parent
04b59b0328
commit
da95ab11b4
@ -0,0 +1,88 @@ |
||||
#! /usr/bin/env nix-shell |
||||
#! nix-shell -I nixpkgs=. -i bash -p pandoc |
||||
|
||||
# This script is temporarily needed while we transition the manual to |
||||
# CommonMark. It converts DocBook files into our CommonMark flavour. |
||||
|
||||
debug= |
||||
files=() |
||||
|
||||
while [ "$#" -gt 0 ]; do |
||||
i="$1"; shift 1 |
||||
case "$i" in |
||||
--debug) |
||||
debug=1 |
||||
;; |
||||
*) |
||||
files+=("$i") |
||||
;; |
||||
esac |
||||
done |
||||
|
||||
echo "WARNING: This is an experimental script and might not preserve all formatting." > /dev/stderr |
||||
echo "Please report any issues you discover." > /dev/stderr |
||||
|
||||
outExtension="md" |
||||
if [[ $debug ]]; then |
||||
outExtension="json" |
||||
fi |
||||
|
||||
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" |
||||
|
||||
# NOTE: Keep in sync with Nixpkgs manual (/doc/Makefile). |
||||
# TODO: Remove raw-attribute when we can get rid of DocBook altogether. |
||||
pandoc_commonmark_enabled_extensions=+attributes+fenced_divs+footnotes+bracketed_spans+definition_lists+pipe_tables+raw_attribute |
||||
targetLang="commonmark${pandoc_commonmark_enabled_extensions}+smart" |
||||
if [[ $debug ]]; then |
||||
targetLang=json |
||||
fi |
||||
pandoc_flags=( |
||||
# Not needed: |
||||
# - diagram-generator.lua (we do not support that in NixOS manual to limit dependencies) |
||||
# - media extraction (was only required for diagram generator) |
||||
# - myst-reader/roles.lua (only relevant for MyST → DocBook) |
||||
# - link-unix-man-references.lua (links should only be added to display output) |
||||
# - docbook-writer/rst-roles.lua (only relevant for → DocBook) |
||||
# - docbook-writer/labelless-link-is-xref.lua (only relevant for → DocBook) |
||||
"--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/docbook-reader/citerefentry-to-rst-role.lua" |
||||
"--lua-filter=$DIR/../../doc/build-aux/pandoc-filters/myst-writer/roles.lua" |
||||
"--lua-filter=$DIR/doc/unknown-code-language.lua" |
||||
-f docbook |
||||
-t "$targetLang" |
||||
--tab-stop=2 |
||||
--wrap=none |
||||
) |
||||
|
||||
for file in "${files[@]}"; do |
||||
if [[ ! -f "$file" ]]; then |
||||
echo "db-to-md.sh: $file does not exist" > /dev/stderr |
||||
exit 1 |
||||
else |
||||
rootElement=$(xmllint --xpath 'name(//*)' "$file") |
||||
|
||||
if [[ $rootElement = chapter ]]; then |
||||
extension=".chapter.$outExtension" |
||||
elif [[ $rootElement = section ]]; then |
||||
extension=".section.$outExtension" |
||||
else |
||||
echo "db-to-md.sh: $file contains an unsupported root element $rootElement" > /dev/stderr |
||||
exit 1 |
||||
fi |
||||
|
||||
outFile="${file%".section.xml"}" |
||||
outFile="${outFile%".chapter.xml"}" |
||||
outFile="${outFile%".xml"}$extension" |
||||
temp1=$(mktemp) |
||||
$DIR/doc/escape-code-markup.py "$file" "$temp1" |
||||
if [[ $debug ]]; then |
||||
echo "Converted $file to $temp1" > /dev/stderr |
||||
fi |
||||
temp2=$(mktemp) |
||||
$DIR/doc/replace-xrefs-by-empty-links.py "$temp1" "$temp2" |
||||
if [[ $debug ]]; then |
||||
echo "Converted $temp1 to $temp2" > /dev/stderr |
||||
fi |
||||
pandoc "$temp2" -o "$outFile" "${pandoc_flags[@]}" |
||||
echo "Converted $file to $outFile" > /dev/stderr |
||||
fi |
||||
done |
@ -0,0 +1,97 @@ |
||||
#! /usr/bin/env nix-shell |
||||
#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml |
||||
|
||||
""" |
||||
Pandoc will strip any markup within code elements so |
||||
let’s escape them so that they can be handled manually. |
||||
""" |
||||
|
||||
import lxml.etree as ET |
||||
import re |
||||
import sys |
||||
|
||||
def replace_element_by_text(el: ET.Element, text: str) -> None: |
||||
""" |
||||
Author: bernulf |
||||
Source: https://stackoverflow.com/a/10520552/160386 |
||||
SPDX-License-Identifier: CC-BY-SA-3.0 |
||||
""" |
||||
text = text + (el.tail or "") |
||||
parent = el.getparent() |
||||
if parent is not None: |
||||
previous = el.getprevious() |
||||
if previous is not None: |
||||
previous.tail = (previous.tail or "") + text |
||||
else: |
||||
parent.text = (parent.text or "") + text |
||||
parent.remove(el) |
||||
|
||||
DOCBOOK_NS = "http://docbook.org/ns/docbook" |
||||
|
||||
# List of elements that pandoc’s DocBook reader strips markup from. |
||||
# https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs |
||||
code_elements = [ |
||||
# CodeBlock |
||||
"literallayout", |
||||
"screen", |
||||
"programlisting", |
||||
# Code (inline) |
||||
"classname", |
||||
"code", |
||||
"filename", |
||||
"envar", |
||||
"literal", |
||||
"computeroutput", |
||||
"prompt", |
||||
"parameter", |
||||
"option", |
||||
"markup", |
||||
"wordasword", |
||||
"command", |
||||
"varname", |
||||
"function", |
||||
"type", |
||||
"symbol", |
||||
"constant", |
||||
"userinput", |
||||
"systemitem", |
||||
] |
||||
|
||||
XMLNS_REGEX = re.compile(r'\s+xmlns(?::[^=]+)?="[^"]*"') |
||||
ROOT_ELEMENT_REGEX = re.compile(r'^\s*<[^>]+>') |
||||
|
||||
def remove_xmlns(match: re.Match) -> str: |
||||
""" |
||||
Removes xmlns attributes. |
||||
|
||||
Expects a match containing an opening tag. |
||||
""" |
||||
return XMLNS_REGEX.sub('', match.group(0)) |
||||
|
||||
if __name__ == '__main__': |
||||
assert len(sys.argv) >= 3, "usage: escape-code-markup.py <input> <output>" |
||||
|
||||
tree = ET.parse(sys.argv[1]) |
||||
name_predicate = " or ".join([f"local-name()='{el}'" for el in code_elements]) |
||||
|
||||
for markup in tree.xpath(f"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"): |
||||
text = ET.tostring(markup, encoding=str) |
||||
|
||||
# tostring adds xmlns attributes to the element we want to stringify |
||||
# as if it was supposed to be usable standalone. |
||||
# We are just converting it to CDATA so we do not care. |
||||
# Let’s strip the namespace declarations to keep the code clean. |
||||
# |
||||
# Note that this removes even namespaces that were potentially |
||||
# in the original file. Though, that should be very rare – |
||||
# most of the time, we will stringify empty DocBook elements |
||||
# like <xref> or <co> or, at worst, <link> with xlink:href attribute. |
||||
# |
||||
# Also note that the regex expects the root element to be first |
||||
# thing in the string. But that should be fine, the tostring method |
||||
# does not produce XML declaration or doctype by default. |
||||
text = ROOT_ELEMENT_REGEX.sub(remove_xmlns, text) |
||||
|
||||
replace_element_by_text(markup, text) |
||||
|
||||
tree.write(sys.argv[2]) |
@ -0,0 +1,32 @@ |
||||
#! /usr/bin/env nix-shell |
||||
#! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml |
||||
|
||||
""" |
||||
Pandoc will try to resolve xrefs and replace them with regular links. |
||||
let’s replace them with links with empty labels which MyST |
||||
and our pandoc filters recognize as cross-references. |
||||
""" |
||||
|
||||
import lxml.etree as ET |
||||
import sys |
||||
|
||||
XLINK_NS = "http://www.w3.org/1999/xlink" |
||||
|
||||
ns = { |
||||
"db": "http://docbook.org/ns/docbook", |
||||
} |
||||
|
||||
|
||||
if __name__ == '__main__': |
||||
assert len(sys.argv) >= 3, "usage: replace-xrefs-by-empty-links.py <input> <output>" |
||||
|
||||
tree = ET.parse(sys.argv[1]) |
||||
for xref in tree.findall(".//db:xref", ns): |
||||
text = ET.tostring(xref, encoding=str) |
||||
parent = xref.getparent() |
||||
link = parent.makeelement('link') |
||||
target_name = xref.get("linkend") |
||||
link.set(f"{{{XLINK_NS}}}href", f"#{target_name}") |
||||
parent.replace(xref, link) |
||||
|
||||
tree.write(sys.argv[2]) |
@ -0,0 +1,12 @@ |
||||
--[[ |
||||
Adds “unknown” class to CodeBlock AST nodes without any classes. |
||||
|
||||
This will cause Pandoc to use fenced code block, which we prefer. |
||||
]] |
||||
|
||||
function CodeBlock(elem) |
||||
if #elem.classes == 0 then |
||||
elem.classes:insert('unknown') |
||||
return elem |
||||
end |
||||
end |
Loading…
Reference in new issue