cleanup for raw txt (grammar check)

This commit is contained in:
Jan Kowalczyk
2025-10-18 12:19:26 +02:00
parent 8697c07c0f
commit 374420727b
7 changed files with 162 additions and 19 deletions

View File

@@ -53,9 +53,9 @@
% **************************************************************************************************
% template setup -- do not change these unless you know what you are doing!
\input{./base/documentclass_\DocumentType}
\input{./base/documentclass_thesis}
\input{./base/packages}
\input{./base/layout_\DocumentType}
\input{./base/layout_thesis}
\input{./base/macros}
% **************************************************************************************************
@@ -156,26 +156,27 @@
% variable for page numbering
\newcounter{mypageno}
% **************************************************************************************************
\begin{document}
% **************************************************************************************************
\input{./base/syntax_formatting}
% for thesis: switch to frontmatter (Roman numbering, etc.)
\ifthenelse{\equal{\DocumentType}{thesis}}
\ifthenelse{\equal{thesis}{thesis}}
{
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
}{}
% **************************************************************************************************
\begin{document}
% **************************************************************************************************
%title
\input{./base/titlepage_\DocumentType}
\input{./base/titlepage_thesis}
% for thesis: abstract, kurzfassung, affidavit and statutory declaration
\ifthenelse{\equal{\DocumentType}{thesis}}
\ifthenelse{\equal{thesis}{thesis}}
{
\emptydoublepage
\addcontentsline{toc}{chapter}{Statutory Declaration}
\input{./base/declaration_\DocumentLanguage}
\input{./base/declaration_en}
\emptydoublepage
\input{thesis_preamble/acknowledgements}
\emptydoublepage
@@ -187,7 +188,7 @@
\tableofcontents
\ifthenelse{\equal{\DocumentType}{thesis}}
\ifthenelse{\equal{thesis}{thesis}}
{
\emptydoublepage
\setcounter{mypageno}{\value{page}}
@@ -1148,7 +1149,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
% **************************************************************************************************
\appendix
\ifthenelse{\equal{\DocumentType}{thesis}}
\ifthenelse{\equal{thesis}{thesis}}
{
\setcounter{mypageno}{\value{page}}
\frontmatter \pagestyle{plain} \pagenumbering{Roman}

View File

@@ -24,10 +24,7 @@
not used other than the declared sources/resources, and that I have
explicitly indicated all material which has been quoted either
literally or by content from the sources used.
\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or
\equal{\ThesisTitle}{diploma thesis} \or
\equal{\ThesisTitle}{doctoral thesis}}
{The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
\par\vspace*{4cm}

11
thesis/drop-images.lua Normal file
View File

@@ -0,0 +1,11 @@
-- drop-images.lua
-- Replaces all images (figures, graphics) with a short placeholder.
function Image(el) return pandoc.Str("[image omitted]") end
-- For LaTeX figures that are still raw
function RawBlock(el)
if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
return pandoc.Plain({pandoc.Str("[figure omitted]")})
end
end

11
thesis/drop-tables.lua Normal file
View File

@@ -0,0 +1,11 @@
-- drop-tables.lua
-- Removes LaTeX tabular and tabularx environments (and their contents).
function RawBlock(el)
if el.format == "tex" then
-- Check for tabular or tabularx environment
if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
return pandoc.Plain({pandoc.Str("[table omitted]")})
end
end
end

View File

@@ -28,7 +28,10 @@
zathura
wmctrl
python312
pandoc
pandoc-lua-filters
];
filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
in
{
devShell = pkgs.mkShell {
@@ -39,6 +42,28 @@
];
};
shellHook = ''
set -eu
# local folder in your repo to reference in commands
link_target="pandoc-filters"
# refresh symlink each time you enter the shell
ln -sfn ${filtersPath} "$link_target"
echo "Linked $link_target -> ${filtersPath}"
# (optional) write a defaults file that uses the relative symlink
if [ ! -f pandoc.defaults.yaml ]; then
cat > pandoc.defaults.yaml <<'YAML'
from: latex
to: plain
wrap: none
lua-filter:
- pandoc-filters/latex-hyphen.lua
- pandoc-filters/pandoc-quotes.lua
YAML
echo "Wrote pandoc.defaults.yaml"
fi
'';
}
);
}

43
thesis/keep-citations.lua Normal file
View File

@@ -0,0 +1,43 @@
-- keep-citations.lua
-- Replace citations with a placeholder and eat any preceding space.
local PH = "[citation]"
-- Pandoc-native citations (if the reader produced Cite nodes)
function Cite(el) return pandoc.Str(PH) end
-- Raw LaTeX \cite-like macros (when not parsed as Cite)
function RawInline(el)
if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
return pandoc.Str(PH)
end
end
-- Remove a single leading Space before our placeholder
local function squash_spaces(inlines)
local out = {}
local i = 1
while i <= #inlines do
local cur = inlines[i]
local nxt = inlines[i + 1]
if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
PH then
table.insert(out, nxt)
i = i + 2
else
table.insert(out, cur)
i = i + 1
end
end
return out
end
function Para(el)
el.content = squash_spaces(el.content)
return el
end
function Plain(el)
el.content = squash_spaces(el.content)
return el
end

55
thesis/tex2plaintext.sh Executable file
View File

@@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail
# Usage:
# ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
#
# Defaults:
# INPUT_TEX = Main.txt (your original file name)
# OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
INPUT_TEX="${1:-Main.tex}"
OUT_BASE="${2:-thesis}"
FLAT_TEX="flat.tex"
NO_TABLES_TEX="flat_notables.tex"
PLAIN_TXT="${OUT_BASE}.txt"
PART1_TXT="${OUT_BASE}_part1.txt"
PART2_TXT="${OUT_BASE}_part2.txt"
MARKER="Data and Preprocessing"
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
# Replace entire tabular / tabularx environments with a placeholder
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
# Ensure the marker exists exactly on its own line
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
echo " (It must be the only content on that line.)"
exit 1
fi
# Clean previous outputs if present
rm -f -- "${PART1_TXT}" "${PART2_TXT}"
# Split so the marker line becomes the FIRST line of part 2
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
BEGIN { current = out1 }
$0 == marker { current = out2; print $0 > current; next }
{ print $0 > current }
' "${PLAIN_TXT}"
echo "Done."
echo " - ${PLAIN_TXT}"
echo " - ${PART1_TXT}"
echo " - ${PART2_TXT}"