cleanup for raw txt (grammar check)
This commit is contained in:
@@ -53,9 +53,9 @@
|
||||
|
||||
% **************************************************************************************************
|
||||
% template setup -- do not change these unless you know what you are doing!
|
||||
\input{./base/documentclass_\DocumentType}
|
||||
\input{./base/documentclass_thesis}
|
||||
\input{./base/packages}
|
||||
\input{./base/layout_\DocumentType}
|
||||
\input{./base/layout_thesis}
|
||||
\input{./base/macros}
|
||||
|
||||
% **************************************************************************************************
|
||||
@@ -156,26 +156,27 @@
|
||||
|
||||
% variable for page numbering
|
||||
\newcounter{mypageno}
|
||||
% **************************************************************************************************
|
||||
\begin{document}
|
||||
% **************************************************************************************************
|
||||
|
||||
\input{./base/syntax_formatting}
|
||||
|
||||
% for thesis: switch to frontmatter (Roman numbering, etc.)
|
||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
||||
\ifthenelse{\equal{thesis}{thesis}}
|
||||
{
|
||||
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
||||
}{}
|
||||
% **************************************************************************************************
|
||||
\begin{document}
|
||||
% **************************************************************************************************
|
||||
|
||||
%title
|
||||
\input{./base/titlepage_\DocumentType}
|
||||
\input{./base/titlepage_thesis}
|
||||
|
||||
% for thesis: abstract, kurzfassung, affidavit and statutory declaration
|
||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
||||
\ifthenelse{\equal{thesis}{thesis}}
|
||||
{
|
||||
\emptydoublepage
|
||||
\addcontentsline{toc}{chapter}{Statutory Declaration}
|
||||
\input{./base/declaration_\DocumentLanguage}
|
||||
\input{./base/declaration_en}
|
||||
\emptydoublepage
|
||||
\input{thesis_preamble/acknowledgements}
|
||||
\emptydoublepage
|
||||
@@ -187,7 +188,7 @@
|
||||
|
||||
\tableofcontents
|
||||
|
||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
||||
\ifthenelse{\equal{thesis}{thesis}}
|
||||
{
|
||||
\emptydoublepage
|
||||
\setcounter{mypageno}{\value{page}}
|
||||
@@ -1148,7 +1149,7 @@ In summary, while this thesis demonstrates the feasibility of using anomaly dete
|
||||
% **************************************************************************************************
|
||||
|
||||
\appendix
|
||||
\ifthenelse{\equal{\DocumentType}{thesis}}
|
||||
\ifthenelse{\equal{thesis}{thesis}}
|
||||
{
|
||||
\setcounter{mypageno}{\value{page}}
|
||||
\frontmatter \pagestyle{plain} \pagenumbering{Roman}
|
||||
|
||||
@@ -24,15 +24,12 @@
|
||||
not used other than the declared sources/resources, and that I have
|
||||
explicitly indicated all material which has been quoted either
|
||||
literally or by content from the sources used.
|
||||
\ifthenelse{\equal{\ThesisTitle}{master's thesis} \or
|
||||
\equal{\ThesisTitle}{diploma thesis} \or
|
||||
\equal{\ThesisTitle}{doctoral thesis}}
|
||||
{The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.}{\reminder{TODO: fix \textbackslash ThesisTitle}}
|
||||
The text document uploaded to TUGRAZonline is identical to the present \ThesisTitle.
|
||||
|
||||
|
||||
\par\vspace*{4cm}
|
||||
\centerline{
|
||||
\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
|
||||
\cline{1-3} \cline{5-7}
|
||||
& date & & & & (signature) &\\
|
||||
\end{tabular}}
|
||||
\begin{tabular}{m{1.5cm}cm{1.5cm}m{3cm}m{1.5cm}cm{1.5cm}}
|
||||
\cline{1-3} \cline{5-7}
|
||||
& date & & & & (signature) & \\
|
||||
\end{tabular}}
|
||||
|
||||
11
thesis/drop-images.lua
Normal file
11
thesis/drop-images.lua
Normal file
@@ -0,0 +1,11 @@
|
||||
-- drop-images.lua
|
||||
-- Replaces all images (figures, graphics) with a short placeholder.
|
||||
function Image(el) return pandoc.Str("[image omitted]") end
|
||||
|
||||
-- For LaTeX figures that are still raw
|
||||
function RawBlock(el)
|
||||
if el.format == "tex" and el.text:match("\\begin%s*{%s*figure%s*}") then
|
||||
return pandoc.Plain({pandoc.Str("[figure omitted]")})
|
||||
end
|
||||
end
|
||||
|
||||
11
thesis/drop-tables.lua
Normal file
11
thesis/drop-tables.lua
Normal file
@@ -0,0 +1,11 @@
|
||||
-- drop-tables.lua
|
||||
-- Removes LaTeX tabular and tabularx environments (and their contents).
|
||||
function RawBlock(el)
|
||||
if el.format == "tex" then
|
||||
-- Check for tabular or tabularx environment
|
||||
if el.text:match("\\begin%s*{%s*tabularx?%s*}") then
|
||||
return pandoc.Plain({pandoc.Str("[table omitted]")})
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
@@ -28,7 +28,10 @@
|
||||
zathura
|
||||
wmctrl
|
||||
python312
|
||||
pandoc
|
||||
pandoc-lua-filters
|
||||
];
|
||||
filtersPath = "${pkgs.pandoc-lua-filters}/share/pandoc/filters";
|
||||
in
|
||||
{
|
||||
devShell = pkgs.mkShell {
|
||||
@@ -39,6 +42,28 @@
|
||||
];
|
||||
};
|
||||
|
||||
shellHook = ''
|
||||
set -eu
|
||||
# local folder in your repo to reference in commands
|
||||
link_target="pandoc-filters"
|
||||
# refresh symlink each time you enter the shell
|
||||
ln -sfn ${filtersPath} "$link_target"
|
||||
echo "Linked $link_target -> ${filtersPath}"
|
||||
|
||||
# (optional) write a defaults file that uses the relative symlink
|
||||
if [ ! -f pandoc.defaults.yaml ]; then
|
||||
cat > pandoc.defaults.yaml <<'YAML'
|
||||
from: latex
|
||||
to: plain
|
||||
wrap: none
|
||||
lua-filter:
|
||||
- pandoc-filters/latex-hyphen.lua
|
||||
- pandoc-filters/pandoc-quotes.lua
|
||||
YAML
|
||||
echo "Wrote pandoc.defaults.yaml"
|
||||
fi
|
||||
'';
|
||||
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
43
thesis/keep-citations.lua
Normal file
43
thesis/keep-citations.lua
Normal file
@@ -0,0 +1,43 @@
|
||||
-- keep-citations.lua
|
||||
-- Replace citations with a placeholder and eat any preceding space.
|
||||
local PH = "[citation]"
|
||||
|
||||
-- Pandoc-native citations (if the reader produced Cite nodes)
|
||||
function Cite(el) return pandoc.Str(PH) end
|
||||
|
||||
-- Raw LaTeX \cite-like macros (when not parsed as Cite)
|
||||
function RawInline(el)
|
||||
if el.format and el.format:match("tex") and el.text:match("\\%a-*cite%*?") then
|
||||
return pandoc.Str(PH)
|
||||
end
|
||||
end
|
||||
|
||||
-- Remove a single leading Space before our placeholder
|
||||
local function squash_spaces(inlines)
|
||||
local out = {}
|
||||
local i = 1
|
||||
while i <= #inlines do
|
||||
local cur = inlines[i]
|
||||
local nxt = inlines[i + 1]
|
||||
if cur and cur.t == "Space" and nxt and nxt.t == "Str" and nxt.text ==
|
||||
PH then
|
||||
table.insert(out, nxt)
|
||||
i = i + 2
|
||||
else
|
||||
table.insert(out, cur)
|
||||
i = i + 1
|
||||
end
|
||||
end
|
||||
return out
|
||||
end
|
||||
|
||||
function Para(el)
|
||||
el.content = squash_spaces(el.content)
|
||||
return el
|
||||
end
|
||||
|
||||
function Plain(el)
|
||||
el.content = squash_spaces(el.content)
|
||||
return el
|
||||
end
|
||||
|
||||
55
thesis/tex2plaintext.sh
Executable file
55
thesis/tex2plaintext.sh
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Usage:
|
||||
# ./tex2plaintext.sh [INPUT_TEX] [OUT_BASENAME]
|
||||
#
|
||||
# Defaults:
|
||||
# INPUT_TEX = Main.txt (your original file name)
|
||||
# OUT_BASENAME = thesis (produces thesis.txt, thesis_part1.txt, thesis_part2.txt)
|
||||
|
||||
INPUT_TEX="${1:-Main.tex}"
|
||||
OUT_BASE="${2:-thesis}"
|
||||
|
||||
FLAT_TEX="flat.tex"
|
||||
NO_TABLES_TEX="flat_notables.tex"
|
||||
PLAIN_TXT="${OUT_BASE}.txt"
|
||||
PART1_TXT="${OUT_BASE}_part1.txt"
|
||||
PART2_TXT="${OUT_BASE}_part2.txt"
|
||||
MARKER="Data and Preprocessing"
|
||||
|
||||
echo "[1/4] Flattening with latexpand -> ${FLAT_TEX}"
|
||||
latexpand "${INPUT_TEX}" > "${FLAT_TEX}"
|
||||
|
||||
echo "[2/4] Removing tabular/tabularx environments -> ${NO_TABLES_TEX}"
|
||||
# Replace entire tabular / tabularx environments with a placeholder
|
||||
perl -0777 -pe 's/\\begin\{(tabularx?)\}.*?\\end\{\1\}/[table omitted]/gs' \
|
||||
"${FLAT_TEX}" > "${NO_TABLES_TEX}"
|
||||
|
||||
echo "[3/4] Converting to plain text with pandoc -> ${PLAIN_TXT}"
|
||||
pandoc -f latex -t plain --wrap=none "${NO_TABLES_TEX}" -o "${PLAIN_TXT}"
|
||||
|
||||
echo "[4/4] Splitting ${PLAIN_TXT} before the marker line: \"${MARKER}\""
|
||||
|
||||
# Ensure the marker exists exactly on its own line
|
||||
if ! grep -xq "${MARKER}" "${PLAIN_TXT}"; then
|
||||
echo "ERROR: Marker line not found exactly as \"${MARKER}\" in ${PLAIN_TXT}."
|
||||
echo " (It must be the only content on that line.)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Clean previous outputs if present
|
||||
rm -f -- "${PART1_TXT}" "${PART2_TXT}"
|
||||
|
||||
# Split so the marker line becomes the FIRST line of part 2
|
||||
awk -v marker="${MARKER}" -v out1="${PART1_TXT}" -v out2="${PART2_TXT}" '
|
||||
BEGIN { current = out1 }
|
||||
$0 == marker { current = out2; print $0 > current; next }
|
||||
{ print $0 > current }
|
||||
' "${PLAIN_TXT}"
|
||||
|
||||
echo "Done."
|
||||
echo " - ${PLAIN_TXT}"
|
||||
echo " - ${PART1_TXT}"
|
||||
echo " - ${PART2_TXT}"
|
||||
|
||||
Reference in New Issue
Block a user