Update document_converter.py
Browse files- document_converter.py +32 -38
document_converter.py
CHANGED
|
@@ -97,14 +97,10 @@ class DocumentConverter:
|
|
| 97 |
def _detect_input_format(self, content: str, force_latex: bool) -> str:
|
| 98 |
"""
|
| 99 |
Decide whether Pandoc should treat the text as 'latex' or 'markdown'.
|
| 100 |
-
|
| 101 |
-
In our use case, we generally force LaTeX, but this function
|
| 102 |
-
is kept flexible.
|
| 103 |
"""
|
| 104 |
if force_latex:
|
| 105 |
return 'latex'
|
| 106 |
|
| 107 |
-
# Heuristics to detect LaTeX
|
| 108 |
latex_markers = [
|
| 109 |
'\\documentclass',
|
| 110 |
'\\begin{document}',
|
|
@@ -119,7 +115,6 @@ class DocumentConverter:
|
|
| 119 |
if any(marker in content for marker in latex_markers):
|
| 120 |
return 'latex'
|
| 121 |
|
| 122 |
-
# Default to markdown if no strong LaTeX signs
|
| 123 |
return 'markdown'
|
| 124 |
|
| 125 |
# =====================
|
|
@@ -137,8 +132,7 @@ class DocumentConverter:
|
|
| 137 |
Convert content (LaTeX/Markdown) directly to DOCX/PDF/etc using Pandoc.
|
| 138 |
|
| 139 |
- If include_latex=True, treat content as LaTeX.
|
| 140 |
-
- Strips ```latex ... ``` or ``` ... ``` fenced code blocks first
|
| 141 |
-
because Pandoc would otherwise render them as literal code.
|
| 142 |
|
| 143 |
Returns: bytes of the generated file (DOCX/PDF/whatever you requested).
|
| 144 |
"""
|
|
@@ -150,42 +144,47 @@ class DocumentConverter:
|
|
| 150 |
input_format = self._detect_input_format(content, force_latex=include_latex)
|
| 151 |
|
| 152 |
# 3) Decide the Pandoc output format from the extension
|
| 153 |
-
# e.g. ".docx" -> "docx", ".pdf" -> "pdf"
|
| 154 |
target = output_format.lstrip('.').lower() or 'docx'
|
| 155 |
|
| 156 |
try:
|
| 157 |
-
#
|
| 158 |
with tempfile.NamedTemporaryFile(suffix=f".{target}", delete=False) as temp_output:
|
| 159 |
output_filename = temp_output.name
|
| 160 |
|
| 161 |
-
#
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
except Exception as e:
|
| 186 |
raise RuntimeError(f"Conversion failed: {str(e)}")
|
| 187 |
|
| 188 |
-
|
| 189 |
# =====================
|
| 190 |
# ADD SIGNATURE TO DOCX
|
| 191 |
# =====================
|
|
@@ -199,11 +198,6 @@ class DocumentConverter:
|
|
| 199 |
) -> bytes:
|
| 200 |
"""
|
| 201 |
Add a signature image (base64) and optional signer name to a DOCX document.
|
| 202 |
-
|
| 203 |
-
- file_content: existing DOCX bytes
|
| 204 |
-
- signature_data: base64 image string (with or without `data:image/...;base64,` prefix)
|
| 205 |
-
- position: 'left', 'center', 'right' / 'bottom-right' / 'bottom-center' etc (only horizontal used)
|
| 206 |
-
- signer_name: optional text below the signature
|
| 207 |
"""
|
| 208 |
try:
|
| 209 |
doc = Document(io.BytesIO(file_content))
|
|
|
|
| 97 |
def _detect_input_format(self, content: str, force_latex: bool) -> str:
|
| 98 |
"""
|
| 99 |
Decide whether Pandoc should treat the text as 'latex' or 'markdown'.
|
|
|
|
|
|
|
|
|
|
| 100 |
"""
|
| 101 |
if force_latex:
|
| 102 |
return 'latex'
|
| 103 |
|
|
|
|
| 104 |
latex_markers = [
|
| 105 |
'\\documentclass',
|
| 106 |
'\\begin{document}',
|
|
|
|
| 115 |
if any(marker in content for marker in latex_markers):
|
| 116 |
return 'latex'
|
| 117 |
|
|
|
|
| 118 |
return 'markdown'
|
| 119 |
|
| 120 |
# =====================
|
|
|
|
| 132 |
Convert content (LaTeX/Markdown) directly to DOCX/PDF/etc using Pandoc.
|
| 133 |
|
| 134 |
- If include_latex=True, treat content as LaTeX.
|
| 135 |
+
- Strips ```latex ... ``` or ``` ... ``` fenced code blocks first.
|
|
|
|
| 136 |
|
| 137 |
Returns: bytes of the generated file (DOCX/PDF/whatever you requested).
|
| 138 |
"""
|
|
|
|
| 144 |
input_format = self._detect_input_format(content, force_latex=include_latex)
|
| 145 |
|
| 146 |
# 3) Decide the Pandoc output format from the extension
|
|
|
|
| 147 |
target = output_format.lstrip('.').lower() or 'docx'
|
| 148 |
|
| 149 |
try:
|
| 150 |
+
# Create temp file for Pandoc output
|
| 151 |
with tempfile.NamedTemporaryFile(suffix=f".{target}", delete=False) as temp_output:
|
| 152 |
output_filename = temp_output.name
|
| 153 |
|
| 154 |
+
# First attempt to run Pandoc
|
| 155 |
+
try:
|
| 156 |
+
pypandoc.convert_text(
|
| 157 |
+
content,
|
| 158 |
+
target,
|
| 159 |
+
format=input_format,
|
| 160 |
+
outputfile=output_filename,
|
| 161 |
+
extra_args=['--standalone']
|
| 162 |
+
)
|
| 163 |
+
except OSError:
|
| 164 |
+
# Pandoc binary not found – try to download (useful in local dev; may not work on HF)
|
| 165 |
+
import pypandoc as _pd
|
| 166 |
+
_pd.download_pandoc()
|
| 167 |
+
# Retry once
|
| 168 |
+
pypandoc.convert_text(
|
| 169 |
+
content,
|
| 170 |
+
target,
|
| 171 |
+
format=input_format,
|
| 172 |
+
outputfile=output_filename,
|
| 173 |
+
extra_args=['--standalone']
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# Read the generated file back into bytes
|
| 177 |
+
with open(output_filename, "rb") as f:
|
| 178 |
+
file_bytes = f.read()
|
| 179 |
+
|
| 180 |
+
# Cleanup temp file
|
| 181 |
+
os.unlink(output_filename)
|
| 182 |
+
|
| 183 |
+
return file_bytes
|
| 184 |
|
| 185 |
except Exception as e:
|
| 186 |
raise RuntimeError(f"Conversion failed: {str(e)}")
|
| 187 |
|
|
|
|
| 188 |
# =====================
|
| 189 |
# ADD SIGNATURE TO DOCX
|
| 190 |
# =====================
|
|
|
|
| 198 |
) -> bytes:
|
| 199 |
"""
|
| 200 |
Add a signature image (base64) and optional signer name to a DOCX document.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
"""
|
| 202 |
try:
|
| 203 |
doc = Document(io.BytesIO(file_content))
|