omgy commited on
Commit
c7c2fb0
·
verified ·
1 Parent(s): bdba52f

Update document_converter.py

Browse files
Files changed (1) hide show
  1. document_converter.py +32 -38
document_converter.py CHANGED
@@ -97,14 +97,10 @@ class DocumentConverter:
97
  def _detect_input_format(self, content: str, force_latex: bool) -> str:
98
  """
99
  Decide whether Pandoc should treat the text as 'latex' or 'markdown'.
100
-
101
- In our use case, we generally force LaTeX, but this function
102
- is kept flexible.
103
  """
104
  if force_latex:
105
  return 'latex'
106
 
107
- # Heuristics to detect LaTeX
108
  latex_markers = [
109
  '\\documentclass',
110
  '\\begin{document}',
@@ -119,7 +115,6 @@ class DocumentConverter:
119
  if any(marker in content for marker in latex_markers):
120
  return 'latex'
121
 
122
- # Default to markdown if no strong LaTeX signs
123
  return 'markdown'
124
 
125
  # =====================
@@ -137,8 +132,7 @@ class DocumentConverter:
137
  Convert content (LaTeX/Markdown) directly to DOCX/PDF/etc using Pandoc.
138
 
139
  - If include_latex=True, treat content as LaTeX.
140
- - Strips ```latex ... ``` or ``` ... ``` fenced code blocks first,
141
- because Pandoc would otherwise render them as literal code.
142
 
143
  Returns: bytes of the generated file (DOCX/PDF/whatever you requested).
144
  """
@@ -150,42 +144,47 @@ class DocumentConverter:
150
  input_format = self._detect_input_format(content, force_latex=include_latex)
151
 
152
  # 3) Decide the Pandoc output format from the extension
153
- # e.g. ".docx" -> "docx", ".pdf" -> "pdf"
154
  target = output_format.lstrip('.').lower() or 'docx'
155
 
156
  try:
157
- # Use correct suffix for temp file: .docx, .pdf, etc.
158
  with tempfile.NamedTemporaryFile(suffix=f".{target}", delete=False) as temp_output:
159
  output_filename = temp_output.name
160
 
161
- # Run Pandoc conversion
162
- try:
163
- # Run Pandoc conversion
164
- pypandoc.convert_text(
165
- content,
166
- target,
167
- format=input_format,
168
- outputfile=output_filename,
169
- extra_args=['--standalone']
170
- )
171
-
172
- except OSError:
173
- # Try to download pandoc automatically (useful in local dev)
174
- import pypandoc as _pd
175
- _pd.download_pandoc()
176
- # Retry once
177
- pypandoc.convert_text(
178
- content,
179
- target,
180
- format=input_format,
181
- outputfile=output_filename,
182
- extra_args=['--standalone']
183
- )
 
 
 
 
 
 
 
184
 
185
  except Exception as e:
186
  raise RuntimeError(f"Conversion failed: {str(e)}")
187
 
188
-
189
  # =====================
190
  # ADD SIGNATURE TO DOCX
191
  # =====================
@@ -199,11 +198,6 @@ class DocumentConverter:
199
  ) -> bytes:
200
  """
201
  Add a signature image (base64) and optional signer name to a DOCX document.
202
-
203
- - file_content: existing DOCX bytes
204
- - signature_data: base64 image string (with or without `data:image/...;base64,` prefix)
205
- - position: 'left', 'center', 'right' / 'bottom-right' / 'bottom-center' etc (only horizontal used)
206
- - signer_name: optional text below the signature
207
  """
208
  try:
209
  doc = Document(io.BytesIO(file_content))
 
97
  def _detect_input_format(self, content: str, force_latex: bool) -> str:
98
  """
99
  Decide whether Pandoc should treat the text as 'latex' or 'markdown'.
 
 
 
100
  """
101
  if force_latex:
102
  return 'latex'
103
 
 
104
  latex_markers = [
105
  '\\documentclass',
106
  '\\begin{document}',
 
115
  if any(marker in content for marker in latex_markers):
116
  return 'latex'
117
 
 
118
  return 'markdown'
119
 
120
  # =====================
 
132
  Convert content (LaTeX/Markdown) directly to DOCX/PDF/etc using Pandoc.
133
 
134
  - If include_latex=True, treat content as LaTeX.
135
+ - Strips ```latex ... ``` or ``` ... ``` fenced code blocks first.
 
136
 
137
  Returns: bytes of the generated file (DOCX/PDF/whatever you requested).
138
  """
 
144
  input_format = self._detect_input_format(content, force_latex=include_latex)
145
 
146
  # 3) Decide the Pandoc output format from the extension
 
147
  target = output_format.lstrip('.').lower() or 'docx'
148
 
149
  try:
150
+ # Create temp file for Pandoc output
151
  with tempfile.NamedTemporaryFile(suffix=f".{target}", delete=False) as temp_output:
152
  output_filename = temp_output.name
153
 
154
+ # First attempt to run Pandoc
155
+ try:
156
+ pypandoc.convert_text(
157
+ content,
158
+ target,
159
+ format=input_format,
160
+ outputfile=output_filename,
161
+ extra_args=['--standalone']
162
+ )
163
+ except OSError:
164
+ # Pandoc binary not found – try to download (useful in local dev; may not work on HF)
165
+ import pypandoc as _pd
166
+ _pd.download_pandoc()
167
+ # Retry once
168
+ pypandoc.convert_text(
169
+ content,
170
+ target,
171
+ format=input_format,
172
+ outputfile=output_filename,
173
+ extra_args=['--standalone']
174
+ )
175
+
176
+ # Read the generated file back into bytes
177
+ with open(output_filename, "rb") as f:
178
+ file_bytes = f.read()
179
+
180
+ # Cleanup temp file
181
+ os.unlink(output_filename)
182
+
183
+ return file_bytes
184
 
185
  except Exception as e:
186
  raise RuntimeError(f"Conversion failed: {str(e)}")
187
 
 
188
  # =====================
189
  # ADD SIGNATURE TO DOCX
190
  # =====================
 
198
  ) -> bytes:
199
  """
200
  Add a signature image (base64) and optional signer name to a DOCX document.
 
 
 
 
 
201
  """
202
  try:
203
  doc = Document(io.BytesIO(file_content))