omgy commited on
Commit
411e5df
·
verified ·
1 Parent(s): 88fcdc8

Update document_converter.py

Browse files
Files changed (1) hide show
  1. document_converter.py +6 -6
document_converter.py CHANGED
@@ -94,13 +94,14 @@ class DocumentConverter:
94
 
95
  return content
96
 
97
- def _detect_input_format(self, content: str, include_latex: bool) -> str:
98
  """
99
  Decide whether Pandoc should treat the text as 'latex' or 'markdown'.
100
- - If include_latex is True -> always 'latex'.
101
- - Otherwise, try to sniff based on typical LaTeX patterns.
 
102
  """
103
- if include_latex:
104
  return 'latex'
105
 
106
  # Heuristics to detect LaTeX
@@ -136,7 +137,6 @@ class DocumentConverter:
136
  Convert content (LaTeX/Markdown) directly to DOCX/PDF/etc using Pandoc.
137
 
138
  - If include_latex=True, treat content as LaTeX.
139
- - Otherwise, sniff for LaTeX patterns; if none, assume Markdown.
140
  - Strips ```latex ... ``` or ``` ... ``` fenced code blocks first,
141
  because Pandoc would otherwise render them as literal code.
142
 
@@ -147,7 +147,7 @@ class DocumentConverter:
147
  content = self._strip_code_fences(content)
148
 
149
  # 2) Decide the Pandoc input format
150
- input_format = self._detect_input_format(content, include_latex=include_latex)
151
 
152
  # 3) Decide the Pandoc output format from the extension
153
  # e.g. ".docx" -> "docx", ".pdf" -> "pdf"
 
94
 
95
  return content
96
 
97
+ def _detect_input_format(self, content: str, force_latex: bool) -> str:
98
  """
99
  Decide whether Pandoc should treat the text as 'latex' or 'markdown'.
100
+
101
+ In our use case, we generally force LaTeX, but this function
102
+ is kept flexible.
103
  """
104
+ if force_latex:
105
  return 'latex'
106
 
107
  # Heuristics to detect LaTeX
 
137
  Convert content (LaTeX/Markdown) directly to DOCX/PDF/etc using Pandoc.
138
 
139
  - If include_latex=True, treat content as LaTeX.
 
140
  - Strips ```latex ... ``` or ``` ... ``` fenced code blocks first,
141
  because Pandoc would otherwise render them as literal code.
142
 
 
147
  content = self._strip_code_fences(content)
148
 
149
  # 2) Decide the Pandoc input format
150
+ input_format = self._detect_input_format(content, force_latex=include_latex)
151
 
152
  # 3) Decide the Pandoc output format from the extension
153
  # e.g. ".docx" -> "docx", ".pdf" -> "pdf"