omgy commited on
Commit
86f307d
Β·
verified Β·
1 Parent(s): 10424de

Update latex_processor.py

Browse files
Files changed (1) hide show
  1. latex_processor.py +268 -208
latex_processor.py CHANGED
@@ -1,208 +1,268 @@
1
- import re
2
- from typing import List, Tuple
3
-
4
- class LaTeXProcessor:
5
- """Processor for LaTeX content in documents"""
6
-
7
- # Common mathematical terms and symbols that indicate math content
8
- MATH_INDICATORS = [
9
- r'\b(equation|formula|theorem|proof|lemma|corollary)\b',
10
- r'[βˆ«βˆ‘βˆβˆšβˆžβ‰€β‰₯β‰ Β±Γ—Γ·βˆˆβˆ‰βŠ‚βŠƒβˆͺβˆ©βˆ€βˆƒβˆ‡βˆ‚]',
11
- r'\d+\s*[+\-*/=]\s*\d+',
12
- r'\b(sin|cos|tan|log|ln|exp|lim|integral|derivative)\b',
13
- r'[a-z]\s*=\s*[a-z0-9]',
14
- r'\^|\d+_\d+',
15
- ]
16
-
17
- def detect_mathematical_content(self, text: str) -> bool:
18
- """
19
- Detect if text contains mathematical/scientific content
20
-
21
- Args:
22
- text: Text to analyze
23
-
24
- Returns:
25
- True if mathematical content is detected
26
- """
27
- text_lower = text.lower()
28
-
29
- for pattern in self.MATH_INDICATORS:
30
- if re.search(pattern, text_lower, re.IGNORECASE):
31
- return True
32
-
33
- return False
34
-
35
- def build_enhancement_prompt(
36
- self,
37
- content: str,
38
- user_instructions: str = "",
39
- doc_type: str = "auto",
40
- include_latex: bool = False
41
- ) -> str:
42
- """
43
- Build comprehensive enhancement prompt for Gemini
44
-
45
- Args:
46
- content: Original document content
47
- user_instructions: User's specific instructions
48
- doc_type: Type of document (auto, academic, technical, business, etc.)
49
- include_latex: Whether to include LaTeX formatting
50
-
51
- Returns:
52
- Complete prompt for Gemini
53
- """
54
- prompt_parts = [
55
- "You are an expert document editor specializing in professional and academic writing.",
56
- ""
57
- ]
58
-
59
- # Add LaTeX instructions if needed
60
- if include_latex:
61
- prompt_parts.extend([
62
- "πŸ”¬ IMPORTANT: This document contains mathematical or scientific content.",
63
- "- Format ALL equations using proper LaTeX notation",
64
- "- Use $...$ for inline equations (e.g., $E = mc^2$)",
65
- "- Use $$...$$ for display equations on their own lines",
66
- "- Use proper LaTeX commands: \\frac{}{}, \\sqrt{}, \\int, \\sum, \\alpha, \\beta, etc.",
67
- "- Number important equations as needed",
68
- "- Ensure all mathematical notation is professional and consistent",
69
- ""
70
- ])
71
-
72
- # Add document type specific instructions
73
- if doc_type == "academic":
74
- prompt_parts.extend([
75
- "πŸ“š Document Type: Academic/Research Paper",
76
- "- Use formal academic tone",
77
- "- Structure with clear sections (Abstract, Introduction, Methods, Results, Discussion, Conclusion)",
78
- "- Include proper citations where needed (use [Author, Year] format)",
79
- "- Ensure technical accuracy",
80
- ""
81
- ])
82
- elif doc_type == "technical":
83
- prompt_parts.extend([
84
- "πŸ”§ Document Type: Technical Documentation",
85
- "- Use clear, precise technical language",
86
- "- Include code examples in proper formatting if relevant",
87
- "- Use numbered lists for procedures",
88
- "- Add technical diagrams descriptions where helpful",
89
- ""
90
- ])
91
- elif doc_type == "business":
92
- prompt_parts.extend([
93
- "πŸ’Ό Document Type: Business Document",
94
- "- Use professional business tone",
95
- "- Focus on clarity and conciseness",
96
- "- Highlight key points and actionable items",
97
- "- Use bullet points for readability",
98
- ""
99
- ])
100
-
101
- # Add user instructions
102
- if user_instructions:
103
- prompt_parts.extend([
104
- f"πŸ‘€ User's Specific Instructions:",
105
- f"{user_instructions}",
106
- ""
107
- ])
108
-
109
- # Add the content
110
- prompt_parts.extend([
111
- "πŸ“„ Original Document Content:",
112
- "=" * 60,
113
- content,
114
- "=" * 60,
115
- "",
116
- "✨ Please provide the ENHANCED version following all guidelines above.",
117
- "Maintain the document structure but improve quality, clarity, and professionalism.",
118
- "Return ONLY the enhanced content, no explanations or meta-commentary.",
119
- ])
120
-
121
- return "\n".join(prompt_parts)
122
-
123
- def process_latex_content(self, content: str) -> str:
124
- """
125
- Process and validate LaTeX content
126
-
127
- Args:
128
- content: Content potentially containing LaTeX
129
-
130
- Returns:
131
- Processed content with valid LaTeX
132
- """
133
- # Ensure proper spacing around inline equations
134
- content = re.sub(r'(\S)\$', r'\1 $', content)
135
- content = re.sub(r'\$(\S)', r'$ \1', content)
136
-
137
- # Ensure display equations are on their own lines
138
- content = re.sub(r'(\S)\$\$', r'\1\n$$', content)
139
- content = re.sub(r'\$\$(\S)', r'$$\n\1', content)
140
-
141
- return content
142
-
143
- def extract_latex_equations(self, content: str) -> List[Tuple[str, str]]:
144
- """
145
- Extract LaTeX equations from content
146
-
147
- Args:
148
- content: Content containing LaTeX
149
-
150
- Returns:
151
- List of tuples (equation_type, equation_content)
152
- equation_type is either 'inline' or 'display'
153
- """
154
- equations = []
155
-
156
- # Extract display equations ($$...$$)
157
- display_pattern = r'\$\$(.*?)\$\$'
158
- for match in re.finditer(display_pattern, content, re.DOTALL):
159
- equations.append(('display', match.group(1).strip()))
160
-
161
- # Extract inline equations ($...$)
162
- inline_pattern = r'(?<!\$)\$(?!\$)(.*?)(?<!\$)\$(?!\$)'
163
- for match in re.finditer(inline_pattern, content):
164
- equations.append(('inline', match.group(1).strip()))
165
-
166
- return equations
167
-
168
- def validate_latex(self, latex_code: str) -> Tuple[bool, str]:
169
- """
170
- Basic validation of LaTeX code
171
-
172
- Args:
173
- latex_code: LaTeX code to validate
174
-
175
- Returns:
176
- Tuple of (is_valid, error_message)
177
- """
178
- # Check for balanced braces
179
- if latex_code.count('{') != latex_code.count('}'):
180
- return False, "Unbalanced braces in LaTeX code"
181
-
182
- # Check for balanced brackets
183
- if latex_code.count('[') != latex_code.count(']'):
184
- return False, "Unbalanced brackets in LaTeX code"
185
-
186
- # Check for common LaTeX commands
187
- common_commands = [
188
- r'\\frac', r'\\sqrt', r'\\sum', r'\\int', r'\\prod',
189
- r'\\alpha', r'\\beta', r'\\gamma', r'\\delta',
190
- r'\\sin', r'\\cos', r'\\tan', r'\\log', r'\\ln',
191
- ]
192
-
193
- # Basic validation passed
194
- return True, ""
195
-
196
- def enhance_equations(self, content: str) -> str:
197
- """
198
- Enhance mathematical equations in content
199
-
200
- Args:
201
- content: Content with equations
202
-
203
- Returns:
204
- Content with enhanced equations
205
- """
206
- # This is a placeholder for more sophisticated equation enhancement
207
- # For now, just ensure proper spacing
208
- return self.process_latex_content(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import List, Tuple
3
+
4
+ class LaTeXProcessor:
5
+ """Processor for LaTeX content in documents"""
6
+
7
+ # Common mathematical terms and symbols that indicate math content
8
+ MATH_INDICATORS = [
9
+ r'\b(equation|formula|theorem|proof|lemma|corollary)\b',
10
+ r'[βˆ«βˆ‘βˆβˆšβˆžβ‰€β‰₯β‰ Β±Γ—Γ·βˆˆβˆ‰βŠ‚βŠƒβˆͺβˆ©βˆ€βˆƒβˆ‡βˆ‚]',
11
+ r'\d+\s*[+\-*/=]\s*\d+',
12
+ r'\b(sin|cos|tan|log|ln|exp|lim|integral|derivative)\b',
13
+ r'[a-z]\s*=\s*[a-z0-9]',
14
+ r'\^|\d+_\d+',
15
+ ]
16
+
17
+ def detect_mathematical_content(self, text: str) -> bool:
18
+ """
19
+ Detect if text contains mathematical/scientific content
20
+
21
+ Args:
22
+ text: Text to analyze
23
+
24
+ Returns:
25
+ True if mathematical content is detected
26
+ """
27
+ text_lower = text.lower()
28
+
29
+ for pattern in self.MATH_INDICATORS:
30
+ if re.search(pattern, text_lower, re.IGNORECASE):
31
+ return True
32
+
33
+ return False
34
+
35
+ def build_enhancement_prompt(
36
+ self,
37
+ content: str,
38
+ user_instructions: str = "",
39
+ doc_type: str = "auto",
40
+ include_latex: bool = False
41
+ ) -> str:
42
+ """
43
+ Build comprehensive enhancement prompt for Gemini
44
+ This method is kept for backward compatibility but now creates
45
+ a LaTeX-focused prompt
46
+
47
+ Args:
48
+ content: Original document content
49
+ user_instructions: User's specific instructions
50
+ doc_type: Type of document (auto, academic, technical, business, etc.)
51
+ include_latex: Whether to include LaTeX formatting
52
+
53
+ Returns:
54
+ Complete prompt for Gemini
55
+ """
56
+ prompt_parts = [
57
+ "You are an expert document editor specializing in professional and academic writing.",
58
+ "Enhance this document with proper formatting and LaTeX notation where needed.",
59
+ ""
60
+ ]
61
+
62
+ # Add LaTeX instructions if needed
63
+ if include_latex:
64
+ prompt_parts.extend([
65
+ "πŸ”¬ IMPORTANT: This document contains mathematical or scientific content.",
66
+ "- Format ALL equations using proper LaTeX notation",
67
+ "- Use $...$ for inline equations (e.g., $E = mc^2$)",
68
+ "- Use $$...$$ for display equations on their own lines",
69
+ "- Use proper LaTeX commands: \\frac{}{}, \\sqrt{}, \\int, \\sum, \\alpha, \\beta, etc.",
70
+ "- Convert all mathematical expressions to clean, compilable LaTeX code",
71
+ "- Number important equations as needed",
72
+ "- Ensure all mathematical notation is professional and consistent",
73
+ ""
74
+ ])
75
+
76
+ # Add document type specific instructions
77
+ if doc_type == "academic":
78
+ prompt_parts.extend([
79
+ "πŸ“š Document Type: Academic/Research Paper",
80
+ "- Use formal academic tone",
81
+ "- Structure with clear sections (Abstract, Introduction, Methods, Results, Discussion, Conclusion)",
82
+ "- Include proper citations where needed (use [Author, Year] format)",
83
+ "- Ensure technical accuracy",
84
+ ""
85
+ ])
86
+ elif doc_type == "technical":
87
+ prompt_parts.extend([
88
+ "πŸ”§ Document Type: Technical Documentation",
89
+ "- Use clear, precise technical language",
90
+ "- Include code examples in proper formatting if relevant",
91
+ "- Use numbered lists for procedures",
92
+ "- Add technical diagrams descriptions where helpful",
93
+ ""
94
+ ])
95
+ elif doc_type == "business":
96
+ prompt_parts.extend([
97
+ "πŸ’Ό Document Type: Business Document",
98
+ "- Use professional business tone",
99
+ "- Focus on clarity and conciseness",
100
+ "- Highlight key points and actionable items",
101
+ "- Use bullet points for readability",
102
+ ""
103
+ ])
104
+
105
+ # Add user instructions
106
+ if user_instructions:
107
+ prompt_parts.extend([
108
+ f"πŸ‘€ User's Specific Instructions:",
109
+ f"{user_instructions}",
110
+ ""
111
+ ])
112
+
113
+ # Add the content
114
+ prompt_parts.extend([
115
+ "πŸ“„ Original Document Content:",
116
+ "=" * 60,
117
+ content,
118
+ "=" * 60,
119
+ "",
120
+ "✨ Please provide the ENHANCED version following all guidelines above.",
121
+ "Maintain the document structure but improve quality, clarity, and professionalism.",
122
+ "Convert all math to proper LaTeX notation if applicable.",
123
+ "Return ONLY the enhanced content, no explanations or meta-commentary.",
124
+ ])
125
+
126
+ return "\n".join(prompt_parts)
127
+
128
+ def process_latex_content(self, content: str) -> str:
129
+ """
130
+ Process and clean LaTeX content from Gemini output
131
+
132
+ Args:
133
+ content: Content potentially containing LaTeX
134
+
135
+ Returns:
136
+ Processed content with valid LaTeX
137
+ """
138
+ # Remove markdown code blocks if Gemini wrapped the output
139
+ content = re.sub(r'```latex\n', '', content)
140
+ content = re.sub(r'```\n?', '', content)
141
+
142
+ # Ensure proper spacing around inline equations
143
+ content = re.sub(r'(\S)\$', r'\1 $', content)
144
+ content = re.sub(r'\$(\S)', r'$ \1', content)
145
+
146
+ # Ensure display equations are on their own lines
147
+ content = re.sub(r'(\S)\$\$', r'\1\n$$', content)
148
+ content = re.sub(r'\$\$(\S)', r'$$\n\1', content)
149
+
150
+ # Clean up excessive newlines
151
+ content = re.sub(r'\n{3,}', '\n\n', content)
152
+
153
+ # Fix common LaTeX spacing issues
154
+ content = re.sub(r'\$\s+\$', '$$', content) # Remove empty equations
155
+
156
+ return content.strip()
157
+
158
+ def extract_latex_equations(self, content: str) -> List[Tuple[str, str]]:
159
+ """
160
+ Extract LaTeX equations from content
161
+
162
+ Args:
163
+ content: Content containing LaTeX
164
+
165
+ Returns:
166
+ List of tuples (equation_type, equation_content)
167
+ equation_type is either 'inline' or 'display'
168
+ """
169
+ equations = []
170
+
171
+ # Extract display equations ($$...$$)
172
+ display_pattern = r'\$\$(.*?)\$\$'
173
+ for match in re.finditer(display_pattern, content, re.DOTALL):
174
+ equations.append(('display', match.group(1).strip()))
175
+
176
+ # Extract inline equations ($...$)
177
+ inline_pattern = r'(?<!\$)\$(?!\$)(.*?)(?<!\$)\$(?!\$)'
178
+ for match in re.finditer(inline_pattern, content):
179
+ equations.append(('inline', match.group(1).strip()))
180
+
181
+ return equations
182
+
183
+ def validate_latex(self, latex_code: str) -> Tuple[bool, str]:
184
+ """
185
+ Basic validation of LaTeX code
186
+
187
+ Args:
188
+ latex_code: LaTeX code to validate
189
+
190
+ Returns:
191
+ Tuple of (is_valid, error_message)
192
+ """
193
+ # Check for balanced braces
194
+ if latex_code.count('{') != latex_code.count('}'):
195
+ return False, "Unbalanced braces in LaTeX code"
196
+
197
+ # Check for balanced brackets
198
+ if latex_code.count('[') != latex_code.count(']'):
199
+ return False, "Unbalanced brackets in LaTeX code"
200
+
201
+ # Check for balanced dollar signs
202
+ single_dollars = len(re.findall(r'(?<!\$)\$(?!\$)', latex_code))
203
+ if single_dollars % 2 != 0:
204
+ return False, "Unbalanced inline equation markers ($)"
205
+
206
+ double_dollars = len(re.findall(r'\$\$', latex_code))
207
+ if double_dollars % 2 != 0:
208
+ return False, "Unbalanced display equation markers ($$)"
209
+
210
+ # Basic validation passed
211
+ return True, ""
212
+
213
+ def convert_latex_to_unicode(self, latex_code: str) -> str:
214
+ """
215
+ Convert simple LaTeX to Unicode for display in DOCX
216
+ (For equations that can be represented in Unicode)
217
+
218
+ Args:
219
+ latex_code: LaTeX code
220
+
221
+ Returns:
222
+ Unicode representation where possible
223
+ """
224
+ # Simple conversions for common symbols
225
+ conversions = {
226
+ r'\\alpha': 'Ξ±',
227
+ r'\\beta': 'Ξ²',
228
+ r'\\gamma': 'Ξ³',
229
+ r'\\delta': 'Ξ΄',
230
+ r'\\epsilon': 'Ξ΅',
231
+ r'\\theta': 'ΞΈ',
232
+ r'\\lambda': 'Ξ»',
233
+ r'\\mu': 'ΞΌ',
234
+ r'\\pi': 'Ο€',
235
+ r'\\sigma': 'Οƒ',
236
+ r'\\phi': 'Ο†',
237
+ r'\\omega': 'Ο‰',
238
+ r'\\infty': '∞',
239
+ r'\\leq': '≀',
240
+ r'\\geq': 'β‰₯',
241
+ r'\\neq': 'β‰ ',
242
+ r'\\approx': 'β‰ˆ',
243
+ r'\\sum': 'βˆ‘',
244
+ r'\\prod': '∏',
245
+ r'\\int': '∫',
246
+ r'\\sqrt': '√',
247
+ r'\\pm': 'Β±',
248
+ r'\\times': 'Γ—',
249
+ r'\\div': 'Γ·',
250
+ }
251
+
252
+ result = latex_code
253
+ for latex, unicode_char in conversions.items():
254
+ result = result.replace(latex, unicode_char)
255
+
256
+ return result
257
+
258
+ def enhance_equations(self, content: str) -> str:
259
+ """
260
+ Enhance mathematical equations in content
261
+
262
+ Args:
263
+ content: Content with equations
264
+
265
+ Returns:
266
+ Content with enhanced equations
267
+ """
268
+ return self.process_latex_content(content)