embedingHF commited on
Commit
b169283
·
verified ·
1 Parent(s): bd2106a

Update converters/document_converter.py

Browse files
Files changed (1) hide show
  1. converters/document_converter.py +186 -341
converters/document_converter.py CHANGED
@@ -1,382 +1,227 @@
1
  import os
2
  from pathlib import Path
3
  from typing import Callable, Dict, Any
4
- import traceback
5
 
6
- # Optional imports with error handling
7
- try:
8
- from docx import Document
 
9
 
10
- DOCX_AVAILABLE = True
11
- except ImportError:
12
- DOCX_AVAILABLE = False
13
- print("⚠ python-docx not installed. DOCX conversion will not work.")
14
 
15
- try:
16
- import fitz # PyMuPDF
17
 
18
- FITZ_AVAILABLE = True
19
- except ImportError:
20
- FITZ_AVAILABLE = False
21
- print("⚠ PyMuPDF not installed. PDF conversion will not work.")
22
 
23
- try:
24
- import markdown
 
25
 
26
- MARKDOWN_AVAILABLE = True
27
- except ImportError:
28
- MARKDOWN_AVAILABLE = False
29
- print("⚠ markdown not installed. MD conversion will not work.")
30
 
31
- try:
32
- from bs4 import BeautifulSoup
33
 
34
- BS4_AVAILABLE = True
35
- except ImportError:
36
- BS4_AVAILABLE = False
37
- print("⚠ beautifulsoup4 not installed. HTML conversion will not work.")
38
 
 
 
 
39
 
40
- class DocumentConverter:
41
- def __init__(self):
42
- pass
43
 
44
- def convert(self, input_path: str, output_path: str,
45
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
46
- """Convert document files"""
47
- input_ext = Path(input_path).suffix.lower()
48
 
49
- try:
50
- self._update_progress(progress_callback, 10)
 
51
 
52
- # Check if input file exists
53
- if not os.path.exists(input_path):
54
- print(f"Input file not found: {input_path}")
55
- return False
56
 
57
- # Create output directory if needed
58
- Path(output_path).parent.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
 
59
 
60
- result = False
61
-
62
- # PDF conversion
63
- if input_ext == '.pdf':
64
- if not FITZ_AVAILABLE:
65
- print("PyMuPDF not available for PDF conversion")
66
- return False
67
- result = self.convert_pdf(input_path, output_path, options, progress_callback)
68
-
69
- # DOCX conversion
70
- elif input_ext in ['.docx', '.doc']:
71
- if not DOCX_AVAILABLE:
72
- print("python-docx not available for DOCX conversion")
73
- return False
74
- result = self.convert_docx(input_path, output_path, options, progress_callback)
75
-
76
- # TXT conversion
77
- elif input_ext == '.txt':
78
- result = self.convert_txt(input_path, output_path, options, progress_callback)
79
-
80
- # Markdown conversion
81
- elif input_ext == '.md':
82
- if not MARKDOWN_AVAILABLE:
83
- print("markdown library not available")
84
- return False
85
- result = self.convert_markdown(input_path, output_path, options, progress_callback)
86
-
87
- # HTML conversion
88
- elif input_ext == '.html':
89
- result = self.convert_html(input_path, output_path, options, progress_callback)
90
 
91
  else:
92
- result = self.convert_generic(input_path, output_path, options, progress_callback)
93
 
94
- if result:
95
- output_ext = Path(output_path).suffix.lower()
96
- print(f"✓ Successfully converted: {os.path.basename(input_path)} → {output_ext}")
97
 
98
- return result
99
 
100
  except Exception as e:
101
- print(f"Document conversion error for {input_path}: {str(e)}")
102
- traceback.print_exc()
103
  return False
104
 
105
- def _update_progress(self, callback, value):
106
- """Safely update progress"""
107
- if callback is not None:
108
- try:
109
- callback(value)
110
- except Exception:
111
- pass
112
 
113
- def convert_pdf(self, input_path: str, output_path: str,
114
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
115
- """Convert PDF to other formats"""
116
- try:
117
- doc = fitz.open(input_path)
118
- total_pages = len(doc)
119
-
120
- self._update_progress(progress_callback, 20)
121
-
122
- if output_path.endswith('.txt'):
123
- text = ""
124
- for page_num in range(total_pages):
125
- page = doc[page_num]
126
- text += page.get_text()
127
- progress_pct = 20 + (page_num + 1) * 60 // total_pages
128
- self._update_progress(progress_callback, progress_pct)
129
-
130
- with open(output_path, 'w', encoding='utf-8') as f:
131
- f.write(text)
132
-
133
- elif output_path.endswith('.docx'):
134
- docx_doc = Document()
135
- for page_num in range(total_pages):
136
- page = doc[page_num]
137
- text = page.get_text()
138
- docx_doc.add_paragraph(text)
139
- progress_pct = 20 + (page_num + 1) * 60 // total_pages
140
- self._update_progress(progress_callback, progress_pct)
141
-
142
- docx_doc.save(output_path)
143
-
144
- elif output_path.endswith('.html'):
145
- html_content = """<!DOCTYPE html>
146
- <html>
147
- <head>
148
- <meta charset="UTF-8">
149
- <title>PDF Content</title>
150
- <style>
151
- body { font-family: Arial, sans-serif; margin: 40px; }
152
- .page { margin-bottom: 30px; page-break-after: always; }
153
- .page-number { color: #666; font-size: 12px; margin-bottom: 10px; }
154
- pre { white-space: pre-wrap; word-wrap: break-word; }
155
- </style>
156
- </head>
157
- <body>
158
- """
159
- for page_num in range(total_pages):
160
- page = doc[page_num]
161
- text = page.get_text()
162
- html_content += f"""
163
- <div class="page">
164
- <div class="page-number">Page {page_num + 1}</div>
165
- <pre>{text}</pre>
166
- </div>
167
- """
168
- progress_pct = 20 + (page_num + 1) * 60 // total_pages
169
- self._update_progress(progress_callback, progress_pct)
170
-
171
- html_content += "</body></html>"
172
-
173
- with open(output_path, 'w', encoding='utf-8') as f:
174
- f.write(html_content)
175
-
176
- doc.close()
177
- self._update_progress(progress_callback, 100)
178
- return True
179
 
180
- except Exception as e:
181
- print(f"PDF conversion error: {e}")
182
- return False
183
 
184
- def convert_docx(self, input_path: str, output_path: str,
185
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
186
- """Convert DOCX to other formats"""
187
- try:
188
- doc = Document(input_path)
189
- self._update_progress(progress_callback, 30)
190
-
191
- if output_path.endswith('.txt'):
192
- text = "\n".join([para.text for para in doc.paragraphs])
193
- with open(output_path, 'w', encoding='utf-8') as f:
194
- f.write(text)
195
-
196
- elif output_path.endswith('.html'):
197
- html_content = """<!DOCTYPE html>
198
- <html>
199
- <head><meta charset="UTF-8"><title>Document Content</title></head>
200
- <body>
201
- """
202
- for para in doc.paragraphs:
203
- if para.text.strip():
204
- html_content += f"<p>{para.text}</p>"
205
- html_content += "</body></html>"
206
-
207
- with open(output_path, 'w', encoding='utf-8') as f:
208
- f.write(html_content)
209
-
210
- elif output_path.endswith('.md'):
211
- markdown_content = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
212
- with open(output_path, 'w', encoding='utf-8') as f:
213
- f.write(markdown_content)
214
-
215
- elif output_path.endswith('.pdf'):
216
- # Simple PDF conversion using text extraction
217
- text = "\n".join([para.text for para in doc.paragraphs])
218
- with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
219
- f.write(text)
220
- print("Note: DOCX to PDF requires additional libraries. Saved as TXT instead.")
221
-
222
- self._update_progress(progress_callback, 100)
223
- return True
224
 
225
- except Exception as e:
226
- print(f"DOCX conversion error: {e}")
227
- return False
 
 
228
 
229
- def convert_txt(self, input_path: str, output_path: str,
230
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
231
- """Convert TXT to other formats"""
232
- try:
233
- with open(input_path, 'r', encoding='utf-8') as f:
234
- content = f.read()
235
-
236
- self._update_progress(progress_callback, 40)
237
-
238
- if output_path.endswith('.md'):
239
- with open(output_path, 'w', encoding='utf-8') as f:
240
- f.write(content)
241
-
242
- elif output_path.endswith('.html'):
243
- html_content = f"""<!DOCTYPE html>
244
- <html>
245
- <head><meta charset="UTF-8"><title>Text Document</title></head>
246
- <body>
247
- <pre>{content}</pre>
248
- </body></html>"""
249
- with open(output_path, 'w', encoding='utf-8') as f:
250
- f.write(html_content)
251
-
252
- elif output_path.endswith('.docx'):
253
- if DOCX_AVAILABLE:
254
- doc = Document()
255
- doc.add_paragraph(content)
256
- doc.save(output_path)
257
- else:
258
- with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
259
- f.write(content)
260
- print("Note: python-docx not installed. Saved as TXT instead.")
261
-
262
- self._update_progress(progress_callback, 100)
263
- return True
264
 
265
- except Exception as e:
266
- print(f"TXT conversion error: {e}")
267
- return False
 
268
 
269
- def convert_markdown(self, input_path: str, output_path: str,
270
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
271
- """Convert Markdown to other formats"""
272
- try:
273
- with open(input_path, 'r', encoding='utf-8') as f:
274
- content = f.read()
275
-
276
- self._update_progress(progress_callback, 40)
277
-
278
- if output_path.endswith('.html'):
279
- html_content = markdown.markdown(content)
280
- full_html = f"""<!DOCTYPE html>
281
- <html>
282
- <head><meta charset="UTF-8"><title>Markdown Document</title></head>
283
- <body>
284
- {html_content}
285
- </body></html>"""
286
- with open(output_path, 'w', encoding='utf-8') as f:
287
- f.write(full_html)
288
-
289
- elif output_path.endswith('.docx'):
290
- if DOCX_AVAILABLE and BS4_AVAILABLE:
291
- html = markdown.markdown(content)
292
- soup = BeautifulSoup(html, 'html.parser')
293
- doc = Document()
294
- for para in soup.find_all('p'):
295
- if para.get_text().strip():
296
- doc.add_paragraph(para.get_text())
297
- doc.save(output_path)
298
- else:
299
- with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
300
- f.write(content)
301
- print("Note: Required libraries not installed. Saved as TXT instead.")
302
-
303
- self._update_progress(progress_callback, 100)
304
- return True
305
 
306
- except Exception as e:
307
- print(f"Markdown conversion error: {e}")
308
- return False
 
 
309
 
310
- def convert_html(self, input_path: str, output_path: str,
311
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
312
- """Convert HTML to other formats"""
313
- try:
314
- with open(input_path, 'r', encoding='utf-8') as f:
315
- content = f.read()
316
 
317
- self._update_progress(progress_callback, 40)
318
 
319
- if BS4_AVAILABLE:
320
- soup = BeautifulSoup(content, 'html.parser')
321
- text = soup.get_text()
322
- else:
323
- # Simple text extraction
324
- import re
325
- text = re.sub(r'<[^>]+>', ' ', content)
326
- text = re.sub(r'\s+', ' ', text).strip()
327
-
328
- if output_path.endswith('.txt'):
329
- with open(output_path, 'w', encoding='utf-8') as f:
330
- f.write(text)
331
-
332
- elif output_path.endswith('.md'):
333
- with open(output_path, 'w', encoding='utf-8') as f:
334
- f.write(f"# Converted from HTML\n\n{text}")
335
-
336
- elif output_path.endswith('.docx'):
337
- if DOCX_AVAILABLE:
338
- doc = Document()
339
- doc.add_paragraph(text)
340
- doc.save(output_path)
341
- else:
342
- with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
343
- f.write(text)
344
-
345
- self._update_progress(progress_callback, 100)
346
- return True
347
 
348
- except Exception as e:
349
- print(f"HTML conversion error: {e}")
350
- return False
351
 
352
- def convert_generic(self, input_path: str, output_path: str,
353
- options: Dict[str, Any], progress_callback: Callable = None) -> bool:
354
- """Generic text file conversion"""
355
- try:
356
- # Try to read as text
357
- encodings = ['utf-8', 'latin-1', 'cp1252']
358
- content = None
359
-
360
- for encoding in encodings:
361
- try:
362
- with open(input_path, 'r', encoding=encoding) as f:
363
- content = f.read()
364
- break
365
- except UnicodeDecodeError:
366
- continue
367
-
368
- if content is None:
369
- # If can't read as text, just copy binary
370
- with open(input_path, 'rb') as src:
371
- with open(output_path, 'wb') as dst:
372
- dst.write(src.read())
373
- else:
374
- with open(output_path, 'w', encoding='utf-8') as f:
375
- f.write(content)
376
 
377
- self._update_progress(progress_callback, 100)
378
- return True
379
 
380
- except Exception as e:
381
- print(f"Generic conversion error: {e}")
382
- return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  from pathlib import Path
3
  from typing import Callable, Dict, Any
 
4
 
5
+ import markdown
6
+ import fitz
7
+ from docx import Document
8
+ from bs4 import BeautifulSoup
9
 
 
 
 
 
10
 
11
+ class DocumentConverter:
 
12
 
13
+ def convert(self, input_path:str, output_path:str,
14
+ options:Dict[str,Any]|None=None,
15
+ progress_callback:Callable|None=None)->bool:
 
16
 
17
+ try:
18
+ input_ext = Path(input_path).suffix.lower()
19
+ output_ext = Path(output_path).suffix.lower()
20
 
21
+ self._update(progress_callback, 10)
 
 
 
22
 
23
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
 
24
 
25
+ success = False
 
 
 
26
 
27
+ # TXT
28
+ if input_ext == ".txt" and output_ext == ".pdf":
29
+ success = self.txt_to_pdf(input_path, output_path)
30
 
31
+ elif input_ext == ".txt" and output_ext == ".html":
32
+ success = self.txt_to_html(input_path, output_path)
 
33
 
34
+ elif input_ext == ".txt" and output_ext == ".md":
35
+ success = self.txt_to_md(input_path, output_path)
 
 
36
 
37
+ # MD
38
+ elif input_ext == ".md" and output_ext == ".html":
39
+ success = self.md_to_html(input_path, output_path)
40
 
41
+ elif input_ext == ".md" and output_ext == ".txt":
42
+ success = self.md_to_text(input_path, output_path)
 
 
43
 
44
+ # HTML
45
+ elif input_ext == ".html" and output_ext == ".txt":
46
+ success = self.html_to_text(input_path, output_path)
47
+
48
+ elif input_ext == ".html" and output_ext == ".md":
49
+ success = self.html_to_md(input_path, output_path)
50
+
51
+ # DOCX
52
+ elif input_ext == ".docx" and output_ext == ".txt":
53
+ success = self.docx_to_text(input_path, output_path)
54
 
55
+ # PDF
56
+ elif input_ext == ".pdf" and output_ext == ".txt":
57
+ success = self.pdf_to_text(input_path, output_path)
58
+
59
+ elif input_ext == ".pdf" and output_ext == ".html":
60
+ success = self.pdf_to_html(input_path, output_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  else:
63
+ raise ValueError(f"Unsupported conversion: {input_ext} -> {output_ext}")
64
 
65
+ self._update(progress_callback, 100)
 
 
66
 
67
+ return success
68
 
69
  except Exception as e:
70
+ print(f"Document conversion error: {e}")
 
71
  return False
72
 
73
+ def txt_to_pdf(self, input_path, output_path):
74
+ pdf = fitz.open()
75
+ page = pdf.new_page()
 
 
 
 
76
 
77
+ text = Path(input_path).read_text(
78
+ encoding="utf-8",
79
+ errors="ignore"
80
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
+ page.insert_text((72, 72), text[:5000])
 
 
83
 
84
+ pdf.save(output_path)
85
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ def txt_to_html(self, input_path, output_path):
88
+ text = Path(input_path).read_text(
89
+ encoding="utf-8",
90
+ errors="ignore"
91
+ )
92
 
93
+ html = f"<html><body><pre>{text}</pre></body></html>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ Path(output_path).write_text(
96
+ html,
97
+ encoding="utf-8"
98
+ )
99
 
100
+ return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ def txt_to_md(self, input_path, output_path):
103
+ text = Path(input_path).read_text(
104
+ encoding="utf-8",
105
+ errors="ignore"
106
+ )
107
 
108
+ Path(output_path).write_text(
109
+ text,
110
+ encoding="utf-8"
111
+ )
 
 
112
 
113
+ return True
114
 
115
+ def md_to_html(self, input_path, output_path):
116
+ md = Path(input_path).read_text(
117
+ encoding="utf-8",
118
+ errors="ignore"
119
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
+ html = markdown.markdown(md)
 
 
122
 
123
+ Path(output_path).write_text(
124
+ html,
125
+ encoding="utf-8"
126
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ return True
 
129
 
130
+ def md_to_text(self, input_path, output_path):
131
+ md = Path(input_path).read_text(
132
+ encoding="utf-8",
133
+ errors="ignore"
134
+ )
135
+
136
+ html = markdown.markdown(md)
137
+ soup = BeautifulSoup(html, "html.parser")
138
+
139
+ Path(output_path).write_text(
140
+ soup.get_text(),
141
+ encoding="utf-8"
142
+ )
143
+
144
+ return True
145
+
146
+ def html_to_text(self, input_path, output_path):
147
+ html = Path(input_path).read_text(
148
+ encoding="utf-8",
149
+ errors="ignore"
150
+ )
151
+
152
+ soup = BeautifulSoup(html, "html.parser")
153
+
154
+ Path(output_path).write_text(
155
+ soup.get_text(),
156
+ encoding="utf-8"
157
+ )
158
+
159
+ return True
160
+
161
+ def html_to_md(self, input_path, output_path):
162
+ html = Path(input_path).read_text(
163
+ encoding="utf-8",
164
+ errors="ignore"
165
+ )
166
+
167
+ soup = BeautifulSoup(html, "html.parser")
168
+
169
+ text = soup.get_text()
170
+
171
+ Path(output_path).write_text(
172
+ text,
173
+ encoding="utf-8"
174
+ )
175
+
176
+ return True
177
+
178
+ def docx_to_text(self, input_path, output_path):
179
+ doc = Document(input_path)
180
+
181
+ text = "\n".join(
182
+ [p.text for p in doc.paragraphs]
183
+ )
184
+
185
+ Path(output_path).write_text(
186
+ text,
187
+ encoding="utf-8"
188
+ )
189
+
190
+ return True
191
+
192
+ def pdf_to_text(self, input_path, output_path):
193
+ doc = fitz.open(input_path)
194
+
195
+ text = ""
196
+
197
+ for page in doc:
198
+ text += page.get_text()
199
+
200
+ Path(output_path).write_text(
201
+ text,
202
+ encoding="utf-8"
203
+ )
204
+
205
+ return True
206
+
207
+ def pdf_to_html(self, input_path, output_path):
208
+ doc = fitz.open(input_path)
209
+
210
+ html = ""
211
+
212
+ for page in doc:
213
+ html += page.get_text("html")
214
+
215
+ Path(output_path).write_text(
216
+ html,
217
+ encoding="utf-8"
218
+ )
219
+
220
+ return True
221
+
222
+ def _update(self, callback, value):
223
+ try:
224
+ if callback:
225
+ callback(value)
226
+ except:
227
+ pass