| import os
|
| from pathlib import Path
|
| from typing import Callable, Dict, Any
|
| import traceback
|
|
|
|
|
| try:
|
| from docx import Document
|
|
|
| DOCX_AVAILABLE = True
|
| except ImportError:
|
| DOCX_AVAILABLE = False
|
| print("⚠ python-docx not installed. DOCX conversion will not work.")
|
|
|
| try:
|
| import fitz
|
|
|
| FITZ_AVAILABLE = True
|
| except ImportError:
|
| FITZ_AVAILABLE = False
|
| print("⚠ PyMuPDF not installed. PDF conversion will not work.")
|
|
|
| try:
|
| import markdown
|
|
|
| MARKDOWN_AVAILABLE = True
|
| except ImportError:
|
| MARKDOWN_AVAILABLE = False
|
| print("⚠ markdown not installed. MD conversion will not work.")
|
|
|
| try:
|
| from bs4 import BeautifulSoup
|
|
|
| BS4_AVAILABLE = True
|
| except ImportError:
|
| BS4_AVAILABLE = False
|
| print("⚠ beautifulsoup4 not installed. HTML conversion will not work.")
|
|
|
|
|
| class DocumentConverter:
|
| def __init__(self):
|
| pass
|
|
|
| def convert(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Convert document files"""
|
| input_ext = Path(input_path).suffix.lower()
|
|
|
| try:
|
| self._update_progress(progress_callback, 10)
|
|
|
|
|
| if not os.path.exists(input_path):
|
| print(f"Input file not found: {input_path}")
|
| return False
|
|
|
|
|
| Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
|
| result = False
|
|
|
|
|
| if input_ext == '.pdf':
|
| if not FITZ_AVAILABLE:
|
| print("PyMuPDF not available for PDF conversion")
|
| return False
|
| result = self.convert_pdf(input_path, output_path, options, progress_callback)
|
|
|
|
|
| elif input_ext in ['.docx', '.doc']:
|
| if not DOCX_AVAILABLE:
|
| print("python-docx not available for DOCX conversion")
|
| return False
|
| result = self.convert_docx(input_path, output_path, options, progress_callback)
|
|
|
|
|
| elif input_ext == '.txt':
|
| result = self.convert_txt(input_path, output_path, options, progress_callback)
|
|
|
|
|
| elif input_ext == '.md':
|
| if not MARKDOWN_AVAILABLE:
|
| print("markdown library not available")
|
| return False
|
| result = self.convert_markdown(input_path, output_path, options, progress_callback)
|
|
|
|
|
| elif input_ext == '.html':
|
| result = self.convert_html(input_path, output_path, options, progress_callback)
|
|
|
| else:
|
| result = self.convert_generic(input_path, output_path, options, progress_callback)
|
|
|
| if result:
|
| output_ext = Path(output_path).suffix.lower()
|
| print(f"✓ Successfully converted: {os.path.basename(input_path)} → {output_ext}")
|
|
|
| return result
|
|
|
| except Exception as e:
|
| print(f"Document conversion error for {input_path}: {str(e)}")
|
| traceback.print_exc()
|
| return False
|
|
|
| def _update_progress(self, callback, value):
|
| """Safely update progress"""
|
| if callback is not None:
|
| try:
|
| callback(value)
|
| except Exception:
|
| pass
|
|
|
| def convert_pdf(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Convert PDF to other formats"""
|
| try:
|
| doc = fitz.open(input_path)
|
| total_pages = len(doc)
|
|
|
| self._update_progress(progress_callback, 20)
|
|
|
| if output_path.endswith('.txt'):
|
| text = ""
|
| for page_num in range(total_pages):
|
| page = doc[page_num]
|
| text += page.get_text()
|
| progress_pct = 20 + (page_num + 1) * 60 // total_pages
|
| self._update_progress(progress_callback, progress_pct)
|
|
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(text)
|
|
|
| elif output_path.endswith('.docx'):
|
| docx_doc = Document()
|
| for page_num in range(total_pages):
|
| page = doc[page_num]
|
| text = page.get_text()
|
| docx_doc.add_paragraph(text)
|
| progress_pct = 20 + (page_num + 1) * 60 // total_pages
|
| self._update_progress(progress_callback, progress_pct)
|
|
|
| docx_doc.save(output_path)
|
|
|
| elif output_path.endswith('.html'):
|
| html_content = """<!DOCTYPE html>
|
| <html>
|
| <head>
|
| <meta charset="UTF-8">
|
| <title>PDF Content</title>
|
| <style>
|
| body { font-family: Arial, sans-serif; margin: 40px; }
|
| .page { margin-bottom: 30px; page-break-after: always; }
|
| .page-number { color: #666; font-size: 12px; margin-bottom: 10px; }
|
| pre { white-space: pre-wrap; word-wrap: break-word; }
|
| </style>
|
| </head>
|
| <body>
|
| """
|
| for page_num in range(total_pages):
|
| page = doc[page_num]
|
| text = page.get_text()
|
| html_content += f"""
|
| <div class="page">
|
| <div class="page-number">Page {page_num + 1}</div>
|
| <pre>{text}</pre>
|
| </div>
|
| """
|
| progress_pct = 20 + (page_num + 1) * 60 // total_pages
|
| self._update_progress(progress_callback, progress_pct)
|
|
|
| html_content += "</body></html>"
|
|
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(html_content)
|
|
|
| doc.close()
|
| self._update_progress(progress_callback, 100)
|
| return True
|
|
|
| except Exception as e:
|
| print(f"PDF conversion error: {e}")
|
| return False
|
|
|
| def convert_docx(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Convert DOCX to other formats"""
|
| try:
|
| doc = Document(input_path)
|
| self._update_progress(progress_callback, 30)
|
|
|
| if output_path.endswith('.txt'):
|
| text = "\n".join([para.text for para in doc.paragraphs])
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(text)
|
|
|
| elif output_path.endswith('.html'):
|
| html_content = """<!DOCTYPE html>
|
| <html>
|
| <head><meta charset="UTF-8"><title>Document Content</title></head>
|
| <body>
|
| """
|
| for para in doc.paragraphs:
|
| if para.text.strip():
|
| html_content += f"<p>{para.text}</p>"
|
| html_content += "</body></html>"
|
|
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(html_content)
|
|
|
| elif output_path.endswith('.md'):
|
| markdown_content = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(markdown_content)
|
|
|
| elif output_path.endswith('.pdf'):
|
|
|
| text = "\n".join([para.text for para in doc.paragraphs])
|
| with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
|
| f.write(text)
|
| print("Note: DOCX to PDF requires additional libraries. Saved as TXT instead.")
|
|
|
| self._update_progress(progress_callback, 100)
|
| return True
|
|
|
| except Exception as e:
|
| print(f"DOCX conversion error: {e}")
|
| return False
|
|
|
| def convert_txt(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Convert TXT to other formats"""
|
| try:
|
| with open(input_path, 'r', encoding='utf-8') as f:
|
| content = f.read()
|
|
|
| self._update_progress(progress_callback, 40)
|
|
|
| if output_path.endswith('.md'):
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(content)
|
|
|
| elif output_path.endswith('.html'):
|
| html_content = f"""<!DOCTYPE html>
|
| <html>
|
| <head><meta charset="UTF-8"><title>Text Document</title></head>
|
| <body>
|
| <pre>{content}</pre>
|
| </body></html>"""
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(html_content)
|
|
|
| elif output_path.endswith('.docx'):
|
| if DOCX_AVAILABLE:
|
| doc = Document()
|
| doc.add_paragraph(content)
|
| doc.save(output_path)
|
| else:
|
| with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
|
| f.write(content)
|
| print("Note: python-docx not installed. Saved as TXT instead.")
|
|
|
| self._update_progress(progress_callback, 100)
|
| return True
|
|
|
| except Exception as e:
|
| print(f"TXT conversion error: {e}")
|
| return False
|
|
|
| def convert_markdown(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Convert Markdown to other formats"""
|
| try:
|
| with open(input_path, 'r', encoding='utf-8') as f:
|
| content = f.read()
|
|
|
| self._update_progress(progress_callback, 40)
|
|
|
| if output_path.endswith('.html'):
|
| html_content = markdown.markdown(content)
|
| full_html = f"""<!DOCTYPE html>
|
| <html>
|
| <head><meta charset="UTF-8"><title>Markdown Document</title></head>
|
| <body>
|
| {html_content}
|
| </body></html>"""
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(full_html)
|
|
|
| elif output_path.endswith('.docx'):
|
| if DOCX_AVAILABLE and BS4_AVAILABLE:
|
| html = markdown.markdown(content)
|
| soup = BeautifulSoup(html, 'html.parser')
|
| doc = Document()
|
| for para in soup.find_all('p'):
|
| if para.get_text().strip():
|
| doc.add_paragraph(para.get_text())
|
| doc.save(output_path)
|
| else:
|
| with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
|
| f.write(content)
|
| print("Note: Required libraries not installed. Saved as TXT instead.")
|
|
|
| self._update_progress(progress_callback, 100)
|
| return True
|
|
|
| except Exception as e:
|
| print(f"Markdown conversion error: {e}")
|
| return False
|
|
|
| def convert_html(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Convert HTML to other formats"""
|
| try:
|
| with open(input_path, 'r', encoding='utf-8') as f:
|
| content = f.read()
|
|
|
| self._update_progress(progress_callback, 40)
|
|
|
| if BS4_AVAILABLE:
|
| soup = BeautifulSoup(content, 'html.parser')
|
| text = soup.get_text()
|
| else:
|
|
|
| import re
|
| text = re.sub(r'<[^>]+>', ' ', content)
|
| text = re.sub(r'\s+', ' ', text).strip()
|
|
|
| if output_path.endswith('.txt'):
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(text)
|
|
|
| elif output_path.endswith('.md'):
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(f"# Converted from HTML\n\n{text}")
|
|
|
| elif output_path.endswith('.docx'):
|
| if DOCX_AVAILABLE:
|
| doc = Document()
|
| doc.add_paragraph(text)
|
| doc.save(output_path)
|
| else:
|
| with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
|
| f.write(text)
|
|
|
| self._update_progress(progress_callback, 100)
|
| return True
|
|
|
| except Exception as e:
|
| print(f"HTML conversion error: {e}")
|
| return False
|
|
|
| def convert_generic(self, input_path: str, output_path: str,
|
| options: Dict[str, Any], progress_callback: Callable = None) -> bool:
|
| """Generic text file conversion"""
|
| try:
|
|
|
| encodings = ['utf-8', 'latin-1', 'cp1252']
|
| content = None
|
|
|
| for encoding in encodings:
|
| try:
|
| with open(input_path, 'r', encoding=encoding) as f:
|
| content = f.read()
|
| break
|
| except UnicodeDecodeError:
|
| continue
|
|
|
| if content is None:
|
|
|
| with open(input_path, 'rb') as src:
|
| with open(output_path, 'wb') as dst:
|
| dst.write(src.read())
|
| else:
|
| with open(output_path, 'w', encoding='utf-8') as f:
|
| f.write(content)
|
|
|
| self._update_progress(progress_callback, 100)
|
| return True
|
|
|
| except Exception as e:
|
| print(f"Generic conversion error: {e}")
|
| return False |