import os from pathlib import Path from typing import Callable, Dict, Any import traceback # Optional imports with error handling try: from docx import Document DOCX_AVAILABLE = True except ImportError: DOCX_AVAILABLE = False print("⚠ python-docx not installed. DOCX conversion will not work.") try: import fitz # PyMuPDF FITZ_AVAILABLE = True except ImportError: FITZ_AVAILABLE = False print("⚠ PyMuPDF not installed. PDF conversion will not work.") try: import markdown MARKDOWN_AVAILABLE = True except ImportError: MARKDOWN_AVAILABLE = False print("⚠ markdown not installed. MD conversion will not work.") try: from bs4 import BeautifulSoup BS4_AVAILABLE = True except ImportError: BS4_AVAILABLE = False print("⚠ beautifulsoup4 not installed. HTML conversion will not work.") class DocumentConverter: def __init__(self): pass def convert(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Convert document files""" input_ext = Path(input_path).suffix.lower() try: self._update_progress(progress_callback, 10) # Check if input file exists if not os.path.exists(input_path): print(f"Input file not found: {input_path}") return False # Create output directory if needed Path(output_path).parent.mkdir(parents=True, exist_ok=True) result = False # PDF conversion if input_ext == '.pdf': if not FITZ_AVAILABLE: print("PyMuPDF not available for PDF conversion") return False result = self.convert_pdf(input_path, output_path, options, progress_callback) # DOCX conversion elif input_ext in ['.docx', '.doc']: if not DOCX_AVAILABLE: print("python-docx not available for DOCX conversion") return False result = self.convert_docx(input_path, output_path, options, progress_callback) # TXT conversion elif input_ext == '.txt': result = self.convert_txt(input_path, output_path, options, progress_callback) # Markdown conversion elif input_ext == '.md': if not MARKDOWN_AVAILABLE: print("markdown library not available") return False result = self.convert_markdown(input_path, output_path, options, progress_callback) # HTML conversion elif input_ext == '.html': result = self.convert_html(input_path, output_path, options, progress_callback) else: result = self.convert_generic(input_path, output_path, options, progress_callback) if result: output_ext = Path(output_path).suffix.lower() print(f"✓ Successfully converted: {os.path.basename(input_path)} → {output_ext}") return result except Exception as e: print(f"Document conversion error for {input_path}: {str(e)}") traceback.print_exc() return False def _update_progress(self, callback, value): """Safely update progress""" if callback is not None: try: callback(value) except Exception: pass def convert_pdf(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Convert PDF to other formats""" try: doc = fitz.open(input_path) total_pages = len(doc) self._update_progress(progress_callback, 20) if output_path.endswith('.txt'): text = "" for page_num in range(total_pages): page = doc[page_num] text += page.get_text() progress_pct = 20 + (page_num + 1) * 60 // total_pages self._update_progress(progress_callback, progress_pct) with open(output_path, 'w', encoding='utf-8') as f: f.write(text) elif output_path.endswith('.docx'): docx_doc = Document() for page_num in range(total_pages): page = doc[page_num] text = page.get_text() docx_doc.add_paragraph(text) progress_pct = 20 + (page_num + 1) * 60 // total_pages self._update_progress(progress_callback, progress_pct) docx_doc.save(output_path) elif output_path.endswith('.html'): html_content = """ PDF Content """ for page_num in range(total_pages): page = doc[page_num] text = page.get_text() html_content += f"""
Page {page_num + 1}
{text}
""" progress_pct = 20 + (page_num + 1) * 60 // total_pages self._update_progress(progress_callback, progress_pct) html_content += "" with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) doc.close() self._update_progress(progress_callback, 100) return True except Exception as e: print(f"PDF conversion error: {e}") return False def convert_docx(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Convert DOCX to other formats""" try: doc = Document(input_path) self._update_progress(progress_callback, 30) if output_path.endswith('.txt'): text = "\n".join([para.text for para in doc.paragraphs]) with open(output_path, 'w', encoding='utf-8') as f: f.write(text) elif output_path.endswith('.html'): html_content = """ Document Content """ for para in doc.paragraphs: if para.text.strip(): html_content += f"

{para.text}

" html_content += "" with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) elif output_path.endswith('.md'): markdown_content = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()]) with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) elif output_path.endswith('.pdf'): # Simple PDF conversion using text extraction text = "\n".join([para.text for para in doc.paragraphs]) with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f: f.write(text) print("Note: DOCX to PDF requires additional libraries. Saved as TXT instead.") self._update_progress(progress_callback, 100) return True except Exception as e: print(f"DOCX conversion error: {e}") return False def convert_txt(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Convert TXT to other formats""" try: with open(input_path, 'r', encoding='utf-8') as f: content = f.read() self._update_progress(progress_callback, 40) if output_path.endswith('.md'): with open(output_path, 'w', encoding='utf-8') as f: f.write(content) elif output_path.endswith('.html'): html_content = f""" Text Document
{content}
""" with open(output_path, 'w', encoding='utf-8') as f: f.write(html_content) elif output_path.endswith('.docx'): if DOCX_AVAILABLE: doc = Document() doc.add_paragraph(content) doc.save(output_path) else: with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f: f.write(content) print("Note: python-docx not installed. Saved as TXT instead.") self._update_progress(progress_callback, 100) return True except Exception as e: print(f"TXT conversion error: {e}") return False def convert_markdown(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Convert Markdown to other formats""" try: with open(input_path, 'r', encoding='utf-8') as f: content = f.read() self._update_progress(progress_callback, 40) if output_path.endswith('.html'): html_content = markdown.markdown(content) full_html = f""" Markdown Document {html_content} """ with open(output_path, 'w', encoding='utf-8') as f: f.write(full_html) elif output_path.endswith('.docx'): if DOCX_AVAILABLE and BS4_AVAILABLE: html = markdown.markdown(content) soup = BeautifulSoup(html, 'html.parser') doc = Document() for para in soup.find_all('p'): if para.get_text().strip(): doc.add_paragraph(para.get_text()) doc.save(output_path) else: with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f: f.write(content) print("Note: Required libraries not installed. Saved as TXT instead.") self._update_progress(progress_callback, 100) return True except Exception as e: print(f"Markdown conversion error: {e}") return False def convert_html(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Convert HTML to other formats""" try: with open(input_path, 'r', encoding='utf-8') as f: content = f.read() self._update_progress(progress_callback, 40) if BS4_AVAILABLE: soup = BeautifulSoup(content, 'html.parser') text = soup.get_text() else: # Simple text extraction import re text = re.sub(r'<[^>]+>', ' ', content) text = re.sub(r'\s+', ' ', text).strip() if output_path.endswith('.txt'): with open(output_path, 'w', encoding='utf-8') as f: f.write(text) elif output_path.endswith('.md'): with open(output_path, 'w', encoding='utf-8') as f: f.write(f"# Converted from HTML\n\n{text}") elif output_path.endswith('.docx'): if DOCX_AVAILABLE: doc = Document() doc.add_paragraph(text) doc.save(output_path) else: with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f: f.write(text) self._update_progress(progress_callback, 100) return True except Exception as e: print(f"HTML conversion error: {e}") return False def convert_generic(self, input_path: str, output_path: str, options: Dict[str, Any], progress_callback: Callable = None) -> bool: """Generic text file conversion""" try: # Try to read as text encodings = ['utf-8', 'latin-1', 'cp1252'] content = None for encoding in encodings: try: with open(input_path, 'r', encoding=encoding) as f: content = f.read() break except UnicodeDecodeError: continue if content is None: # If can't read as text, just copy binary with open(input_path, 'rb') as src: with open(output_path, 'wb') as dst: dst.write(src.read()) else: with open(output_path, 'w', encoding='utf-8') as f: f.write(content) self._update_progress(progress_callback, 100) return True except Exception as e: print(f"Generic conversion error: {e}") return False