embedingHF
/

AFC

@@ -1,382 +1,227 @@
 import os
 from pathlib import Path
 from typing import Callable, Dict, Any
-import traceback
-# Optional imports with error handling
-try:
-    from docx import Document
-    DOCX_AVAILABLE = True
-except ImportError:
-    DOCX_AVAILABLE = False
-    print("⚠ python-docx not installed. DOCX conversion will not work.")
-try:
-    import fitz  # PyMuPDF
-    FITZ_AVAILABLE = True
-except ImportError:
-    FITZ_AVAILABLE = False
-    print("⚠ PyMuPDF not installed. PDF conversion will not work.")
-try:
-    import markdown
-    MARKDOWN_AVAILABLE = True
-except ImportError:
-    MARKDOWN_AVAILABLE = False
-    print("⚠ markdown not installed. MD conversion will not work.")
-try:
-    from bs4 import BeautifulSoup
-    BS4_AVAILABLE = True
-except ImportError:
-    BS4_AVAILABLE = False
-    print("⚠ beautifulsoup4 not installed. HTML conversion will not work.")
-class DocumentConverter:
-    def __init__(self):
-        pass
-    def convert(self, input_path: str, output_path: str,
-                options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Convert document files"""
-        input_ext = Path(input_path).suffix.lower()
-        try:
-            self._update_progress(progress_callback, 10)
-            # Check if input file exists
-            if not os.path.exists(input_path):
-                print(f"Input file not found: {input_path}")
-                return False
-            # Create output directory if needed
-            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
-            result = False
-            # PDF conversion
-            if input_ext == '.pdf':
-                if not FITZ_AVAILABLE:
-                    print("PyMuPDF not available for PDF conversion")
-                    return False
-                result = self.convert_pdf(input_path, output_path, options, progress_callback)
-            # DOCX conversion
-            elif input_ext in ['.docx', '.doc']:
-                if not DOCX_AVAILABLE:
-                    print("python-docx not available for DOCX conversion")
-                    return False
-                result = self.convert_docx(input_path, output_path, options, progress_callback)
-            # TXT conversion
-            elif input_ext == '.txt':
-                result = self.convert_txt(input_path, output_path, options, progress_callback)
-            # Markdown conversion
-            elif input_ext == '.md':
-                if not MARKDOWN_AVAILABLE:
-                    print("markdown library not available")
-                    return False
-                result = self.convert_markdown(input_path, output_path, options, progress_callback)
-            # HTML conversion
-            elif input_ext == '.html':
-                result = self.convert_html(input_path, output_path, options, progress_callback)
             else:
-                result = self.convert_generic(input_path, output_path, options, progress_callback)
-            if result:
-                output_ext = Path(output_path).suffix.lower()
-                print(f"✓ Successfully converted: {os.path.basename(input_path)} → {output_ext}")
-            return result
         except Exception as e:
-            print(f"Document conversion error for {input_path}: {str(e)}")
-            traceback.print_exc()
             return False
-    def _update_progress(self, callback, value):
-        """Safely update progress"""
-        if callback is not None:
-            try:
-                callback(value)
-            except Exception:
-                pass
-    def convert_pdf(self, input_path: str, output_path: str,
-                    options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Convert PDF to other formats"""
-        try:
-            doc = fitz.open(input_path)
-            total_pages = len(doc)
-            self._update_progress(progress_callback, 20)
-            if output_path.endswith('.txt'):
-                text = ""
-                for page_num in range(total_pages):
-                    page = doc[page_num]
-                    text += page.get_text()
-                    progress_pct = 20 + (page_num + 1) * 60 // total_pages
-                    self._update_progress(progress_callback, progress_pct)
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(text)
-            elif output_path.endswith('.docx'):
-                docx_doc = Document()
-                for page_num in range(total_pages):
-                    page = doc[page_num]
-                    text = page.get_text()
-                    docx_doc.add_paragraph(text)
-                    progress_pct = 20 + (page_num + 1) * 60 // total_pages
-                    self._update_progress(progress_callback, progress_pct)
-                docx_doc.save(output_path)
-            elif output_path.endswith('.html'):
-                html_content = """<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="UTF-8">
-    <title>PDF Content</title>
-    <style>
-        body { font-family: Arial, sans-serif; margin: 40px; }
-        .page { margin-bottom: 30px; page-break-after: always; }
-        .page-number { color: #666; font-size: 12px; margin-bottom: 10px; }
-        pre { white-space: pre-wrap; word-wrap: break-word; }
-    </style>
-</head>
-<body>
-"""
-                for page_num in range(total_pages):
-                    page = doc[page_num]
-                    text = page.get_text()
-                    html_content += f"""
-<div class="page">
-    <div class="page-number">Page {page_num + 1}</div>
-    <pre>{text}</pre>
-</div>
-"""
-                    progress_pct = 20 + (page_num + 1) * 60 // total_pages
-                    self._update_progress(progress_callback, progress_pct)
-                html_content += "</body></html>"
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(html_content)
-            doc.close()
-            self._update_progress(progress_callback, 100)
-            return True
-        except Exception as e:
-            print(f"PDF conversion error: {e}")
-            return False
-    def convert_docx(self, input_path: str, output_path: str,
-                     options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Convert DOCX to other formats"""
-        try:
-            doc = Document(input_path)
-            self._update_progress(progress_callback, 30)
-            if output_path.endswith('.txt'):
-                text = "\n".join([para.text for para in doc.paragraphs])
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(text)
-            elif output_path.endswith('.html'):
-                html_content = """<!DOCTYPE html>
-<html>
-<head><meta charset="UTF-8"><title>Document Content</title></head>
-<body>
-"""
-                for para in doc.paragraphs:
-                    if para.text.strip():
-                        html_content += f"<p>{para.text}</p>"
-                html_content += "</body></html>"
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(html_content)
-            elif output_path.endswith('.md'):
-                markdown_content = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(markdown_content)
-            elif output_path.endswith('.pdf'):
-                # Simple PDF conversion using text extraction
-                text = "\n".join([para.text for para in doc.paragraphs])
-                with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
-                    f.write(text)
-                print("Note: DOCX to PDF requires additional libraries. Saved as TXT instead.")
-            self._update_progress(progress_callback, 100)
-            return True
-        except Exception as e:
-            print(f"DOCX conversion error: {e}")
-            return False
-    def convert_txt(self, input_path: str, output_path: str,
-                    options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Convert TXT to other formats"""
-        try:
-            with open(input_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-            self._update_progress(progress_callback, 40)
-            if output_path.endswith('.md'):
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(content)
-            elif output_path.endswith('.html'):
-                html_content = f"""<!DOCTYPE html>
-<html>
-<head><meta charset="UTF-8"><title>Text Document</title></head>
-<body>
-<pre>{content}</pre>
-</body></html>"""
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(html_content)
-            elif output_path.endswith('.docx'):
-                if DOCX_AVAILABLE:
-                    doc = Document()
-                    doc.add_paragraph(content)
-                    doc.save(output_path)
-                else:
-                    with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
-                        f.write(content)
-                    print("Note: python-docx not installed. Saved as TXT instead.")
-            self._update_progress(progress_callback, 100)
-            return True
-        except Exception as e:
-            print(f"TXT conversion error: {e}")
-            return False
-    def convert_markdown(self, input_path: str, output_path: str,
-                         options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Convert Markdown to other formats"""
-        try:
-            with open(input_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-            self._update_progress(progress_callback, 40)
-            if output_path.endswith('.html'):
-                html_content = markdown.markdown(content)
-                full_html = f"""<!DOCTYPE html>
-<html>
-<head><meta charset="UTF-8"><title>Markdown Document</title></head>
-<body>
-{html_content}
-</body></html>"""
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(full_html)
-            elif output_path.endswith('.docx'):
-                if DOCX_AVAILABLE and BS4_AVAILABLE:
-                    html = markdown.markdown(content)
-                    soup = BeautifulSoup(html, 'html.parser')
-                    doc = Document()
-                    for para in soup.find_all('p'):
-                        if para.get_text().strip():
-                            doc.add_paragraph(para.get_text())
-                    doc.save(output_path)
-                else:
-                    with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
-                        f.write(content)
-                    print("Note: Required libraries not installed. Saved as TXT instead.")
-            self._update_progress(progress_callback, 100)
-            return True
-        except Exception as e:
-            print(f"Markdown conversion error: {e}")
-            return False
-    def convert_html(self, input_path: str, output_path: str,
-                     options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Convert HTML to other formats"""
-        try:
-            with open(input_path, 'r', encoding='utf-8') as f:
-                content = f.read()
-            self._update_progress(progress_callback, 40)
-            if BS4_AVAILABLE:
-                soup = BeautifulSoup(content, 'html.parser')
-                text = soup.get_text()
-            else:
-                # Simple text extraction
-                import re
-                text = re.sub(r'<[^>]+>', ' ', content)
-                text = re.sub(r'\s+', ' ', text).strip()
-            if output_path.endswith('.txt'):
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(text)
-            elif output_path.endswith('.md'):
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(f"# Converted from HTML\n\n{text}")
-            elif output_path.endswith('.docx'):
-                if DOCX_AVAILABLE:
-                    doc = Document()
-                    doc.add_paragraph(text)
-                    doc.save(output_path)
-                else:
-                    with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
-                        f.write(text)
-            self._update_progress(progress_callback, 100)
-            return True
-        except Exception as e:
-            print(f"HTML conversion error: {e}")
-            return False
-    def convert_generic(self, input_path: str, output_path: str,
-                        options: Dict[str, Any], progress_callback: Callable = None) -> bool:
-        """Generic text file conversion"""
-        try:
-            # Try to read as text
-            encodings = ['utf-8', 'latin-1', 'cp1252']
-            content = None
-            for encoding in encodings:
-                try:
-                    with open(input_path, 'r', encoding=encoding) as f:
-                        content = f.read()
-                    break
-                except UnicodeDecodeError:
-                    continue
-            if content is None:
-                # If can't read as text, just copy binary
-                with open(input_path, 'rb') as src:
-                    with open(output_path, 'wb') as dst:
-                        dst.write(src.read())
-            else:
-                with open(output_path, 'w', encoding='utf-8') as f:
-                    f.write(content)
-            self._update_progress(progress_callback, 100)
-            return True
-        except Exception as e:
-            print(f"Generic conversion error: {e}")
-            return False

 import os
 from pathlib import Path
 from typing import Callable, Dict, Any
+import markdown
+import fitz
+from docx import Document
+from bs4 import BeautifulSoup
+class DocumentConverter:
+    def convert(self, input_path:str, output_path:str,
+                options:Dict[str,Any]|None=None,
+                progress_callback:Callable|None=None)->bool:
+        try:
+            input_ext = Path(input_path).suffix.lower()
+            output_ext = Path(output_path).suffix.lower()
+            self._update(progress_callback, 10)
+            Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+            success = False
+            # TXT
+            if input_ext == ".txt" and output_ext == ".pdf":
+                success = self.txt_to_pdf(input_path, output_path)
+            elif input_ext == ".txt" and output_ext == ".html":
+                success = self.txt_to_html(input_path, output_path)
+            elif input_ext == ".txt" and output_ext == ".md":
+                success = self.txt_to_md(input_path, output_path)
+            # MD
+            elif input_ext == ".md" and output_ext == ".html":
+                success = self.md_to_html(input_path, output_path)
+            elif input_ext == ".md" and output_ext == ".txt":
+                success = self.md_to_text(input_path, output_path)
+            # HTML
+            elif input_ext == ".html" and output_ext == ".txt":
+                success = self.html_to_text(input_path, output_path)
+            elif input_ext == ".html" and output_ext == ".md":
+                success = self.html_to_md(input_path, output_path)
+            # DOCX
+            elif input_ext == ".docx" and output_ext == ".txt":
+                success = self.docx_to_text(input_path, output_path)
+            # PDF
+            elif input_ext == ".pdf" and output_ext == ".txt":
+                success = self.pdf_to_text(input_path, output_path)
+            elif input_ext == ".pdf" and output_ext == ".html":
+                success = self.pdf_to_html(input_path, output_path)
             else:
+                raise ValueError(f"Unsupported conversion: {input_ext} -> {output_ext}")
+            self._update(progress_callback, 100)
+            return success
         except Exception as e:
+            print(f"Document conversion error: {e}")
             return False
+    def txt_to_pdf(self, input_path, output_path):
+        pdf = fitz.open()
+        page = pdf.new_page()
+        text = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        page.insert_text((72, 72), text[:5000])
+        pdf.save(output_path)
+        return True
+    def txt_to_html(self, input_path, output_path):
+        text = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        html = f"<html><body><pre>{text}</pre></body></html>"
+        Path(output_path).write_text(
+            html,
+            encoding="utf-8"
+        )
+        return True
+    def txt_to_md(self, input_path, output_path):
+        text = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        Path(output_path).write_text(
+            text,
+            encoding="utf-8"
+        )
+        return True
+    def md_to_html(self, input_path, output_path):
+        md = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        html = markdown.markdown(md)
+        Path(output_path).write_text(
+            html,
+            encoding="utf-8"
+        )
+        return True
+    def md_to_text(self, input_path, output_path):
+        md = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        html = markdown.markdown(md)
+        soup = BeautifulSoup(html, "html.parser")
+        Path(output_path).write_text(
+            soup.get_text(),
+            encoding="utf-8"
+        )
+        return True
+    def html_to_text(self, input_path, output_path):
+        html = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        soup = BeautifulSoup(html, "html.parser")
+        Path(output_path).write_text(
+            soup.get_text(),
+            encoding="utf-8"
+        )
+        return True
+    def html_to_md(self, input_path, output_path):
+        html = Path(input_path).read_text(
+            encoding="utf-8",
+            errors="ignore"
+        )
+        soup = BeautifulSoup(html, "html.parser")
+        text = soup.get_text()
+        Path(output_path).write_text(
+            text,
+            encoding="utf-8"
+        )
+        return True
+    def docx_to_text(self, input_path, output_path):
+        doc = Document(input_path)
+        text = "\n".join(
+            [p.text for p in doc.paragraphs]
+        )
+        Path(output_path).write_text(
+            text,
+            encoding="utf-8"
+        )
+        return True
+    def pdf_to_text(self, input_path, output_path):
+        doc = fitz.open(input_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        Path(output_path).write_text(
+            text,
+            encoding="utf-8"
+        )
+        return True
+    def pdf_to_html(self, input_path, output_path):
+        doc = fitz.open(input_path)
+        html = ""
+        for page in doc:
+            html += page.get_text("html")
+        Path(output_path).write_text(
+            html,
+            encoding="utf-8"
+        )
+        return True
+    def _update(self, callback, value):
+        try:
+            if callback:
+                callback(value)
+        except:
+            pass