AFC / converters /document_converter.py
embedingHF's picture
Upload folder using huggingface_hub
8470fc3 verified
import os
from pathlib import Path
from typing import Callable, Dict, Any
import traceback
# Optional imports with error handling
try:
from docx import Document
DOCX_AVAILABLE = True
except ImportError:
DOCX_AVAILABLE = False
print("⚠ python-docx not installed. DOCX conversion will not work.")
try:
import fitz # PyMuPDF
FITZ_AVAILABLE = True
except ImportError:
FITZ_AVAILABLE = False
print("⚠ PyMuPDF not installed. PDF conversion will not work.")
try:
import markdown
MARKDOWN_AVAILABLE = True
except ImportError:
MARKDOWN_AVAILABLE = False
print("⚠ markdown not installed. MD conversion will not work.")
try:
from bs4 import BeautifulSoup
BS4_AVAILABLE = True
except ImportError:
BS4_AVAILABLE = False
print("⚠ beautifulsoup4 not installed. HTML conversion will not work.")
class DocumentConverter:
def __init__(self):
pass
def convert(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Convert document files"""
input_ext = Path(input_path).suffix.lower()
try:
self._update_progress(progress_callback, 10)
# Check if input file exists
if not os.path.exists(input_path):
print(f"Input file not found: {input_path}")
return False
# Create output directory if needed
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
result = False
# PDF conversion
if input_ext == '.pdf':
if not FITZ_AVAILABLE:
print("PyMuPDF not available for PDF conversion")
return False
result = self.convert_pdf(input_path, output_path, options, progress_callback)
# DOCX conversion
elif input_ext in ['.docx', '.doc']:
if not DOCX_AVAILABLE:
print("python-docx not available for DOCX conversion")
return False
result = self.convert_docx(input_path, output_path, options, progress_callback)
# TXT conversion
elif input_ext == '.txt':
result = self.convert_txt(input_path, output_path, options, progress_callback)
# Markdown conversion
elif input_ext == '.md':
if not MARKDOWN_AVAILABLE:
print("markdown library not available")
return False
result = self.convert_markdown(input_path, output_path, options, progress_callback)
# HTML conversion
elif input_ext == '.html':
result = self.convert_html(input_path, output_path, options, progress_callback)
else:
result = self.convert_generic(input_path, output_path, options, progress_callback)
if result:
output_ext = Path(output_path).suffix.lower()
print(f"✓ Successfully converted: {os.path.basename(input_path)}{output_ext}")
return result
except Exception as e:
print(f"Document conversion error for {input_path}: {str(e)}")
traceback.print_exc()
return False
def _update_progress(self, callback, value):
"""Safely update progress"""
if callback is not None:
try:
callback(value)
except Exception:
pass
def convert_pdf(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Convert PDF to other formats"""
try:
doc = fitz.open(input_path)
total_pages = len(doc)
self._update_progress(progress_callback, 20)
if output_path.endswith('.txt'):
text = ""
for page_num in range(total_pages):
page = doc[page_num]
text += page.get_text()
progress_pct = 20 + (page_num + 1) * 60 // total_pages
self._update_progress(progress_callback, progress_pct)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
elif output_path.endswith('.docx'):
docx_doc = Document()
for page_num in range(total_pages):
page = doc[page_num]
text = page.get_text()
docx_doc.add_paragraph(text)
progress_pct = 20 + (page_num + 1) * 60 // total_pages
self._update_progress(progress_callback, progress_pct)
docx_doc.save(output_path)
elif output_path.endswith('.html'):
html_content = """<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>PDF Content</title>
<style>
body { font-family: Arial, sans-serif; margin: 40px; }
.page { margin-bottom: 30px; page-break-after: always; }
.page-number { color: #666; font-size: 12px; margin-bottom: 10px; }
pre { white-space: pre-wrap; word-wrap: break-word; }
</style>
</head>
<body>
"""
for page_num in range(total_pages):
page = doc[page_num]
text = page.get_text()
html_content += f"""
<div class="page">
<div class="page-number">Page {page_num + 1}</div>
<pre>{text}</pre>
</div>
"""
progress_pct = 20 + (page_num + 1) * 60 // total_pages
self._update_progress(progress_callback, progress_pct)
html_content += "</body></html>"
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
doc.close()
self._update_progress(progress_callback, 100)
return True
except Exception as e:
print(f"PDF conversion error: {e}")
return False
def convert_docx(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Convert DOCX to other formats"""
try:
doc = Document(input_path)
self._update_progress(progress_callback, 30)
if output_path.endswith('.txt'):
text = "\n".join([para.text for para in doc.paragraphs])
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
elif output_path.endswith('.html'):
html_content = """<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><title>Document Content</title></head>
<body>
"""
for para in doc.paragraphs:
if para.text.strip():
html_content += f"<p>{para.text}</p>"
html_content += "</body></html>"
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
elif output_path.endswith('.md'):
markdown_content = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
elif output_path.endswith('.pdf'):
# Simple PDF conversion using text extraction
text = "\n".join([para.text for para in doc.paragraphs])
with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
f.write(text)
print("Note: DOCX to PDF requires additional libraries. Saved as TXT instead.")
self._update_progress(progress_callback, 100)
return True
except Exception as e:
print(f"DOCX conversion error: {e}")
return False
def convert_txt(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Convert TXT to other formats"""
try:
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
self._update_progress(progress_callback, 40)
if output_path.endswith('.md'):
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
elif output_path.endswith('.html'):
html_content = f"""<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><title>Text Document</title></head>
<body>
<pre>{content}</pre>
</body></html>"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(html_content)
elif output_path.endswith('.docx'):
if DOCX_AVAILABLE:
doc = Document()
doc.add_paragraph(content)
doc.save(output_path)
else:
with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
f.write(content)
print("Note: python-docx not installed. Saved as TXT instead.")
self._update_progress(progress_callback, 100)
return True
except Exception as e:
print(f"TXT conversion error: {e}")
return False
def convert_markdown(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Convert Markdown to other formats"""
try:
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
self._update_progress(progress_callback, 40)
if output_path.endswith('.html'):
html_content = markdown.markdown(content)
full_html = f"""<!DOCTYPE html>
<html>
<head><meta charset="UTF-8"><title>Markdown Document</title></head>
<body>
{html_content}
</body></html>"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(full_html)
elif output_path.endswith('.docx'):
if DOCX_AVAILABLE and BS4_AVAILABLE:
html = markdown.markdown(content)
soup = BeautifulSoup(html, 'html.parser')
doc = Document()
for para in soup.find_all('p'):
if para.get_text().strip():
doc.add_paragraph(para.get_text())
doc.save(output_path)
else:
with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
f.write(content)
print("Note: Required libraries not installed. Saved as TXT instead.")
self._update_progress(progress_callback, 100)
return True
except Exception as e:
print(f"Markdown conversion error: {e}")
return False
def convert_html(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Convert HTML to other formats"""
try:
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
self._update_progress(progress_callback, 40)
if BS4_AVAILABLE:
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text()
else:
# Simple text extraction
import re
text = re.sub(r'<[^>]+>', ' ', content)
text = re.sub(r'\s+', ' ', text).strip()
if output_path.endswith('.txt'):
with open(output_path, 'w', encoding='utf-8') as f:
f.write(text)
elif output_path.endswith('.md'):
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f"# Converted from HTML\n\n{text}")
elif output_path.endswith('.docx'):
if DOCX_AVAILABLE:
doc = Document()
doc.add_paragraph(text)
doc.save(output_path)
else:
with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
f.write(text)
self._update_progress(progress_callback, 100)
return True
except Exception as e:
print(f"HTML conversion error: {e}")
return False
def convert_generic(self, input_path: str, output_path: str,
options: Dict[str, Any], progress_callback: Callable = None) -> bool:
"""Generic text file conversion"""
try:
# Try to read as text
encodings = ['utf-8', 'latin-1', 'cp1252']
content = None
for encoding in encodings:
try:
with open(input_path, 'r', encoding=encoding) as f:
content = f.read()
break
except UnicodeDecodeError:
continue
if content is None:
# If can't read as text, just copy binary
with open(input_path, 'rb') as src:
with open(output_path, 'wb') as dst:
dst.write(src.read())
else:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(content)
self._update_progress(progress_callback, 100)
return True
except Exception as e:
print(f"Generic conversion error: {e}")
return False