AFC / converters /document_converter.py

Upload folder using huggingface_hub

8470fc3 verified 12 days ago

14.2 kB

	import os
	from pathlib import Path
	from typing import Callable, Dict, Any
	import traceback

	# Optional imports with error handling
	try:
	from docx import Document

	DOCX_AVAILABLE = True
	except ImportError:
	DOCX_AVAILABLE = False
	print("⚠ python-docx not installed. DOCX conversion will not work.")

	try:
	import fitz # PyMuPDF

	FITZ_AVAILABLE = True
	except ImportError:
	FITZ_AVAILABLE = False
	print("⚠ PyMuPDF not installed. PDF conversion will not work.")

	try:
	import markdown

	MARKDOWN_AVAILABLE = True
	except ImportError:
	MARKDOWN_AVAILABLE = False
	print("⚠ markdown not installed. MD conversion will not work.")

	try:
	from bs4 import BeautifulSoup

	BS4_AVAILABLE = True
	except ImportError:
	BS4_AVAILABLE = False
	print("⚠ beautifulsoup4 not installed. HTML conversion will not work.")


	class DocumentConverter:
	def __init__(self):
	pass

	def convert(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Convert document files"""
	input_ext = Path(input_path).suffix.lower()

	try:
	self._update_progress(progress_callback, 10)

	# Check if input file exists
	if not os.path.exists(input_path):
	print(f"Input file not found: {input_path}")
	return False

	# Create output directory if needed
	Path(output_path).parent.mkdir(parents=True, exist_ok=True)

	result = False

	# PDF conversion
	if input_ext == '.pdf':
	if not FITZ_AVAILABLE:
	print("PyMuPDF not available for PDF conversion")
	return False
	result = self.convert_pdf(input_path, output_path, options, progress_callback)

	# DOCX conversion
	elif input_ext in ['.docx', '.doc']:
	if not DOCX_AVAILABLE:
	print("python-docx not available for DOCX conversion")
	return False
	result = self.convert_docx(input_path, output_path, options, progress_callback)

	# TXT conversion
	elif input_ext == '.txt':
	result = self.convert_txt(input_path, output_path, options, progress_callback)

	# Markdown conversion
	elif input_ext == '.md':
	if not MARKDOWN_AVAILABLE:
	print("markdown library not available")
	return False
	result = self.convert_markdown(input_path, output_path, options, progress_callback)

	# HTML conversion
	elif input_ext == '.html':
	result = self.convert_html(input_path, output_path, options, progress_callback)

	else:
	result = self.convert_generic(input_path, output_path, options, progress_callback)

	if result:
	output_ext = Path(output_path).suffix.lower()
	print(f"✓ Successfully converted: {os.path.basename(input_path)} → {output_ext}")

	return result

	except Exception as e:
	print(f"Document conversion error for {input_path}: {str(e)}")
	traceback.print_exc()
	return False

	def _update_progress(self, callback, value):
	"""Safely update progress"""
	if callback is not None:
	try:
	callback(value)
	except Exception:
	pass

	def convert_pdf(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Convert PDF to other formats"""
	try:
	doc = fitz.open(input_path)
	total_pages = len(doc)

	self._update_progress(progress_callback, 20)

	if output_path.endswith('.txt'):
	text = ""
	for page_num in range(total_pages):
	page = doc[page_num]
	text += page.get_text()
	progress_pct = 20 + (page_num + 1) * 60 // total_pages
	self._update_progress(progress_callback, progress_pct)

	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(text)

	elif output_path.endswith('.docx'):
	docx_doc = Document()
	for page_num in range(total_pages):
	page = doc[page_num]
	text = page.get_text()
	docx_doc.add_paragraph(text)
	progress_pct = 20 + (page_num + 1) * 60 // total_pages
	self._update_progress(progress_callback, progress_pct)

	docx_doc.save(output_path)

	elif output_path.endswith('.html'):
	html_content = """<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<title>PDF Content</title>
	<style>
	body { font-family: Arial, sans-serif; margin: 40px; }
	.page { margin-bottom: 30px; page-break-after: always; }
	.page-number { color: #666; font-size: 12px; margin-bottom: 10px; }
	pre { white-space: pre-wrap; word-wrap: break-word; }
	</style>
	</head>
	<body>
	"""
	for page_num in range(total_pages):
	page = doc[page_num]
	text = page.get_text()
	html_content += f"""
	<div class="page">
	<div class="page-number">Page {page_num + 1}</div>
	<pre>{text}</pre>
	</div>
	"""
	progress_pct = 20 + (page_num + 1) * 60 // total_pages
	self._update_progress(progress_callback, progress_pct)

	html_content += "</body></html>"

	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(html_content)

	doc.close()
	self._update_progress(progress_callback, 100)
	return True

	except Exception as e:
	print(f"PDF conversion error: {e}")
	return False

	def convert_docx(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Convert DOCX to other formats"""
	try:
	doc = Document(input_path)
	self._update_progress(progress_callback, 30)

	if output_path.endswith('.txt'):
	text = "\n".join([para.text for para in doc.paragraphs])
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(text)

	elif output_path.endswith('.html'):
	html_content = """<!DOCTYPE html>
	<html>
	<head><meta charset="UTF-8"><title>Document Content</title></head>
	<body>
	"""
	for para in doc.paragraphs:
	if para.text.strip():
	html_content += f"<p>{para.text}</p>"
	html_content += "</body></html>"

	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(html_content)

	elif output_path.endswith('.md'):
	markdown_content = "\n\n".join([para.text for para in doc.paragraphs if para.text.strip()])
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(markdown_content)

	elif output_path.endswith('.pdf'):
	# Simple PDF conversion using text extraction
	text = "\n".join([para.text for para in doc.paragraphs])
	with open(output_path.replace('.pdf', '.txt'), 'w', encoding='utf-8') as f:
	f.write(text)
	print("Note: DOCX to PDF requires additional libraries. Saved as TXT instead.")

	self._update_progress(progress_callback, 100)
	return True

	except Exception as e:
	print(f"DOCX conversion error: {e}")
	return False

	def convert_txt(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Convert TXT to other formats"""
	try:
	with open(input_path, 'r', encoding='utf-8') as f:
	content = f.read()

	self._update_progress(progress_callback, 40)

	if output_path.endswith('.md'):
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(content)

	elif output_path.endswith('.html'):
	html_content = f"""<!DOCTYPE html>
	<html>
	<head><meta charset="UTF-8"><title>Text Document</title></head>
	<body>
	<pre>{content}</pre>
	</body></html>"""
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(html_content)

	elif output_path.endswith('.docx'):
	if DOCX_AVAILABLE:
	doc = Document()
	doc.add_paragraph(content)
	doc.save(output_path)
	else:
	with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
	f.write(content)
	print("Note: python-docx not installed. Saved as TXT instead.")

	self._update_progress(progress_callback, 100)
	return True

	except Exception as e:
	print(f"TXT conversion error: {e}")
	return False

	def convert_markdown(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Convert Markdown to other formats"""
	try:
	with open(input_path, 'r', encoding='utf-8') as f:
	content = f.read()

	self._update_progress(progress_callback, 40)

	if output_path.endswith('.html'):
	html_content = markdown.markdown(content)
	full_html = f"""<!DOCTYPE html>
	<html>
	<head><meta charset="UTF-8"><title>Markdown Document</title></head>
	<body>
	{html_content}
	</body></html>"""
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(full_html)

	elif output_path.endswith('.docx'):
	if DOCX_AVAILABLE and BS4_AVAILABLE:
	html = markdown.markdown(content)
	soup = BeautifulSoup(html, 'html.parser')
	doc = Document()
	for para in soup.find_all('p'):
	if para.get_text().strip():
	doc.add_paragraph(para.get_text())
	doc.save(output_path)
	else:
	with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
	f.write(content)
	print("Note: Required libraries not installed. Saved as TXT instead.")

	self._update_progress(progress_callback, 100)
	return True

	except Exception as e:
	print(f"Markdown conversion error: {e}")
	return False

	def convert_html(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Convert HTML to other formats"""
	try:
	with open(input_path, 'r', encoding='utf-8') as f:
	content = f.read()

	self._update_progress(progress_callback, 40)

	if BS4_AVAILABLE:
	soup = BeautifulSoup(content, 'html.parser')
	text = soup.get_text()
	else:
	# Simple text extraction
	import re
	text = re.sub(r'<[^>]+>', ' ', content)
	text = re.sub(r'\s+', ' ', text).strip()

	if output_path.endswith('.txt'):
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(text)

	elif output_path.endswith('.md'):
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(f"# Converted from HTML\n\n{text}")

	elif output_path.endswith('.docx'):
	if DOCX_AVAILABLE:
	doc = Document()
	doc.add_paragraph(text)
	doc.save(output_path)
	else:
	with open(output_path.replace('.docx', '.txt'), 'w', encoding='utf-8') as f:
	f.write(text)

	self._update_progress(progress_callback, 100)
	return True

	except Exception as e:
	print(f"HTML conversion error: {e}")
	return False

	def convert_generic(self, input_path: str, output_path: str,
	options: Dict[str, Any], progress_callback: Callable = None) -> bool:
	"""Generic text file conversion"""
	try:
	# Try to read as text
	encodings = ['utf-8', 'latin-1', 'cp1252']
	content = None

	for encoding in encodings:
	try:
	with open(input_path, 'r', encoding=encoding) as f:
	content = f.read()
	break
	except UnicodeDecodeError:
	continue

	if content is None:
	# If can't read as text, just copy binary
	with open(input_path, 'rb') as src:
	with open(output_path, 'wb') as dst:
	dst.write(src.read())
	else:
	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(content)

	self._update_progress(progress_callback, 100)
	return True

	except Exception as e:
	print(f"Generic conversion error: {e}")
	return False