Spaces:
Running
Running
| # pdf2html.py | |
| import fitz | |
| from pathlib import Path | |
| import sys | |
| from bs4 import BeautifulSoup | |
| import asyncio | |
| import aiofiles | |
| from tqdm.asyncio import tqdm | |
| def convert_pdf_sync(pdf_path: str) -> str: | |
| try: | |
| doc = fitz.open(pdf_path) | |
| tqdm.write(f"[*] Successfully opened PDF file: {pdf_path}") | |
| except Exception as e: | |
| tqdm.write(f"[!] Error: Could not open PDF file. {e}", file=sys.stderr) | |
| return "" | |
| full_html_content = "" | |
| for page in doc: | |
| full_html_content += page.get_text("html") | |
| doc.close() | |
| soup = BeautifulSoup(full_html_content, "lxml") | |
| for img_tag in soup.find_all("img"): | |
| img_tag.decompose() | |
| return soup.prettify() | |
| async def convert_pdf_to_text_only_html(pdf_path: str, output_path: str) -> bool: | |
| cleaned_html = await asyncio.to_thread(convert_pdf_sync, pdf_path) | |
| if not cleaned_html: | |
| return False | |
| try: | |
| output_file = Path(output_path) | |
| output_file.parent.mkdir(parents=True, exist_ok=True) | |
| async with aiofiles.open(output_file, "w", encoding="utf-8") as f: | |
| await f.write(cleaned_html) | |
| return True | |
| except Exception as e: | |
| tqdm.write(f"[!] Error: Could not write HTML file. {e}", file=sys.stderr) | |
| return False | |