AutoPR / pragent /backend /pdf2html.py
yzweak's picture
Initial commit
ec3d86e
# pdf2html.py
import fitz
from pathlib import Path
import sys
from bs4 import BeautifulSoup
import asyncio
import aiofiles
from tqdm.asyncio import tqdm
def convert_pdf_sync(pdf_path: str) -> str:
try:
doc = fitz.open(pdf_path)
tqdm.write(f"[*] Successfully opened PDF file: {pdf_path}")
except Exception as e:
tqdm.write(f"[!] Error: Could not open PDF file. {e}", file=sys.stderr)
return ""
full_html_content = ""
for page in doc:
full_html_content += page.get_text("html")
doc.close()
soup = BeautifulSoup(full_html_content, "lxml")
for img_tag in soup.find_all("img"):
img_tag.decompose()
return soup.prettify()
async def convert_pdf_to_text_only_html(pdf_path: str, output_path: str) -> bool:
cleaned_html = await asyncio.to_thread(convert_pdf_sync, pdf_path)
if not cleaned_html:
return False
try:
output_file = Path(output_path)
output_file.parent.mkdir(parents=True, exist_ok=True)
async with aiofiles.open(output_file, "w", encoding="utf-8") as f:
await f.write(cleaned_html)
return True
except Exception as e:
tqdm.write(f"[!] Error: Could not write HTML file. {e}", file=sys.stderr)
return False