Spaces:

yzweak
/

AutoPR

Running

AutoPR / pragent /backend /pdf2html.py

Initial commit

ec3d86e 3 months ago

1.28 kB

	# pdf2html.py
	import fitz
	from pathlib import Path
	import sys
	from bs4 import BeautifulSoup
	import asyncio
	import aiofiles
	from tqdm.asyncio import tqdm
	def convert_pdf_sync(pdf_path: str) -> str:
	try:
	doc = fitz.open(pdf_path)
	tqdm.write(f"[*] Successfully opened PDF file: {pdf_path}")
	except Exception as e:
	tqdm.write(f"[!] Error: Could not open PDF file. {e}", file=sys.stderr)
	return ""
	full_html_content = ""
	for page in doc:
	full_html_content += page.get_text("html")
	doc.close()
	soup = BeautifulSoup(full_html_content, "lxml")
	for img_tag in soup.find_all("img"):
	img_tag.decompose()

	return soup.prettify()

	async def convert_pdf_to_text_only_html(pdf_path: str, output_path: str) -> bool:
	cleaned_html = await asyncio.to_thread(convert_pdf_sync, pdf_path)
	if not cleaned_html:
	return False
	try:
	output_file = Path(output_path)
	output_file.parent.mkdir(parents=True, exist_ok=True)
	async with aiofiles.open(output_file, "w", encoding="utf-8") as f:
	await f.write(cleaned_html)
	return True
	except Exception as e:
	tqdm.write(f"[!] Error: Could not write HTML file. {e}", file=sys.stderr)
	return False