File size: 1,279 Bytes
ec3d86e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# pdf2html.py
import fitz  
from pathlib import Path
import sys
from bs4 import BeautifulSoup
import asyncio
import aiofiles
from tqdm.asyncio import tqdm
def convert_pdf_sync(pdf_path: str) -> str:
    try:
        doc = fitz.open(pdf_path)
        tqdm.write(f"[*] Successfully opened PDF file: {pdf_path}")
    except Exception as e:
        tqdm.write(f"[!] Error: Could not open PDF file. {e}", file=sys.stderr)
        return ""
    full_html_content = ""
    for page in doc:
        full_html_content += page.get_text("html")
    doc.close()
    soup = BeautifulSoup(full_html_content, "lxml")
    for img_tag in soup.find_all("img"):
        img_tag.decompose()
    
    return soup.prettify()

async def convert_pdf_to_text_only_html(pdf_path: str, output_path: str) -> bool:
    cleaned_html = await asyncio.to_thread(convert_pdf_sync, pdf_path)
    if not cleaned_html:
        return False
    try:
        output_file = Path(output_path)
        output_file.parent.mkdir(parents=True, exist_ok=True)
        async with aiofiles.open(output_file, "w", encoding="utf-8") as f:
            await f.write(cleaned_html)
        return True
    except Exception as e:
        tqdm.write(f"[!] Error: Could not write HTML file. {e}", file=sys.stderr)
        return False