File size: 1,434 Bytes
0914e96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 |
# FILE: ai-service/core/document_parser.py
import fitz # PyMuPDF library
import requests
import io
def parse_pdf_from_url(pdf_url: str) -> str:
"""
Downloads a PDF from a URL, extracts all text, and returns it as a single string.
"""
print(f" - π Downloading and parsing PDF from URL...")
try:
# Step 1: Download the PDF content from the URL
response = requests.get(pdf_url, timeout=30)
response.raise_for_status() # Raise an exception for bad status codes
pdf_bytes = response.content
# Step 2: Open the PDF from memory using PyMuPDF
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = ""
# Step 3: Iterate through each page and extract text
for page_num in range(len(doc)):
page = doc.load_page(page_num)
full_text += page.get_text("text") + "\n\n"
doc.close()
print(f" - β
PDF parsed successfully. Total characters: {len(full_text)}")
return full_text
except requests.exceptions.RequestException as e:
print(f" - β FAILED to download PDF: {e}")
raise ConnectionError(f"Could not download the file from the provided URL: {pdf_url}") from e
except Exception as e:
print(f" - β FAILED to parse PDF: {e}")
raise ValueError("The provided file could not be parsed as a valid PDF.") from e |