reachify-ai-service / core /document_parser.py
amitbhatt6075's picture
Complete fresh start - FINAL UPLOAD
0914e96
# FILE: ai-service/core/document_parser.py
import fitz # PyMuPDF library
import requests
import io
def parse_pdf_from_url(pdf_url: str) -> str:
"""
Downloads a PDF from a URL, extracts all text, and returns it as a single string.
"""
print(f" - πŸ“‘ Downloading and parsing PDF from URL...")
try:
# Step 1: Download the PDF content from the URL
response = requests.get(pdf_url, timeout=30)
response.raise_for_status() # Raise an exception for bad status codes
pdf_bytes = response.content
# Step 2: Open the PDF from memory using PyMuPDF
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = ""
# Step 3: Iterate through each page and extract text
for page_num in range(len(doc)):
page = doc.load_page(page_num)
full_text += page.get_text("text") + "\n\n"
doc.close()
print(f" - βœ… PDF parsed successfully. Total characters: {len(full_text)}")
return full_text
except requests.exceptions.RequestException as e:
print(f" - ❌ FAILED to download PDF: {e}")
raise ConnectionError(f"Could not download the file from the provided URL: {pdf_url}") from e
except Exception as e:
print(f" - ❌ FAILED to parse PDF: {e}")
raise ValueError("The provided file could not be parsed as a valid PDF.") from e