| # FILE: ai-service/core/document_parser.py | |
| import fitz # PyMuPDF library | |
| import requests | |
| import io | |
| def parse_pdf_from_url(pdf_url: str) -> str: | |
| """ | |
| Downloads a PDF from a URL, extracts all text, and returns it as a single string. | |
| """ | |
| print(f" - π Downloading and parsing PDF from URL...") | |
| try: | |
| # Step 1: Download the PDF content from the URL | |
| response = requests.get(pdf_url, timeout=30) | |
| response.raise_for_status() # Raise an exception for bad status codes | |
| pdf_bytes = response.content | |
| # Step 2: Open the PDF from memory using PyMuPDF | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| full_text = "" | |
| # Step 3: Iterate through each page and extract text | |
| for page_num in range(len(doc)): | |
| page = doc.load_page(page_num) | |
| full_text += page.get_text("text") + "\n\n" | |
| doc.close() | |
| print(f" - β PDF parsed successfully. Total characters: {len(full_text)}") | |
| return full_text | |
| except requests.exceptions.RequestException as e: | |
| print(f" - β FAILED to download PDF: {e}") | |
| raise ConnectionError(f"Could not download the file from the provided URL: {pdf_url}") from e | |
| except Exception as e: | |
| print(f" - β FAILED to parse PDF: {e}") | |
| raise ValueError("The provided file could not be parsed as a valid PDF.") from e |