Spaces:

amitbhatt6075
/

reachify-ai-service

Running

File size: 1,434 Bytes

0914e96

# FILE: ai-service/core/document_parser.py

import fitz  # PyMuPDF library
import requests
import io

def parse_pdf_from_url(pdf_url: str) -> str:
    """
    Downloads a PDF from a URL, extracts all text, and returns it as a single string.
    """
    print(f"   - 📑 Downloading and parsing PDF from URL...")
    try:
        # Step 1: Download the PDF content from the URL
        response = requests.get(pdf_url, timeout=30)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        pdf_bytes = response.content

        # Step 2: Open the PDF from memory using PyMuPDF
        doc = fitz.open(stream=pdf_bytes, filetype="pdf")
        
        full_text = ""
        # Step 3: Iterate through each page and extract text
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            full_text += page.get_text("text") + "\n\n"
        
        doc.close()
        
        print(f"   - ✅ PDF parsed successfully. Total characters: {len(full_text)}")
        return full_text

    except requests.exceptions.RequestException as e:
        print(f"   - ❌ FAILED to download PDF: {e}")
        raise ConnectionError(f"Could not download the file from the provided URL: {pdf_url}") from e
    except Exception as e:
        print(f"   - ❌ FAILED to parse PDF: {e}")
        raise ValueError("The provided file could not be parsed as a valid PDF.") from e