Spaces:

amitbhatt6075
/

reachify-ai-service

Running

reachify-ai-service / core /document_parser.py

Complete fresh start - FINAL UPLOAD

0914e96 20 days ago

1.43 kB

	# FILE: ai-service/core/document_parser.py

	import fitz # PyMuPDF library
	import requests
	import io

	def parse_pdf_from_url(pdf_url: str) -> str:
	"""
	Downloads a PDF from a URL, extracts all text, and returns it as a single string.
	"""
	print(f" - 📑 Downloading and parsing PDF from URL...")
	try:
	# Step 1: Download the PDF content from the URL
	response = requests.get(pdf_url, timeout=30)
	response.raise_for_status() # Raise an exception for bad status codes

	pdf_bytes = response.content

	# Step 2: Open the PDF from memory using PyMuPDF
	doc = fitz.open(stream=pdf_bytes, filetype="pdf")

	full_text = ""
	# Step 3: Iterate through each page and extract text
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	full_text += page.get_text("text") + "\n\n"

	doc.close()

	print(f" - ✅ PDF parsed successfully. Total characters: {len(full_text)}")
	return full_text

	except requests.exceptions.RequestException as e:
	print(f" - ❌ FAILED to download PDF: {e}")
	raise ConnectionError(f"Could not download the file from the provided URL: {pdf_url}") from e
	except Exception as e:
	print(f" - ❌ FAILED to parse PDF: {e}")
	raise ValueError("The provided file could not be parsed as a valid PDF.") from e