Spaces:

aneesarom
/

PDF-Text-Extractor

Running

App Files Files Community

PDF-Text-Extractor / app.py

aneesarom

Update app.py

4d17112 verified 5 months ago

raw

history blame contribute delete

2.54 kB

	import json
	import gradio as gr
	import pdfplumber
	import requests
	from io import BytesIO

	def read_pdf_from_url(url: str) -> dict:
	"""
	Extracts text from a PDF file given a direct PDF download URL.

	Args:
	url (str): A URL that points directly to a PDF file.

	Returns:
	dict: JSON-formatted dictionary containing:
	- url (str): The PDF URL
	- page_count (int): Number of pages in the PDF
	- content (str): Extracted text from the PDF, with page numbers
	- error (str, optional): Error message if extraction fails
	"""
	try:
	if not url.startswith("http"):
	return {"error": "Invalid URL. Must start with http:// or https://"}

	response = requests.get(url, timeout=10)
	response.raise_for_status()

	if not response.content.startswith(b"%PDF-"):
	return {"error": "URL does not point to a valid PDF file"}

	file_like = BytesIO(response.content)
	text = ""
	with pdfplumber.open(file_like) as pdf:
	for page_num, page in enumerate(pdf.pages, start=1):
	page_text = page.extract_text()
	if page_text:
	text += f"[Page {page_num}]\n{page_text}\n\n"

	return {
	"url": url,
	"page_count": len(pdf.pages),
	"content": text.strip() if text else "No text found in PDF."
	}

	except Exception as e:
	return {"error": str(e)}

	# Example PDF URLs for the buttons
	example_urls = [
	["https://education.github.com/git-cheat-sheet-education.pdf"],
	["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
	]

	# Gradio MCP interface with examples
	demo = gr.Interface(
	fn=read_pdf_from_url,
	inputs=gr.Textbox(
	label="PDF URL",
	placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
	),
	outputs=gr.JSON(label="Extracted Text"),
	title="PDF Text Extractor From Url",
	description=(
	"Provide a URL that directly points to a PDF file (from any server). "
	"The server fetches the PDF and extracts the text content, returning it in JSON format."
	),
	examples=example_urls, # This adds buttons below the input box
	flagging_mode="never", # ✅ replaces allow_flagging
	cache_examples=False # ✅ disables caching (prevents CSV write)
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)