File size: 2,958 Bytes
f28740b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
Hugging Face's logo
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Hugging Face is way more fun with friends and colleagues! 🤗 Join an organization
Spaces:
aneesarom
/
PDF-Text-Extractor
like
0
Logs
App
Files
Community
Settings
PDF-Text-Extractor
/
app.py
aneesarom's picture
aneesarom
Update app.py
4d17112
verified
5 days ago
raw
Copy download link
history
blame
edit
delete
2.54 kB
import json
import gradio as gr
import pdfplumber
import requests
from io import BytesIO
def read_pdf_from_url(url: str) -> dict:
"""
Extracts text from a PDF file given a direct PDF download URL.
Args:
url (str): A URL that points directly to a PDF file.
Returns:
dict: JSON-formatted dictionary containing:
- url (str): The PDF URL
- page_count (int): Number of pages in the PDF
- content (str): Extracted text from the PDF, with page numbers
- error (str, optional): Error message if extraction fails
"""
try:
if not url.startswith("http"):
return {"error": "Invalid URL. Must start with http:// or https://"}
response = requests.get(url, timeout=10)
response.raise_for_status()
if not response.content.startswith(b"%PDF-"):
return {"error": "URL does not point to a valid PDF file"}
file_like = BytesIO(response.content)
text = ""
with pdfplumber.open(file_like) as pdf:
for page_num, page in enumerate(pdf.pages, start=1):
page_text = page.extract_text()
if page_text:
text += f"[Page {page_num}]\n{page_text}\n\n"
return {
"url": url,
"page_count": len(pdf.pages),
"content": text.strip() if text else "No text found in PDF."
}
except Exception as e:
return {"error": str(e)}
# Example PDF URLs for the buttons
example_urls = [
["https://education.github.com/git-cheat-sheet-education.pdf"],
["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
]
# Gradio MCP interface with examples
demo = gr.Interface(
fn=read_pdf_from_url,
inputs=gr.Textbox(
label="PDF URL",
placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
),
outputs=gr.JSON(label="Extracted Text"),
title="PDF Text Extractor From Url",
description=(
"Provide a URL that directly points to a PDF file (from any server). "
"The server fetches the PDF and extracts the text content, returning it in JSON format."
),
examples=example_urls, # This adds buttons below the input box
flagging_mode="never", # ✅ replaces allow_flagging
cache_examples=False # ✅ disables caching (prevents CSV write)
)
if __name__ == "__main__":
demo.launch(mcp_server=True)
|