|
|
Hugging Face's logo |
|
|
Hugging Face |
|
|
Models |
|
|
Datasets |
|
|
Spaces |
|
|
Community |
|
|
Docs |
|
|
Enterprise |
|
|
Pricing |
|
|
|
|
|
|
|
|
|
|
|
Hugging Face is way more fun with friends and colleagues! π€ Join an organization |
|
|
Spaces: |
|
|
|
|
|
aneesarom |
|
|
/ |
|
|
PDF-Text-Extractor |
|
|
|
|
|
|
|
|
like |
|
|
0 |
|
|
|
|
|
Logs |
|
|
App |
|
|
Files |
|
|
Community |
|
|
Settings |
|
|
PDF-Text-Extractor |
|
|
/ |
|
|
app.py |
|
|
|
|
|
aneesarom's picture |
|
|
aneesarom |
|
|
Update app.py |
|
|
4d17112 |
|
|
verified |
|
|
5 days ago |
|
|
raw |
|
|
|
|
|
Copy download link |
|
|
history |
|
|
blame |
|
|
edit |
|
|
delete |
|
|
|
|
|
2.54 kB |
|
|
import json |
|
|
import gradio as gr |
|
|
import pdfplumber |
|
|
import requests |
|
|
from io import BytesIO |
|
|
|
|
|
def read_pdf_from_url(url: str) -> dict: |
|
|
""" |
|
|
Extracts text from a PDF file given a direct PDF download URL. |
|
|
Args: |
|
|
url (str): A URL that points directly to a PDF file. |
|
|
Returns: |
|
|
dict: JSON-formatted dictionary containing: |
|
|
- url (str): The PDF URL |
|
|
- page_count (int): Number of pages in the PDF |
|
|
- content (str): Extracted text from the PDF, with page numbers |
|
|
- error (str, optional): Error message if extraction fails |
|
|
""" |
|
|
try: |
|
|
if not url.startswith("http"): |
|
|
return {"error": "Invalid URL. Must start with http:// or https://"} |
|
|
|
|
|
response = requests.get(url, timeout=10) |
|
|
response.raise_for_status() |
|
|
|
|
|
if not response.content.startswith(b"%PDF-"): |
|
|
return {"error": "URL does not point to a valid PDF file"} |
|
|
|
|
|
file_like = BytesIO(response.content) |
|
|
text = "" |
|
|
with pdfplumber.open(file_like) as pdf: |
|
|
for page_num, page in enumerate(pdf.pages, start=1): |
|
|
page_text = page.extract_text() |
|
|
if page_text: |
|
|
text += f"[Page {page_num}]\n{page_text}\n\n" |
|
|
|
|
|
return { |
|
|
"url": url, |
|
|
"page_count": len(pdf.pages), |
|
|
"content": text.strip() if text else "No text found in PDF." |
|
|
} |
|
|
|
|
|
except Exception as e: |
|
|
return {"error": str(e)} |
|
|
|
|
|
|
|
|
example_urls = [ |
|
|
["https://education.github.com/git-cheat-sheet-education.pdf"], |
|
|
["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"] |
|
|
] |
|
|
|
|
|
|
|
|
demo = gr.Interface( |
|
|
fn=read_pdf_from_url, |
|
|
inputs=gr.Textbox( |
|
|
label="PDF URL", |
|
|
placeholder="Enter a direct PDF URL (e.g., GitHub raw link)" |
|
|
), |
|
|
outputs=gr.JSON(label="Extracted Text"), |
|
|
title="PDF Text Extractor From Url", |
|
|
description=( |
|
|
"Provide a URL that directly points to a PDF file (from any server). " |
|
|
"The server fetches the PDF and extracts the text content, returning it in JSON format." |
|
|
), |
|
|
examples=example_urls, |
|
|
flagging_mode="never", |
|
|
cache_examples=False |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch(mcp_server=True) |
|
|
|
|
|
|