File size: 2,958 Bytes
f28740b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
Hugging Face's logo
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing



Hugging Face is way more fun with friends and colleagues! 🤗 Join an organization
Spaces:

aneesarom
/
PDF-Text-Extractor


like
0

Logs
App
Files
Community
Settings
PDF-Text-Extractor
/
app.py

aneesarom's picture
aneesarom
Update app.py
4d17112
verified
5 days ago
raw

Copy download link
history
blame
edit
delete

2.54 kB
import json
import gradio as gr
import pdfplumber
import requests
from io import BytesIO

def read_pdf_from_url(url: str) -> dict:
    """
    Extracts text from a PDF file given a direct PDF download URL.
    Args:
        url (str): A URL that points directly to a PDF file.
    Returns:
        dict: JSON-formatted dictionary containing:
            - url (str): The PDF URL
            - page_count (int): Number of pages in the PDF
            - content (str): Extracted text from the PDF, with page numbers
            - error (str, optional): Error message if extraction fails
    """
    try:
        if not url.startswith("http"):
            return {"error": "Invalid URL. Must start with http:// or https://"}
        
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        if not response.content.startswith(b"%PDF-"):
            return {"error": "URL does not point to a valid PDF file"}

        file_like = BytesIO(response.content)
        text = ""
        with pdfplumber.open(file_like) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                page_text = page.extract_text()
                if page_text:
                    text += f"[Page {page_num}]\n{page_text}\n\n"

        return {
            "url": url,
            "page_count": len(pdf.pages),
            "content": text.strip() if text else "No text found in PDF."
        }

    except Exception as e:
        return {"error": str(e)}

# Example PDF URLs for the buttons
example_urls = [
    ["https://education.github.com/git-cheat-sheet-education.pdf"],
    ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
]

# Gradio MCP interface with examples
demo = gr.Interface(
    fn=read_pdf_from_url,
    inputs=gr.Textbox(
        label="PDF URL",
        placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
    ),
    outputs=gr.JSON(label="Extracted Text"),
    title="PDF Text Extractor From Url",
    description=(
        "Provide a URL that directly points to a PDF file (from any server). "
        "The server fetches the PDF and extracts the text content, returning it in JSON format."
    ),
    examples=example_urls,  # This adds buttons below the input box
    flagging_mode="never",   # ✅ replaces allow_flagging
    cache_examples=False     # ✅ disables caching (prevents CSV write)
)

if __name__ == "__main__":
    demo.launch(mcp_server=True)