aneesarom commited on
Commit
f28740b
·
verified ·
1 Parent(s): 06f75b6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Models
4
+ Datasets
5
+ Spaces
6
+ Community
7
+ Docs
8
+ Enterprise
9
+ Pricing
10
+
11
+
12
+
13
+ Hugging Face is way more fun with friends and colleagues! 🤗 Join an organization
14
+ Spaces:
15
+
16
+ aneesarom
17
+ /
18
+ PDF-Text-Extractor
19
+
20
+
21
+ like
22
+ 0
23
+
24
+ Logs
25
+ App
26
+ Files
27
+ Community
28
+ Settings
29
+ PDF-Text-Extractor
30
+ /
31
+ app.py
32
+
33
+ aneesarom's picture
34
+ aneesarom
35
+ Update app.py
36
+ 4d17112
37
+ verified
38
+ 5 days ago
39
+ raw
40
+
41
+ Copy download link
42
+ history
43
+ blame
44
+ edit
45
+ delete
46
+
47
+ 2.54 kB
48
+ import json
49
+ import gradio as gr
50
+ import pdfplumber
51
+ import requests
52
+ from io import BytesIO
53
+
54
+ def read_pdf_from_url(url: str) -> dict:
55
+ """
56
+ Extracts text from a PDF file given a direct PDF download URL.
57
+ Args:
58
+ url (str): A URL that points directly to a PDF file.
59
+ Returns:
60
+ dict: JSON-formatted dictionary containing:
61
+ - url (str): The PDF URL
62
+ - page_count (int): Number of pages in the PDF
63
+ - content (str): Extracted text from the PDF, with page numbers
64
+ - error (str, optional): Error message if extraction fails
65
+ """
66
+ try:
67
+ if not url.startswith("http"):
68
+ return {"error": "Invalid URL. Must start with http:// or https://"}
69
+
70
+ response = requests.get(url, timeout=10)
71
+ response.raise_for_status()
72
+
73
+ if not response.content.startswith(b"%PDF-"):
74
+ return {"error": "URL does not point to a valid PDF file"}
75
+
76
+ file_like = BytesIO(response.content)
77
+ text = ""
78
+ with pdfplumber.open(file_like) as pdf:
79
+ for page_num, page in enumerate(pdf.pages, start=1):
80
+ page_text = page.extract_text()
81
+ if page_text:
82
+ text += f"[Page {page_num}]\n{page_text}\n\n"
83
+
84
+ return {
85
+ "url": url,
86
+ "page_count": len(pdf.pages),
87
+ "content": text.strip() if text else "No text found in PDF."
88
+ }
89
+
90
+ except Exception as e:
91
+ return {"error": str(e)}
92
+
93
+ # Example PDF URLs for the buttons
94
+ example_urls = [
95
+ ["https://education.github.com/git-cheat-sheet-education.pdf"],
96
+ ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
97
+ ]
98
+
99
+ # Gradio MCP interface with examples
100
+ demo = gr.Interface(
101
+ fn=read_pdf_from_url,
102
+ inputs=gr.Textbox(
103
+ label="PDF URL",
104
+ placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
105
+ ),
106
+ outputs=gr.JSON(label="Extracted Text"),
107
+ title="PDF Text Extractor From Url",
108
+ description=(
109
+ "Provide a URL that directly points to a PDF file (from any server). "
110
+ "The server fetches the PDF and extracts the text content, returning it in JSON format."
111
+ ),
112
+ examples=example_urls, # This adds buttons below the input box
113
+ flagging_mode="never", # ✅ replaces allow_flagging
114
+ cache_examples=False # ✅ disables caching (prevents CSV write)
115
+ )
116
+
117
+ if __name__ == "__main__":
118
+ demo.launch(mcp_server=True)
119
+