aneesarom commited on
Commit
fa078e7
·
verified ·
1 Parent(s): d4340f4

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import gradio as gr
3
+ import pdfplumber
4
+ import requests
5
+ from io import BytesIO
6
+
7
+ def read_pdf_from_url(url: str) -> dict:
8
+ """
9
+ Extracts text from a PDF file given a direct PDF download URL.
10
+
11
+ Args:
12
+ url (str): A URL that points directly to a PDF file.
13
+
14
+ Returns:
15
+ dict: JSON-formatted dictionary containing:
16
+ - url (str): The PDF URL
17
+ - page_count (int): Number of pages in the PDF
18
+ - content (str): Extracted text from the PDF, with page numbers
19
+ - error (str, optional): Error message if extraction fails
20
+ """
21
+ try:
22
+ if not url.startswith("http"):
23
+ return {"error": "Invalid URL. Must start with http:// or https://"}
24
+
25
+ response = requests.get(url, timeout=10)
26
+ response.raise_for_status()
27
+
28
+ if not response.content.startswith(b"%PDF-"):
29
+ return {"error": "URL does not point to a valid PDF file"}
30
+
31
+ file_like = BytesIO(response.content)
32
+ text = ""
33
+ with pdfplumber.open(file_like) as pdf:
34
+ for page_num, page in enumerate(pdf.pages, start=1):
35
+ page_text = page.extract_text()
36
+ if page_text:
37
+ text += f"[Page {page_num}]\n{page_text}\n\n"
38
+
39
+ return {
40
+ "url": url,
41
+ "page_count": len(pdf.pages),
42
+ "content": text.strip() if text else "No text found in PDF."
43
+ }
44
+
45
+ except Exception as e:
46
+ return {"error": str(e)}
47
+
48
+ # Example PDF URLs for the buttons
49
+ example_urls = [
50
+ ["https://education.github.com/git-cheat-sheet-education.pdf"],
51
+ ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
52
+ ]
53
+
54
+ # Gradio MCP interface with examples
55
+ demo = gr.Interface(
56
+ fn=read_pdf_from_url,
57
+ inputs=gr.Textbox(
58
+ label="PDF URL",
59
+ placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
60
+ ),
61
+ outputs=gr.JSON(label="Extracted Text"),
62
+ title="PDF URL Text Extractor",
63
+ description=(
64
+ "Provide a URL that directly points to a PDF file (from any server). "
65
+ "The server fetches the PDF and extracts the text content, returning it in JSON format."
66
+ ),
67
+ examples=example_urls # This adds buttons below the input box
68
+ )
69
+
70
+ if __name__ == "__main__":
71
+ demo.launch(mcp_server=True)