Spaces:

aneesarom
/

PDF-Text-Extractor

Sleeping

App Files Files Community

aneesarom commited on Sep 20, 2025

Commit

fa078e7

verified ·

1 Parent(s): d4340f4

Create app.py

Browse files

Files changed (1) hide show

app.py +71 -0

app.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import json
+import gradio as gr
+import pdfplumber
+import requests
+from io import BytesIO
+def read_pdf_from_url(url: str) -> dict:
+    """
+    Extracts text from a PDF file given a direct PDF download URL.
+    Args:
+        url (str): A URL that points directly to a PDF file.
+    Returns:
+        dict: JSON-formatted dictionary containing:
+            - url (str): The PDF URL
+            - page_count (int): Number of pages in the PDF
+            - content (str): Extracted text from the PDF, with page numbers
+            - error (str, optional): Error message if extraction fails
+    """
+    try:
+        if not url.startswith("http"):
+            return {"error": "Invalid URL. Must start with http:// or https://"}
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        if not response.content.startswith(b"%PDF-"):
+            return {"error": "URL does not point to a valid PDF file"}
+        file_like = BytesIO(response.content)
+        text = ""
+        with pdfplumber.open(file_like) as pdf:
+            for page_num, page in enumerate(pdf.pages, start=1):
+                page_text = page.extract_text()
+                if page_text:
+                    text += f"[Page {page_num}]\n{page_text}\n\n"
+        return {
+            "url": url,
+            "page_count": len(pdf.pages),
+            "content": text.strip() if text else "No text found in PDF."
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# Example PDF URLs for the buttons
+example_urls = [
+    ["https://education.github.com/git-cheat-sheet-education.pdf"],
+    ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
+]
+# Gradio MCP interface with examples
+demo = gr.Interface(
+    fn=read_pdf_from_url,
+    inputs=gr.Textbox(
+        label="PDF URL",
+        placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
+    ),
+    outputs=gr.JSON(label="Extracted Text"),
+    title="PDF URL Text Extractor",
+    description=(
+        "Provide a URL that directly points to a PDF file (from any server). "
+        "The server fetches the PDF and extracts the text content, returning it in JSON format."
+    ),
+    examples=example_urls  # This adds buttons below the input box
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)