Spaces:

aneesarom
/

test

Runtime error

App Files Files Community

aneesarom commited on Sep 25, 2025

Commit

f28740b

verified ·

1 Parent(s): 06f75b6

Create app.py

Browse files

Files changed (1) hide show

app.py +119 -0

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+Hugging Face's logo
+Hugging Face
+Models
+Datasets
+Spaces
+Community
+Docs
+Enterprise
+Pricing
+Hugging Face is way more fun with friends and colleagues! 🤗 Join an organization
+Spaces:
+aneesarom
+/
+PDF-Text-Extractor
+like
+0
+Logs
+App
+Files
+Community
+Settings
+PDF-Text-Extractor
+/
+app.py
+aneesarom's picture
+aneesarom
+Update app.py
+4d17112
+verified
+5 days ago
+raw
+Copy download link
+history
+blame
+edit
+delete
+2.54 kB
+import json
+import gradio as gr
+import pdfplumber
+import requests
+from io import BytesIO
+def read_pdf_from_url(url: str) -> dict:
+    """
+    Extracts text from a PDF file given a direct PDF download URL.
+    Args:
+        url (str): A URL that points directly to a PDF file.
+    Returns:
+        dict: JSON-formatted dictionary containing:
+            - url (str): The PDF URL
+            - page_count (int): Number of pages in the PDF
+            - content (str): Extracted text from the PDF, with page numbers
+            - error (str, optional): Error message if extraction fails
+    """
+    try:
+        if not url.startswith("http"):
+            return {"error": "Invalid URL. Must start with http:// or https://"}
+        response = requests.get(url, timeout=10)
+        response.raise_for_status()
+        if not response.content.startswith(b"%PDF-"):
+            return {"error": "URL does not point to a valid PDF file"}
+        file_like = BytesIO(response.content)
+        text = ""
+        with pdfplumber.open(file_like) as pdf:
+            for page_num, page in enumerate(pdf.pages, start=1):
+                page_text = page.extract_text()
+                if page_text:
+                    text += f"[Page {page_num}]\n{page_text}\n\n"
+        return {
+            "url": url,
+            "page_count": len(pdf.pages),
+            "content": text.strip() if text else "No text found in PDF."
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# Example PDF URLs for the buttons
+example_urls = [
+    ["https://education.github.com/git-cheat-sheet-education.pdf"],
+    ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
+]
+# Gradio MCP interface with examples
+demo = gr.Interface(
+    fn=read_pdf_from_url,
+    inputs=gr.Textbox(
+        label="PDF URL",
+        placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
+    ),
+    outputs=gr.JSON(label="Extracted Text"),
+    title="PDF Text Extractor From Url",
+    description=(
+        "Provide a URL that directly points to a PDF file (from any server). "
+        "The server fetches the PDF and extracts the text content, returning it in JSON format."
+    ),
+    examples=example_urls,  # This adds buttons below the input box
+    flagging_mode="never",   # ✅ replaces allow_flagging
+    cache_examples=False     # ✅ disables caching (prevents CSV write)
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)