Spaces:
Running
Running
| import json | |
| import gradio as gr | |
| import pdfplumber | |
| import requests | |
| from io import BytesIO | |
| def read_pdf_from_url(url: str) -> dict: | |
| """ | |
| Extracts text from a PDF file given a direct PDF download URL. | |
| Args: | |
| url (str): A URL that points directly to a PDF file. | |
| Returns: | |
| dict: JSON-formatted dictionary containing: | |
| - url (str): The PDF URL | |
| - page_count (int): Number of pages in the PDF | |
| - content (str): Extracted text from the PDF, with page numbers | |
| - error (str, optional): Error message if extraction fails | |
| """ | |
| try: | |
| if not url.startswith("http"): | |
| return {"error": "Invalid URL. Must start with http:// or https://"} | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| if not response.content.startswith(b"%PDF-"): | |
| return {"error": "URL does not point to a valid PDF file"} | |
| file_like = BytesIO(response.content) | |
| text = "" | |
| with pdfplumber.open(file_like) as pdf: | |
| for page_num, page in enumerate(pdf.pages, start=1): | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += f"[Page {page_num}]\n{page_text}\n\n" | |
| return { | |
| "url": url, | |
| "page_count": len(pdf.pages), | |
| "content": text.strip() if text else "No text found in PDF." | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| # Example PDF URLs for the buttons | |
| example_urls = [ | |
| ["https://education.github.com/git-cheat-sheet-education.pdf"], | |
| ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"] | |
| ] | |
| # Gradio MCP interface with examples | |
| demo = gr.Interface( | |
| fn=read_pdf_from_url, | |
| inputs=gr.Textbox( | |
| label="PDF URL", | |
| placeholder="Enter a direct PDF URL (e.g., GitHub raw link)" | |
| ), | |
| outputs=gr.JSON(label="Extracted Text"), | |
| title="PDF Text Extractor From Url", | |
| description=( | |
| "Provide a URL that directly points to a PDF file (from any server). " | |
| "The server fetches the PDF and extracts the text content, returning it in JSON format." | |
| ), | |
| examples=example_urls, # This adds buttons below the input box | |
| flagging_mode="never", # ✅ replaces allow_flagging | |
| cache_examples=False # ✅ disables caching (prevents CSV write) | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(mcp_server=True) | |