import json import gradio as gr import pdfplumber import requests from io import BytesIO def read_pdf_from_url(url: str) -> dict: """ Extracts text from a PDF file given a direct PDF download URL. Args: url (str): A URL that points directly to a PDF file. Returns: dict: JSON-formatted dictionary containing: - url (str): The PDF URL - page_count (int): Number of pages in the PDF - content (str): Extracted text from the PDF, with page numbers - error (str, optional): Error message if extraction fails """ try: if not url.startswith("http"): return {"error": "Invalid URL. Must start with http:// or https://"} response = requests.get(url, timeout=10) response.raise_for_status() if not response.content.startswith(b"%PDF-"): return {"error": "URL does not point to a valid PDF file"} file_like = BytesIO(response.content) text = "" with pdfplumber.open(file_like) as pdf: for page_num, page in enumerate(pdf.pages, start=1): page_text = page.extract_text() if page_text: text += f"[Page {page_num}]\n{page_text}\n\n" return { "url": url, "page_count": len(pdf.pages), "content": text.strip() if text else "No text found in PDF." } except Exception as e: return {"error": str(e)} # Example PDF URLs for the buttons example_urls = [ ["https://education.github.com/git-cheat-sheet-education.pdf"], ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"] ] # Gradio MCP interface with examples demo = gr.Interface( fn=read_pdf_from_url, inputs=gr.Textbox( label="PDF URL", placeholder="Enter a direct PDF URL (e.g., GitHub raw link)" ), outputs=gr.JSON(label="Extracted Text"), title="PDF Text Extractor From Url", description=( "Provide a URL that directly points to a PDF file (from any server). " "The server fetches the PDF and extracts the text content, returning it in JSON format." ), examples=example_urls, # This adds buttons below the input box flagging_mode="never", # ✅ replaces allow_flagging cache_examples=False # ✅ disables caching (prevents CSV write) ) if __name__ == "__main__": demo.launch(mcp_server=True)