Spaces:

aneesarom
/

PDF-Text-Extractor

Running

File size: 2,538 Bytes

import json
import gradio as gr
import pdfplumber
import requests
from io import BytesIO

def read_pdf_from_url(url: str) -> dict:
    """
    Extracts text from a PDF file given a direct PDF download URL.

    Args:
        url (str): A URL that points directly to a PDF file.

    Returns:
        dict: JSON-formatted dictionary containing:
            - url (str): The PDF URL
            - page_count (int): Number of pages in the PDF
            - content (str): Extracted text from the PDF, with page numbers
            - error (str, optional): Error message if extraction fails
    """
    try:
        if not url.startswith("http"):
            return {"error": "Invalid URL. Must start with http:// or https://"}
        
        response = requests.get(url, timeout=10)
        response.raise_for_status()

        if not response.content.startswith(b"%PDF-"):
            return {"error": "URL does not point to a valid PDF file"}

        file_like = BytesIO(response.content)
        text = ""
        with pdfplumber.open(file_like) as pdf:
            for page_num, page in enumerate(pdf.pages, start=1):
                page_text = page.extract_text()
                if page_text:
                    text += f"[Page {page_num}]\n{page_text}\n\n"

        return {
            "url": url,
            "page_count": len(pdf.pages),
            "content": text.strip() if text else "No text found in PDF."
        }

    except Exception as e:
        return {"error": str(e)}

# Example PDF URLs for the buttons
example_urls = [
    ["https://education.github.com/git-cheat-sheet-education.pdf"],
    ["https://github.com/tpn/pdfs/raw/master/A%20Journey%20in%20Creating%20an%20Operating%20System%20Kernel%20-%20The%20539Kernel%20Book%20(Nov%202022).pdf"]
]

# Gradio MCP interface with examples
demo = gr.Interface(
    fn=read_pdf_from_url,
    inputs=gr.Textbox(
        label="PDF URL",
        placeholder="Enter a direct PDF URL (e.g., GitHub raw link)"
    ),
    outputs=gr.JSON(label="Extracted Text"),
    title="PDF Text Extractor From Url",
    description=(
        "Provide a URL that directly points to a PDF file (from any server). "
        "The server fetches the PDF and extracts the text content, returning it in JSON format."
    ),
    examples=example_urls,  # This adds buttons below the input box
    flagging_mode="never",   # ✅ replaces allow_flagging
    cache_examples=False     # ✅ disables caching (prevents CSV write)
)

if __name__ == "__main__":
    demo.launch(mcp_server=True)