Spaces:

GermanySutherland
/

PDF-Extract-Text

Runtime error

App Files Files Community

GermanySutherland commited on Aug 21, 2025

Commit

b003c87

verified ·

1 Parent(s): d5925a1

Create app.py

Browse files

Files changed (1) hide show

app.py +80 -0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
+from PIL import Image
+import fitz # PyMuPDF
+import io
+# --- Hugging Face Models ---
+# 1. Optical Character Recognition (OCR) model
+# This model is specifically trained to read text from images.
+try:
+    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
+    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
+    trocr_pipeline = pipeline("image-to-text", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)
+except Exception as e:
+    print(f"Error loading models: {e}")
+    trocr_pipeline = None
+# --- Functions ---
+def extract_text_from_pdf(pdf_file):
+    """
+    Extracts text from a PDF file by rendering each page to an image and
+    then applying a TrOCR model for text extraction.
+    Args:
+        pdf_file: The uploaded PDF file object from Gradio.
+    Returns:
+        A formatted string of the extracted text.
+    """
+    if not trocr_pipeline:
+        return "Model failed to load. Please check your dependencies."
+    extracted_pages = []
+    # Open the PDF file using PyMuPDF (fitz)
+    try:
+        pdf_document = fitz.open(stream=pdf_file.name, filetype="pdf")
+    except Exception as e:
+        return f"Error opening PDF: {e}"
+    # Loop through each page of the PDF
+    for page_num in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_num)
+        # Render the page as a high-resolution image (300 DPI)
+        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+        # Convert the image to a PIL Image object
+        img_bytes = pix.tobytes("png")
+        image = Image.open(io.BytesIO(img_bytes))
+        # Apply the TrOCR pipeline to the image
+        try:
+            # The pipeline automatically handles the model and tokenizer
+            extracted_text = trocr_pipeline(image, max_new_tokens=256)[0]['generated_text']
+        except Exception as e:
+            extracted_text = f"[OCR Failed on this page: {e}]"
+        extracted_pages.append(f"--- Page {page_num + 1} ---\n{extracted_text}\n")
+    # Join all page texts into a single string
+    return "\n".join(extracted_pages)
+# --- Gradio UI ---
+with gr.Blocks(title="PDF Text Extractor") as demo:
+    gr.Markdown("## 📄 AI PDF Text Extractor")
+    gr.Markdown("Upload a PDF file to extract text from its pages using a powerful OCR model. "
+                "This tool handles complex layouts, including tilted text, by "
+                "first converting each page into an image.")
+    with gr.Row():
+        pdf_input = gr.File(label="Upload PDF File", file_types=["pdf"])
+    btn = gr.Button("Extract Text")
+    text_output = gr.Textbox(label="Extracted Text", lines=20)
+    # Set up the button click event
+    btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)
+demo.launch()