Spaces:

GermanySutherland
/

PDF-Extract-Text

Runtime error

App Files Files Community

GermanySutherland commited on Aug 21, 2025

Commit

25f374f

verified ·

1 Parent(s): fc7a5ba

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -67

app.py CHANGED Viewed

@@ -1,80 +1,41 @@
 import gradio as gr
-from transformers import pipeline, TrOCRProcessor, VisionEncoderDecoderModel
 from PIL import Image
-import fitz # PyMuPDF
-import io
-# --- Hugging Face Models ---
-# 1. Optical Character Recognition (OCR) model
-# This model is specifically trained to read text from images.
-try:
-    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
-    model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
-    trocr_pipeline = pipeline("image-to-text", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor)
-except Exception as e:
-    print(f"Error loading models: {e}")
-    trocr_pipeline = None
-# --- Functions ---
-def extract_text_from_pdf(pdf_file):
-    """
-    Extracts text from a PDF file by rendering each page to an image and
-    then applying a TrOCR model for text extraction.
-    Args:
-        pdf_file: The uploaded PDF file object from Gradio.
-    Returns:
-        A formatted string of the extracted text.
-    """
-    if not trocr_pipeline:
-        return "Model failed to load. Please check your dependencies."
-    extracted_pages = []
-    # Open the PDF file using PyMuPDF (fitz)
-    try:
-        pdf_document = fitz.open(stream=pdf_file.name, filetype="pdf")
-    except Exception as e:
-        return f"Error opening PDF: {e}"
-    # Loop through each page of the PDF
-    for page_num in range(pdf_document.page_count):
-        page = pdf_document.load_page(page_num)
-        # Render the page as a high-resolution image (300 DPI)
-        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
-        # Convert the image to a PIL Image object
-        img_bytes = pix.tobytes("png")
-        image = Image.open(io.BytesIO(img_bytes))
-        # Apply the TrOCR pipeline to the image
-        try:
-            # The pipeline automatically handles the model and tokenizer
-            extracted_text = trocr_pipeline(image, max_new_tokens=256)[0]['generated_text']
-        except Exception as e:
-            extracted_text = f"[OCR Failed on this page: {e}]"
-        extracted_pages.append(f"--- Page {page_num + 1} ---\n{extracted_text}\n")
-    # Join all page texts into a single string
-    return "\n".join(extracted_pages)
-# --- Gradio UI ---
-with gr.Blocks(title="PDF Text Extractor") as demo:
-    gr.Markdown("## 📄 AI PDF Text Extractor")
-    gr.Markdown("Upload a PDF file to extract text from its pages using a powerful OCR model. "
-                "This tool handles complex layouts, including tilted text, by "
-                "first converting each page into an image.")
-    with gr.Row():
-        pdf_input = gr.File(label="Upload PDF File", file_types=["pdf"])
     btn = gr.Button("Extract Text")
-    text_output = gr.Textbox(label="Extracted Text", lines=20)
-    # Set up the button click event
-    btn.click(fn=extract_text_from_pdf, inputs=pdf_input, outputs=text_output)
 demo.launch()

 import gradio as gr
+import fitz  # PyMuPDF
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from PIL import Image
+from textblob import TextBlob
+# Load lightweight Hugging Face OCR model
+processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-stage1")
+model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-stage1")
+def pdf_to_text(pdf_file):
+    if not pdf_file:
+        return "No PDF uploaded."
+    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+    all_text = []
+    for page in doc:
+        pix = page.get_pixmap()
+        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # OCR inference
+        pixel_values = processor(images=img, return_tensors="pt").pixel_values
+        generated_ids = model.generate(pixel_values)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        # Spell & grammar correction
+        corrected = str(TextBlob(text).correct())
+        all_text.append(corrected)
+    return "\n\n".join(all_text)
+with gr.Blocks() as demo:
+    gr.Markdown("## 📄 Robust PDF OCR MVP (Handles Tilted Words)")
+    pdf_input = gr.File(label="Upload PDF", type="file", file_types=[".pdf"])  # ✅ fixed
     btn = gr.Button("Extract Text")
+    output = gr.Textbox(label="Extracted Text", lines=15)
+    btn.click(fn=pdf_to_text, inputs=pdf_input, outputs=output)
 demo.launch()