File size: 1,410 Bytes
b003c87
25f374f
 
b003c87
25f374f
b003c87
25f374f
 
 
b003c87
25f374f
 
 
b003c87
25f374f
 
b003c87
25f374f
 
 
b003c87
25f374f
 
 
 
b003c87
25f374f
 
 
b003c87
25f374f
b003c87
25f374f
 
 
b003c87
25f374f
 
 
b003c87
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import gradio as gr
import fitz  # PyMuPDF
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image
from textblob import TextBlob

# Load lightweight Hugging Face OCR model
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-stage1")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-stage1")

def pdf_to_text(pdf_file):
    if not pdf_file:
        return "No PDF uploaded."

    doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
    all_text = []

    for page in doc:
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

        # OCR inference
        pixel_values = processor(images=img, return_tensors="pt").pixel_values
        generated_ids = model.generate(pixel_values)
        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

        # Spell & grammar correction
        corrected = str(TextBlob(text).correct())
        all_text.append(corrected)

    return "\n\n".join(all_text)

with gr.Blocks() as demo:
    gr.Markdown("## 📄 Robust PDF OCR MVP (Handles Tilted Words)")
    pdf_input = gr.File(label="Upload PDF", type="file", file_types=[".pdf"])  # ✅ fixed
    btn = gr.Button("Extract Text")
    output = gr.Textbox(label="Extracted Text", lines=15)

    btn.click(fn=pdf_to_text, inputs=pdf_input, outputs=output)

demo.launch()