File size: 2,106 Bytes
0a3e928
 
0cc296e
bb94232
0a3e928
0cc296e
bb94232
0cc296e
285d260
0cc296e
4a811f6
bb94232
 
 
 
 
 
 
 
 
 
 
 
285d260
0cc296e
285d260
0a3e928
0cc296e
 
bb94232
 
 
285d260
 
 
 
 
 
 
 
 
 
 
 
 
 
bb94232
285d260
 
 
 
 
 
bb94232
0cc296e
 
 
 
bb94232
 
285d260
 
bb94232
 
285d260
bb94232
285d260
 
0cc296e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig

# =========================
# Model Setup & Patch
# =========================
model_id = 'microsoft/Florence-2-large'
device = "cuda" if torch.cuda.is_available() else "cpu"

# PATCH: Explicitly handle the Florence2 configuration bug
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
if not hasattr(config, 'forced_bos_token_id'):
    config.forced_bos_token_id = None

# Load model and processor
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    config=config,
    trust_remote_code=True
).to(device).eval()

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)

def run_ocr(image):
    if image is None:
        return "⚠️ Please upload an image."

    # Using <DETAILED_CAPTION> or <OCR> task for better text flow
    # Florence-2 works best with these specific task tags
    prompt = "<OCR>"
    
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            do_sample=False,
            num_beams=3
        )

    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    # Clean up the output
    parsed_answer = processor.post_process_generation(
        generated_text, 
        task=prompt, 
        image_size=(image.width, image.height)
    )

    return parsed_answer[prompt]

# =========================
# Gradio UI
# =========================
with gr.Blocks() as demo:
    gr.Markdown("## 🖋️ Handwritten Note to Text (Florence-2)")
    
    with gr.Row():
        input_img = gr.Image(type="pil")
        output_text = gr.Textbox(label="Extracted Text", lines=10)
    
    btn = gr.Button("Convert to Text", variant="primary")
    btn.click(fn=run_ocr, inputs=input_img, outputs=output_text)

if __name__ == "__main__":
    demo.launch()