Spaces:
Sleeping
Sleeping
File size: 2,106 Bytes
0a3e928 0cc296e bb94232 0a3e928 0cc296e bb94232 0cc296e 285d260 0cc296e 4a811f6 bb94232 285d260 0cc296e 285d260 0a3e928 0cc296e bb94232 285d260 bb94232 285d260 bb94232 0cc296e bb94232 285d260 bb94232 285d260 bb94232 285d260 0cc296e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 | import gradio as gr
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig
# =========================
# Model Setup & Patch
# =========================
model_id = 'microsoft/Florence-2-large'
device = "cuda" if torch.cuda.is_available() else "cpu"
# PATCH: Explicitly handle the Florence2 configuration bug
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
if not hasattr(config, 'forced_bos_token_id'):
config.forced_bos_token_id = None
# Load model and processor
model = AutoModelForCausalLM.from_pretrained(
model_id,
config=config,
trust_remote_code=True
).to(device).eval()
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
def run_ocr(image):
if image is None:
return "⚠️ Please upload an image."
# Using <DETAILED_CAPTION> or <OCR> task for better text flow
# Florence-2 works best with these specific task tags
prompt = "<OCR>"
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
with torch.no_grad():
generated_ids = model.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
do_sample=False,
num_beams=3
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
# Clean up the output
parsed_answer = processor.post_process_generation(
generated_text,
task=prompt,
image_size=(image.width, image.height)
)
return parsed_answer[prompt]
# =========================
# Gradio UI
# =========================
with gr.Blocks() as demo:
gr.Markdown("## 🖋️ Handwritten Note to Text (Florence-2)")
with gr.Row():
input_img = gr.Image(type="pil")
output_text = gr.Textbox(label="Extracted Text", lines=10)
btn = gr.Button("Convert to Text", variant="primary")
btn.click(fn=run_ocr, inputs=input_img, outputs=output_text)
if __name__ == "__main__":
demo.launch() |