Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PIL import Image | |
| import torch | |
| from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig | |
| # ========================= | |
| # Model Setup & Patch | |
| # ========================= | |
| model_id = 'microsoft/Florence-2-large' | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # PATCH: Explicitly handle the Florence2 configuration bug | |
| config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) | |
| if not hasattr(config, 'forced_bos_token_id'): | |
| config.forced_bos_token_id = None | |
| # Load model and processor | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| config=config, | |
| trust_remote_code=True | |
| ).to(device).eval() | |
| processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) | |
| def run_ocr(image): | |
| if image is None: | |
| return "⚠️ Please upload an image." | |
| # Using <DETAILED_CAPTION> or <OCR> task for better text flow | |
| # Florence-2 works best with these specific task tags | |
| prompt = "<OCR>" | |
| inputs = processor(text=prompt, images=image, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| generated_ids = model.generate( | |
| input_ids=inputs["input_ids"], | |
| pixel_values=inputs["pixel_values"], | |
| max_new_tokens=1024, | |
| do_sample=False, | |
| num_beams=3 | |
| ) | |
| generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # Clean up the output | |
| parsed_answer = processor.post_process_generation( | |
| generated_text, | |
| task=prompt, | |
| image_size=(image.width, image.height) | |
| ) | |
| return parsed_answer[prompt] | |
| # ========================= | |
| # Gradio UI | |
| # ========================= | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🖋️ Handwritten Note to Text (Florence-2)") | |
| with gr.Row(): | |
| input_img = gr.Image(type="pil") | |
| output_text = gr.Textbox(label="Extracted Text", lines=10) | |
| btn = gr.Button("Convert to Text", variant="primary") | |
| btn.click(fn=run_ocr, inputs=input_img, outputs=output_text) | |
| if __name__ == "__main__": | |
| demo.launch() |