"""Fanoni Document AI - HuggingFace Space with GOT-OCR2.0 Model.""" import gradio as gr import spaces from transformers import AutoModel, AutoTokenizer from PIL import Image import torch # Load GOT-OCR2.0 model MODEL_NAME = "ucaslcl/GOT-OCR2_0" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) print("Loading model...") model = AutoModel.from_pretrained( MODEL_NAME, trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.float16 ) print("Model loaded!") @spaces.GPU def extract_text(image, output_format): """Extract text from uploaded image using GOT-OCR2.0.""" if image is None: return "Please upload an image." try: # Move model to GPU for this call device = "cuda" if torch.cuda.is_available() else "cpu" model_gpu = model.to(device).eval() # Save image temporarily temp_path = "/tmp/uploaded_image.png" if isinstance(image, str): temp_path = image else: Image.fromarray(image).save(temp_path) # OCR extraction if output_format == "Plain Text": result = model_gpu.chat(tokenizer, temp_path, ocr_type='ocr') elif output_format == "Formatted (Tables/Structure)": result = model_gpu.chat(tokenizer, temp_path, ocr_type='format') else: result = model_gpu.chat(tokenizer, temp_path, ocr_type='ocr') return result except Exception as e: return f"Error: {str(e)}" # Gradio Interface with gr.Blocks(title="Fanoni Document AI", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 📄 Fanoni Document AI ### Extract text from documents using GOT-OCR2.0 Upload an image of a document (invoice, receipt, form, etc.) to extract text. """) with gr.Row(): with gr.Column(scale=1): image_input = gr.Image( label="Upload Document", type="numpy", height=400 ) format_dropdown = gr.Dropdown( choices=["Plain Text", "Formatted (Tables/Structure)"], value="Plain Text", label="Output Format" ) extract_btn = gr.Button("Extract Text", variant="primary", size="lg") with gr.Column(scale=1): output_text = gr.Textbox( label="Extracted Text", lines=20, max_lines=50, show_copy_button=True ) extract_btn.click( fn=extract_text, inputs=[image_input, format_dropdown], outputs=output_text ) gr.Markdown(""" --- **Supported formats:** JPG, PNG, WEBP, BMP **Model:** [GOT-OCR2.0](https://huggingface.co/ucaslcl/GOT-OCR2_0) - General OCR Theory **Powered by:** Fanoni AI """) if __name__ == "__main__": demo.launch()