Spaces:
Running on Zero
Running on Zero
| import gradio as gr | |
| import spaces | |
| import torch | |
| from PIL import Image | |
| from transformers import AutoModel, AutoTokenizer | |
| # Target model: MiniCPM-o 4.5 (9B parameter variant) | |
| MODEL_ID = "openbmb/MiniCPM-o-4_5" | |
| # trust_remote_code is required for MiniCPM's custom architecture | |
| model = AutoModel.from_pretrained( | |
| MODEL_ID, | |
| trust_remote_code=True, | |
| attn_implementation="sdpa", | |
| torch_dtype=torch.bfloat16, | |
| init_vision=True, | |
| init_audio=False, | |
| init_tts=False, | |
| ).eval() | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| # The @spaces.GPU decorator handles GPU allocation on Hugging Face Spaces | |
| def process_image(image, system_prompt, temperature, top_p, max_tokens): | |
| if not image: | |
| return "Error: No image provided." | |
| # Convert to RGB to ensure compatibility with vision encoder (removes Alpha channel) | |
| image = image.convert("RGB") | |
| # Construct the message list expected by MiniCPM-o | |
| msgs = [{"role": "user", "content": [image, system_prompt]}] | |
| try: | |
| # sampling=True enables temperature/top_p. | |
| # For strict OCR, lower temperature (0.1) is recommended. | |
| response = model.chat( | |
| image=None, | |
| msgs=msgs, | |
| tokenizer=tokenizer, | |
| sampling=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| max_new_tokens=max_tokens, | |
| ) | |
| return response | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Default prompt designed for dynamic table structure detection | |
| DEFAULT_PROMPT = """Analyze this document image. | |
| 1. Visually identify the table headers and structure. | |
| 2. Transcribe the exact content into a Markdown table. | |
| 3. Rules: | |
| - Use the headers visible in the image. | |
| - Preserve row alignment strictly. | |
| - Leave empty cells blank. | |
| - Output ONLY the Markdown table.""" | |
| with gr.Blocks(title="Universal Medical OCR") as demo: | |
| gr.Markdown("## Universal Medical Report Digitizer") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_img = gr.Image( | |
| type="pil", | |
| label="Upload Report", | |
| sources=["upload", "clipboard"], | |
| height=450, | |
| ) | |
| with gr.Accordion("Settings", open=True): | |
| prompt_input = gr.TextArea( | |
| label="System Prompt", value=DEFAULT_PROMPT, lines=6 | |
| ) | |
| temp_slider = gr.Slider( | |
| 0.1, 1.0, value=0.1, step=0.1, label="Temperature" | |
| ) | |
| top_p_slider = gr.Slider(0.1, 1.0, value=0.8, step=0.1, label="Top-P") | |
| tokens_slider = gr.Slider( | |
| 256, 4096, value=2048, step=256, label="Max Tokens" | |
| ) | |
| run_btn = gr.Button("Extract Table", variant="primary") | |
| with gr.Column(): | |
| output_box = gr.Markdown(label="Detected Table") | |
| run_btn.click( | |
| fn=process_image, | |
| inputs=[input_img, prompt_input, temp_slider, top_p_slider, tokens_slider], | |
| outputs=output_box, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |