vinayaka-lifesigns
feat: Add application file
349bb79
import gradio as gr
import spaces
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
# Target model: MiniCPM-o 4.5 (9B parameter variant)
MODEL_ID = "openbmb/MiniCPM-o-4_5"
# trust_remote_code is required for MiniCPM's custom architecture
model = AutoModel.from_pretrained(
MODEL_ID,
trust_remote_code=True,
attn_implementation="sdpa",
torch_dtype=torch.bfloat16,
init_vision=True,
init_audio=False,
init_tts=False,
).eval()
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# The @spaces.GPU decorator handles GPU allocation on Hugging Face Spaces
@spaces.GPU
def process_image(image, system_prompt, temperature, top_p, max_tokens):
if not image:
return "Error: No image provided."
# Convert to RGB to ensure compatibility with vision encoder (removes Alpha channel)
image = image.convert("RGB")
# Construct the message list expected by MiniCPM-o
msgs = [{"role": "user", "content": [image, system_prompt]}]
try:
# sampling=True enables temperature/top_p.
# For strict OCR, lower temperature (0.1) is recommended.
response = model.chat(
image=None,
msgs=msgs,
tokenizer=tokenizer,
sampling=True,
temperature=temperature,
top_p=top_p,
max_new_tokens=max_tokens,
)
return response
except Exception as e:
return f"Error: {str(e)}"
# Default prompt designed for dynamic table structure detection
DEFAULT_PROMPT = """Analyze this document image.
1. Visually identify the table headers and structure.
2. Transcribe the exact content into a Markdown table.
3. Rules:
- Use the headers visible in the image.
- Preserve row alignment strictly.
- Leave empty cells blank.
- Output ONLY the Markdown table."""
with gr.Blocks(title="Universal Medical OCR") as demo:
gr.Markdown("## Universal Medical Report Digitizer")
with gr.Row():
with gr.Column():
input_img = gr.Image(
type="pil",
label="Upload Report",
sources=["upload", "clipboard"],
height=450,
)
with gr.Accordion("Settings", open=True):
prompt_input = gr.TextArea(
label="System Prompt", value=DEFAULT_PROMPT, lines=6
)
temp_slider = gr.Slider(
0.1, 1.0, value=0.1, step=0.1, label="Temperature"
)
top_p_slider = gr.Slider(0.1, 1.0, value=0.8, step=0.1, label="Top-P")
tokens_slider = gr.Slider(
256, 4096, value=2048, step=256, label="Max Tokens"
)
run_btn = gr.Button("Extract Table", variant="primary")
with gr.Column():
output_box = gr.Markdown(label="Detected Table")
run_btn.click(
fn=process_image,
inputs=[input_img, prompt_input, temp_slider, top_p_slider, tokens_slider],
outputs=output_box,
)
if __name__ == "__main__":
demo.launch()