Spaces:

lspatilvs
/

Medical-Report-OCR

Running on Zero

vinayaka-lifesigns

feat: Add application file

349bb79 2 months ago

3.14 kB

	import gradio as gr
	import spaces
	import torch
	from PIL import Image
	from transformers import AutoModel, AutoTokenizer

	# Target model: MiniCPM-o 4.5 (9B parameter variant)
	MODEL_ID = "openbmb/MiniCPM-o-4_5"

	# trust_remote_code is required for MiniCPM's custom architecture
	model = AutoModel.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	attn_implementation="sdpa",
	torch_dtype=torch.bfloat16,
	init_vision=True,
	init_audio=False,
	init_tts=False,
	).eval()

	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)


	# The @spaces.GPU decorator handles GPU allocation on Hugging Face Spaces
	@spaces.GPU
	def process_image(image, system_prompt, temperature, top_p, max_tokens):
	if not image:
	return "Error: No image provided."

	# Convert to RGB to ensure compatibility with vision encoder (removes Alpha channel)
	image = image.convert("RGB")

	# Construct the message list expected by MiniCPM-o
	msgs = [{"role": "user", "content": [image, system_prompt]}]

	try:
	# sampling=True enables temperature/top_p.
	# For strict OCR, lower temperature (0.1) is recommended.
	response = model.chat(
	image=None,
	msgs=msgs,
	tokenizer=tokenizer,
	sampling=True,
	temperature=temperature,
	top_p=top_p,
	max_new_tokens=max_tokens,
	)
	return response
	except Exception as e:
	return f"Error: {str(e)}"


	# Default prompt designed for dynamic table structure detection
	DEFAULT_PROMPT = """Analyze this document image.
	1. Visually identify the table headers and structure.
	2. Transcribe the exact content into a Markdown table.
	3. Rules:
	- Use the headers visible in the image.
	- Preserve row alignment strictly.
	- Leave empty cells blank.
	- Output ONLY the Markdown table."""

	with gr.Blocks(title="Universal Medical OCR") as demo:
	gr.Markdown("## Universal Medical Report Digitizer")

	with gr.Row():
	with gr.Column():
	input_img = gr.Image(
	type="pil",
	label="Upload Report",
	sources=["upload", "clipboard"],
	height=450,
	)

	with gr.Accordion("Settings", open=True):
	prompt_input = gr.TextArea(
	label="System Prompt", value=DEFAULT_PROMPT, lines=6
	)
	temp_slider = gr.Slider(
	0.1, 1.0, value=0.1, step=0.1, label="Temperature"
	)
	top_p_slider = gr.Slider(0.1, 1.0, value=0.8, step=0.1, label="Top-P")
	tokens_slider = gr.Slider(
	256, 4096, value=2048, step=256, label="Max Tokens"
	)

	run_btn = gr.Button("Extract Table", variant="primary")

	with gr.Column():
	output_box = gr.Markdown(label="Detected Table")

	run_btn.click(
	fn=process_image,
	inputs=[input_img, prompt_input, temp_slider, top_p_slider, tokens_slider],
	outputs=output_box,
	)

	if __name__ == "__main__":
	demo.launch()