Spaces:

ImedHa
/

hackathon_acvss

Sleeping

App Files Files Community

ImedHa commited on Jul 23, 2025

Commit

02bb97a

verified ·

1 Parent(s): 505fb2e

Upload 2 files

Browse files

Files changed (2) hide show

app.py +128 -0
requirements.txt +6 -3

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import streamlit as st
+from PIL import Image
+import torch
+import os
+from io import StringIO
+import sys
+# --- TorchDynamo Fix for Unsloth/MedGemma ---
+import torch._dynamo
+torch._dynamo.config.capture_scalar_outputs = True
+torch.compiler.disable()
+# --- Dependency Handling ---
+try:
+    from unsloth import FastVisionModel
+    from transformers import TextStreamer
+except ImportError as e:
+    st.error(f"A required library is not installed. Please install dependencies. Error: {e}")
+    st.stop()
+@st.cache_resource
+def load_medgemma_model():
+    """Loads the MedGemma vision-language model in eager mode."""
+    try:
+        model, processor = FastVisionModel.from_pretrained(
+            "fiqqy/MedGemma-MM-OR-FT10",
+            load_in_4bit=False,
+            use_gradient_checkpointing="unsloth",
+        )
+        return model, processor
+    except Exception as e:
+        st.error(f"Error loading MedGemma model: {e}")
+        return None, None
+def run_captioning(medgemma_model, processor, frames, instruction):
+    """Runs MedGemma inference using 3 frames and an instruction."""
+    st.write("Preparing inputs for MedGemma...")
+    images = [f.convert("RGB") for f in frames]
+    messages = [
+        {"role": "user", "content": [
+            {"type": "image"}, {"type": "image"}, {"type": "image"},
+            {"type": "text", "text": instruction},
+        ]},
+    ]
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    inputs = processor(
+        images, input_text, add_special_tokens=False, return_tensors="pt",
+    ).to(device)
+    text_streamer = TextStreamer(processor, skip_prompt=True)
+    old_stdout = sys.stdout
+    sys.stdout = captured_output = StringIO()
+    st.write("Running MedGemma Analysis...")
+    torch._dynamo.disable()
+    medgemma_model.generate(
+        **inputs, streamer=text_streamer, max_new_tokens=768,
+        use_cache=True, temperature=1.0, top_p=0.95, top_k=64
+    )
+    sys.stdout = old_stdout
+    result = captured_output.getvalue()
+    return result
+def show():
+    """Main function to render the Streamlit UI."""
+    st.title("MedGemma Scene Analysis System")
+    st.write("A system to test MedGemma vision-language captioning model.")
+    st.header("1. Load MedGemma Model")
+    if "medgemma_model" not in st.session_state:
+        st.session_state.medgemma_model, st.session_state.processor = None, None
+    if st.button("Load MedGemma Model"):
+        with st.spinner("Loading MedGemma... This can take several minutes."):
+            st.session_state.medgemma_model, st.session_state.processor = load_medgemma_model()
+    if st.session_state.get("medgemma_model") and st.session_state.get("processor"):
+        st.success("MedGemma model is loaded.")
+    else:
+        st.warning("MedGemma model is not loaded.")
+    st.header("2. Upload Data")
+    st.subheader("Upload Three Sequential Surgical Video Frames")
+    col1, col2, col3 = st.columns(3)
+    uploaded_files = [
+        col1.file_uploader("Upload Frame 1", type=["png", "jpg", "jpeg"], key="frame1"),
+        col2.file_uploader("Upload Frame 2", type=["png", "jpg", "jpeg"], key="frame2"),
+        col3.file_uploader("Upload Frame 3", type=["png", "jpg", "jpeg"], key="frame3")
+    ]
+    frames = [Image.open(f) for f in uploaded_files if f is not None]
+    display_size = (256, 256)
+    if len(frames) == 3:
+        st.success("All three frames have been uploaded successfully.")
+        img_cols = st.columns(3)
+        for i, frame in enumerate(frames):
+            img_cols[i].image(frame.resize(display_size), caption=f"Frame {i+1}", use_container_width=True)
+    else:
+        st.info("Please upload all three frames to proceed.")
+    st.header("3. Generate Scene Analysis")
+    instruction_prompt = st.text_area(
+        "Enter your custom instruction prompt:",
+        "Provide a detailed summary of the surgical action, noting the instruments used and their interactions."
+    )
+    can_run_analysis = (
+        st.session_state.get("medgemma_model") is not None and
+        len(frames) == 3 and
+        bool(instruction_prompt)
+    )
+    if st.button("Run Analysis", disabled=not can_run_analysis):
+        with st.spinner("Running MedGemma analysis... This may take a moment."):
+            result = run_captioning(
+                st.session_state.medgemma_model, st.session_state.processor,
+                frames, instruction_prompt
+            )
+            st.subheader("Analysis Result")
+            st.write(result)
+    if not can_run_analysis:
+        st.warning("Please ensure the MedGemma model is loaded, three frames are uploaded, and a prompt is provided.")
+if __name__ == "__main__":
+    show()

requirements.txt CHANGED Viewed

@@ -1,3 +1,6 @@
-altair
-pandas
-streamlit

+# Only MedGemma dependencies required
+streamlit
+Pillow
+torch
+unsloth
+transformers