Spaces:

tenet
/

HRM

Runtime error

App Files Files Community

tenet commited on Aug 21, 2025

Commit

89215aa

verified ·

1 Parent(s): ce470a2

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -79

app.py CHANGED Viewed

@@ -1,93 +1,70 @@
 import gradio as gr
 from transformers import pipeline
-import librosa
-# ----------------
-# TEXT MODELS
-# ----------------
-text_models = {
-    "TinyBERT (Fill Mask)": pipeline("fill-mask", model="prajjwal1/bert-tiny"),
-    "DistilBERT (Fill Mask)": pipeline("fill-mask", model="distilbert-base-uncased"),
-    "ALBERT (Fill Mask)": pipeline("fill-mask", model="albert-base-v2"),
-    "MobileBERT (Fill Mask)": pipeline("fill-mask", model="google/mobilebert-uncased"),
-    "GPT-2 (Text Generation)": pipeline("text-generation", model="gpt2")
-}
-def run_text_model(model_name, text):
-    pipe = text_models[model_name]
-    if "GPT-2" in model_name:
-        output = pipe(text, max_length=50, do_sample=True, top_k=50, temperature=0.7)
-        return output[0]["generated_text"]
-    else:
-        if "[MASK]" not in text:
-            text = text.strip()
-            if not text.endswith("."):
-                text += "."
-            text = text[:-1] + " [MASK]."
-        preds = pipe(text, top_k=5)
-        formatted = "\n".join(
-            [f"{p['token_str']} (prob={p['score']:.4f})" for p in preds]
-        )
-        return f"Input: {text}\n\nPredictions:\n{formatted}"
-# ----------------
-# IMAGE SEGMENTATION
-# ----------------
-segmentation_pipeline = pipeline(
-    "image-segmentation", model="nvidia/segformer-b0-finetuned-ade-512-512"
-)
-def segment_image(image):
     results = segmentation_pipeline(image)
-    # Combine masks into a single image with labels
-    annotated = {}
-    for r in results:
-        annotated[r["label"]] = r["mask"]  # label → mask
-    return (image, annotated)
-# ----------------
-# SPEECH RECOGNITION
-# ----------------
-asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
-def transcribe(audio):
-    # Load with max 30s duration
-    speech, sr = librosa.load(audio, sr=16000, duration=30)
-    return asr_pipeline({"array": speech, "sampling_rate": sr}, return_timestamps=True)["text"]
-# ----------------
-# GRADIO APP
-# ----------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 🔥 Multi-Modal Playground\n"
-                "Try **Tiny LLMs, Image Segmentation, and Speech Models** all in one app!\n\n")
-    # TEXT TAB
-    with gr.Tab("Text Models"):
-        model_choice = gr.Dropdown(list(text_models.keys()), label="Choose Model")
-        text_input = gr.Textbox(label="Enter text or prompt")
-        text_output = gr.Textbox(label="Output", lines=8)
-        run_btn = gr.Button("Run")
-        run_btn.click(fn=run_text_model, inputs=[model_choice, text_input], outputs=text_output)
-    # IMAGE TAB
     with gr.Tab("Image Segmentation"):
-        img_in = gr.Image(type="pil", label="Upload an Image")
-        img_out = gr.AnnotatedImage(label="Segmented Output")
-        seg_btn = gr.Button("Segment Objects")
-        seg_btn.click(fn=segment_image, inputs=img_in, outputs=img_out)
-    # AUDIO TAB
-    with gr.Tab("Speech Recognition"):
-        audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or record audio")
-        audio_out = gr.Textbox(label="Transcription")
-        asr_btn = gr.Button("Transcribe")
-        asr_btn.click(fn=transcribe, inputs=audio_in, outputs=audio_out)
-demo.launch()

 import gradio as gr
 from transformers import pipeline
+from PIL import Image
+import numpy as np
+import random
+# ----------------------------
+# Load Pipelines
+# ----------------------------
+# Speech recognition (Whisper tiny or small recommended for edge use)
+asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=-1)
+# Image segmentation (Sam, DETR, or similar)
+segmentation_pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device=-1)
+# ----------------------------
+# Speech Transcription Function
+# ----------------------------
+def transcribe(audio):
+    # Enable timestamps automatically if input > 30s
+    try:
+        result = asr_pipeline(audio, return_timestamps=True)
+    except Exception as e:
+        return f"Error: {str(e)}"
+    return result["text"]
+# ----------------------------
+# Segmentation Function
+# ----------------------------
+def segment_image(image: Image.Image):
     results = segmentation_pipeline(image)
+    # Generate a random color for each object
+    overlay = np.array(image).copy()
+    annotations = []
+    for r in results:
+        mask = np.array(r["mask"])  # mask is a PIL image
+        label = r["label"]
+        # Random color per mask
+        color = [random.randint(0, 255) for _ in range(3)]
+        # Apply semi-transparent overlay
+        overlay[mask > 0] = (0.6 * overlay[mask > 0] + 0.4 * np.array(color)).astype(np.uint8)
+        # Store mask + label for Gradio
+        annotations.append((r["mask"], label))
+    overlay_img = Image.fromarray(overlay)
+    return (overlay_img, annotations)
+# ----------------------------
+# Gradio UI
+# ----------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧩 Multimodal Playground\nSpeech + Image Segmentation")
+    with gr.Tab("Speech to Text"):
+        audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
+        txt_out = gr.Textbox(label="Transcription")
+        btn1 = gr.Button("Transcribe")
+        btn1.click(transcribe, inputs=audio_in, outputs=txt_out)
     with gr.Tab("Image Segmentation"):
+        img_in = gr.Image(type="pil")
+        img_out = gr.AnnotatedImage(label="Segmentation")
+        btn2 = gr.Button("Segment")
+        btn2.click(segment_image, inputs=img_in, outputs=img_out)
+demo.launch(server_name="0.0.0.0", server_port=7860)