tenet commited on
Commit
89215aa
·
verified ·
1 Parent(s): ce470a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -79
app.py CHANGED
@@ -1,93 +1,70 @@
1
  import gradio as gr
2
  from transformers import pipeline
3
- import librosa
4
- # ----------------
5
- # TEXT MODELS
6
- # ----------------
7
- text_models = {
8
- "TinyBERT (Fill Mask)": pipeline("fill-mask", model="prajjwal1/bert-tiny"),
9
- "DistilBERT (Fill Mask)": pipeline("fill-mask", model="distilbert-base-uncased"),
10
- "ALBERT (Fill Mask)": pipeline("fill-mask", model="albert-base-v2"),
11
- "MobileBERT (Fill Mask)": pipeline("fill-mask", model="google/mobilebert-uncased"),
12
- "GPT-2 (Text Generation)": pipeline("text-generation", model="gpt2")
13
- }
14
-
15
- def run_text_model(model_name, text):
16
- pipe = text_models[model_name]
17
-
18
- if "GPT-2" in model_name:
19
- output = pipe(text, max_length=50, do_sample=True, top_k=50, temperature=0.7)
20
- return output[0]["generated_text"]
21
-
22
- else:
23
- if "[MASK]" not in text:
24
- text = text.strip()
25
- if not text.endswith("."):
26
- text += "."
27
- text = text[:-1] + " [MASK]."
28
-
29
- preds = pipe(text, top_k=5)
30
- formatted = "\n".join(
31
- [f"{p['token_str']} (prob={p['score']:.4f})" for p in preds]
32
- )
33
- return f"Input: {text}\n\nPredictions:\n{formatted}"
34
-
35
-
36
- # ----------------
37
- # IMAGE SEGMENTATION
38
- # ----------------
39
- segmentation_pipeline = pipeline(
40
- "image-segmentation", model="nvidia/segformer-b0-finetuned-ade-512-512"
41
- )
42
-
43
- def segment_image(image):
44
  results = segmentation_pipeline(image)
45
- # Combine masks into a single image with labels
46
- annotated = {}
47
- for r in results:
48
- annotated[r["label"]] = r["mask"] # label → mask
49
- return (image, annotated)
50
 
 
 
 
 
 
 
51
 
52
- # ----------------
53
- # SPEECH RECOGNITION
54
- # ----------------
55
- asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
56
 
 
 
57
 
 
 
58
 
59
- def transcribe(audio):
60
- # Load with max 30s duration
61
- speech, sr = librosa.load(audio, sr=16000, duration=30)
62
- return asr_pipeline({"array": speech, "sampling_rate": sr}, return_timestamps=True)["text"]
63
 
64
- # ----------------
65
- # GRADIO APP
66
- # ----------------
67
  with gr.Blocks() as demo:
68
- gr.Markdown("# 🔥 Multi-Modal Playground\n"
69
- "Try **Tiny LLMs, Image Segmentation, and Speech Models** all in one app!\n\n")
70
 
71
- # TEXT TAB
72
- with gr.Tab("Text Models"):
73
- model_choice = gr.Dropdown(list(text_models.keys()), label="Choose Model")
74
- text_input = gr.Textbox(label="Enter text or prompt")
75
- text_output = gr.Textbox(label="Output", lines=8)
76
- run_btn = gr.Button("Run")
77
- run_btn.click(fn=run_text_model, inputs=[model_choice, text_input], outputs=text_output)
78
 
79
- # IMAGE TAB
80
  with gr.Tab("Image Segmentation"):
81
- img_in = gr.Image(type="pil", label="Upload an Image")
82
- img_out = gr.AnnotatedImage(label="Segmented Output")
83
- seg_btn = gr.Button("Segment Objects")
84
- seg_btn.click(fn=segment_image, inputs=img_in, outputs=img_out)
85
-
86
- # AUDIO TAB
87
- with gr.Tab("Speech Recognition"):
88
- audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Upload or record audio")
89
- audio_out = gr.Textbox(label="Transcription")
90
- asr_btn = gr.Button("Transcribe")
91
- asr_btn.click(fn=transcribe, inputs=audio_in, outputs=audio_out)
92
 
93
- demo.launch()
 
1
  import gradio as gr
2
  from transformers import pipeline
3
+ from PIL import Image
4
+ import numpy as np
5
+ import random
6
+
7
+ # ----------------------------
8
+ # Load Pipelines
9
+ # ----------------------------
10
+ # Speech recognition (Whisper tiny or small recommended for edge use)
11
+ asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=-1)
12
+
13
+ # Image segmentation (Sam, DETR, or similar)
14
+ segmentation_pipeline = pipeline("image-segmentation", model="facebook/detr-resnet-50-panoptic", device=-1)
15
+
16
+ # ----------------------------
17
+ # Speech Transcription Function
18
+ # ----------------------------
19
+ def transcribe(audio):
20
+ # Enable timestamps automatically if input > 30s
21
+ try:
22
+ result = asr_pipeline(audio, return_timestamps=True)
23
+ except Exception as e:
24
+ return f"Error: {str(e)}"
25
+ return result["text"]
26
+
27
+ # ----------------------------
28
+ # Segmentation Function
29
+ # ----------------------------
30
+ def segment_image(image: Image.Image):
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  results = segmentation_pipeline(image)
 
 
 
 
 
32
 
33
+ # Generate a random color for each object
34
+ overlay = np.array(image).copy()
35
+ annotations = []
36
+ for r in results:
37
+ mask = np.array(r["mask"]) # mask is a PIL image
38
+ label = r["label"]
39
 
40
+ # Random color per mask
41
+ color = [random.randint(0, 255) for _ in range(3)]
 
 
42
 
43
+ # Apply semi-transparent overlay
44
+ overlay[mask > 0] = (0.6 * overlay[mask > 0] + 0.4 * np.array(color)).astype(np.uint8)
45
 
46
+ # Store mask + label for Gradio
47
+ annotations.append((r["mask"], label))
48
 
49
+ overlay_img = Image.fromarray(overlay)
50
+ return (overlay_img, annotations)
 
 
51
 
52
+ # ----------------------------
53
+ # Gradio UI
54
+ # ----------------------------
55
  with gr.Blocks() as demo:
56
+ gr.Markdown("# 🧩 Multimodal Playground\nSpeech + Image Segmentation")
 
57
 
58
+ with gr.Tab("Speech to Text"):
59
+ audio_in = gr.Audio(sources=["microphone", "upload"], type="filepath")
60
+ txt_out = gr.Textbox(label="Transcription")
61
+ btn1 = gr.Button("Transcribe")
62
+ btn1.click(transcribe, inputs=audio_in, outputs=txt_out)
 
 
63
 
 
64
  with gr.Tab("Image Segmentation"):
65
+ img_in = gr.Image(type="pil")
66
+ img_out = gr.AnnotatedImage(label="Segmentation")
67
+ btn2 = gr.Button("Segment")
68
+ btn2.click(segment_image, inputs=img_in, outputs=img_out)
 
 
 
 
 
 
 
69
 
70
+ demo.launch(server_name="0.0.0.0", server_port=7860)