Spaces:

MeysamSh
/

SoundClassification

Sleeping

App Files Files Community

MeysamSh commited on Feb 6

Commit

910595e

1 Parent(s): 7d9277c

update

Browse files

Files changed (2) hide show

app.py +290 -43
requirements.txt +4 -1

app.py CHANGED Viewed

@@ -3,9 +3,17 @@ import numpy as np
 import librosa
 import xgboost as xgb
 import random
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
-import difflib
 # --- Constants ---
 SAMPLE_RATE = 16000
@@ -16,6 +24,30 @@ SILENCE_EMOJI = "_"
 MIN_SEC = 3.0
 MAX_SEC = 5.0
 def generate_challenge():
     length = random.randint(3, 5)
     seq = []
@@ -170,47 +202,262 @@ def play_game(target_display, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s):
     # 🏆 WINNER: <span style="color: #ff4b4b; font-size: 40px;">{winner}</span>
     """
-# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🎙️ The AI Sequence Battle")
-    # Store the mission in a hidden state so we can still use it for scoring even when invisible
-    hidden_target = gr.State("")
-    with gr.Row():
-        target_seq_ui = gr.Textbox(label="📢 Referee's Mission (Memorize this!)", interactive=False)
-        refresh_btn = gr.Button("🔄 New Mission")
-    # On load and on refresh, update both the UI and the State
-    demo.load(generate_challenge, outputs=[hidden_target, target_seq_ui])
-    refresh_btn.click(generate_challenge, outputs=[hidden_target, target_seq_ui])
-    with gr.Accordion("⚖️ Step 1: The Referee", open=True):
-        ref_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record the Mission")
-        # Trigger hiding when audio is recorded
-        ref_audio.change(hide_mission, inputs=ref_audio, outputs=target_seq_ui)
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### 👤 Player 1 (3-5s samples)")
-            p1_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
-            p1_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
-            p1_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🤫")
-        with gr.Column():
-            gr.Markdown("### 👤 Player 2 (3-5s samples)")
-            p2_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
-            p2_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
-            p2_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🤫")
-    btn_fight = gr.Button("🔥 REVEAL WINNER", variant="primary", size="lg")
-    # Using Markdown for large, styled text results
-    result_display = gr.Markdown("### Results will appear here after the battle!")
-    btn_fight.click(
-        play_game,
-        inputs=[hidden_target, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s],
-        outputs=result_display
-    )
-demo.launch()

 import librosa
 import xgboost as xgb
 import random
+import subprocess
+import tempfile
+import os
+import cv2
+import difflib
 from sklearn.preprocessing import StandardScaler
 from sklearn.pipeline import Pipeline
+import torch
+import torchvision.transforms as T
+import torchvision.models as models
 # --- Constants ---
 SAMPLE_RATE = 16000
 MIN_SEC = 3.0
 MAX_SEC = 5.0
+# --- Lightweight pretrained visual backbone ---
+device = torch.device("cpu")
+# mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
+mobilenet = models.mobilenet_v3_small(
+    weights=models.MobileNet_V3_Small_Weights.DEFAULT
+)
+mobilenet = mobilenet.features  # remove classifier
+mobilenet.eval()
+mobilenet.to(device)
+# ImageNet normalization
+video_transform = T.Compose([
+    T.ToPILImage(),
+    T.Resize((96, 96)),  # small input for speed
+    T.ToTensor(),
+    T.Normalize(
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]
+    )
+])
 def generate_challenge():
     length = random.randint(3, 5)
     seq = []
     # 🏆 WINNER: <span style="color: #ff4b4b; font-size: 40px;">{winner}</span>
     """
+# =========================================================
+# VIDEO SECTION
+# =========================================================
+def ensure_readable_video(input_path):
+    """Re-encode video to MP4 to avoid WEBM/Opus issues."""
+    if input_path is None:
+        return None
+    tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+    tmp_path = tmp.name
+    tmp.close()
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i", input_path,
+        "-an",                 # remove audio
+        "-vcodec", "libx264",
+        "-preset", "ultrafast",
+        tmp_path
+    ]
+    try:
+        subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        return tmp_path
+    except:
+        return input_path
+def extract_video_features(video_path, max_frames=300):
+    """Extract frame-level features from video."""
+    if video_path is None:
+        return None, "No video provided"
+    video_path = ensure_readable_video(video_path)
+    cap = cv2.VideoCapture(video_path)
+    feats = []
+    frame_count = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret or frame_count >= max_frames:
+            break
+        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        tensor = video_transform(frame_rgb).unsqueeze(0).to(device)
+        with torch.no_grad():
+            feat_map = mobilenet(tensor)
+            feat = torch.nn.functional.adaptive_avg_pool2d(feat_map, 1)
+            feat = feat.view(-1).cpu().numpy()
+        feats.append(feat)
+        # frame = cv2.resize(frame, (64, 64))
+        # frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # # Basic color statistics
+        # mean = frame_rgb.mean(axis=(0, 1))
+        # std = frame_rgb.std(axis=(0, 1))
+        # brightness = frame_rgb.mean()
+        # feat = np.concatenate([mean, std, [brightness]])
+        # feats.append(feat)
+        frame_count += 1
+    cap.release()
+    if len(feats) == 0:
+        return None, "No frames extracted"
+    return np.array(feats), "OK"
+def train_video_model(v0, v1, v_bg):
+    X0, msg0 = extract_video_features(v0)
+    X1, msg1 = extract_video_features(v1)
+    Xbg, msgbg = extract_video_features(v_bg)
+    if X0 is None: return None, f"Class 0 error: {msg0}"
+    if X1 is None: return None, f"Class 1 error: {msg1}"
+    if Xbg is None: return None, f"Background error: {msgbg}"
+    X = np.vstack([X0, X1, Xbg])
+    y = np.concatenate([
+        np.zeros(len(X0)),
+        np.ones(len(X1)),
+        np.full(len(Xbg), 2)
+    ])
+    model = Pipeline([
+        ("scaler", StandardScaler()),
+        ("clf", xgb.XGBClassifier(
+            n_estimators=50,
+            max_depth=3,
+            objective='multi:softprob',
+            num_class=3
+        ))
+    ])
+    model.fit(X, y)
+    return model, "OK"
+def decode_video_sequence(model, video_path):
+    X, msg = extract_video_features(video_path)
+    if X is None:
+        return f"Error: {msg}"
+    preds = model.predict(X)
+    import pdb; pdb.set_trace()
+    return post_process_to_emoji(preds, window_ms=100)
+def run_video_decoder(v0, v1, v_bg, test_video):
+    model, msg = train_video_model(v0, v1, v_bg)
+    if model is None:
+        return f"❌ {msg}"
+    result = decode_video_sequence(model, test_video)
+    return f"### 🎬 Decoded Sequence: `{result}`"
+# =========================================================
+# GRADIO UI WITH DUAL TABS
+# =========================================================
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    with gr.Tabs():
+        # =====================================
+        # TAB 1 — AUDIO GAME (existing)
+        # =====================================
+        with gr.Tab("🎙️ Audio Sequence Battle"):
+            hidden_target = gr.State("")
+            with gr.Row():
+                target_seq_ui = gr.Textbox(
+                    label="📢 Referee's Mission",
+                    interactive=False
+                )
+                refresh_btn = gr.Button("🔄 New Mission")
+            demo.load(generate_challenge, outputs=[hidden_target, target_seq_ui])
+            refresh_btn.click(generate_challenge, outputs=[hidden_target, target_seq_ui])
+            with gr.Accordion("⚖️ Step 1: The Referee", open=True):
+                ref_audio = gr.Audio(
+                    sources=["microphone"],
+                    type="filepath",
+                    label="Record the Mission"
+                )
+                ref_audio.change(hide_mission, inputs=ref_audio, outputs=target_seq_ui)
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### 👤 Player 1")
+                    p1_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
+                    p1_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
+                    p1_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence")
+                with gr.Column():
+                    gr.Markdown("### 👤 Player 2")
+                    p2_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
+                    p2_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
+                    p2_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence")
+            btn_fight = gr.Button("🔥 REVEAL WINNER", variant="primary")
+            result_display = gr.Markdown("### Results will appear here")
+            btn_fight.click(
+                play_game,
+                inputs=[hidden_target, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s],
+                outputs=result_display
+            )
+        # =====================================
+        # TAB 2 — VIDEO DECODER
+        # =====================================
+        with gr.Tab("🎬 Video Frame Decoder"):
+            gr.Markdown("## Train video symbols and decode frame-level sequence")
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Training Samples")
+                    v0 = gr.Video(label="Class 0 video",format="mp4")
+                    v1 = gr.Video(label="Class 1 video",format="mp4")
+                    v_bg = gr.Video(label="Background video",format="mp4")
+                with gr.Column():
+                    gr.Markdown("### Test Video")
+                    test_video = gr.Video(label="Video to decode",format="mp4")
+            decode_btn = gr.Button("🎬 Decode Video", variant="primary")
+            video_result = gr.Markdown("### Decoded result will appear here")
+            decode_btn.click(
+                run_video_decoder,
+                inputs=[v0, v1, v_bg, test_video],
+                outputs=video_result
+            )
+demo.launch()
+# # --- Gradio UI ---
+# with gr.Blocks(theme=gr.themes.Soft()) as demo:
+#     gr.Markdown("# 🎙️ The AI Sequence Battle")
+#     # Store the mission in a hidden state so we can still use it for scoring even when invisible
+#     hidden_target = gr.State("")
+#     with gr.Row():
+#         target_seq_ui = gr.Textbox(label="📢 Referee's Mission (Memorize this!)", interactive=False)
+#         refresh_btn = gr.Button("🔄 New Mission")
+#     # On load and on refresh, update both the UI and the State
+#     demo.load(generate_challenge, outputs=[hidden_target, target_seq_ui])
+#     refresh_btn.click(generate_challenge, outputs=[hidden_target, target_seq_ui])
+#     with gr.Accordion("⚖️ Step 1: The Referee", open=True):
+#         ref_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record the Mission")
+#         # Trigger hiding when audio is recorded
+#         ref_audio.change(hide_mission, inputs=ref_audio, outputs=target_seq_ui)
+#     with gr.Row():
+#         with gr.Column():
+#             gr.Markdown("### 👤 Player 1 (3-5s samples)")
+#             p1_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
+#             p1_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
+#             p1_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🤫")
+#         with gr.Column():
+#             gr.Markdown("### 👤 Player 2 (3-5s samples)")
+#             p2_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
+#             p2_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
+#             p2_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🤫")
+#     btn_fight = gr.Button("🔥 REVEAL WINNER", variant="primary", size="lg")
+#     # Using Markdown for large, styled text results
+#     result_display = gr.Markdown("### Results will appear here after the battle!")
+#     btn_fight.click(
+#         play_game,
+#         inputs=[hidden_target, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s],
+#         outputs=result_display
+#     )
+# demo.launch()

requirements.txt CHANGED Viewed

@@ -3,4 +3,7 @@ numpy
 librosa
 scikit-learn
 soundfile
-xgboost

 librosa
 scikit-learn
 soundfile
+xgboost
+opencv-python
+torch
+torchvision