MeysamSh commited on
Commit
910595e
Β·
1 Parent(s): 7d9277c
Files changed (2) hide show
  1. app.py +290 -43
  2. requirements.txt +4 -1
app.py CHANGED
@@ -3,9 +3,17 @@ import numpy as np
3
  import librosa
4
  import xgboost as xgb
5
  import random
 
 
 
 
 
6
  from sklearn.preprocessing import StandardScaler
7
  from sklearn.pipeline import Pipeline
8
- import difflib
 
 
 
9
 
10
  # --- Constants ---
11
  SAMPLE_RATE = 16000
@@ -16,6 +24,30 @@ SILENCE_EMOJI = "_"
16
  MIN_SEC = 3.0
17
  MAX_SEC = 5.0
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  def generate_challenge():
20
  length = random.randint(3, 5)
21
  seq = []
@@ -170,47 +202,262 @@ def play_game(target_display, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s):
170
  # πŸ† WINNER: <span style="color: #ff4b4b; font-size: 40px;">{winner}</span>
171
  """
172
 
173
- # --- Gradio UI ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
175
- gr.Markdown("# πŸŽ™οΈ The AI Sequence Battle")
176
-
177
- # Store the mission in a hidden state so we can still use it for scoring even when invisible
178
- hidden_target = gr.State("")
179
-
180
- with gr.Row():
181
- target_seq_ui = gr.Textbox(label="πŸ“’ Referee's Mission (Memorize this!)", interactive=False)
182
- refresh_btn = gr.Button("πŸ”„ New Mission")
183
-
184
- # On load and on refresh, update both the UI and the State
185
- demo.load(generate_challenge, outputs=[hidden_target, target_seq_ui])
186
- refresh_btn.click(generate_challenge, outputs=[hidden_target, target_seq_ui])
187
-
188
- with gr.Accordion("βš–οΈ Step 1: The Referee", open=True):
189
- ref_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record the Mission")
190
- # Trigger hiding when audio is recorded
191
- ref_audio.change(hide_mission, inputs=ref_audio, outputs=target_seq_ui)
192
-
193
- with gr.Row():
194
- with gr.Column():
195
- gr.Markdown("### πŸ‘€ Player 1 (3-5s samples)")
196
- p1_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
197
- p1_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
198
- p1_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🀫")
199
- with gr.Column():
200
- gr.Markdown("### πŸ‘€ Player 2 (3-5s samples)")
201
- p2_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
202
- p2_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
203
- p2_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🀫")
204
-
205
- btn_fight = gr.Button("πŸ”₯ REVEAL WINNER", variant="primary", size="lg")
206
-
207
- # Using Markdown for large, styled text results
208
- result_display = gr.Markdown("### Results will appear here after the battle!")
209
-
210
- btn_fight.click(
211
- play_game,
212
- inputs=[hidden_target, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s],
213
- outputs=result_display
214
- )
215
 
216
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import librosa
4
  import xgboost as xgb
5
  import random
6
+ import subprocess
7
+ import tempfile
8
+ import os
9
+ import cv2
10
+ import difflib
11
  from sklearn.preprocessing import StandardScaler
12
  from sklearn.pipeline import Pipeline
13
+ import torch
14
+ import torchvision.transforms as T
15
+ import torchvision.models as models
16
+
17
 
18
  # --- Constants ---
19
  SAMPLE_RATE = 16000
 
24
  MIN_SEC = 3.0
25
  MAX_SEC = 5.0
26
 
27
+ # --- Lightweight pretrained visual backbone ---
28
+ device = torch.device("cpu")
29
+
30
+ # mobilenet = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.DEFAULT)
31
+ mobilenet = models.mobilenet_v3_small(
32
+ weights=models.MobileNet_V3_Small_Weights.DEFAULT
33
+ )
34
+ mobilenet = mobilenet.features # remove classifier
35
+ mobilenet.eval()
36
+ mobilenet.to(device)
37
+
38
+ # ImageNet normalization
39
+ video_transform = T.Compose([
40
+ T.ToPILImage(),
41
+ T.Resize((96, 96)), # small input for speed
42
+ T.ToTensor(),
43
+ T.Normalize(
44
+ mean=[0.485, 0.456, 0.406],
45
+ std=[0.229, 0.224, 0.225]
46
+ )
47
+ ])
48
+
49
+
50
+
51
  def generate_challenge():
52
  length = random.randint(3, 5)
53
  seq = []
 
202
  # πŸ† WINNER: <span style="color: #ff4b4b; font-size: 40px;">{winner}</span>
203
  """
204
 
205
+
206
+ # =========================================================
207
+ # VIDEO SECTION
208
+ # =========================================================
209
+
210
+ def ensure_readable_video(input_path):
211
+ """Re-encode video to MP4 to avoid WEBM/Opus issues."""
212
+ if input_path is None:
213
+ return None
214
+
215
+ tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
216
+ tmp_path = tmp.name
217
+ tmp.close()
218
+
219
+ cmd = [
220
+ "ffmpeg",
221
+ "-y",
222
+ "-i", input_path,
223
+ "-an", # remove audio
224
+ "-vcodec", "libx264",
225
+ "-preset", "ultrafast",
226
+ tmp_path
227
+ ]
228
+
229
+ try:
230
+ subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
231
+ return tmp_path
232
+ except:
233
+ return input_path
234
+
235
+
236
+ def extract_video_features(video_path, max_frames=300):
237
+ """Extract frame-level features from video."""
238
+ if video_path is None:
239
+ return None, "No video provided"
240
+
241
+ video_path = ensure_readable_video(video_path)
242
+
243
+ cap = cv2.VideoCapture(video_path)
244
+ feats = []
245
+ frame_count = 0
246
+
247
+ while True:
248
+ ret, frame = cap.read()
249
+ if not ret or frame_count >= max_frames:
250
+ break
251
+
252
+ frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
253
+ tensor = video_transform(frame_rgb).unsqueeze(0).to(device)
254
+
255
+ with torch.no_grad():
256
+ feat_map = mobilenet(tensor)
257
+ feat = torch.nn.functional.adaptive_avg_pool2d(feat_map, 1)
258
+ feat = feat.view(-1).cpu().numpy()
259
+
260
+ feats.append(feat)
261
+
262
+ # frame = cv2.resize(frame, (64, 64))
263
+ # frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
264
+
265
+ # # Basic color statistics
266
+ # mean = frame_rgb.mean(axis=(0, 1))
267
+ # std = frame_rgb.std(axis=(0, 1))
268
+ # brightness = frame_rgb.mean()
269
+
270
+ # feat = np.concatenate([mean, std, [brightness]])
271
+ # feats.append(feat)
272
+
273
+ frame_count += 1
274
+
275
+ cap.release()
276
+
277
+ if len(feats) == 0:
278
+ return None, "No frames extracted"
279
+
280
+ return np.array(feats), "OK"
281
+
282
+
283
+ def train_video_model(v0, v1, v_bg):
284
+ X0, msg0 = extract_video_features(v0)
285
+ X1, msg1 = extract_video_features(v1)
286
+ Xbg, msgbg = extract_video_features(v_bg)
287
+
288
+ if X0 is None: return None, f"Class 0 error: {msg0}"
289
+ if X1 is None: return None, f"Class 1 error: {msg1}"
290
+ if Xbg is None: return None, f"Background error: {msgbg}"
291
+
292
+ X = np.vstack([X0, X1, Xbg])
293
+ y = np.concatenate([
294
+ np.zeros(len(X0)),
295
+ np.ones(len(X1)),
296
+ np.full(len(Xbg), 2)
297
+ ])
298
+
299
+ model = Pipeline([
300
+ ("scaler", StandardScaler()),
301
+ ("clf", xgb.XGBClassifier(
302
+ n_estimators=50,
303
+ max_depth=3,
304
+ objective='multi:softprob',
305
+ num_class=3
306
+ ))
307
+ ])
308
+
309
+ model.fit(X, y)
310
+ return model, "OK"
311
+
312
+
313
+ def decode_video_sequence(model, video_path):
314
+ X, msg = extract_video_features(video_path)
315
+ if X is None:
316
+ return f"Error: {msg}"
317
+
318
+ preds = model.predict(X)
319
+ import pdb; pdb.set_trace()
320
+ return post_process_to_emoji(preds, window_ms=100)
321
+
322
+
323
+ def run_video_decoder(v0, v1, v_bg, test_video):
324
+ model, msg = train_video_model(v0, v1, v_bg)
325
+ if model is None:
326
+ return f"❌ {msg}"
327
+
328
+ result = decode_video_sequence(model, test_video)
329
+ return f"### 🎬 Decoded Sequence: `{result}`"
330
+
331
+
332
+ # =========================================================
333
+ # GRADIO UI WITH DUAL TABS
334
+ # =========================================================
335
+
336
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
+ with gr.Tabs():
339
+
340
+ # =====================================
341
+ # TAB 1 β€” AUDIO GAME (existing)
342
+ # =====================================
343
+ with gr.Tab("πŸŽ™οΈ Audio Sequence Battle"):
344
+
345
+ hidden_target = gr.State("")
346
+
347
+ with gr.Row():
348
+ target_seq_ui = gr.Textbox(
349
+ label="πŸ“’ Referee's Mission",
350
+ interactive=False
351
+ )
352
+ refresh_btn = gr.Button("πŸ”„ New Mission")
353
+
354
+ demo.load(generate_challenge, outputs=[hidden_target, target_seq_ui])
355
+ refresh_btn.click(generate_challenge, outputs=[hidden_target, target_seq_ui])
356
+
357
+ with gr.Accordion("βš–οΈ Step 1: The Referee", open=True):
358
+ ref_audio = gr.Audio(
359
+ sources=["microphone"],
360
+ type="filepath",
361
+ label="Record the Mission"
362
+ )
363
+ ref_audio.change(hide_mission, inputs=ref_audio, outputs=target_seq_ui)
364
+
365
+ with gr.Row():
366
+ with gr.Column():
367
+ gr.Markdown("### πŸ‘€ Player 1")
368
+ p1_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
369
+ p1_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
370
+ p1_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence")
371
+
372
+ with gr.Column():
373
+ gr.Markdown("### πŸ‘€ Player 2")
374
+ p2_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
375
+ p2_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
376
+ p2_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence")
377
+
378
+ btn_fight = gr.Button("πŸ”₯ REVEAL WINNER", variant="primary")
379
+ result_display = gr.Markdown("### Results will appear here")
380
+
381
+ btn_fight.click(
382
+ play_game,
383
+ inputs=[hidden_target, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s],
384
+ outputs=result_display
385
+ )
386
+
387
+
388
+ # =====================================
389
+ # TAB 2 β€” VIDEO DECODER
390
+ # =====================================
391
+ with gr.Tab("🎬 Video Frame Decoder"):
392
+
393
+ gr.Markdown("## Train video symbols and decode frame-level sequence")
394
+
395
+ with gr.Row():
396
+ with gr.Column():
397
+ gr.Markdown("### Training Samples")
398
+ v0 = gr.Video(label="Class 0 video",format="mp4")
399
+ v1 = gr.Video(label="Class 1 video",format="mp4")
400
+ v_bg = gr.Video(label="Background video",format="mp4")
401
+
402
+ with gr.Column():
403
+ gr.Markdown("### Test Video")
404
+ test_video = gr.Video(label="Video to decode",format="mp4")
405
+
406
+ decode_btn = gr.Button("🎬 Decode Video", variant="primary")
407
+ video_result = gr.Markdown("### Decoded result will appear here")
408
+
409
+ decode_btn.click(
410
+ run_video_decoder,
411
+ inputs=[v0, v1, v_bg, test_video],
412
+ outputs=video_result
413
+ )
414
+
415
+ demo.launch()
416
+
417
+
418
+
419
+
420
+ # # --- Gradio UI ---
421
+ # with gr.Blocks(theme=gr.themes.Soft()) as demo:
422
+ # gr.Markdown("# πŸŽ™οΈ The AI Sequence Battle")
423
+
424
+ # # Store the mission in a hidden state so we can still use it for scoring even when invisible
425
+ # hidden_target = gr.State("")
426
+
427
+ # with gr.Row():
428
+ # target_seq_ui = gr.Textbox(label="πŸ“’ Referee's Mission (Memorize this!)", interactive=False)
429
+ # refresh_btn = gr.Button("πŸ”„ New Mission")
430
+
431
+ # # On load and on refresh, update both the UI and the State
432
+ # demo.load(generate_challenge, outputs=[hidden_target, target_seq_ui])
433
+ # refresh_btn.click(generate_challenge, outputs=[hidden_target, target_seq_ui])
434
+
435
+ # with gr.Accordion("βš–οΈ Step 1: The Referee", open=True):
436
+ # ref_audio = gr.Audio(sources=["microphone"], type="filepath", label="Record the Mission")
437
+ # # Trigger hiding when audio is recorded
438
+ # ref_audio.change(hide_mission, inputs=ref_audio, outputs=target_seq_ui)
439
+
440
+ # with gr.Row():
441
+ # with gr.Column():
442
+ # gr.Markdown("### πŸ‘€ Player 1 (3-5s samples)")
443
+ # p1_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
444
+ # p1_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
445
+ # p1_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🀫")
446
+ # with gr.Column():
447
+ # gr.Markdown("### πŸ‘€ Player 2 (3-5s samples)")
448
+ # p2_0 = gr.Audio(sources=["microphone"], type="filepath", label="Source 0")
449
+ # p2_1 = gr.Audio(sources=["microphone"], type="filepath", label="Source 1")
450
+ # p2_s = gr.Audio(sources=["microphone"], type="filepath", label="Silence 🀫")
451
+
452
+ # btn_fight = gr.Button("πŸ”₯ REVEAL WINNER", variant="primary", size="lg")
453
+
454
+ # # Using Markdown for large, styled text results
455
+ # result_display = gr.Markdown("### Results will appear here after the battle!")
456
+
457
+ # btn_fight.click(
458
+ # play_game,
459
+ # inputs=[hidden_target, ref_audio, p1_0, p1_1, p1_s, p2_0, p2_1, p2_s],
460
+ # outputs=result_display
461
+ # )
462
+
463
+ # demo.launch()
requirements.txt CHANGED
@@ -3,4 +3,7 @@ numpy
3
  librosa
4
  scikit-learn
5
  soundfile
6
- xgboost
 
 
 
 
3
  librosa
4
  scikit-learn
5
  soundfile
6
+ xgboost
7
+ opencv-python
8
+ torch
9
+ torchvision