MeysamSh commited on
Commit
e6f58e4
·
1 Parent(s): 910595e
Files changed (1) hide show
  1. app.py +80 -2
app.py CHANGED
@@ -65,6 +65,81 @@ def hide_mission(audio_data):
65
  return gr.update(visible=False)
66
  return gr.update(visible=True)
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  def post_process_to_emoji(preds, window_ms, min_silence_ms=200):
70
  """Processes raw AI output, smooths it, enforces silence gaps, and merges duplicates."""
@@ -289,6 +364,8 @@ def train_video_model(v0, v1, v_bg):
289
  if X1 is None: return None, f"Class 1 error: {msg1}"
290
  if Xbg is None: return None, f"Background error: {msgbg}"
291
 
 
 
292
  X = np.vstack([X0, X1, Xbg])
293
  y = np.concatenate([
294
  np.zeros(len(X0)),
@@ -307,6 +384,7 @@ def train_video_model(v0, v1, v_bg):
307
  ])
308
 
309
  model.fit(X, y)
 
310
  return model, "OK"
311
 
312
 
@@ -316,8 +394,8 @@ def decode_video_sequence(model, video_path):
316
  return f"Error: {msg}"
317
 
318
  preds = model.predict(X)
319
- import pdb; pdb.set_trace()
320
- return post_process_to_emoji(preds, window_ms=100)
321
 
322
 
323
  def run_video_decoder(v0, v1, v_bg, test_video):
 
65
  return gr.update(visible=False)
66
  return gr.update(visible=True)
67
 
68
+ def post_process_video_sequence(
69
+ preds,
70
+ min_segment_frames=10,
71
+ smoothing_window=10,
72
+ background_class=2
73
+ ):
74
+ """
75
+ Post-process frame-level predictions into a clean symbol sequence.
76
+
77
+ Steps:
78
+ 1. Temporal smoothing (majority vote).
79
+ 2. Remove very short segments.
80
+ 3. Collapse into final sequence.
81
+
82
+ Args:
83
+ preds: array of class predictions per frame
84
+ min_segment_frames: minimum frames required to accept a symbol
85
+ smoothing_window: neighborhood size for smoothing
86
+ background_class: class index for background
87
+ """
88
+
89
+ if len(preds) == 0:
90
+ return ""
91
+
92
+ preds = [int(p) for p in preds]
93
+
94
+ # -----------------------------------
95
+ # 1. Majority vote smoothing
96
+ # -----------------------------------
97
+ half_w = smoothing_window // 2
98
+ smoothed = []
99
+
100
+ for i in range(len(preds)):
101
+ start = max(0, i - half_w)
102
+ end = min(len(preds), i + half_w + 1)
103
+ neighborhood = preds[start:end]
104
+ smoothed.append(max(set(neighborhood), key=neighborhood.count))
105
+
106
+ # -----------------------------------
107
+ # 2. Segment compression
108
+ # -----------------------------------
109
+ segments = []
110
+ current = smoothed[0]
111
+ length = 1
112
+
113
+ for p in smoothed[1:]:
114
+ if p == current:
115
+ length += 1
116
+ else:
117
+ segments.append((current, length))
118
+ current = p
119
+ length = 1
120
+ segments.append((current, length))
121
+
122
+ # -----------------------------------
123
+ # 3. Filter short segments
124
+ # -----------------------------------
125
+ filtered = []
126
+ for cls, length in segments:
127
+ if cls != background_class and length < min_segment_frames:
128
+ continue
129
+ filtered.append(cls)
130
+
131
+ # -----------------------------------
132
+ # 4. Collapse duplicates
133
+ # -----------------------------------
134
+ final_seq = []
135
+ for cls in filtered:
136
+ if cls == background_class:
137
+ continue
138
+ if not final_seq or cls != final_seq[-1]:
139
+ final_seq.append(str(cls))
140
+
141
+ return "_".join(final_seq)
142
+
143
 
144
  def post_process_to_emoji(preds, window_ms, min_silence_ms=200):
145
  """Processes raw AI output, smooths it, enforces silence gaps, and merges duplicates."""
 
364
  if X1 is None: return None, f"Class 1 error: {msg1}"
365
  if Xbg is None: return None, f"Background error: {msgbg}"
366
 
367
+ print(f"Training video model with {len(X0)} frames for Class 0, {len(X1)} frames for Class 1, and {len(Xbg)} frames for Background.")
368
+
369
  X = np.vstack([X0, X1, Xbg])
370
  y = np.concatenate([
371
  np.zeros(len(X0)),
 
384
  ])
385
 
386
  model.fit(X, y)
387
+ print("Video model trained successfully!")
388
  return model, "OK"
389
 
390
 
 
394
  return f"Error: {msg}"
395
 
396
  preds = model.predict(X)
397
+ print(f"Raw frame-level predictions: {preds}")
398
+ return post_process_video_sequence(preds)
399
 
400
 
401
  def run_video_decoder(v0, v1, v_bg, test_video):