Spaces:
Sleeping
Sleeping
update
Browse files
app.py
CHANGED
|
@@ -65,6 +65,81 @@ def hide_mission(audio_data):
|
|
| 65 |
return gr.update(visible=False)
|
| 66 |
return gr.update(visible=True)
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
def post_process_to_emoji(preds, window_ms, min_silence_ms=200):
|
| 70 |
"""Processes raw AI output, smooths it, enforces silence gaps, and merges duplicates."""
|
|
@@ -289,6 +364,8 @@ def train_video_model(v0, v1, v_bg):
|
|
| 289 |
if X1 is None: return None, f"Class 1 error: {msg1}"
|
| 290 |
if Xbg is None: return None, f"Background error: {msgbg}"
|
| 291 |
|
|
|
|
|
|
|
| 292 |
X = np.vstack([X0, X1, Xbg])
|
| 293 |
y = np.concatenate([
|
| 294 |
np.zeros(len(X0)),
|
|
@@ -307,6 +384,7 @@ def train_video_model(v0, v1, v_bg):
|
|
| 307 |
])
|
| 308 |
|
| 309 |
model.fit(X, y)
|
|
|
|
| 310 |
return model, "OK"
|
| 311 |
|
| 312 |
|
|
@@ -316,8 +394,8 @@ def decode_video_sequence(model, video_path):
|
|
| 316 |
return f"Error: {msg}"
|
| 317 |
|
| 318 |
preds = model.predict(X)
|
| 319 |
-
|
| 320 |
-
return
|
| 321 |
|
| 322 |
|
| 323 |
def run_video_decoder(v0, v1, v_bg, test_video):
|
|
|
|
| 65 |
return gr.update(visible=False)
|
| 66 |
return gr.update(visible=True)
|
| 67 |
|
| 68 |
+
def post_process_video_sequence(
|
| 69 |
+
preds,
|
| 70 |
+
min_segment_frames=10,
|
| 71 |
+
smoothing_window=10,
|
| 72 |
+
background_class=2
|
| 73 |
+
):
|
| 74 |
+
"""
|
| 75 |
+
Post-process frame-level predictions into a clean symbol sequence.
|
| 76 |
+
|
| 77 |
+
Steps:
|
| 78 |
+
1. Temporal smoothing (majority vote).
|
| 79 |
+
2. Remove very short segments.
|
| 80 |
+
3. Collapse into final sequence.
|
| 81 |
+
|
| 82 |
+
Args:
|
| 83 |
+
preds: array of class predictions per frame
|
| 84 |
+
min_segment_frames: minimum frames required to accept a symbol
|
| 85 |
+
smoothing_window: neighborhood size for smoothing
|
| 86 |
+
background_class: class index for background
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
if len(preds) == 0:
|
| 90 |
+
return ""
|
| 91 |
+
|
| 92 |
+
preds = [int(p) for p in preds]
|
| 93 |
+
|
| 94 |
+
# -----------------------------------
|
| 95 |
+
# 1. Majority vote smoothing
|
| 96 |
+
# -----------------------------------
|
| 97 |
+
half_w = smoothing_window // 2
|
| 98 |
+
smoothed = []
|
| 99 |
+
|
| 100 |
+
for i in range(len(preds)):
|
| 101 |
+
start = max(0, i - half_w)
|
| 102 |
+
end = min(len(preds), i + half_w + 1)
|
| 103 |
+
neighborhood = preds[start:end]
|
| 104 |
+
smoothed.append(max(set(neighborhood), key=neighborhood.count))
|
| 105 |
+
|
| 106 |
+
# -----------------------------------
|
| 107 |
+
# 2. Segment compression
|
| 108 |
+
# -----------------------------------
|
| 109 |
+
segments = []
|
| 110 |
+
current = smoothed[0]
|
| 111 |
+
length = 1
|
| 112 |
+
|
| 113 |
+
for p in smoothed[1:]:
|
| 114 |
+
if p == current:
|
| 115 |
+
length += 1
|
| 116 |
+
else:
|
| 117 |
+
segments.append((current, length))
|
| 118 |
+
current = p
|
| 119 |
+
length = 1
|
| 120 |
+
segments.append((current, length))
|
| 121 |
+
|
| 122 |
+
# -----------------------------------
|
| 123 |
+
# 3. Filter short segments
|
| 124 |
+
# -----------------------------------
|
| 125 |
+
filtered = []
|
| 126 |
+
for cls, length in segments:
|
| 127 |
+
if cls != background_class and length < min_segment_frames:
|
| 128 |
+
continue
|
| 129 |
+
filtered.append(cls)
|
| 130 |
+
|
| 131 |
+
# -----------------------------------
|
| 132 |
+
# 4. Collapse duplicates
|
| 133 |
+
# -----------------------------------
|
| 134 |
+
final_seq = []
|
| 135 |
+
for cls in filtered:
|
| 136 |
+
if cls == background_class:
|
| 137 |
+
continue
|
| 138 |
+
if not final_seq or cls != final_seq[-1]:
|
| 139 |
+
final_seq.append(str(cls))
|
| 140 |
+
|
| 141 |
+
return "_".join(final_seq)
|
| 142 |
+
|
| 143 |
|
| 144 |
def post_process_to_emoji(preds, window_ms, min_silence_ms=200):
|
| 145 |
"""Processes raw AI output, smooths it, enforces silence gaps, and merges duplicates."""
|
|
|
|
| 364 |
if X1 is None: return None, f"Class 1 error: {msg1}"
|
| 365 |
if Xbg is None: return None, f"Background error: {msgbg}"
|
| 366 |
|
| 367 |
+
print(f"Training video model with {len(X0)} frames for Class 0, {len(X1)} frames for Class 1, and {len(Xbg)} frames for Background.")
|
| 368 |
+
|
| 369 |
X = np.vstack([X0, X1, Xbg])
|
| 370 |
y = np.concatenate([
|
| 371 |
np.zeros(len(X0)),
|
|
|
|
| 384 |
])
|
| 385 |
|
| 386 |
model.fit(X, y)
|
| 387 |
+
print("Video model trained successfully!")
|
| 388 |
return model, "OK"
|
| 389 |
|
| 390 |
|
|
|
|
| 394 |
return f"Error: {msg}"
|
| 395 |
|
| 396 |
preds = model.predict(X)
|
| 397 |
+
print(f"Raw frame-level predictions: {preds}")
|
| 398 |
+
return post_process_video_sequence(preds)
|
| 399 |
|
| 400 |
|
| 401 |
def run_video_decoder(v0, v1, v_bg, test_video):
|