Update app.py
Browse files
app.py
CHANGED
|
@@ -303,74 +303,12 @@ def predict_single_sign(video_path):
|
|
| 303 |
confidence, pred_class = torch.max(probs, 1)
|
| 304 |
|
| 305 |
predicted_label = id_to_label[pred_class.item()]
|
| 306 |
-
confidence_value = confidence.item()
|
| 307 |
|
| 308 |
-
return predicted_label
|
| 309 |
|
| 310 |
except Exception as e:
|
| 311 |
print(f"❌ Prediction error: {e}")
|
| 312 |
-
return "Unknown"
|
| 313 |
-
|
| 314 |
-
def predict_multiple_videos(video_files):
|
| 315 |
-
"""
|
| 316 |
-
MAIN FUNCTION: Predict signs from multiple videos and build a sentence
|
| 317 |
-
|
| 318 |
-
Args:
|
| 319 |
-
video_files: List of video file paths or single video
|
| 320 |
-
|
| 321 |
-
Returns:
|
| 322 |
-
Complete sentence, individual predictions, detailed results
|
| 323 |
-
"""
|
| 324 |
-
try:
|
| 325 |
-
# Handle single video or list
|
| 326 |
-
if not isinstance(video_files, list):
|
| 327 |
-
video_files = [video_files]
|
| 328 |
-
|
| 329 |
-
# Remove None values
|
| 330 |
-
video_files = [v for v in video_files if v is not None]
|
| 331 |
-
|
| 332 |
-
if len(video_files) == 0:
|
| 333 |
-
return "Please upload at least one video.", "", []
|
| 334 |
-
|
| 335 |
-
# Predict each video
|
| 336 |
-
predictions = []
|
| 337 |
-
detailed_results = []
|
| 338 |
-
|
| 339 |
-
for i, video_path in enumerate(video_files, 1):
|
| 340 |
-
sign, confidence = predict_single_sign(video_path)
|
| 341 |
-
predictions.append(sign)
|
| 342 |
-
detailed_results.append({
|
| 343 |
-
'video_num': i,
|
| 344 |
-
'sign': sign,
|
| 345 |
-
'confidence': confidence
|
| 346 |
-
})
|
| 347 |
-
|
| 348 |
-
# Build sentence
|
| 349 |
-
sentence = " ".join(predictions)
|
| 350 |
-
|
| 351 |
-
# Format detailed results
|
| 352 |
-
details_md = "### 📊 Individual Sign Analysis\n\n"
|
| 353 |
-
for result in detailed_results:
|
| 354 |
-
details_md += f"**Sign {result['video_num']}:** {result['sign']} ({result['confidence']*100:.1f}% confidence)\n\n"
|
| 355 |
-
|
| 356 |
-
# Final output
|
| 357 |
-
final_result = f"""
|
| 358 |
-
## 🎯 Complete Sentence Translation
|
| 359 |
-
|
| 360 |
-
### Detected Sentence:
|
| 361 |
-
**"{sentence}"**
|
| 362 |
-
|
| 363 |
-
{details_md}
|
| 364 |
-
|
| 365 |
-
---
|
| 366 |
-
**Total Signs Detected:** {len(predictions)}
|
| 367 |
-
**Model:** X-CLIP Fine-tuned on Ugandan Sign Language
|
| 368 |
-
"""
|
| 369 |
-
|
| 370 |
-
return final_result, sentence, detailed_results
|
| 371 |
-
|
| 372 |
-
except Exception as e:
|
| 373 |
-
return f"**Error:** {str(e)}", "", []
|
| 374 |
|
| 375 |
def analyze_joined_video(video_path, num_signs, use_auto_detect):
|
| 376 |
"""
|
|
@@ -411,21 +349,21 @@ def analyze_joined_video(video_path, num_signs, use_auto_detect):
|
|
| 411 |
|
| 412 |
for i, segment_path in enumerate(segment_paths, 1):
|
| 413 |
print(f"🔍 Analyzing segment {i}/{actual_segments}...")
|
| 414 |
-
sign
|
|
|
|
| 415 |
predictions.append(sign)
|
| 416 |
detailed_results.append({
|
| 417 |
'video_num': i,
|
| 418 |
-
'sign': sign
|
| 419 |
-
'confidence': confidence
|
| 420 |
})
|
| 421 |
|
| 422 |
# STEP 3: Build sentence
|
| 423 |
sentence = " ".join(predictions)
|
| 424 |
|
| 425 |
# Format detailed results
|
| 426 |
-
details_md = "###
|
| 427 |
for result in detailed_results:
|
| 428 |
-
details_md += f"**Position {result['video_num']}:** {result['sign']}
|
| 429 |
|
| 430 |
# Determine split method used
|
| 431 |
split_method = "Automatic Motion Detection" if use_auto_detect else "Equal Time Segments"
|
|
@@ -433,7 +371,7 @@ def analyze_joined_video(video_path, num_signs, use_auto_detect):
|
|
| 433 |
|
| 434 |
# Final output
|
| 435 |
final_result = f"""
|
| 436 |
-
##
|
| 437 |
|
| 438 |
### Detected Sentence:
|
| 439 |
**"{sentence}"**
|
|
@@ -638,7 +576,49 @@ with gr.Blocks(css=custom_css, title="Sign Language Sentence Builder") as demo:
|
|
| 638 |
outputs=[joined_video, auto_detect, num_signs_input, results_output, current_sentence, current_details, feedback_output]
|
| 639 |
)
|
| 640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
# Launch
|
| 644 |
if __name__ == "__main__":
|
|
|
|
| 303 |
confidence, pred_class = torch.max(probs, 1)
|
| 304 |
|
| 305 |
predicted_label = id_to_label[pred_class.item()]
|
|
|
|
| 306 |
|
| 307 |
+
return predicted_label # Only return the label
|
| 308 |
|
| 309 |
except Exception as e:
|
| 310 |
print(f"❌ Prediction error: {e}")
|
| 311 |
+
return "Unknown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
def analyze_joined_video(video_path, num_signs, use_auto_detect):
|
| 314 |
"""
|
|
|
|
| 349 |
|
| 350 |
for i, segment_path in enumerate(segment_paths, 1):
|
| 351 |
print(f"🔍 Analyzing segment {i}/{actual_segments}...")
|
| 352 |
+
sign = predict_single_sign(segment_path)
|
| 353 |
+
|
| 354 |
predictions.append(sign)
|
| 355 |
detailed_results.append({
|
| 356 |
'video_num': i,
|
| 357 |
+
'sign': sign
|
|
|
|
| 358 |
})
|
| 359 |
|
| 360 |
# STEP 3: Build sentence
|
| 361 |
sentence = " ".join(predictions)
|
| 362 |
|
| 363 |
# Format detailed results
|
| 364 |
+
details_md = "### Individual Sign Analysis (In Order)\n\n"
|
| 365 |
for result in detailed_results:
|
| 366 |
+
details_md += f"**Position {result['video_num']}:** {result['sign']}\n\n"
|
| 367 |
|
| 368 |
# Determine split method used
|
| 369 |
split_method = "Automatic Motion Detection" if use_auto_detect else "Equal Time Segments"
|
|
|
|
| 371 |
|
| 372 |
# Final output
|
| 373 |
final_result = f"""
|
| 374 |
+
## Complete Sentence Translation
|
| 375 |
|
| 376 |
### Detected Sentence:
|
| 377 |
**"{sentence}"**
|
|
|
|
| 576 |
outputs=[joined_video, auto_detect, num_signs_input, results_output, current_sentence, current_details, feedback_output]
|
| 577 |
)
|
| 578 |
|
| 579 |
+
# Example section
|
| 580 |
+
gr.Markdown("""
|
| 581 |
+
---
|
| 582 |
+
### 📝 Complete Example Workflow
|
| 583 |
+
|
| 584 |
+
**Goal:** Translate "Hello how good" in sign language
|
| 585 |
+
|
| 586 |
+
**Step 1: Record Your Signs**
|
| 587 |
+
- Sign 1: "Hello" (performer holds sign for 2 seconds)
|
| 588 |
+
- Sign 2: "How" (performer holds sign for 1 second)
|
| 589 |
+
- Sign 3: "Good" (performer holds sign for 3 seconds)
|
| 590 |
|
| 591 |
+
**Step 2: Join in CapCut**
|
| 592 |
+
- Import all 3 videos
|
| 593 |
+
- Arrange in order: Hello → How → Good
|
| 594 |
+
- Export as ONE video (6 seconds total)
|
| 595 |
+
|
| 596 |
+
**Step 3: Upload & Analyze**
|
| 597 |
+
- Upload the 6-second video here
|
| 598 |
+
- Enable "Automatic Detection" ✅
|
| 599 |
+
- Set "Expected signs" to 3
|
| 600 |
+
- Click "Analyze Sentence"
|
| 601 |
+
|
| 602 |
+
**Step 4: Result**
|
| 603 |
+
- 🤖 AI detects 3 segments automatically:
|
| 604 |
+
- Position 1: "Hello"
|
| 605 |
+
- Position 2: "How"
|
| 606 |
+
- Position 3: "Good"
|
| 607 |
+
- **Final Sentence:** "Hello How Good" ✅
|
| 608 |
+
|
| 609 |
+
---
|
| 610 |
+
|
| 611 |
+
### 🆚 When to Use Each Mode
|
| 612 |
+
|
| 613 |
+
| Scenario | Recommended Mode | Why |
|
| 614 |
+
|----------|-----------------|-----|
|
| 615 |
+
| Signs have different lengths | 🤖 Automatic | Detects boundaries precisely |
|
| 616 |
+
| You pause between signs | 🤖 Automatic | Pauses help detection |
|
| 617 |
+
| All signs exactly same duration | 📏 Manual | Simple equal split works |
|
| 618 |
+
| Fast, continuous signing | 📏 Manual | Motion detection may struggle |
|
| 619 |
+
| Professional recording | 🤖 Automatic | Better accuracy |
|
| 620 |
+
| Quick test/prototype | 📏 Manual | Faster processing |
|
| 621 |
+
""")
|
| 622 |
|
| 623 |
# Launch
|
| 624 |
if __name__ == "__main__":
|