Spaces:

Bliss-Ruth
/

Ugandan_sign_language_translation_tool

Sleeping

App Files Files Community

Bliss-Ruth commited on 28 days ago

Commit

473d0f3

verified ·

1 Parent(s): b010867

Update app.py

Browse files

Files changed (1) hide show

app.py +454 -38

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-# app.py - MULTI-VIDEO SENTENCE BUILDER
 import torch
 import torch.nn as nn
 from transformers import XCLIPProcessor, XCLIPModel
@@ -9,6 +10,7 @@ from PIL import Image
 import pandas as pd
 from datetime import datetime
 import os
 print("🚀 Loading Ugandan Sign Language Model...")
@@ -70,9 +72,178 @@ except Exception as e:
     exit(1)
 # ============================================================================
-# CORE FUNCTIONS
 # ============================================================================
 def extract_frames(video_path, num_frames=8):
     """Extract frames from video"""
     try:
@@ -180,7 +351,7 @@ def predict_multiple_videos(video_files):
         # Format detailed results
         details_md = "### 📊 Individual Sign Analysis\n\n"
         for result in detailed_results:
-            details_md += f"**Video {result['video_num']}:** {result['sign']} ({result['confidence']*100:.1f}% confidence)\n\n"
         # Final output
         final_result = f"""
@@ -201,6 +372,98 @@ def predict_multiple_videos(video_files):
     except Exception as e:
         return f"**Error:** {str(e)}", "", []
 # ============================================================================
 # FEEDBACK SYSTEM
 # ============================================================================
@@ -267,35 +530,180 @@ h1 {
 with gr.Blocks(css=custom_css, title="Sign Language Sentence Builder") as demo:
     gr.Markdown("""
-    # 🤟 Ugandan Sign Language Sentence Builder
-    *Upload multiple videos (one sign per video) to build complete sentences!*
-    **How it works:**
-    1. Upload 2-5 videos in sequence (each video = one sign/word)
-    2. Click "Build Sentence" to see the complete translation
-    3. Example: Video 1 (Hello) + Video 2 (How) + Video 3 (Are) → "Hello How Are"
     """)
     with gr.Row():
-        # Left side - Video uploads
         with gr.Column(scale=1):
-            gr.Markdown("### 📤 Upload Videos (In Order)")
-            video1 = gr.Video(label="Video 1 (First Sign)", sources=["upload", "webcam"])
-            video2 = gr.Video(label="Video 2 (Second Sign)", sources=["upload", "webcam"])
-            video3 = gr.Video(label="Video 3 (Third Sign)", sources=["upload", "webcam"])
-            video4 = gr.Video(label="Video 4 (Fourth Sign)", sources=["upload", "webcam"])
-            video5 = gr.Video(label="Video 5 (Fifth Sign)", sources=["upload", "webcam"])
             with gr.Row():
-                analyze_btn = gr.Button("🚀 Build Sentence", variant="primary", scale=2)
-                clear_btn = gr.Button("🗑️ Clear All", variant="secondary", scale=1)
         # Right side - Results
         with gr.Column(scale=1):
             gr.Markdown("### 🎯 Translation Results")
             results_output = gr.Markdown(
-                value="**Upload your videos and click 'Build Sentence' to see the translation.**"
             )
             gr.Markdown("### 💡 Sentence Feedback")
@@ -311,15 +719,10 @@ with gr.Blocks(css=custom_css, title="Sign Language Sentence Builder") as demo:
     current_sentence = gr.State()
     current_details = gr.State()
-    # Build sentence logic
-    def build_sentence_wrapper(v1, v2, v3, v4, v5):
-        videos = [v1, v2, v3, v4, v5]
-        result, sentence, details = predict_multiple_videos(videos)
-        return result, sentence, details
     analyze_btn.click(
-        fn=build_sentence_wrapper,
-        inputs=[video1, video2, video3, video4, video5],
         outputs=[results_output, current_sentence, current_details]
     )
@@ -339,28 +742,41 @@ with gr.Blocks(css=custom_css, title="Sign Language Sentence Builder") as demo:
     # Clear button
     def clear_all():
-        return None, None, None, None, None, "**Upload your videos and click 'Build Sentence'.**", "", [], ""
     clear_btn.click(
         fn=clear_all,
-        outputs=[video1, video2, video3, video4, video5, results_output, current_sentence, current_details, feedback_output]
     )
     # Example section
     gr.Markdown("""
     ---
-    ### 📝 Example Usage
-    **Scenario:** You want to say "Hello how are you"
-    1. **Video 1:** Record/upload sign for "Hello"
-    2. **Video 2:** Record/upload sign for "How"
-    3. **Video 3:** Record/upload sign for "Are"
-    4. **Video 4:** Record/upload sign for "You"
-    5. Click "Build Sentence"
-    6. Result: **"Hello How Are You"**
-    *Note: Each video should contain ONE sign only*
     """)
 # Launch

+# app.py - JOINED VIDEO SENTENCE ANALYZER
+# Analyzes ONE long video with multiple signs and builds a sentence
 import torch
 import torch.nn as nn
 from transformers import XCLIPProcessor, XCLIPModel
 import pandas as pd
 from datetime import datetime
 import os
+import tempfile
 print("🚀 Loading Ugandan Sign Language Model...")
     exit(1)
 # ============================================================================
+# CORE FUNCTIONS - VIDEO SPLITTING & ANALYSIS WITH MOTION DETECTION
 # ============================================================================
+def detect_motion_changes(video_path, threshold=30):
+    """
+    Detect motion changes in video to find sign boundaries
+    Args:
+        video_path: Path to video
+        threshold: Motion threshold (higher = less sensitive)
+    Returns:
+        List of frame indices where significant motion changes occur
+    """
+    try:
+        cap = cv2.VideoCapture(video_path)
+        # Read first frame
+        ret, prev_frame = cap.read()
+        if not ret:
+            cap.release()
+            return []
+        prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
+        prev_gray = cv2.GaussianBlur(prev_gray, (21, 21), 0)
+        motion_scores = []
+        frame_idx = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Convert to grayscale and blur
+            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+            gray = cv2.GaussianBlur(gray, (21, 21), 0)
+            # Calculate difference between frames
+            frame_delta = cv2.absdiff(prev_gray, gray)
+            thresh = cv2.threshold(frame_delta, 25, 255, cv2.THRESH_BINARY)[1]
+            # Calculate motion score (percentage of changed pixels)
+            motion_score = np.sum(thresh) / (thresh.shape[0] * thresh.shape[1])
+            motion_scores.append((frame_idx, motion_score))
+            prev_gray = gray
+            frame_idx += 1
+        cap.release()
+        # Find peaks in motion (where motion suddenly increases/decreases)
+        # This indicates transitions between signs
+        boundaries = [0]  # Start with first frame
+        if len(motion_scores) > 10:
+            # Smooth motion scores
+            window_size = 5
+            smoothed = []
+            for i in range(len(motion_scores)):
+                start = max(0, i - window_size)
+                end = min(len(motion_scores), i + window_size + 1)
+                avg_score = np.mean([s[1] for s in motion_scores[start:end]])
+                smoothed.append((motion_scores[i][0], avg_score))
+            # Find local minima (pauses between signs)
+            for i in range(10, len(smoothed) - 10):
+                # Check if this is a local minimum
+                current_score = smoothed[i][1]
+                prev_scores = [smoothed[j][1] for j in range(i-10, i)]
+                next_scores = [smoothed[j][1] for j in range(i+1, i+11)]
+                if current_score < np.mean(prev_scores) * 0.3 and current_score < np.mean(next_scores) * 0.3:
+                    # Significant pause detected
+                    boundaries.append(smoothed[i][0])
+        return boundaries
+    except Exception as e:
+        print(f"❌ Motion detection error: {e}")
+        return [0]
+def split_video_smart(video_path, num_signs=None, use_motion_detection=True):
+    """
+    Smart video splitting using motion detection OR equal segments
+    Args:
+        video_path: Path to the joined video
+        num_signs: Expected number of signs (optional if using motion detection)
+        use_motion_detection: Whether to use automatic boundary detection
+    Returns:
+        List of segment video paths
+    """
+    try:
+        cap = cv2.VideoCapture(video_path)
+        # Get video properties
+        fps = int(cap.get(cv2.CAP_PROP_FPS))
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        if total_frames == 0:
+            cap.release()
+            return []
+        # Determine split points
+        if use_motion_detection:
+            print("🔍 Using motion detection to find sign boundaries...")
+            boundaries = detect_motion_changes(video_path)
+            # Filter boundaries to get approximately num_signs segments
+            if num_signs and len(boundaries) > num_signs + 1:
+                # Too many boundaries detected, keep the strongest ones
+                # Sort by spacing and keep most evenly spaced
+                step = len(boundaries) // (num_signs + 1)
+                boundaries = [boundaries[i * step] for i in range(num_signs + 1)]
+            boundaries.append(total_frames)  # Add end frame
+            boundaries = sorted(list(set(boundaries)))  # Remove duplicates
+            print(f"✅ Found {len(boundaries)-1} sign segments at frames: {boundaries}")
+        else:
+            # Fall back to equal segments
+            print(f"📏 Splitting into {num_signs} equal segments...")
+            frames_per_segment = total_frames // num_signs
+            boundaries = [i * frames_per_segment for i in range(num_signs + 1)]
+            boundaries[-1] = total_frames
+        segment_paths = []
+        temp_dir = tempfile.mkdtemp()
+        # Create segments based on boundaries
+        for segment_idx in range(len(boundaries) - 1):
+            start_frame = boundaries[segment_idx]
+            end_frame = boundaries[segment_idx + 1]
+            # Skip very short segments (less than 5 frames)
+            if end_frame - start_frame < 5:
+                continue
+            segment_path = os.path.join(temp_dir, f"segment_{segment_idx}.mp4")
+            fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+            out = cv2.VideoWriter(segment_path, fourcc, fps, (width, height))
+            # Write frames for this segment
+            cap.set(cv2.CAP_PROP_POS_FRAMES, start_frame)
+            for frame_idx in range(start_frame, end_frame):
+                ret, frame = cap.read()
+                if not ret:
+                    break
+                out.write(frame)
+            out.release()
+            # Only add if file was created successfully
+            if os.path.exists(segment_path) and os.path.getsize(segment_path) > 0:
+                segment_paths.append(segment_path)
+        cap.release()
+        return segment_paths
+    except Exception as e:
+        print(f"❌ Error splitting video: {e}")
+        import traceback
+        traceback.print_exc()
+        return []
 def extract_frames(video_path, num_frames=8):
     """Extract frames from video"""
     try:
         # Format detailed results
         details_md = "### 📊 Individual Sign Analysis\n\n"
         for result in detailed_results:
+            details_md += f"**Sign {result['video_num']}:** {result['sign']} ({result['confidence']*100:.1f}% confidence)\n\n"
         # Final output
         final_result = f"""
     except Exception as e:
         return f"**Error:** {str(e)}", "", []
+def analyze_joined_video(video_path, num_signs, use_auto_detect):
+    """
+    NEW MAIN FUNCTION: Analyze a JOINED video with multiple signs
+    Args:
+        video_path: Path to the joined video from CapCut
+        num_signs: How many signs are in the video (used as hint)
+        use_auto_detect: Whether to use automatic motion detection
+    Returns:
+        Complete sentence, individual predictions, detailed results
+    """
+    try:
+        if video_path is None:
+            return "Please upload a video.", "", []
+        if num_signs is None or num_signs <= 0:
+            num_signs = 3  # Default
+        # STEP 1: Split the joined video into segments
+        if use_auto_detect:
+            print(f"🤖 Using AUTOMATIC motion detection (expected ~{num_signs} signs)...")
+            segment_paths = split_video_smart(video_path, num_signs, use_motion_detection=True)
+        else:
+            print(f"📏 Using MANUAL equal split ({num_signs} segments)...")
+            segment_paths = split_video_smart(video_path, num_signs, use_motion_detection=False)
+        if len(segment_paths) == 0:
+            return "Failed to split video. Please check your video file.", "", []
+        actual_segments = len(segment_paths)
+        print(f"✅ Created {actual_segments} segments")
+        # STEP 2: Analyze each segment separately
+        predictions = []
+        detailed_results = []
+        for i, segment_path in enumerate(segment_paths, 1):
+            print(f"🔍 Analyzing segment {i}/{actual_segments}...")
+            sign, confidence = predict_single_sign(segment_path)
+            predictions.append(sign)
+            detailed_results.append({
+                'video_num': i,
+                'sign': sign,
+                'confidence': confidence
+            })
+        # STEP 3: Build sentence
+        sentence = " ".join(predictions)
+        # Format detailed results
+        details_md = "### 📊 Individual Sign Analysis (In Order)\n\n"
+        for result in detailed_results:
+            details_md += f"**Position {result['video_num']}:** {result['sign']} ({result['confidence']*100:.1f}% confidence)\n\n"
+        # Determine split method used
+        split_method = "Automatic Motion Detection" if use_auto_detect else "Equal Time Segments"
+        segments_info = f"Detected {actual_segments} segments" if use_auto_detect else f"Split into {num_signs} equal segments"
+        # Final output
+        final_result = f"""
+## 🎯 Complete Sentence Translation
+### Detected Sentence:
+**"{sentence}"**
+{details_md}
+---
+**Split Method:** {split_method}
+**Segments:** {segments_info}
+**Model:** X-CLIP Fine-tuned on Ugandan Sign Language
+*{'Signs were automatically detected by analyzing motion patterns' if use_auto_detect else 'Each sign was analyzed from equal time segments'}*
+"""
+        # Clean up temporary files
+        try:
+            for segment_path in segment_paths:
+                if os.path.exists(segment_path):
+                    os.remove(segment_path)
+        except:
+            pass
+        return final_result, sentence, detailed_results
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"❌ Error: {error_details}")
+        return f"**Error analyzing video:** {str(e)}\n\nPlease try:\n- Using a different video\n- Toggling automatic detection\n- Adjusting number of signs", "", []
 # ============================================================================
 # FEEDBACK SYSTEM
 # ============================================================================
 with gr.Blocks(css=custom_css, title="Sign Language Sentence Builder") as demo:
     gr.Markdown("""
+    # 🤟 Ugandan Sign Language Sentence Analyzer
+    *Upload ONE joined video with multiple signs - we'll automatically detect and translate them!*
+    **Two Detection Modes:**
+    1. **🤖 Automatic (Recommended):** AI detects where each sign starts/ends (works with unequal durations!)
+    2. **📏 Manual:** Split video into equal time segments (use if signs have equal duration)
     """)
     with gr.Row():
+        # Left side - Video upload
         with gr.Column(scale=1):
+            gr.Markdown("### 📤 Upload Your Joined Video")
+            joined_video = gr.Video(
+                label="Joined Video (from CapCut or any editor)",
+                sources=["upload", "webcam"]
+            )
+            gr.Markdown("### ⚙️ Detection Settings")
+            auto_detect = gr.Checkbox(
+                label="🤖 Use Automatic Motion Detection",
+                value=True,
+                info="AI automatically finds sign boundaries (recommended!)"
+            )
+            num_signs_input = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=3,
+                step=1,
+                label="Expected number of signs (approximate)",
+                info="Helps guide the detection algorithm"
+            )
+            with gr.Accordion("💡 How It Works", open=False):
+                gr.Markdown("""
+                **Automatic Mode (🤖):**
+                - Analyzes motion patterns in your video
+                - Detects pauses/transitions between signs
+                - Works even if signs have different durations!
+                - Example: 1s + 3s + 2s signs → correctly detected
+                **Manual Mode (📏):**
+                - Splits video into equal time segments
+                - Works best when all signs take equal time
+                - Example: 2s + 2s + 2s signs → perfect split
+                **Tips:**
+                - ✅ Pause briefly between signs for best detection
+                - ✅ Keep camera angle consistent
+                - ✅ Good lighting helps accuracy
+                """)
             with gr.Row():
+                analyze_btn = gr.Button("🚀 Analyze Sentence", variant="primary", scale=2)
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary", scale=1)
         # Right side - Results
         with gr.Column(scale=1):
             gr.Markdown("### 🎯 Translation Results")
             results_output = gr.Markdown(
+                value="**Upload your video, choose detection mode, and click 'Analyze Sentence'**"
+            )
+            gr.Markdown("### 💡 Feedback")
+            gr.Markdown("*Help improve accuracy by providing corrections:*")
+            correct_sentence_input = gr.Textbox(
+                label="Correct Sentence (if prediction was wrong)",
+                placeholder="e.g., Hello how are you"
+            )
+            feedback_btn = gr.Button("📝 Submit Feedback", variant="secondary")
+            feedback_output = gr.Markdown()
+    # Hidden states
+    current_sentence = gr.State()
+    current_details = gr.State()
+    # Analyze sentence logic
+    analyze_btn.click(
+        fn=analyze_joined_video,
+        inputs=[joined_video, num_signs_input, auto_detect],
+        outputs=[results_output, current_sentence, current_details]
+    )
+    # Feedback logic
+    def submit_feedback_wrapper(predicted, corrected, details):
+        if not corrected or corrected.strip() == "":
+            return "Please enter the correct sentence."
+        num_videos = len(details) if details else 0
+        return save_sentence_feedback(predicted, corrected, num_videos)
+    feedback_btn.click(
+        fn=submit_feedback_wrapper,
+        inputs=[current_sentence, correct_sentence_input, current_details],
+        outputs=[feedback_output]
+    )
+    # Clear button
+    def clear_all():
+        return None, True, 3, "**Upload your video and click 'Analyze Sentence'.**", "", [], ""
+    clear_btn.click(
+        fn=clear_all,
+        outputs=[joined_video, auto_detect, num_signs_input, results_output, current_sentence, current_details, feedback_output]
+    )
+    # Example section
+    gr.Markdown("""
+    ---
+    ### 📝 Complete Example Workflow
+    **Goal:** Translate "Hello how good" in sign language
+    **Step 1: Record Your Signs**
+    - Sign 1: "Hello" (performer holds sign for 2 seconds)
+    - Sign 2: "How" (performer holds sign for 1 second)
+    - Sign 3: "Good" (performer holds sign for 3 seconds)
+    **Step 2: Join in CapCut**
+    - Import all 3 videos
+    - Arrange in order: Hello → How → Good
+    - Export as ONE video (6 seconds total)
+    **Step 3: Upload & Analyze**
+    - Upload the 6-second video here
+    - Enable "Automatic Detection" ✅
+    - Set "Expected signs" to 3
+    - Click "Analyze Sentence"
+    **Step 4: Result**
+    - 🤖 AI detects 3 segments automatically:
+      - Position 1: "Hello" (0-2 seconds, 87% confidence)
+      - Position 2: "How" (2-3 seconds, 91% confidence)
+      - Position 3: "Good" (3-6 seconds, 85% confidence)
+    - **Final Sentence:** "Hello How Good" ✅
+    ---
+    ### 🆚 When to Use Each Mode
+    | Scenario | Recommended Mode | Why |
+    |----------|-----------------|-----|
+    | Signs have different lengths | 🤖 Automatic | Detects boundaries precisely |
+    | You pause between signs | 🤖 Automatic | Pauses help detection |
+    | All signs exactly same duration | 📏 Manual | Simple equal split works |
+    | Fast, continuous signing | 📏 Manual | Motion detection may struggle |
+    | Professional recording | 🤖 Automatic | Better accuracy |
+    | Quick test/prototype | 📏 Manual | Faster processing |
+    """)
+# Launch
+if __name__ == "__main__":
+    demo.launch(share=True)
+                info="The video will be split into this many equal parts"
+            )
+            gr.Markdown("""
+            **💡 Tip:**
+            - Make sure each sign takes roughly the same time in your joined video
+            - Example: 3 signs × 2 seconds each = 6 second video
+            - The video will be split equally into segments
+            """)
+            with gr.Row():
+                analyze_btn = gr.Button("🚀 Analyze Sentence", variant="primary", scale=2)
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary", scale=1)
+        # Right side - Results
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎯 Translation Results")
+            results_output = gr.Markdown(
+                value="**Upload your joined video and click 'Analyze Sentence' to see the translation.**"
             )
             gr.Markdown("### 💡 Sentence Feedback")
     current_sentence = gr.State()
     current_details = gr.State()
+    # Analyze sentence logic
     analyze_btn.click(
+        fn=analyze_joined_video,
+        inputs=[joined_video, num_signs_input],
         outputs=[results_output, current_sentence, current_details]
     )
     # Clear button
     def clear_all():
+        return None, 3, "**Upload your video and click 'Analyze Sentence'.**", "", [], ""
     clear_btn.click(
         fn=clear_all,
+        outputs=[joined_video, num_signs_input, results_output, current_sentence, current_details, feedback_output]
     )
     # Example section
     gr.Markdown("""
     ---
+    ### 📝 Step-by-Step Example
+    **Goal:** Say "Hello how are you" in sign language
+    **Method 1: Using CapCut (Recommended)**
+    1. Record/film 4 separate videos:
+       - Video 1: Sign for "Hello" (2 seconds)
+       - Video 2: Sign for "How" (2 seconds)
+       - Video 3: Sign for "Are" (2 seconds)
+       - Video 4: Sign for "You" (2 seconds)
+    2. Open CapCut and **join the 4 videos** in order
+    3. Export as ONE video (8 seconds total)
+    4. Upload here and enter "4" for number of signs
+    5. Click "Analyze Sentence"
+    6. **Result:** "Hello How Are You" ✅
+    ---
+    **Method 2: Multiple Videos** *(if you prefer separate uploads)*
+    - Use the "Multi-Video Mode" (see tabs above)
     """)
 # Launch