Spaces:

Bliss-Ruth
/

Ugandan_sign_language_translation_tool

Sleeping

App Files Files Community

Bliss-Ruth commited on Nov 12, 2025

Commit

3ab18de

verified ·

1 Parent(s): f4381e1

UPDATED app.PY

Browse files

Files changed (1) hide show

app.py +191 -156

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py
 import torch
 import torch.nn as nn
 from transformers import XCLIPProcessor, XCLIPModel
@@ -6,61 +6,66 @@ import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image
-import tempfile
-import os
 import pandas as pd
 from datetime import datetime
-# Your exact model class
-class XCLIPSignLanguageClassifier(nn.Module):
-    def __init__(self, num_classes, feature_dim=512):
         super().__init__()
-        self.xclip = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
-        for param in self.xclip.parameters():
-            param.requires_grad = False
         self.classifier = nn.Sequential(
-            nn.Dropout(0.5), nn.Linear(feature_dim, 128), nn.LayerNorm(128), nn.ReLU(),
-            nn.Dropout(0.3), nn.Linear(128, 64), nn.LayerNorm(64), nn.ReLU(),
-            nn.Dropout(0.2), nn.Linear(64, num_classes)
         )
-    def forward(self, input_ids, attention_mask, pixel_values):
-        with torch.no_grad():
-            outputs = self.xclip(input_ids=input_ids, attention_mask=attention_mask,
-                               pixel_values=pixel_values, return_dict=True)
-        video_embeds = outputs.video_embeds
-        return self.classifier(video_embeds)
-print("🚀 Loading Ugandan Sign Language Model...")
-# Initialize
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 # Load your trained model
 try:
     checkpoint = torch.load("best_xclip_model.pth", map_location=device, weights_only=False)
-    model = XCLIPSignLanguageClassifier(num_classes=len(checkpoint["id_to_label"])).to(device)
-    model.load_state_dict(checkpoint["model_state_dict"])
     model.eval()
-    id_to_label = checkpoint["id_to_label"]
     label_to_id = {v: k for k, v in id_to_label.items()}
-    print(f"✅ Model loaded! Can recognize {len(id_to_label)} signs: {list(id_to_label.values())}")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit(1)
-# Feedback system
-FEEDBACK_FILE = "user_feedback.csv"
-if not os.path.exists(FEEDBACK_FILE):
-    pd.DataFrame(columns=['timestamp', 'video_path', 'predicted_label', 'correct_label', 'confidence']).to_csv(FEEDBACK_FILE, index=False)
 def extract_frames(video_path, num_frames=8):
-    """Extract frames from video file"""
     try:
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
         if total_frames <= num_frames:
             indices = list(range(total_frames)) + [total_frames-1] * (num_frames - total_frames)
         else:
@@ -77,15 +82,16 @@ def extract_frames(video_path, num_frames=8):
                 frame = cv2.resize(frame, (224, 224))
                 frames.append(Image.fromarray(frame))
             else:
-                frames.append(Image.new("RGB", (224, 224), (128, 128, 128)))
         cap.release()
         return frames
     except Exception as e:
-        print(f"Frame extraction error: {e}")
-        return [Image.new("RGB", (224, 224), (128, 128, 128)) for _ in range(num_frames)]
-def predict_sign_enhanced(video_path):
-    """Enhanced prediction with detailed outputs"""
     try:
         frames = extract_frames(video_path)
@@ -97,22 +103,36 @@ def predict_sign_enhanced(video_path):
         attention_mask = text_inputs['attention_mask'].to(device)
         with torch.no_grad():
-            logits = model(input_ids, attention_mask, pixel_values)
             probs = torch.softmax(logits, dim=1)
             confidence, pred_class = torch.max(probs, 1)
-            all_probs = probs.cpu().numpy()[0]
         predicted_label = id_to_label[pred_class.item()]
         confidence_value = confidence.item()
-        return predicted_label, confidence_value, all_probs
     except Exception as e:
-        print(f"❌ Prediction error: {e}")
-        return "Unknown", 0.0, []
 def save_feedback(video_path, predicted_label, correct_label, confidence):
-    """Save user feedback and check if retraining is needed"""
     try:
         feedback_data = {
             'timestamp': datetime.now().isoformat(),
@@ -122,196 +142,211 @@ def save_feedback(video_path, predicted_label, correct_label, confidence):
             'confidence': confidence
         }
-        # Save feedback
         df = pd.read_csv(FEEDBACK_FILE)
         df = pd.concat([df, pd.DataFrame([feedback_data])], ignore_index=True)
         df.to_csv(FEEDBACK_FILE, index=False)
-        # Check if retraining is needed (5+ corrections)
         corrections = len(df[df['predicted_label'] != df['correct_label']])
         if corrections >= 5:
-            return f"✅ Feedback saved! 🚀 {corrections} corrections collected - ready for retraining!"
         else:
-            return f"✅ Feedback saved! 📊 {5-corrections} more corrections needed for retraining."
     except Exception as e:
         return f"❌ Error saving feedback: {str(e)}"
-def predict_video_enhanced(video_file):
-    """Enhanced prediction function"""
-    try:
-        if video_file is None:
-            return "## 📹 Please upload a video file", "", gr.update(visible=False), gr.update(value=None)
-        predicted_label, confidence, all_probs = predict_sign_enhanced(video_file)
-        # Create detailed results
-        result = f"""
-## 🎯 Sign Language Translation Result:
-### **Detected Sign:** {predicted_label}
-### **Confidence Level:** {confidence*100:.1f}%
-### **Translation:** This sign means "{predicted_label}" in Ugandan Sign Language
----
-## 📊 Detailed Analysis:
-**Confidence Breakdown:**
 """
-        # Add confidence bars for each class
-        for i, (label, prob) in enumerate(zip(id_to_label.values(), all_probs)):
-            bar_length = int(prob * 20)
-            bar = "█" * bar_length + "░" * (20 - bar_length)
-            result += f"\n**{label}:** {bar} {prob*100:.1f}%"
-        # Check feedback status
-        try:
-            feedback_df = pd.read_csv(FEEDBACK_FILE)
-            corrections = len(feedback_df[feedback_df['predicted_label'] != feedback_df['correct_label']])
-            result += f"\n\n---\n**📈 Learning Progress:** {corrections}/5 corrections collected for next retraining"
-        except:
-            result += f"\n\n---\n**📈 Learning Progress:** 0/5 corrections collected for next retraining"
-        result += f"""
----
-### 🔧 Model Information:
-- **Model:** X-CLIP Fine-tuned on Ugandan Sign Language
-- **Supported Signs:** {len(id_to_label)} classes
-- **Top Confidence:** {confidence*100:.1f}%
 ---
-**🤔 Was this prediction correct?** Use the feedback section below to help improve the model!
 """
-        return result, predicted_label, gr.update(visible=True), video_file, confidence
     except Exception as e:
-        return f"## ❌ Error Processing Video\n\n**Error:** {str(e)}", "", gr.update(visible=False), None, 0.0
-def submit_feedback_enhanced(predicted_label, user_correction, video_path, confidence):
-    """Enhanced feedback handling"""
     if user_correction == "" or user_correction is None:
-        return "⚠️ Please select the correct sign label"
     result = save_feedback(video_path, predicted_label, user_correction, confidence)
     if user_correction != predicted_label:
-        result += f"\n\n🎯 **Correction:** '{predicted_label}' → '{user_correction}'"
-        result += f"\n💡 Thank you for helping improve the model accuracy!"
     return result
-def get_feedback_stats():
-    """Get current feedback statistics"""
-    try:
-        feedback_df = pd.read_csv(FEEDBACK_FILE)
-        total = len(feedback_df)
-        corrections = len(feedback_df[feedback_df['predicted_label'] != feedback_df['correct_label']])
-        return f"**Feedback Collected:** {corrections} corrections ({total} total)\n**Retraining Ready:** { '✅' if corrections >= 5 else '❌' }"
-    except:
-        return "**Feedback Collected:** 0 corrections\n**Retraining Ready:** ❌"
-# Create the enhanced interface
-with gr.Blocks(
-    theme=gr.themes.Soft(primary_hue="teal"),
-    title="🤟 Ugandan Sign Language Translator"
-) as demo:
     gr.Markdown("""
-    # 🤟 Ugandan Sign Language Translation Tool
-    ### *With Continuous Learning from User Feedback*
     """)
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Video")
             video_input = gr.Video(
-                label="Sign Language Video",
-                sources=["upload"],
-                height=300
             )
             with gr.Row():
-                predict_btn = gr.Button("🚀 Analyze Sign", variant="primary", size="lg")
-                clear_btn = gr.Button("🔄 Clear", variant="secondary")
-        with gr.Column(scale=2):
-            gr.Markdown("### 🎯 Analysis Results")
             results_output = gr.Markdown(
-                value="## 📤 Upload a video to get started..."
             )
-    # Hidden states
-    current_prediction = gr.State()
-    current_video_path = gr.State()
-    current_confidence = gr.State()
-    # Feedback section
-    with gr.Row(visible=False) as feedback_row:
         with gr.Column():
-            gr.Markdown("## 💡 Help Improve The Model")
             with gr.Row():
                 correction_dropdown = gr.Dropdown(
                     choices=list(id_to_label.values()),
-                    label="Select Correct Sign",
-                    scale=3
                 )
-                feedback_btn = gr.Button("📈 Submit Feedback", variant="primary", scale=1)
             feedback_output = gr.Markdown()
-    # Statistics section
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### 📊 Learning Progress")
-            stats_display = gr.Markdown()
-    # Update stats function
-    def update_stats():
-        return get_feedback_stats()
     # Prediction logic
     predict_btn.click(
-        fn=predict_video_enhanced,
         inputs=[video_input],
-        outputs=[results_output, current_prediction, feedback_row, current_video_path, current_confidence]
-    ).then(
-        lambda: update_stats(),
-        outputs=[stats_display]
     )
     # Feedback logic
     feedback_btn.click(
-        fn=submit_feedback_enhanced,
         inputs=[current_prediction, correction_dropdown, current_video_path, current_confidence],
         outputs=[feedback_output]
-    ).then(
-        lambda: update_stats(),
-        outputs=[stats_display]
     )
-    # Clear button
-    def clear_all():
-        return None, "## 📤 Upload a video to get started...", "", gr.update(visible=False), None, 0.0, "", update_stats()
     clear_btn.click(
-        fn=clear_all,
-        outputs=[video_input, results_output, current_prediction, feedback_row, current_video_path, current_confidence, feedback_output, stats_display]
-    )
-    # Initialize stats
-    demo.load(
-        fn=update_stats,
-        outputs=[stats_display]
     )
 # Launch the app
 if __name__ == "__main__":
-    demo.launch(share=True)

+# app.py - CLEAN MINIMAL INTERFACE (No Confidence Bars/Tabs)
 import torch
 import torch.nn as nn
 from transformers import XCLIPProcessor, XCLIPModel
 import cv2
 import numpy as np
 from PIL import Image
 import pandas as pd
 from datetime import datetime
+import os
+print("🚀 Loading Ugandan Sign Language Model...")
+# ============================================================================
+# MODEL SETUP
+# ============================================================================
+class MinimalClassifier(nn.Module):
+    def __init__(self, input_dim=512, num_classes=85, dropout=0.5):
         super().__init__()
         self.classifier = nn.Sequential(
+            nn.Dropout(dropout),
+            nn.Linear(input_dim, num_classes)
         )
+    def forward(self, x):
+        return self.classifier(x)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
+xclip_model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32").to(device)
+xclip_model.eval()
 # Load your trained model
 try:
     checkpoint = torch.load("best_xclip_model.pth", map_location=device, weights_only=False)
+    model = MinimalClassifier(
+        input_dim=512,
+        num_classes=checkpoint['num_classes'],
+        dropout=0.5
+    ).to(device)
+    model.load_state_dict(checkpoint['model_state_dict'])
     model.eval()
+    id_to_label = checkpoint['id_to_label']
     label_to_id = {v: k for k, v in id_to_label.items()}
+    print(f"✅ Model loaded! Can recognize {len(id_to_label)} signs")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit(1)
+# ============================================================================
+# CORE FUNCTIONS
+# ============================================================================
 def extract_frames(video_path, num_frames=8):
+    """Extract frames from video"""
     try:
         cap = cv2.VideoCapture(video_path)
         total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        if total_frames == 0:
+            cap.release()
+            return [Image.new('RGB', (224, 224), (0, 0, 0)) for _ in range(num_frames)]
         if total_frames <= num_frames:
             indices = list(range(total_frames)) + [total_frames-1] * (num_frames - total_frames)
         else:
                 frame = cv2.resize(frame, (224, 224))
                 frames.append(Image.fromarray(frame))
             else:
+                frames.append(Image.new('RGB', (224, 224), (0, 0, 0)))
         cap.release()
         return frames
     except Exception as e:
+        return [Image.new('RGB', (224, 224), (0, 0, 0)) for _ in range(num_frames)]
+def predict_sign(video_path):
+    """Predict sign from video"""
     try:
         frames = extract_frames(video_path)
         attention_mask = text_inputs['attention_mask'].to(device)
         with torch.no_grad():
+            outputs = xclip_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                pixel_values=pixel_values,
+                return_dict=True
+            )
+            video_embeds = outputs.video_embeds
+            logits = model(video_embeds)
             probs = torch.softmax(logits, dim=1)
             confidence, pred_class = torch.max(probs, 1)
         predicted_label = id_to_label[pred_class.item()]
         confidence_value = confidence.item()
+        return predicted_label, confidence_value
     except Exception as e:
+        return "Unknown", 0.0
+# ============================================================================
+# FEEDBACK SYSTEM
+# ============================================================================
+FEEDBACK_FILE = "user_feedback.csv"
+if not os.path.exists(FEEDBACK_FILE):
+    pd.DataFrame(columns=['timestamp', 'video_path', 'predicted_label', 'correct_label', 'confidence']).to_csv(FEEDBACK_FILE, index=False)
 def save_feedback(video_path, predicted_label, correct_label, confidence):
+    """Save user feedback"""
     try:
         feedback_data = {
             'timestamp': datetime.now().isoformat(),
             'confidence': confidence
         }
         df = pd.read_csv(FEEDBACK_FILE)
         df = pd.concat([df, pd.DataFrame([feedback_data])], ignore_index=True)
         df.to_csv(FEEDBACK_FILE, index=False)
         corrections = len(df[df['predicted_label'] != df['correct_label']])
         if corrections >= 5:
+            return f"✅ Thank you! Ready for model improvement ({corrections}/5)"
         else:
+            return f"✅ Thank you! {5-corrections} more needed for retraining"
     except Exception as e:
         return f"❌ Error saving feedback: {str(e)}"
+# ============================================================================
+# CLEAN GRADIO INTERFACE - MINIMAL
+# ============================================================================
+# Custom CSS for clean orange/black theme
+custom_css = """
+.gradio-container {
+    background: linear-gradient(135deg, #1a1a1a 0%, #2d2d2d 100%);
+    font-family: 'Arial', sans-serif;
+    max-width: 900px !important;
+    margin: 0 auto !important;
+}
+h1 {
+    color: #ff6b35 !important;
+    text-align: center;
+    margin-bottom: 10px !important;
+}
+.gr-markdown p {
+    color: #cccccc !important;
+    text-align: center;
+    font-size: 16px !important;
+}
+.gr-box {
+    border: 2px dashed #ff6b35 !important;
+    background: #2d2d2d !important;
+    border-radius: 10px !important;
+}
+.primary {
+    background: #ff6b35 !important;
+    border: none !important;
+    color: white !important;
+    font-weight: bold !important;
+}
+.primary:hover {
+    background: #e55a2b !important;
+}
+.secondary {
+    background: #444444 !important;
+    border: 1px solid #ff6b35 !important;
+    color: white !important;
+}
+.secondary:hover {
+    background: #555555 !important;
+}
+.gr-dropdown {
+    background: #2d2d2d !important;
+    color: white !important;
+    border: 1px solid #ff6b35 !important;
+}
+/* Results styling */
+.results-box {
+    background: #2d2d2d !important;
+    padding: 20px !important;
+    border-radius: 10px !important;
+    border-left: 4px solid #ff6b35 !important;
+    margin-top: 20px !important;
+}
 """
+def predict_video_clean(video_file):
+    """Clean prediction function - simple output only"""
+    try:
+        if video_file is None:
+            return "**Please upload a sign language video to get started.**", gr.update(visible=False)
+        predicted_label, confidence = predict_sign(video_file)
+        # SIMPLE CLEAN RESULTS - NO CONFIDENCE BARS
+        result = f"""
+## Sign Language Translation Result
+**Detected Sign:** {predicted_label}
+**Confidence:** {confidence*100:.1f}%
+**Translation:** This sign means "{predicted_label}" in Ugandan Sign Language
 ---
+**Model Information:**
+- Model: X-CLIP Fine-tuned
+- Classes: {len(id_to_label)} signs
+- Training: Ugandan Sign Language Dataset
+*Think the prediction is wrong? Help improve the model below.*
 """
+        return result, gr.update(visible=True), predicted_label, video_file, confidence
     except Exception as e:
+        return f"**Error processing video:** {str(e)}", gr.update(visible=False), "", None, 0.0
+def submit_feedback_clean(predicted_label, user_correction, video_path, confidence):
+    """Clean feedback submission"""
     if user_correction == "" or user_correction is None:
+        return "Please select what the sign actually was."
     result = save_feedback(video_path, predicted_label, user_correction, confidence)
     if user_correction != predicted_label:
+        result += f"\n\nCorrection recorded: **{predicted_label}** → **{user_correction}**"
     return result
+# ============================================================================
+# CREATE CLEAN MINIMAL INTERFACE
+# ============================================================================
+with gr.Blocks(css=custom_css, title="Ugandan Sign Language Translator") as demo:
     gr.Markdown("""
+    # 🤟 Ugandan Sign Language Translator
+    *Upload a video of Ugandan Sign Language and get instant translation!*
+    **Supported signs:** hello, how, good, please, sign language, and more...
     """)
+    # Main content - simple two column layout
     with gr.Row():
+        # Left column - Upload
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Video")
             video_input = gr.Video(
+                label="",
+                sources=["upload"]
             )
+            # Action buttons
             with gr.Row():
+                predict_btn = gr.Button("🚀 Analyze Sign", variant="primary", scale=2)
+                clear_btn = gr.Button("🗑️ Clear", variant="secondary", scale=1)
+        # Right column - Results
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎯 Results")
             results_output = gr.Markdown(
+                value="**Upload a sign language video to begin analysis.**"
             )
+    # Feedback section (hidden until needed)
+    with gr.Row(visible=False) as feedback_section:
         with gr.Column():
+            gr.Markdown("### 💡 Help Improve Accuracy")
             with gr.Row():
                 correction_dropdown = gr.Dropdown(
                     choices=list(id_to_label.values()),
+                    label="If the prediction was wrong, select the correct sign:",
+                    value=""
                 )
+                feedback_btn = gr.Button("📝 Submit Correction", variant="secondary")
             feedback_output = gr.Markdown()
+    # Hidden states
+    current_prediction = gr.State()
+    current_video_path = gr.State()
+    current_confidence = gr.State()
     # Prediction logic
     predict_btn.click(
+        fn=predict_video_clean,
         inputs=[video_input],
+        outputs=[results_output, feedback_section, current_prediction, current_video_path, current_confidence]
     )
     # Feedback logic
     feedback_btn.click(
+        fn=submit_feedback_clean,
         inputs=[current_prediction, correction_dropdown, current_video_path, current_confidence],
         outputs=[feedback_output]
     )
+    # Clear button - resets everything
+    def clear_interface():
+        return None, "**Upload a sign language video to begin analysis.**", gr.update(visible=False), "", None, 0.0, ""
     clear_btn.click(
+        fn=clear_interface,
+        outputs=[video_input, results_output, feedback_section, current_prediction, current_video_path, current_confidence, feedback_output]
     )
 # Launch the app
 if __name__ == "__main__":
+    demo.launch(
+        share=True
+    )