Spaces:

Bliss-Ruth
/

Ugandan_sign_language_translation_tool

Sleeping

App Files Files Community

Bliss-Ruth commited on Nov 12

Commit

66dc1a0

verified ·

1 Parent(s): 24093cb

updated app.py

Browse files

Files changed (1) hide show

app.py +191 -31

app.py CHANGED Viewed

@@ -8,6 +8,9 @@ import numpy as np
 from PIL import Image
 import tempfile
 import os
 # Your exact model class
 class XCLIPSignLanguageClassifier(nn.Module):
@@ -38,15 +41,21 @@ processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 # Load your trained model
 try:
     checkpoint = torch.load("best_xclip_model.pth", map_location=device, weights_only=False)
-    model = XCLIPSignLanguageClassifier(num_classes=len(checkpoint["label_to_id"])).to(device)
     model.load_state_dict(checkpoint["model_state_dict"])
     model.eval()
     id_to_label = checkpoint["id_to_label"]
     print(f"✅ Model loaded! Can recognize {len(id_to_label)} signs: {list(id_to_label.values())}")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit(1)
 def extract_frames(video_path, num_frames=8):
     """Extract frames from video file"""
     try:
@@ -76,24 +85,8 @@ def extract_frames(video_path, num_frames=8):
         print(f"Frame extraction error: {e}")
         return [Image.new("RGB", (224, 224), (128, 128, 128)) for _ in range(num_frames)]
-def predict_video(video_file, user_correction=None):
-    """Predict sign language from uploaded video"""
-    try:
-        # Get prediction
-        predicted_label, confidence = predict_sign(video_file, model, processor, id_to_label, device)
-        # Format results - EXACT SAME as our Colab interface
-        result = f"🎯 **Prediction**: {predicted_label}\n"
-        result += f"📊 **Confidence**: {confidence*100:.1f}%\n"
-        result += f"🔍 **Model**: X-CLIP Fine-tuned"
-        return result
-    except Exception as e:
-        return f"❌ Error processing video: {str(e)}"
 def predict_sign(video_path, model, processor, id_to_label, device):
-    """Core prediction function"""
     try:
         # Sample frames
         frames = extract_frames(video_path)
@@ -110,23 +103,190 @@ def predict_sign(video_path, model, processor, id_to_label, device):
             logits = model(input_ids, attention_mask, pixel_values)
             probs = torch.softmax(logits, dim=1)
             confidence, pred_class = torch.max(probs, 1)
-        return id_to_label[pred_class.item()], confidence.item()
     except Exception as e:
         print(f"❌ Prediction error: {e}")
-        return "Unknown", 0.0
-# Create the interface - EXACT SAME as our Colab version
-demo = gr.Interface(
-    fn=predict_video,
-    inputs=gr.Video(label="📹 Upload Sign Language Video"),
-    outputs=gr.Markdown(label=" Prediction Results"),
-    title="🤟 Ugandan Sign Language Recognition",
-    description="Upload a video of sign language and the AI will predict which sign it is!",
-    examples=[]  # You can add example videos later
-)
 # For Hugging Face Spaces deployment
 if __name__ == "__main__":
-    demo.launch(share=True)

 from PIL import Image
 import tempfile
 import os
+import json
+from datetime import datetime
+import pandas as pd
 # Your exact model class
 class XCLIPSignLanguageClassifier(nn.Module):
 # Load your trained model
 try:
     checkpoint = torch.load("best_xclip_model.pth", map_location=device, weights_only=False)
+    model = XCLIPSignLanguageClassifier(num_classes=len(checkpoint["id_to_label"])).to(device)
     model.load_state_dict(checkpoint["model_state_dict"])
     model.eval()
     id_to_label = checkpoint["id_to_label"]
+    label_to_id = {v: k for k, v in id_to_label.items()}
     print(f"✅ Model loaded! Can recognize {len(id_to_label)} signs: {list(id_to_label.values())}")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
     exit(1)
+# Continuous learning storage
+FEEDBACK_FILE = "user_feedback.csv"
+if not os.path.exists(FEEDBACK_FILE):
+    pd.DataFrame(columns=['timestamp', 'video_path', 'predicted_label', 'correct_label', 'confidence']).to_csv(FEEDBACK_FILE, index=False)
 def extract_frames(video_path, num_frames=8):
     """Extract frames from video file"""
     try:
         print(f"Frame extraction error: {e}")
         return [Image.new("RGB", (224, 224), (128, 128, 128)) for _ in range(num_frames)]
 def predict_sign(video_path, model, processor, id_to_label, device):
+    """Core prediction function with detailed outputs"""
     try:
         # Sample frames
         frames = extract_frames(video_path)
             logits = model(input_ids, attention_mask, pixel_values)
             probs = torch.softmax(logits, dim=1)
             confidence, pred_class = torch.max(probs, 1)
+            # Get all probabilities for detailed analysis
+            all_probs = probs.cpu().numpy()[0]
+        predicted_label = id_to_label[pred_class.item()]
+        confidence_value = confidence.item()
+        # Create confidence breakdown
+        confidence_details = []
+        for i, prob in enumerate(all_probs):
+            confidence_details.append(f"{id_to_label[i]}: {prob*100:.1f}%")
+        return predicted_label, confidence_value, confidence_details, all_probs
     except Exception as e:
         print(f"❌ Prediction error: {e}")
+        return "Unknown", 0.0, [], []
+def save_feedback(video_path, predicted_label, correct_label, confidence):
+    """Save user feedback for continuous learning"""
+    try:
+        feedback_data = {
+            'timestamp': datetime.now().isoformat(),
+            'video_path': video_path,
+            'predicted_label': predicted_label,
+            'correct_label': correct_label,
+            'confidence': confidence
+        }
+        # Append to CSV
+        df = pd.read_csv(FEEDBACK_FILE)
+        df = pd.concat([df, pd.DataFrame([feedback_data])], ignore_index=True)
+        df.to_csv(FEEDBACK_FILE, index=False)
+        return f"✅ Feedback saved! We'll use this to improve the model."
+    except Exception as e:
+        return f"❌ Error saving feedback: {str(e)}"
+def predict_video(video_file):
+    """Predict sign language from uploaded video with detailed results"""
+    try:
+        if video_file is None:
+            return "## 📹 Please upload a video file", "", gr.update(visible=False)
+        # Get detailed prediction
+        predicted_label, confidence, confidence_details, all_probs = predict_sign(
+            video_file, model, processor, id_to_label, device
+        )
+        # Create detailed results
+        result = f"""
+##  **Sign Language Translation Result**:
+### **Detected Sign:** {predicted_label}
+### **Confidence Level:** {confidence*100:.1f}%
+### **Translation:** This sign means "{predicted_label}" in Ugandan Sign Language
+---
+##  Detailed Analysis:
+**Confidence Breakdown:**
+"""
+        # Add confidence bars for each class
+        for i, (label, prob) in enumerate(zip(id_to_label.values(), all_probs)):
+            bar_length = int(prob * 20)  # Scale to 20 characters
+            bar = "█" * bar_length + "░" * (20 - bar_length)
+            result += f"\n**{label}:** {bar} {prob*100:.1f}%"
+        result += f"""
+---
+### 🔧 Model Information:
+- **Model:** X-CLIP Fine-tuned on Ugandan Sign Language
+- **Supported Signs:** {len(id_to_label)} classes
+- **Top Confidence:** {confidence*100:.1f}%
+- **All Classes:** {', '.join(id_to_label.values())}
+---
+**🤔 Was this prediction correct?** Use the feedback section below to help improve the model!
+"""
+        # Show feedback section
+        feedback_section = gr.update(visible=True)
+        return result, predicted_label, feedback_section
+    except Exception as e:
+        return f"## ❌ Error Processing Video\n\n**Error:** {str(e)}\n\nPlease try another video file.", "", gr.update(visible=False)
+def submit_feedback(predicted_label, user_correction, video_path):
+    """Handle user feedback for continuous learning"""
+    if user_correction == "" or user_correction is None:
+        return "⚠️ Please select the correct sign label"
+    if user_correction == predicted_label:
+        return "✅ Thank you for confirming the prediction was correct!"
+    # Save correction feedback
+    result = save_feedback(video_path, predicted_label, user_correction, 0.0)
+    # Additional improvement message
+    result += f"\n\n📈 **Model Improvement:** The model will learn from this correction!"
+    result += f"\n**Wrong:** {predicted_label} → **Correct:** {user_correction}"
+    result += f"\n\n💡 This feedback will be used to retrain and improve the model accuracy."
+    return result
+# Create the enhanced interface
+with gr.Blocks(theme=gr.themes.Soft(), title="Ugandan Sign Language Translator") as demo:
+    gr.Markdown("""
+    # 🤟 Ugandan Sign Language Translation Tool
+    **Upload a video of Ugandan Sign Language and get instant translation with detailed analysis!**
+    *Supported signs: hello, how, good, please, sign language*
+    """)
+    with gr.Row():
+        with gr.Column():
+            video_input = gr.Video(
+                label="📹 Upload Sign Language Video",
+                sources=["upload"],
+                type="filepath"
+            )
+            predict_btn = gr.Button("🚀 Analyze Sign Language", variant="primary")
+        with gr.Column():
+            results_output = gr.Markdown(
+                label="🎯 Translation Results",
+                value="## 📤 Upload a video to get started..."
+            )
+    # Hidden state for current prediction
+    current_prediction = gr.State()
+    current_video_path = gr.State()
+    # Feedback section (initially hidden)
+    with gr.Row(visible=False) as feedback_row:
+        with gr.Column():
+            gr.Markdown("## 💡 Help Improve The Model")
+            correction_dropdown = gr.Dropdown(
+                choices=list(id_to_label.values()),
+                label="What was the correct sign?",
+                info="Select the actual sign in the video"
+            )
+            feedback_btn = gr.Button("📈 Submit Correction", variant="secondary")
+            feedback_output = gr.Markdown()
+    # Prediction logic
+    predict_btn.click(
+        fn=predict_video,
+        inputs=[video_input],
+        outputs=[results_output, current_prediction, feedback_row]
+    ).then(
+        lambda video: video,
+        inputs=[video_input],
+        outputs=[current_video_path]
+    )
+    # Feedback logic
+    feedback_btn.click(
+        fn=submit_feedback,
+        inputs=[current_prediction, correction_dropdown, current_video_path],
+        outputs=[feedback_output]
+    )
+    # Examples
+    gr.Markdown("### 📚 How to use:")
+    gr.Markdown("""
+    1. **Upload** a video of someone performing sign language
+    2. **Click Analyze** to get the translation
+    3. **Review** the detailed confidence analysis
+    4. **Provide feedback** if the prediction was wrong (this helps improve the model!)
+    """)
 # For Hugging Face Spaces deployment
 if __name__ == "__main__":
+    demo.launch(
+        share=True,
+        show_error=True
+    )