Spaces:

Bliss-Ruth
/

Ugandan_sign_language_translation_tool

Sleeping

App Files Files Community

Bliss-Ruth commited on Nov 12

Commit

908b58d

verified ·

1 Parent(s): a2c84bd

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -27

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - CLEAN MINIMAL INTERFACE (No Confidence Bars/Tabs)
 import torch
 import torch.nn as nn
 from transformers import XCLIPProcessor, XCLIPModel
@@ -13,10 +13,11 @@ import os
 print("🚀 Loading Ugandan Sign Language Model...")
 # ============================================================================
-# MODEL SETUP - FIXED VERSION
 # ============================================================================
 class MinimalClassifier(nn.Module):
     def __init__(self, input_dim=512, num_classes=85, dropout=0.5):
         super().__init__()
         self.classifier = nn.Sequential(
@@ -32,36 +33,24 @@ processor = XCLIPProcessor.from_pretrained("microsoft/xclip-base-patch32")
 xclip_model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32").to(device)
 xclip_model.eval()
-# Load your trained model - WITH ERROR HANDLING
 try:
-    checkpoint = torch.load("best_xclip_model.pth", map_location=device, weights_only=False)
     # DEBUG: Check what's in the checkpoint
     print(f"🔍 Checkpoint keys: {list(checkpoint.keys())}")
-    # FIX: Handle missing 'num_classes' key
     if 'num_classes' in checkpoint:
         num_classes = checkpoint['num_classes']
     else:
-        # Try to infer number of classes
-        if 'id_to_label' in checkpoint:
-            num_classes = len(checkpoint['id_to_label'])
-        elif 'label_to_id' in checkpoint:
-            num_classes = len(checkpoint['label_to_id'])
-        else:
-            # Count from model weights
-            for key in checkpoint.keys():
-                if 'model_state_dict' in checkpoint:
-                    weight_key = [k for k in checkpoint['model_state_dict'].keys() if 'classifier' in k and 'weight' in k]
-                    if weight_key:
-                        num_classes = checkpoint['model_state_dict'][weight_key[0]].shape[0]
-                        break
-            else:
-                num_classes = 85  # Default fallback
     print(f"✅ Using num_classes: {num_classes}")
-    # Initialize model
     model = MinimalClassifier(
         input_dim=512,
         num_classes=num_classes,
@@ -72,14 +61,12 @@ try:
     if 'model_state_dict' in checkpoint:
         model.load_state_dict(checkpoint['model_state_dict'])
     else:
-        # If checkpoint IS the state dict
         model.load_state_dict(checkpoint)
     # Load label mappings
     if 'id_to_label' in checkpoint:
         id_to_label = checkpoint['id_to_label']
     else:
-        # Create default mapping
         id_to_label = {i: f"class_{i}" for i in range(num_classes)}
         print("⚠️ Created default label mapping")
@@ -91,7 +78,7 @@ try:
 except Exception as e:
     print(f"❌ Error loading model: {e}")
-    print("💡 TIP: Make sure your model file has 'num_classes' or 'id_to_label' key")
     exit(1)
 # ============================================================================
@@ -133,7 +120,7 @@ def extract_frames(video_path, num_frames=8):
         return [Image.new('RGB', (224, 224), (0, 0, 0)) for _ in range(num_frames)]
 def predict_sign(video_path):
-    """Predict sign from video"""
     try:
         frames = extract_frames(video_path)
@@ -145,6 +132,7 @@ def predict_sign(video_path):
         attention_mask = text_inputs['attention_mask'].to(device)
         with torch.no_grad():
             outputs = xclip_model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -153,6 +141,7 @@ def predict_sign(video_path):
             )
             video_embeds = outputs.video_embeds
             logits = model(video_embeds)
             probs = torch.softmax(logits, dim=1)
             confidence, pred_class = torch.max(probs, 1)
@@ -265,6 +254,16 @@ h1 {
     border-left: 4px solid #ff6b35 !important;
     margin-top: 20px !important;
 }
 """
 def predict_video_clean(video_file):
@@ -331,9 +330,12 @@ with gr.Blocks(css=custom_css, title="Ugandan Sign Language Translator") as demo
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Video")
             video_input = gr.Video(
-                label="",
-                sources=["upload"]
             )
             # Action buttons
             with gr.Row():
@@ -359,6 +361,27 @@ with gr.Blocks(css=custom_css, title="Ugandan Sign Language Translator") as demo
                 )
                 feedback_btn = gr.Button("📝 Submit Correction", variant="secondary")
             feedback_output = gr.Markdown()
     # Hidden states
     current_prediction = gr.State()

+# app.py - CORRECTED VERSION (Uses MinimalClassifier from your training)
 import torch
 import torch.nn as nn
 from transformers import XCLIPProcessor, XCLIPModel
 print("🚀 Loading Ugandan Sign Language Model...")
 # ============================================================================
+# MODEL SETUP - MINIMALCLASSIFIER (Matches Your Training)
 # ============================================================================
 class MinimalClassifier(nn.Module):
+    """SIMPLE classifier - matches your training notebook exactly"""
     def __init__(self, input_dim=512, num_classes=85, dropout=0.5):
         super().__init__()
         self.classifier = nn.Sequential(
 xclip_model = XCLIPModel.from_pretrained("microsoft/xclip-base-patch32").to(device)
 xclip_model.eval()
+# Load your trained model - WITH MINIMALCLASSIFIER
 try:
+    checkpoint = torch.load("finetuned_xclip_model.pth", map_location=device, weights_only=False)
     # DEBUG: Check what's in the checkpoint
     print(f"🔍 Checkpoint keys: {list(checkpoint.keys())}")
+    # Get num_classes
     if 'num_classes' in checkpoint:
         num_classes = checkpoint['num_classes']
+    elif 'id_to_label' in checkpoint:
+        num_classes = len(checkpoint['id_to_label'])
     else:
+        num_classes = 85  # Default
     print(f"✅ Using num_classes: {num_classes}")
+    # Initialize with MINIMALCLASSIFIER (your actual architecture)
     model = MinimalClassifier(
         input_dim=512,
         num_classes=num_classes,
     if 'model_state_dict' in checkpoint:
         model.load_state_dict(checkpoint['model_state_dict'])
     else:
         model.load_state_dict(checkpoint)
     # Load label mappings
     if 'id_to_label' in checkpoint:
         id_to_label = checkpoint['id_to_label']
     else:
         id_to_label = {i: f"class_{i}" for i in range(num_classes)}
         print("⚠️ Created default label mapping")
 except Exception as e:
     print(f"❌ Error loading model: {e}")
+    print("💡 Make sure your model file uses MinimalClassifier architecture")
     exit(1)
 # ============================================================================
         return [Image.new('RGB', (224, 224), (0, 0, 0)) for _ in range(num_frames)]
 def predict_sign(video_path):
+    """Predict sign from video """
     try:
         frames = extract_frames(video_path)
         attention_mask = text_inputs['attention_mask'].to(device)
         with torch.no_grad():
+            # Extract features using X-CLIP
             outputs = xclip_model(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
             )
             video_embeds = outputs.video_embeds
+            # Classify with MinimalClassifier (takes features as input)
             logits = model(video_embeds)
             probs = torch.softmax(logits, dim=1)
             confidence, pred_class = torch.max(probs, 1)
     border-left: 4px solid #ff6b35 !important;
     margin-top: 20px !important;
 }
+/* Add to your custom_css */
+#video-upload {
+    border: 2px dashed #ff6b35 !important;
+}
+#video-upload:hover {
+    border-color: #e55a2b !important;
+    background: #3d3d3d !important;
+}
 """
 def predict_video_clean(video_file):
         with gr.Column(scale=1):
             gr.Markdown("### 📤 Upload Video")
             video_input = gr.Video(
+                label="📱 Upload or Record Video",
+                sources=["upload", "webcam"]
+                elem_id="video-upload"
             )
             # Action buttons
             with gr.Row():
                 )
                 feedback_btn = gr.Button("📝 Submit Correction", variant="secondary")
             feedback_output = gr.Markdown()
+    gr.Markdown("---")
+    gr.Markdown("### 📚 Example Videos")
+    # Create examples from your dataset (same as your testing UI)
+    example_videos = []
+    for i in range(min(3, len(full_df))):
+        if os.path.exists(full_df.iloc[i]['video_path']):
+            example_videos.append([full_df.iloc[i]['video_path']])
+    if example_videos:
+        gr.Examples(
+            examples=example_videos,
+            inputs=[video_input],
+            label="Try these example videos:",
+            # Optional: You can also add outputs if you want auto-prediction
+            # outputs=[results_output],
+            # fn=predict_video_clean,
+        )
+    else:
+        gr.Markdown("*No example videos available*")
     # Hidden states
     current_prediction = gr.State()