abbasNoway commited on
Commit
3a680d6
·
verified ·
1 Parent(s): 66bc908

create urdu_specific_embedding.py

Browse files
Files changed (1) hide show
  1. urdu_specific_embedding.py +39 -128
urdu_specific_embedding.py CHANGED
@@ -1,94 +1,55 @@
1
- # urdu_specific_embedding.py
2
- import pandas as pd
3
- import numpy as np
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  import pickle
7
  import os
8
- import torch
9
-
10
- def create_urdu_optimized_model(input_csv, output_dir, top_k_emojis=80):
11
- """Create model optimized for Urdu with better emoji selection"""
12
- print("Creating Urdu-optimized embedding model...")
13
-
14
- # Load data
15
- df = pd.read_csv(input_csv)
16
-
17
- # Get most common emojis but filter out very specific/rare ones
18
- emoji_counts = df['emoji'].value_counts()
19
-
20
- # Manual selection of most meaningful emojis (remove flags, symbols, etc.)
21
- meaningful_emojis = [
22
- '😂', '❤', '💔', '😜', '😁', '❤️', '😍', '🌹', '🙏', '🔥',
23
- '😊', '😅', '😭', '😀', '💕', '💯', '😉', '😆', '😝', '😏',
24
- '😌', '👍', '😢', '😔', '😐', '💞', '😒', '😎', '👌', '😋',
25
- '😄', '😡', '🤔', '🌸', '✨', '🌷', '😕', '😇', '✌', '😃',
26
- '😑', '😳', '😛', '💪', '😥', '👏', '🤣', '💐', '😬', '💖',
27
- '🌚', '😷', '🌺', '😘', '😠', '💓', '☺', '😞', '💗', '🙌',
28
- '😪', '🍃', '☹️', '🥀', '😹', '💙', '🌻', '😱', '🤪', '🙃',
29
- '💝', '😓', '🌼', '😣', '🤦‍♂️', '🎉', '🎊', '🥰', '🤗', '😴'
30
- ]
31
-
32
- # Use our selected meaningful emojis
33
- common_emojis = [e for e in meaningful_emojis if e in emoji_counts.index][:top_k_emojis]
34
-
35
- focused_df = df[df['emoji'].isin(common_emojis)]
36
-
37
- print(f"Urdu-optimized dataset: {len(focused_df)} samples, {len(common_emojis)} emojis")
38
-
39
- # Try different model for Urdu
40
- model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2') # Larger model
41
-
42
- # Use strategic sampling - more samples for common emojis
43
- emoji_to_texts = {}
44
- for emoji in common_emojis:
45
- emoji_texts = focused_df[focused_df['emoji'] == emoji]['text'].tolist()
46
- # Use more samples for better representation
47
- if len(emoji_texts) > 300:
48
- emoji_texts = emoji_texts[:300]
49
- emoji_to_texts[emoji] = emoji_texts
50
-
51
- print("Generating Urdu-optimized embeddings...")
52
- emoji_embeddings = {}
53
-
54
- for emoji, emoji_texts in emoji_to_texts.items():
55
- if len(emoji_texts) > 10: # Only use emojis with sufficient data
56
- text_embeds = model.encode(emoji_texts, convert_to_tensor=True, batch_size=16)
57
- emoji_embed = text_embeds.mean(dim=0).cpu().numpy()
58
- emoji_embeddings[emoji] = (emoji_embed, len(emoji_texts))
59
-
60
- # Save the optimized model
61
- os.makedirs(output_dir, exist_ok=True)
62
-
63
- model_data = {
64
- 'emoji_embeddings': emoji_embeddings,
65
- 'emoji_list': list(emoji_embeddings.keys()),
66
- 'model_name': 'paraphrase-multilingual-mpnet-base-v2'
67
- }
68
-
69
- with open(f'{output_dir}/urdu_optimized_model.pkl', 'wb') as f:
70
- pickle.dump(model_data, f)
71
-
72
- print(f"✅ Urdu-optimized model saved with {len(emoji_embeddings)} emojis")
73
- return output_dir
74
 
75
  class UrduOptimizedPredictor:
76
- def __init__(self, model_path):
77
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
78
  self.text_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
79
  self.text_model.to(self.device)
80
 
81
- # Load optimized model
82
- with open(f'{model_path}/urdu_optimized_model.pkl', 'rb') as f:
83
- model_data = pickle.load(f)
 
 
 
 
84
 
85
- self.emoji_embeddings = {k: v[0] for k, v in model_data['emoji_embeddings'].items()}
86
- self.emoji_list = model_data['emoji_list']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- print(f"✅ Loaded Urdu-optimized model with {len(self.emoji_list)} meaningful emojis")
 
 
 
 
89
 
90
- def predict_smart(self, text, top_k=3, min_confidence=0.4):
91
  """Smart prediction with confidence filtering"""
 
 
 
 
92
  # Get text embedding
93
  text_embedding = self.text_model.encode([text], convert_to_tensor=True)
94
  text_embedding_np = text_embedding.cpu().numpy()
@@ -108,54 +69,4 @@ class UrduOptimizedPredictor:
108
  top_overall = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:1]
109
  return top_overall
110
 
111
- return sorted_emojis[:top_k]
112
-
113
- def explain_prediction(self, text):
114
- """Provide explanation for predictions"""
115
- predictions = self.predict_smart(text, top_k=3, min_confidence=0.3)
116
-
117
- print(f"\n🧠 Analysis for: '{text}'")
118
- print("🎯 Top predictions:")
119
-
120
- for i, (emoji, score) in enumerate(predictions, 1):
121
- confidence = "HIGH" if score > 0.6 else "MEDIUM" if score > 0.4 else "LOW"
122
- print(f" {i}. {emoji} (score: {score:.3f}) - {confidence} confidence")
123
-
124
- return predictions
125
-
126
- def final_evaluation():
127
- """Final comprehensive evaluation"""
128
- # Create the optimized model
129
- optimized_dir = create_urdu_optimized_model(
130
- "urdu_emoji_training_data_proper.csv",
131
- "models/urdu_optimized_model",
132
- top_k_emojis=80
133
- )
134
-
135
- # Test the optimized model
136
- predictor = UrduOptimizedPredictor(optimized_dir)
137
-
138
- # Test on our key examples
139
- test_cases = [
140
- ("میں بہت خوش ہوں", "Should show 😊😄😂"),
141
- ("امی نے میری پسندیدہ ڈش بنائی ہے", "Should show 😋❤️🍛"),
142
- ("آج کا دن بہت برا گزرا", "Should show 😞😢💔"),
143
- ("دل ٹوٹ گیا ہے", "Should show 💔😭😢"),
144
- ("آج کی بارش نے موسم کو خوشگوار بنا دیا", "Should show 🌧️🌈☔"),
145
- ("دوستوں کے ساتھ پارٹی کا مزہ آیا", "Should show 🎉😄👯"),
146
- ("محبت میں پڑ گیا ہوں", "Should show ❤️😍💕"),
147
- ("غصہ سے دماغ پھٹ رہا ہے", "Should show 😠💢🤬"),
148
- ("نیند آ رہی ہے تھک گیا ہوں", "Should show 😴💤🥱")
149
- ]
150
-
151
- print("\n" + "="*60)
152
- print("FINAL URDU-OPTIMIZED PREDICTIONS")
153
- print("="*60)
154
-
155
- for text, expected in test_cases:
156
- predictor.explain_prediction(text)
157
- print(f" 💡 Expected: {expected}")
158
- print()
159
-
160
- if __name__ == "__main__":
161
- final_evaluation()
 
1
+ # urdu_specific_embedding.py (Updated)
2
+ import torch
 
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import pickle
6
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class UrduOptimizedPredictor:
9
+ def __init__(self, model_path=None):
10
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
11
  self.text_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
12
  self.text_model.to(self.device)
13
 
14
+ # Try multiple possible model file locations
15
+ possible_paths = [
16
+ "urdu_optimized_model.pkl", # Direct in root
17
+ "./urdu_optimized_model.pkl", # Current directory
18
+ "models/urdu_optimized_model/urdu_optimized_model.pkl", # Local structure
19
+ "/data/urdu_optimized_model.pkl" # HF Spaces data directory
20
+ ]
21
 
22
+ model_loaded = False
23
+ for model_file in possible_paths:
24
+ if os.path.exists(model_file):
25
+ print(f"📁 Loading model from: {model_file}")
26
+ try:
27
+ with open(model_file, 'rb') as f:
28
+ model_data = pickle.load(f)
29
+
30
+ self.emoji_embeddings = {k: v[0] for k, v in model_data['emoji_embeddings'].items()}
31
+ self.emoji_list = model_data['emoji_list']
32
+
33
+ print(f"✅ Loaded Urdu-optimized model with {len(self.emoji_list)} meaningful emojis")
34
+ model_loaded = True
35
+ break
36
+
37
+ except Exception as e:
38
+ print(f"❌ Error loading {model_file}: {e}")
39
+ continue
40
 
41
+ if not model_loaded:
42
+ print("❌ Could not load model file. Please make sure urdu_optimized_model.pkl is uploaded.")
43
+ # Create empty structures to avoid crashes
44
+ self.emoji_embeddings = {}
45
+ self.emoji_list = []
46
 
47
+ def predict_smart(self, text, top_k=3, min_confidence=0.3):
48
  """Smart prediction with confidence filtering"""
49
+ # Check if model is loaded
50
+ if not self.emoji_embeddings:
51
+ return [("❌", 0.0)] # Return error emoji if model not loaded
52
+
53
  # Get text embedding
54
  text_embedding = self.text_model.encode([text], convert_to_tensor=True)
55
  text_embedding_np = text_embedding.cpu().numpy()
 
69
  top_overall = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:1]
70
  return top_overall
71
 
72
+ return sorted_emojis[:top_k]