Spaces:
Running
Running
create urdu_specific_embedding.py
Browse files- urdu_specific_embedding.py +39 -128
urdu_specific_embedding.py
CHANGED
|
@@ -1,94 +1,55 @@
|
|
| 1 |
-
# urdu_specific_embedding.py
|
| 2 |
-
import
|
| 3 |
-
import numpy as np
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
import pickle
|
| 7 |
import os
|
| 8 |
-
import torch
|
| 9 |
-
|
| 10 |
-
def create_urdu_optimized_model(input_csv, output_dir, top_k_emojis=80):
|
| 11 |
-
"""Create model optimized for Urdu with better emoji selection"""
|
| 12 |
-
print("Creating Urdu-optimized embedding model...")
|
| 13 |
-
|
| 14 |
-
# Load data
|
| 15 |
-
df = pd.read_csv(input_csv)
|
| 16 |
-
|
| 17 |
-
# Get most common emojis but filter out very specific/rare ones
|
| 18 |
-
emoji_counts = df['emoji'].value_counts()
|
| 19 |
-
|
| 20 |
-
# Manual selection of most meaningful emojis (remove flags, symbols, etc.)
|
| 21 |
-
meaningful_emojis = [
|
| 22 |
-
'😂', '❤', '💔', '😜', '😁', '❤️', '😍', '🌹', '🙏', '🔥',
|
| 23 |
-
'😊', '😅', '😭', '😀', '💕', '💯', '😉', '😆', '😝', '😏',
|
| 24 |
-
'😌', '👍', '😢', '😔', '😐', '💞', '😒', '😎', '👌', '😋',
|
| 25 |
-
'😄', '😡', '🤔', '🌸', '✨', '🌷', '😕', '😇', '✌', '😃',
|
| 26 |
-
'😑', '😳', '😛', '💪', '😥', '👏', '🤣', '💐', '😬', '💖',
|
| 27 |
-
'🌚', '😷', '🌺', '😘', '😠', '💓', '☺', '😞', '💗', '🙌',
|
| 28 |
-
'😪', '🍃', '☹️', '🥀', '😹', '💙', '🌻', '😱', '🤪', '🙃',
|
| 29 |
-
'💝', '😓', '🌼', '😣', '🤦♂️', '🎉', '🎊', '🥰', '🤗', '😴'
|
| 30 |
-
]
|
| 31 |
-
|
| 32 |
-
# Use our selected meaningful emojis
|
| 33 |
-
common_emojis = [e for e in meaningful_emojis if e in emoji_counts.index][:top_k_emojis]
|
| 34 |
-
|
| 35 |
-
focused_df = df[df['emoji'].isin(common_emojis)]
|
| 36 |
-
|
| 37 |
-
print(f"Urdu-optimized dataset: {len(focused_df)} samples, {len(common_emojis)} emojis")
|
| 38 |
-
|
| 39 |
-
# Try different model for Urdu
|
| 40 |
-
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2') # Larger model
|
| 41 |
-
|
| 42 |
-
# Use strategic sampling - more samples for common emojis
|
| 43 |
-
emoji_to_texts = {}
|
| 44 |
-
for emoji in common_emojis:
|
| 45 |
-
emoji_texts = focused_df[focused_df['emoji'] == emoji]['text'].tolist()
|
| 46 |
-
# Use more samples for better representation
|
| 47 |
-
if len(emoji_texts) > 300:
|
| 48 |
-
emoji_texts = emoji_texts[:300]
|
| 49 |
-
emoji_to_texts[emoji] = emoji_texts
|
| 50 |
-
|
| 51 |
-
print("Generating Urdu-optimized embeddings...")
|
| 52 |
-
emoji_embeddings = {}
|
| 53 |
-
|
| 54 |
-
for emoji, emoji_texts in emoji_to_texts.items():
|
| 55 |
-
if len(emoji_texts) > 10: # Only use emojis with sufficient data
|
| 56 |
-
text_embeds = model.encode(emoji_texts, convert_to_tensor=True, batch_size=16)
|
| 57 |
-
emoji_embed = text_embeds.mean(dim=0).cpu().numpy()
|
| 58 |
-
emoji_embeddings[emoji] = (emoji_embed, len(emoji_texts))
|
| 59 |
-
|
| 60 |
-
# Save the optimized model
|
| 61 |
-
os.makedirs(output_dir, exist_ok=True)
|
| 62 |
-
|
| 63 |
-
model_data = {
|
| 64 |
-
'emoji_embeddings': emoji_embeddings,
|
| 65 |
-
'emoji_list': list(emoji_embeddings.keys()),
|
| 66 |
-
'model_name': 'paraphrase-multilingual-mpnet-base-v2'
|
| 67 |
-
}
|
| 68 |
-
|
| 69 |
-
with open(f'{output_dir}/urdu_optimized_model.pkl', 'wb') as f:
|
| 70 |
-
pickle.dump(model_data, f)
|
| 71 |
-
|
| 72 |
-
print(f"✅ Urdu-optimized model saved with {len(emoji_embeddings)} emojis")
|
| 73 |
-
return output_dir
|
| 74 |
|
| 75 |
class UrduOptimizedPredictor:
|
| 76 |
-
def __init__(self, model_path):
|
| 77 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 78 |
self.text_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
|
| 79 |
self.text_model.to(self.device)
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
def predict_smart(self, text, top_k=3, min_confidence=0.
|
| 91 |
"""Smart prediction with confidence filtering"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
# Get text embedding
|
| 93 |
text_embedding = self.text_model.encode([text], convert_to_tensor=True)
|
| 94 |
text_embedding_np = text_embedding.cpu().numpy()
|
|
@@ -108,54 +69,4 @@ class UrduOptimizedPredictor:
|
|
| 108 |
top_overall = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:1]
|
| 109 |
return top_overall
|
| 110 |
|
| 111 |
-
return sorted_emojis[:top_k]
|
| 112 |
-
|
| 113 |
-
def explain_prediction(self, text):
|
| 114 |
-
"""Provide explanation for predictions"""
|
| 115 |
-
predictions = self.predict_smart(text, top_k=3, min_confidence=0.3)
|
| 116 |
-
|
| 117 |
-
print(f"\n🧠 Analysis for: '{text}'")
|
| 118 |
-
print("🎯 Top predictions:")
|
| 119 |
-
|
| 120 |
-
for i, (emoji, score) in enumerate(predictions, 1):
|
| 121 |
-
confidence = "HIGH" if score > 0.6 else "MEDIUM" if score > 0.4 else "LOW"
|
| 122 |
-
print(f" {i}. {emoji} (score: {score:.3f}) - {confidence} confidence")
|
| 123 |
-
|
| 124 |
-
return predictions
|
| 125 |
-
|
| 126 |
-
def final_evaluation():
|
| 127 |
-
"""Final comprehensive evaluation"""
|
| 128 |
-
# Create the optimized model
|
| 129 |
-
optimized_dir = create_urdu_optimized_model(
|
| 130 |
-
"urdu_emoji_training_data_proper.csv",
|
| 131 |
-
"models/urdu_optimized_model",
|
| 132 |
-
top_k_emojis=80
|
| 133 |
-
)
|
| 134 |
-
|
| 135 |
-
# Test the optimized model
|
| 136 |
-
predictor = UrduOptimizedPredictor(optimized_dir)
|
| 137 |
-
|
| 138 |
-
# Test on our key examples
|
| 139 |
-
test_cases = [
|
| 140 |
-
("میں بہت خوش ہوں", "Should show 😊😄😂"),
|
| 141 |
-
("امی نے میری پسندیدہ ڈش بنائی ہے", "Should show 😋❤️🍛"),
|
| 142 |
-
("آج کا دن بہت برا گزرا", "Should show 😞😢💔"),
|
| 143 |
-
("دل ٹوٹ گیا ہے", "Should show 💔😭😢"),
|
| 144 |
-
("آج کی بارش نے موسم کو خوشگوار بنا دیا", "Should show 🌧️🌈☔"),
|
| 145 |
-
("دوستوں کے ساتھ پارٹی کا مزہ آیا", "Should show 🎉😄👯"),
|
| 146 |
-
("محبت میں پڑ گیا ہوں", "Should show ❤️😍💕"),
|
| 147 |
-
("غصہ سے دماغ پھٹ رہا ہے", "Should show 😠💢🤬"),
|
| 148 |
-
("نیند آ رہی ہے تھک گیا ہوں", "Should show 😴💤🥱")
|
| 149 |
-
]
|
| 150 |
-
|
| 151 |
-
print("\n" + "="*60)
|
| 152 |
-
print("FINAL URDU-OPTIMIZED PREDICTIONS")
|
| 153 |
-
print("="*60)
|
| 154 |
-
|
| 155 |
-
for text, expected in test_cases:
|
| 156 |
-
predictor.explain_prediction(text)
|
| 157 |
-
print(f" 💡 Expected: {expected}")
|
| 158 |
-
print()
|
| 159 |
-
|
| 160 |
-
if __name__ == "__main__":
|
| 161 |
-
final_evaluation()
|
|
|
|
| 1 |
+
# urdu_specific_embedding.py (Updated)
|
| 2 |
+
import torch
|
|
|
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
import pickle
|
| 6 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
class UrduOptimizedPredictor:
|
| 9 |
+
def __init__(self, model_path=None):
|
| 10 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 11 |
self.text_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
|
| 12 |
self.text_model.to(self.device)
|
| 13 |
|
| 14 |
+
# Try multiple possible model file locations
|
| 15 |
+
possible_paths = [
|
| 16 |
+
"urdu_optimized_model.pkl", # Direct in root
|
| 17 |
+
"./urdu_optimized_model.pkl", # Current directory
|
| 18 |
+
"models/urdu_optimized_model/urdu_optimized_model.pkl", # Local structure
|
| 19 |
+
"/data/urdu_optimized_model.pkl" # HF Spaces data directory
|
| 20 |
+
]
|
| 21 |
|
| 22 |
+
model_loaded = False
|
| 23 |
+
for model_file in possible_paths:
|
| 24 |
+
if os.path.exists(model_file):
|
| 25 |
+
print(f"📁 Loading model from: {model_file}")
|
| 26 |
+
try:
|
| 27 |
+
with open(model_file, 'rb') as f:
|
| 28 |
+
model_data = pickle.load(f)
|
| 29 |
+
|
| 30 |
+
self.emoji_embeddings = {k: v[0] for k, v in model_data['emoji_embeddings'].items()}
|
| 31 |
+
self.emoji_list = model_data['emoji_list']
|
| 32 |
+
|
| 33 |
+
print(f"✅ Loaded Urdu-optimized model with {len(self.emoji_list)} meaningful emojis")
|
| 34 |
+
model_loaded = True
|
| 35 |
+
break
|
| 36 |
+
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"❌ Error loading {model_file}: {e}")
|
| 39 |
+
continue
|
| 40 |
|
| 41 |
+
if not model_loaded:
|
| 42 |
+
print("❌ Could not load model file. Please make sure urdu_optimized_model.pkl is uploaded.")
|
| 43 |
+
# Create empty structures to avoid crashes
|
| 44 |
+
self.emoji_embeddings = {}
|
| 45 |
+
self.emoji_list = []
|
| 46 |
|
| 47 |
+
def predict_smart(self, text, top_k=3, min_confidence=0.3):
|
| 48 |
"""Smart prediction with confidence filtering"""
|
| 49 |
+
# Check if model is loaded
|
| 50 |
+
if not self.emoji_embeddings:
|
| 51 |
+
return [("❌", 0.0)] # Return error emoji if model not loaded
|
| 52 |
+
|
| 53 |
# Get text embedding
|
| 54 |
text_embedding = self.text_model.encode([text], convert_to_tensor=True)
|
| 55 |
text_embedding_np = text_embedding.cpu().numpy()
|
|
|
|
| 69 |
top_overall = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:1]
|
| 70 |
return top_overall
|
| 71 |
|
| 72 |
+
return sorted_emojis[:top_k]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|