Spaces:
Sleeping
Sleeping
File size: 3,215 Bytes
3a680d6 6bc66a5 3a680d6 6bc66a5 3a680d6 6bc66a5 3a680d6 6bc66a5 3a680d6 6bc66a5 3a680d6 6bc66a5 3a680d6 6bc66a5 3a680d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# urdu_specific_embedding.py (Updated)
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os
class UrduOptimizedPredictor:
def __init__(self, model_path=None):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.text_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
self.text_model.to(self.device)
# Try multiple possible model file locations
possible_paths = [
"urdu_optimized_model.pkl", # Direct in root
"./urdu_optimized_model.pkl", # Current directory
"models/urdu_optimized_model/urdu_optimized_model.pkl", # Local structure
"/data/urdu_optimized_model.pkl" # HF Spaces data directory
]
model_loaded = False
for model_file in possible_paths:
if os.path.exists(model_file):
print(f"π Loading model from: {model_file}")
try:
with open(model_file, 'rb') as f:
model_data = pickle.load(f)
self.emoji_embeddings = {k: v[0] for k, v in model_data['emoji_embeddings'].items()}
self.emoji_list = model_data['emoji_list']
print(f"β
Loaded Urdu-optimized model with {len(self.emoji_list)} meaningful emojis")
model_loaded = True
break
except Exception as e:
print(f"β Error loading {model_file}: {e}")
continue
if not model_loaded:
print("β Could not load model file. Please make sure urdu_optimized_model.pkl is uploaded.")
# Create empty structures to avoid crashes
self.emoji_embeddings = {}
self.emoji_list = []
def predict_smart(self, text, top_k=3, min_confidence=0.3):
"""Smart prediction with confidence filtering"""
# Check if model is loaded
if not self.emoji_embeddings:
return [("β", 0.0)] # Return error emoji if model not loaded
# Get text embedding
text_embedding = self.text_model.encode([text], convert_to_tensor=True)
text_embedding_np = text_embedding.cpu().numpy()
# Calculate similarities
similarities = {}
for emoji, emoji_embedding in self.emoji_embeddings.items():
similarity = cosine_similarity(text_embedding_np, emoji_embedding.reshape(1, -1))[0][0]
similarities[emoji] = similarity
# Filter by confidence and return top K
filtered = [(emoji, score) for emoji, score in similarities.items() if score >= min_confidence]
sorted_emojis = sorted(filtered, key=lambda x: x[1], reverse=True)
# If no confident predictions, return top 1 anyway
if not sorted_emojis:
top_overall = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:1]
return top_overall
return sorted_emojis[:top_k] |