File size: 3,215 Bytes
3a680d6
 
6bc66a5
 
 
 
 
 
3a680d6
6bc66a5
 
 
 
3a680d6
 
 
 
 
 
 
6bc66a5
3a680d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6bc66a5
3a680d6
 
 
 
 
6bc66a5
3a680d6
6bc66a5
3a680d6
 
 
 
6bc66a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a680d6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# urdu_specific_embedding.py (Updated)
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import os

class UrduOptimizedPredictor:
    def __init__(self, model_path=None):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.text_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
        self.text_model.to(self.device)
        
        # Try multiple possible model file locations
        possible_paths = [
            "urdu_optimized_model.pkl",  # Direct in root
            "./urdu_optimized_model.pkl",  # Current directory
            "models/urdu_optimized_model/urdu_optimized_model.pkl",  # Local structure
            "/data/urdu_optimized_model.pkl"  # HF Spaces data directory
        ]
        
        model_loaded = False
        for model_file in possible_paths:
            if os.path.exists(model_file):
                print(f"πŸ“ Loading model from: {model_file}")
                try:
                    with open(model_file, 'rb') as f:
                        model_data = pickle.load(f)
                    
                    self.emoji_embeddings = {k: v[0] for k, v in model_data['emoji_embeddings'].items()}
                    self.emoji_list = model_data['emoji_list']
                    
                    print(f"βœ… Loaded Urdu-optimized model with {len(self.emoji_list)} meaningful emojis")
                    model_loaded = True
                    break
                    
                except Exception as e:
                    print(f"❌ Error loading {model_file}: {e}")
                    continue
        
        if not model_loaded:
            print("❌ Could not load model file. Please make sure urdu_optimized_model.pkl is uploaded.")
            # Create empty structures to avoid crashes
            self.emoji_embeddings = {}
            self.emoji_list = []
    
    def predict_smart(self, text, top_k=3, min_confidence=0.3):
        """Smart prediction with confidence filtering"""
        # Check if model is loaded
        if not self.emoji_embeddings:
            return [("❌", 0.0)]  # Return error emoji if model not loaded
        
        # Get text embedding
        text_embedding = self.text_model.encode([text], convert_to_tensor=True)
        text_embedding_np = text_embedding.cpu().numpy()
        
        # Calculate similarities
        similarities = {}
        for emoji, emoji_embedding in self.emoji_embeddings.items():
            similarity = cosine_similarity(text_embedding_np, emoji_embedding.reshape(1, -1))[0][0]
            similarities[emoji] = similarity
        
        # Filter by confidence and return top K
        filtered = [(emoji, score) for emoji, score in similarities.items() if score >= min_confidence]
        sorted_emojis = sorted(filtered, key=lambda x: x[1], reverse=True)
        
        # If no confident predictions, return top 1 anyway
        if not sorted_emojis:
            top_overall = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:1]
            return top_overall
        
        return sorted_emojis[:top_k]