File size: 8,541 Bytes
ca2c89c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import nltk
import spacy
from transformers import pipeline

# Global models dictionary for persistent access
models = {
    "nlp": None,
    "sentiment_analyzer": None,
    "emotion_classifier": None,
    "summarizer": None,
    "qa_pipeline": None,
    "translation_pipeline": None,
    "text_generator": None,
    "zero_shot": None,
    "embedding_model": None
}

def download_nltk_resources():
    """Download and initialize NLTK resources"""
    resources = ['punkt', 'stopwords', 'vader_lexicon', 'wordnet', 'averaged_perceptron_tagger', 'sentiwordnet']
    for resource in resources:
        try:
            if resource == 'punkt':
                nltk.data.find(f'tokenizers/{resource}')
            elif resource in ['stopwords', 'wordnet']:
                nltk.data.find(f'corpora/{resource}')
            elif resource == 'vader_lexicon':
                nltk.data.find(f'sentiment/{resource}')
            elif resource == 'averaged_perceptron_tagger':
                nltk.data.find(f'taggers/{resource}')
            elif resource == 'sentiwordnet':
                nltk.data.find(f'corpora/{resource}')
        except LookupError:
            print(f"Downloading required NLTK resource: {resource}")
            nltk.download(resource)

def load_spacy():
    """Load spaCy model"""
    if models["nlp"] is None:
        try:
            models["nlp"] = spacy.load("en_core_web_sm")
        except:
            print("SpaCy model not found. Please run: python -m spacy download en_core_web_sm")
    return models["nlp"]

def load_sentiment_analyzer():
    """Load sentiment analysis model"""
    if models["sentiment_analyzer"] is None:
        try:
            models["sentiment_analyzer"] = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
        except Exception as e:
            print(f"Failed to load sentiment analyzer: {e}")
    return models["sentiment_analyzer"]

def load_emotion_classifier():
    """Load emotion classification model"""
    if models["emotion_classifier"] is None:
        try:
            models["emotion_classifier"] = pipeline(
                "text-classification", 
                model="cardiffnlp/twitter-roberta-base-emotion",
                return_all_scores=True
            )
        except Exception as e:
            print(f"Failed to load emotion classifier: {e}")
    return models["emotion_classifier"]

def load_summarizer():
    """Load summarization model"""
    if models["summarizer"] is None:
        try:
            models["summarizer"] = pipeline("summarization", model="facebook/bart-large-cnn")
        except Exception as e:
            print(f"Failed to load summarizer: {e}")
    return models["summarizer"]

def load_qa_pipeline():
    """Load or initialize the question answering pipeline."""
    if models["qa_pipeline"] is None:
        try:
            from transformers import pipeline
            
            # Use a smaller model to reduce memory usage and improve speed
            models["qa_pipeline"] = pipeline(
                "question-answering",
                model="deepset/roberta-base-squad2",  # You can change this to a different model if needed
                tokenizer="deepset/roberta-base-squad2"
            )
        except Exception as e:
            print(f"Error loading QA pipeline: {e}")
            models["qa_pipeline"] = None
            raise e
    return models["qa_pipeline"]

def load_translation_pipeline():
    """Load translation model"""
    if models["translation_pipeline"] is None:
        try:
            models["translation_pipeline"] = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
        except Exception as e:
            print(f"Failed to load translation model: {e}")
    return models["translation_pipeline"]

def load_translator(source_lang="auto", target_lang="en"):
    """

    Load a machine translation model for the given language pair.

    

    Args:

        source_lang (str): Source language code, or 'auto' for automatic detection

        target_lang (str): Target language code

        

    Returns:

        A translation pipeline or model

    """
    from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
    
    try:
        # For auto language detection, use a more general model
        if source_lang == "auto":
            # Using Helsinki-NLP's opus-mt model for translation
            model_name = "Helsinki-NLP/opus-mt-mul-en"  # Multilingual to English
            translator = pipeline("translation", model=model_name)
        else:
            # For specific language pairs
            model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
            
            # Load the model and tokenizer
            model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            
            # Create the translation pipeline
            translator = pipeline("translation", model=model, tokenizer=tokenizer)
            
        return translator
    except Exception as e:
        # Fallback to a more general model if language pair isn't available
        try:
            # Use MarianMT model for many language pairs
            model_name = "Helsinki-NLP/opus-mt-mul-en"  # Multilingual to English
            translator = pipeline("translation", model=model_name)
            return translator
        except Exception as nested_e:
            # If all else fails, return a simple callable object that returns an error message
            class ErrorTranslator:
                def __call__(self, text, **kwargs):
                    return [{"translation_text": f"Error loading translation model: {str(e)}. Fallback also failed: {str(nested_e)}"}]
            return ErrorTranslator()

def load_text_generator():
    """Load text generation model"""
    if models["text_generator"] is None:
        try:
            models["text_generator"] = pipeline("text-generation", model="gpt2")
        except Exception as e:
            print(f"Failed to load text generator: {e}")
    return models["text_generator"]

def load_zero_shot():
    """Load zero-shot classification model"""
    if models["zero_shot"] is None:
        try:
            models["zero_shot"] = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
        except Exception as e:
            print(f"Failed to load zero-shot classifier: {e}")
    return models["zero_shot"]

def load_embedding_model():
    """Load sentence embedding model for semantic search"""
    if models.get("embedding_model") is None:
        try:
            from sentence_transformers import SentenceTransformer
            models["embedding_model"] = SentenceTransformer('all-MiniLM-L6-v2')
        except Exception as e:
            print(f"Failed to load embedding model: {e}")
    return models["embedding_model"]

def initialize_all_models():
    """Initialize all models for better performance"""
    print("Initializing NLP models...")
    
    # Download NLTK resources first
    download_nltk_resources()
    
    # Load spaCy model
    try:
        load_spacy()
        print("✓ spaCy model loaded")
    except Exception as e:
        print(f"✗ Failed to load spaCy: {e}")
    
    # Load transformer models (these might take time)
    models_to_load = [
        ("Sentiment Analyzer", load_sentiment_analyzer),
        ("Emotion Classifier", load_emotion_classifier),
        ("Summarizer", load_summarizer),
        ("QA Pipeline", load_qa_pipeline),
        ("Text Generator", load_text_generator),
        ("Zero-shot Classifier", load_zero_shot),
        ("Embedding Model", load_embedding_model)
    ]
    
    for name, loader_func in models_to_load:
        try:
            loader_func()
            print(f"✓ {name} loaded")
        except Exception as e:
            print(f"✗ Failed to load {name}: {e}")
    
    print("Model initialization complete!")

def get_model_status():
    """Get status of all models"""
    status = {}
    for model_name, model in models.items():
        status[model_name] = model is not None
    return status

def clear_models():
    """Clear all loaded models to free memory"""
    for key in models:
        models[key] = None
    print("All models cleared from memory")