import joblib from sentence_transformers import SentenceTransformer import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import pandas as pd import os # Set the Hugging Face cache directory to a writable location os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache" # Load pre-trained Sentence Transformer model model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True) # Load and preprocess SMS data (from an Excel file) def load_sms_data(file_path="data/sms_process_data_main.xlsx"): data = pd.read_excel(file_path) texts = data['MessageText'].tolist() labels = data['label'].tolist() embeddings = model.encode(texts, convert_to_tensor=True) embeddings = embeddings.detach().numpy() label_encoder = LabelEncoder() encoded_labels = label_encoder.fit_transform(labels) return embeddings, encoded_labels, label_encoder # Train and save the Logistic Regression model def train_sms_classifier(): embeddings, labels, label_encoder = load_sms_data() X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42) # Train Logistic Regression lr_model = LogisticRegression() lr_model.fit(X_train, y_train) accuracy = lr_model.score(X_test, y_test) print(f"Model Accuracy: {accuracy * 100:.2f}%") # Save the trained model and label encoder joblib.dump(lr_model, 'model/sms_classifier_model.pkl') joblib.dump(label_encoder, 'model/label_encoder.pkl') return lr_model, label_encoder # Load the saved model and label encoder def load_saved_model(): lr_model = joblib.load('model/sms_classifier_model.pkl') label_encoder = joblib.load('model/label_encoder.pkl') return lr_model, label_encoder # Generate embeddings for the messages def get_embeddings(messages): embeddings = model.encode(messages, convert_to_tensor=True) return embeddings.detach().numpy() # Predict the label of an SMS message def predict_sms_category(message): # Load the saved model and label encoder lr_model, label_encoder = load_saved_model() embedding = model.encode([message], convert_to_tensor=True) embedding = embedding.detach().numpy() prediction = lr_model.predict(embedding) label = label_encoder.inverse_transform(prediction)[0] return label