Spaces:
Sleeping
Sleeping
| import joblib | |
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| import pandas as pd | |
| import os | |
| # Set the Hugging Face cache directory to a writable location | |
| os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache" | |
| # Load pre-trained Sentence Transformer model | |
| model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True) | |
| # Load and preprocess SMS data (from an Excel file) | |
| def load_sms_data(file_path="data/sms_process_data_main.xlsx"): | |
| data = pd.read_excel(file_path) | |
| texts = data['MessageText'].tolist() | |
| labels = data['label'].tolist() | |
| embeddings = model.encode(texts, convert_to_tensor=True) | |
| embeddings = embeddings.detach().numpy() | |
| label_encoder = LabelEncoder() | |
| encoded_labels = label_encoder.fit_transform(labels) | |
| return embeddings, encoded_labels, label_encoder | |
| # Train and save the Logistic Regression model | |
| def train_sms_classifier(): | |
| embeddings, labels, label_encoder = load_sms_data() | |
| X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42) | |
| # Train Logistic Regression | |
| lr_model = LogisticRegression() | |
| lr_model.fit(X_train, y_train) | |
| accuracy = lr_model.score(X_test, y_test) | |
| print(f"Model Accuracy: {accuracy * 100:.2f}%") | |
| # Save the trained model and label encoder | |
| joblib.dump(lr_model, 'model/sms_classifier_model.pkl') | |
| joblib.dump(label_encoder, 'model/label_encoder.pkl') | |
| return lr_model, label_encoder | |
| # Load the saved model and label encoder | |
| def load_saved_model(): | |
| lr_model = joblib.load('model/sms_classifier_model.pkl') | |
| label_encoder = joblib.load('model/label_encoder.pkl') | |
| return lr_model, label_encoder | |
| # Generate embeddings for the messages | |
| def get_embeddings(messages): | |
| embeddings = model.encode(messages, convert_to_tensor=True) | |
| return embeddings.detach().numpy() | |
| # Predict the label of an SMS message | |
| def predict_sms_category(message): | |
| # Load the saved model and label encoder | |
| lr_model, label_encoder = load_saved_model() | |
| embedding = model.encode([message], convert_to_tensor=True) | |
| embedding = embedding.detach().numpy() | |
| prediction = lr_model.predict(embedding) | |
| label = label_encoder.inverse_transform(prediction)[0] | |
| return label | |