File size: 2,446 Bytes
e9a2c4c
 
 
 
 
 
 
e0ec062
 
 
e9a2c4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import joblib
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import os
# Set the Hugging Face cache directory to a writable location
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
# Load pre-trained Sentence Transformer model
model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

# Load and preprocess SMS data (from an Excel file)
def load_sms_data(file_path="data/sms_process_data_main.xlsx"):
    data = pd.read_excel(file_path)
    texts = data['MessageText'].tolist()
    labels = data['label'].tolist()

    embeddings = model.encode(texts, convert_to_tensor=True)
    embeddings = embeddings.detach().numpy()

    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)

    return embeddings, encoded_labels, label_encoder

# Train and save the Logistic Regression model
def train_sms_classifier():
    embeddings, labels, label_encoder = load_sms_data()
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42)

    # Train Logistic Regression
    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)

    accuracy = lr_model.score(X_test, y_test)
    print(f"Model Accuracy: {accuracy * 100:.2f}%")
    
    # Save the trained model and label encoder
    joblib.dump(lr_model, 'model/sms_classifier_model.pkl')
    joblib.dump(label_encoder, 'model/label_encoder.pkl')

    return lr_model, label_encoder

# Load the saved model and label encoder
def load_saved_model():
    lr_model = joblib.load('model/sms_classifier_model.pkl')
    label_encoder = joblib.load('model/label_encoder.pkl')
    return lr_model, label_encoder

# Generate embeddings for the messages
def get_embeddings(messages):
    embeddings = model.encode(messages, convert_to_tensor=True)
    return embeddings.detach().numpy()

# Predict the label of an SMS message
def predict_sms_category(message):
    # Load the saved model and label encoder
    lr_model, label_encoder = load_saved_model()

    embedding = model.encode([message], convert_to_tensor=True)
    embedding = embedding.detach().numpy()

    prediction = lr_model.predict(embedding)
    label = label_encoder.inverse_transform(prediction)[0]
    return label