Embedding_fastapi / model.py
Ezhil
modified code
e0ec062
import joblib
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import os
# Set the Hugging Face cache directory to a writable location
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface_cache"
# Load pre-trained Sentence Transformer model
model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)
# Load and preprocess SMS data (from an Excel file)
def load_sms_data(file_path="data/sms_process_data_main.xlsx"):
data = pd.read_excel(file_path)
texts = data['MessageText'].tolist()
labels = data['label'].tolist()
embeddings = model.encode(texts, convert_to_tensor=True)
embeddings = embeddings.detach().numpy()
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)
return embeddings, encoded_labels, label_encoder
# Train and save the Logistic Regression model
def train_sms_classifier():
embeddings, labels, label_encoder = load_sms_data()
X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42)
# Train Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
accuracy = lr_model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
# Save the trained model and label encoder
joblib.dump(lr_model, 'model/sms_classifier_model.pkl')
joblib.dump(label_encoder, 'model/label_encoder.pkl')
return lr_model, label_encoder
# Load the saved model and label encoder
def load_saved_model():
lr_model = joblib.load('model/sms_classifier_model.pkl')
label_encoder = joblib.load('model/label_encoder.pkl')
return lr_model, label_encoder
# Generate embeddings for the messages
def get_embeddings(messages):
embeddings = model.encode(messages, convert_to_tensor=True)
return embeddings.detach().numpy()
# Predict the label of an SMS message
def predict_sms_category(message):
# Load the saved model and label encoder
lr_model, label_encoder = load_saved_model()
embedding = model.encode([message], convert_to_tensor=True)
embedding = embedding.detach().numpy()
prediction = lr_model.predict(embedding)
label = label_encoder.inverse_transform(prediction)[0]
return label