import os
import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = "data/sms_process_data_main.xlsx"
MODEL_PATH = "models/logistic.pkl"

# Check if the dataset file exists
print(DATA_PATH)
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset file not found at: {DATA_PATH}")

# Load dataset
df = pd.read_excel(DATA_PATH)  

# Ensure the dataset has the required columns (adjust as necessary)
if not {'text', 'label'}.issubset(df.columns):
    raise ValueError("Dataset must contain 'text' and 'label' columns")

# Load Sentence Transformer model
embedding_model = SentenceTransformer('Alibaba-NLP/gte-base-en-v1.5', trust_remote_code=True)

# Generate embeddings
X = df['text'].apply(lambda x: embedding_model.encode(x).tolist()).tolist()
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Evaluate the model
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Save the model
print("Saving model and embeddings...")
with open(MODEL_PATH, 'wb') as f:
    pickle.dump(logistic_model, f)

print(f"Logistic model saved to {MODEL_PATH}")