import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
import preprocess

def train():
    print("Loading dataset...")
    try:
        df = pd.read_csv("dataset.csv")
    except FileNotFoundError:
        print("Error: dataset.csv not found.")
        return

    print("Preprocessing data...")
    # Fill NaN with empty string just in case
    df['text'] = df['text'].fillna('')
    df['clean_text'] = df['text'].apply(preprocess.preprocess_text)

    X = df['clean_text']
    y = df['label']

    print("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("Setting up pipeline and grid search...")
    # Pipeline: TF-IDF -> LinearSVC (often best for text)
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC(dual='auto'))
    ])

    # Parameters to tune
    param_grid = {
        'tfidf__ngram_range': [(1, 1), (1, 2)],  # Unigrams or Bigrams
        'tfidf__max_df': [0.9, 1.0],
        'clf__C': [0.1, 1, 10]
    }

    # Grid Search for best accuracy
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
    
    print("Training model...")
    grid_search.fit(X_train, y_train)

    print(f"Best Parameters: {grid_search.best_params_}")
    best_model = grid_search.best_estimator_

    print("Evaluating model...")
    y_pred = best_model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

    print("Saving model...")
    joblib.dump(best_model, "sentiment_model_best.pkl")
    print("Model saved to sentiment_model_best.pkl")

if __name__ == "__main__":
    train()