from __future__ import annotations

import argparse
import os
import pickle
from typing import List, Tuple

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

ALLOWED_LABELS = {"easy", "medium", "hard"}


def _fallback_samples() -> Tuple[List[str], List[str]]:
    texts = [
        "What is the capital of France?",
        "Define photosynthesis in one sentence.",
        "Name the largest planet in the solar system.",
        "Explain how binary search works and analyze its time complexity.",
        "What is the derivative of x squared?",
        "Describe the causes and outcomes of World War I.",
        "Prove that the sum of first n natural numbers is n times n plus one by two.",
        "Compare mitosis and meiosis with key differences.",
        "Discuss transformers in NLP and the role of self-attention.",
    ]
    labels = [
        "easy",
        "easy",
        "easy",
        "medium",
        "medium",
        "medium",
        "hard",
        "hard",
        "hard",
    ]
    return texts, labels


def load_data(csv_path: str | None) -> Tuple[List[str], List[str]]:
    if not csv_path:
        return _fallback_samples()

    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Dataset file not found: {csv_path}")

    data = pd.read_csv(csv_path)
    required_columns = {"question", "label"}
    if not required_columns.issubset(data.columns):
        raise ValueError("CSV must contain columns: question,label")

    cleaned = data.dropna(subset=["question", "label"]).copy()
    cleaned["label"] = cleaned["label"].astype(str).str.strip().str.lower()
    cleaned = cleaned[cleaned["label"].isin(ALLOWED_LABELS)]

    if cleaned.empty:
        raise ValueError("No valid rows found after filtering labels to easy/medium/hard.")

    texts = cleaned["question"].astype(str).tolist()
    labels = cleaned["label"].tolist()
    return texts, labels


def build_pipeline() -> Pipeline:
    return Pipeline(
        steps=[
            (
                "tfidf",
                TfidfVectorizer(
                    ngram_range=(1, 2),
                    lowercase=True,
                    strip_accents="unicode",
                    min_df=1,
                ),
            ),
            (
                "classifier",
                LogisticRegression(
                    max_iter=2000,
                    solver="lbfgs",
                    class_weight="balanced",
                ),
            ),
        ]
    )


def train_and_save(csv_path: str | None, output_path: str, test_size: float, random_state: int) -> None:
    texts, labels = load_data(csv_path)

    unique_labels = sorted(set(labels))
    min_fraction_for_stratify = len(unique_labels) / max(len(labels), 1)
    adjusted_test_size = max(test_size, min_fraction_for_stratify)
    adjusted_test_size = min(adjusted_test_size, 0.5)

    x_train, x_test, y_train, y_test = train_test_split(
        texts,
        labels,
        test_size=adjusted_test_size,
        random_state=random_state,
        stratify=labels,
    )

    pipeline = build_pipeline()
    pipeline.fit(x_train, y_train)

    predictions = pipeline.predict(x_test)
    print("Classification report:")
    print(classification_report(y_test, predictions, labels=["easy", "medium", "hard"]))

    os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
    with open(output_path, "wb") as model_file:
        pickle.dump(pipeline, model_file)

    print(f"Model saved to: {output_path}")


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Train TF-IDF + Logistic Regression model for question difficulty classification."
    )
    parser.add_argument(
        "--csv",
        type=str,
        default=None,
        help="Optional CSV path with columns: question,label (labels: easy, medium, hard)",
    )
    parser.add_argument(
        "--output",
        type=str,
        default="models/difficulty_model.pkl",
        help="Output pickle file path",
    )
    parser.add_argument("--test-size", type=float, default=0.2)
    parser.add_argument("--random-state", type=int, default=42)

    args = parser.parse_args()
    train_and_save(
        csv_path=args.csv,
        output_path=args.output,
        test_size=args.test_size,
        random_state=args.random_state,
    )


if __name__ == "__main__":
    main()