quiz-generator / models /train_difficulty_model.py
Pavan Kumar
Deploy Quiz Generator
3e3f813
Raw
History Blame Contribute Delete
4.58 kB
from __future__ import annotations
import argparse
import os
import pickle
from typing import List, Tuple
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
ALLOWED_LABELS = {"easy", "medium", "hard"}
def _fallback_samples() -> Tuple[List[str], List[str]]:
texts = [
"What is the capital of France?",
"Define photosynthesis in one sentence.",
"Name the largest planet in the solar system.",
"Explain how binary search works and analyze its time complexity.",
"What is the derivative of x squared?",
"Describe the causes and outcomes of World War I.",
"Prove that the sum of first n natural numbers is n times n plus one by two.",
"Compare mitosis and meiosis with key differences.",
"Discuss transformers in NLP and the role of self-attention.",
]
labels = [
"easy",
"easy",
"easy",
"medium",
"medium",
"medium",
"hard",
"hard",
"hard",
]
return texts, labels
def load_data(csv_path: str | None) -> Tuple[List[str], List[str]]:
if not csv_path:
return _fallback_samples()
if not os.path.exists(csv_path):
raise FileNotFoundError(f"Dataset file not found: {csv_path}")
data = pd.read_csv(csv_path)
required_columns = {"question", "label"}
if not required_columns.issubset(data.columns):
raise ValueError("CSV must contain columns: question,label")
cleaned = data.dropna(subset=["question", "label"]).copy()
cleaned["label"] = cleaned["label"].astype(str).str.strip().str.lower()
cleaned = cleaned[cleaned["label"].isin(ALLOWED_LABELS)]
if cleaned.empty:
raise ValueError("No valid rows found after filtering labels to easy/medium/hard.")
texts = cleaned["question"].astype(str).tolist()
labels = cleaned["label"].tolist()
return texts, labels
def build_pipeline() -> Pipeline:
return Pipeline(
steps=[
(
"tfidf",
TfidfVectorizer(
ngram_range=(1, 2),
lowercase=True,
strip_accents="unicode",
min_df=1,
),
),
(
"classifier",
LogisticRegression(
max_iter=2000,
solver="lbfgs",
class_weight="balanced",
),
),
]
)
def train_and_save(csv_path: str | None, output_path: str, test_size: float, random_state: int) -> None:
texts, labels = load_data(csv_path)
unique_labels = sorted(set(labels))
min_fraction_for_stratify = len(unique_labels) / max(len(labels), 1)
adjusted_test_size = max(test_size, min_fraction_for_stratify)
adjusted_test_size = min(adjusted_test_size, 0.5)
x_train, x_test, y_train, y_test = train_test_split(
texts,
labels,
test_size=adjusted_test_size,
random_state=random_state,
stratify=labels,
)
pipeline = build_pipeline()
pipeline.fit(x_train, y_train)
predictions = pipeline.predict(x_test)
print("Classification report:")
print(classification_report(y_test, predictions, labels=["easy", "medium", "hard"]))
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
with open(output_path, "wb") as model_file:
pickle.dump(pipeline, model_file)
print(f"Model saved to: {output_path}")
def main() -> None:
parser = argparse.ArgumentParser(
description="Train TF-IDF + Logistic Regression model for question difficulty classification."
)
parser.add_argument(
"--csv",
type=str,
default=None,
help="Optional CSV path with columns: question,label (labels: easy, medium, hard)",
)
parser.add_argument(
"--output",
type=str,
default="models/difficulty_model.pkl",
help="Output pickle file path",
)
parser.add_argument("--test-size", type=float, default=0.2)
parser.add_argument("--random-state", type=int, default=42)
args = parser.parse_args()
train_and_save(
csv_path=args.csv,
output_path=args.output,
test_size=args.test_size,
random_state=args.random_state,
)
if __name__ == "__main__":
main()