Spaces:

Pavan-Kumar23
/

quiz-generator

Sleeping

quiz-generator / models /train_difficulty_model.py

Pavan Kumar

Deploy Quiz Generator

3e3f813 4 days ago

4.58 kB

	from __future__ import annotations

	import argparse
	import os
	import pickle
	from typing import List, Tuple

	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import classification_report
	from sklearn.model_selection import train_test_split
	from sklearn.pipeline import Pipeline

	ALLOWED_LABELS = {"easy", "medium", "hard"}


	def _fallback_samples() -> Tuple[List[str], List[str]]:
	texts = [
	"What is the capital of France?",
	"Define photosynthesis in one sentence.",
	"Name the largest planet in the solar system.",
	"Explain how binary search works and analyze its time complexity.",
	"What is the derivative of x squared?",
	"Describe the causes and outcomes of World War I.",
	"Prove that the sum of first n natural numbers is n times n plus one by two.",
	"Compare mitosis and meiosis with key differences.",
	"Discuss transformers in NLP and the role of self-attention.",
	]
	labels = [
	"easy",
	"easy",
	"easy",
	"medium",
	"medium",
	"medium",
	"hard",
	"hard",
	"hard",
	]
	return texts, labels


	def load_data(csv_path: str \| None) -> Tuple[List[str], List[str]]:
	if not csv_path:
	return _fallback_samples()

	if not os.path.exists(csv_path):
	raise FileNotFoundError(f"Dataset file not found: {csv_path}")

	data = pd.read_csv(csv_path)
	required_columns = {"question", "label"}
	if not required_columns.issubset(data.columns):
	raise ValueError("CSV must contain columns: question,label")

	cleaned = data.dropna(subset=["question", "label"]).copy()
	cleaned["label"] = cleaned["label"].astype(str).str.strip().str.lower()
	cleaned = cleaned[cleaned["label"].isin(ALLOWED_LABELS)]

	if cleaned.empty:
	raise ValueError("No valid rows found after filtering labels to easy/medium/hard.")

	texts = cleaned["question"].astype(str).tolist()
	labels = cleaned["label"].tolist()
	return texts, labels


	def build_pipeline() -> Pipeline:
	return Pipeline(
	steps=[
	(
	"tfidf",
	TfidfVectorizer(
	ngram_range=(1, 2),
	lowercase=True,
	strip_accents="unicode",
	min_df=1,
	),
	),
	(
	"classifier",
	LogisticRegression(
	max_iter=2000,
	solver="lbfgs",
	class_weight="balanced",
	),
	),
	]
	)


	def train_and_save(csv_path: str \| None, output_path: str, test_size: float, random_state: int) -> None:
	texts, labels = load_data(csv_path)

	unique_labels = sorted(set(labels))
	min_fraction_for_stratify = len(unique_labels) / max(len(labels), 1)
	adjusted_test_size = max(test_size, min_fraction_for_stratify)
	adjusted_test_size = min(adjusted_test_size, 0.5)

	x_train, x_test, y_train, y_test = train_test_split(
	texts,
	labels,
	test_size=adjusted_test_size,
	random_state=random_state,
	stratify=labels,
	)

	pipeline = build_pipeline()
	pipeline.fit(x_train, y_train)

	predictions = pipeline.predict(x_test)
	print("Classification report:")
	print(classification_report(y_test, predictions, labels=["easy", "medium", "hard"]))

	os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
	with open(output_path, "wb") as model_file:
	pickle.dump(pipeline, model_file)

	print(f"Model saved to: {output_path}")


	def main() -> None:
	parser = argparse.ArgumentParser(
	description="Train TF-IDF + Logistic Regression model for question difficulty classification."
	)
	parser.add_argument(
	"--csv",
	type=str,
	default=None,
	help="Optional CSV path with columns: question,label (labels: easy, medium, hard)",
	)
	parser.add_argument(
	"--output",
	type=str,
	default="models/difficulty_model.pkl",
	help="Output pickle file path",
	)
	parser.add_argument("--test-size", type=float, default=0.2)
	parser.add_argument("--random-state", type=int, default=42)

	args = parser.parse_args()
	train_and_save(
	csv_path=args.csv,
	output_path=args.output,
	test_size=args.test_size,
	random_state=args.random_state,
	)


	if __name__ == "__main__":
	main()