Spaces:

ganeshkonapalli
/

logreg__

Running

App Files Files Community

logreg__ / train_utils.py

ganeshkonapalli

Create train_utils.py

89e4a53 verified 7 months ago

raw

history blame contribute delete

3.28 kB

	import os
	import numpy as np
	import pandas as pd
	from sklearn.metrics import classification_report
	from tqdm import tqdm
	import joblib

	from config import LABEL_COLUMNS, MODEL_SAVE_DIR


	def train_logreg_models(X, y, label_encoders, model_class):
	"""
	Trains one Logistic Regression model per label column.

	Args:
	X (array-like): Feature matrix (e.g., TF-IDF vectors).
	y (DataFrame): Target DataFrame containing all label columns.
	label_encoders (dict): Label encoders for each target.
	model_class: LogisticRegression class.

	Returns:
	dict: Trained models keyed by label name.
	"""
	models = {}
	for col in LABEL_COLUMNS:
	print(f"Training Logistic Regression model for {col}...")
	model = model_class()
	model.fit(X, y[col])
	models[col] = model
	return models


	def evaluate_logreg_models(models, X_val, y_val, label_encoders):
	"""
	Evaluates Logistic Regression models on validation data.

	Args:
	models (dict): Dictionary of trained models per label.
	X_val (array-like): Validation features.
	y_val (DataFrame): Validation labels.
	label_encoders (dict): Encoders used for decoding.

	Returns:
	tuple: (classification_reports, true_labels_list, predicted_labels_list)
	"""
	reports = {}
	truths = []
	predictions = []

	for col in LABEL_COLUMNS:
	model = models[col]
	y_true = y_val[col]
	y_pred = model.predict(X_val)

	truths.append(y_true.tolist())
	predictions.append(y_pred.tolist())

	report = classification_report(
	y_true, y_pred, output_dict=True, zero_division=0
	)
	reports[col] = report

	return reports, truths, predictions


	def summarize_metrics(metrics):
	summary = []
	for field, report in metrics.items():
	precision = report['weighted avg'].get('precision', 0)
	recall = report['weighted avg'].get('recall', 0)
	f1 = report['weighted avg'].get('f1-score', 0)
	support = report['weighted avg'].get('support', 0)
	accuracy = report.get('accuracy', 0)
	summary.append({
	"Field": field,
	"Precision": precision,
	"Recall": recall,
	"F1-Score": f1,
	"Accuracy": accuracy,
	"Support": support
	})
	return pd.DataFrame(summary)


	def save_logreg_models(models, model_name):
	model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
	joblib.dump(models, model_path)
	print(f"Saved Logistic Regression models to {model_path}")


	def load_logreg_models(model_name):
	model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
	if not os.path.exists(model_path):
	raise FileNotFoundError(f"Model not found at {model_path}")
	models = joblib.load(model_path)
	print(f"Loaded Logistic Regression models from {model_path}")
	return models


	def predict_logreg_probabilities(models, X):
	"""
	Returns probability distributions for each label.

	Returns:
	list: One list per label of probability arrays.
	"""
	all_probs = []
	for col in LABEL_COLUMNS:
	probs = models[col].predict_proba(X)
	all_probs.append(probs)
	return all_probs