Spaces:

gapura-dev
/

gapura-oneclick

Sleeping

gapura-oneclick / training /train_tfidf_classifier.py

Muhammad Ridzki Nugraha

Deploy API and config (Batch 3)

07476a1 verified about 2 months ago

7.81 kB

	"""
	Train TF-IDF severity classifier with improved accuracy
	"""

	import os
	import sys
	import pickle
	import numpy as np
	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import (
	RandomForestClassifier,
	GradientBoostingClassifier,
	VotingClassifier,
	)
	from sklearn.linear_model import LogisticRegression
	from sklearn.svm import SVC
	from sklearn.model_selection import train_test_split, cross_val_score
	from sklearn.metrics import classification_report, accuracy_score
	from dotenv import load_dotenv
	import warnings

	warnings.filterwarnings("ignore")

	load_dotenv()

	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	from data.sheets_service import GoogleSheetsService


	def train_tfidf_classifier():
	print("Training TF-IDF Severity Classifier (Improved)...")

	sheets_service = GoogleSheetsService()
	spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")

	if not spreadsheet_id:
	print("ERROR: GOOGLE_SHEET_ID not set")
	return

	print(f"Fetching data from spreadsheet...")
	non_cargo = sheets_service.fetch_sheet_data(spreadsheet_id, "NON CARGO", "A1:AA500")
	cargo = sheets_service.fetch_sheet_data(spreadsheet_id, "CGO", "A1:Z500")
	all_data = non_cargo + cargo
	print(f"Fetched {len(all_data)} records")

	data = []
	for report in all_data:
	texts = []
	for field in ["Report", "Root_Caused", "Action_Taken"]:
	text = report.get(field, "")
	if text and text != "#N/A":
	texts.append(text)

	combined_text = " ".join(texts)
	if not combined_text or len(combined_text) < 10:
	continue

	text_lower = combined_text.lower()

	critical_keywords = [
	"emergency",
	"darurat",
	"critical",
	"kritis",
	"severe",
	"parah",
	"injury",
	"cedera",
	"accident",
	"kecelakaan",
	"death",
	"kematian",
	"fire",
	"kebakaran",
	"explosion",
	"ledakan",
	"safety issue",
	]
	high_keywords = [
	"damage",
	"rusak",
	"torn",
	"robek",
	"broken",
	"pecah",
	"urgent",
	"mendesak",
	"lost",
	"hilang",
	"stolen",
	"dicuri",
	"security",
	"keamanan",
	"theft",
	"pencurian",
	]
	medium_keywords = [
	"delay",
	"terlambat",
	"wrong",
	"salah",
	"error",
	"kesalahan",
	"fail",
	"gagal",
	"problem",
	"masalah",
	"issue",
	"isu",
	"complaint",
	"keluhan",
	"reject",
	"missing",
	]

	critical_count = sum(1 for kw in critical_keywords if kw in text_lower)
	high_count = sum(1 for kw in high_keywords if kw in text_lower)
	medium_count = sum(1 for kw in medium_keywords if kw in text_lower)

	text_len = len(combined_text)
	has_damage = any(
	kw in text_lower
	for kw in ["damage", "rusak", "broken", "pecah", "torn", "robek"]
	)

	if critical_count >= 1:
	severity = "Critical"
	elif high_count >= 2 or (high_count >= 1 and has_damage):
	severity = "High"
	elif high_count >= 1:
	severity = "High"
	elif medium_count >= 3:
	severity = "Medium"
	elif medium_count >= 1:
	severity = "Medium"
	elif text_len > 300:
	severity = "Medium"
	else:
	severity = "Low"

	data.append({"text": combined_text, "severity": severity})

	df = pd.DataFrame(data)
	print(f"\nPrepared {len(df)} samples")
	print(f"Severity distribution:\n{df['severity'].value_counts()}")

	X_train, X_test, y_train, y_test = train_test_split(
	df["text"],
	df["severity"],
	test_size=0.2,
	random_state=42,
	stratify=df["severity"],
	)

	print("\nTraining TF-IDF vectorizer...")
	vectorizer = TfidfVectorizer(
	max_features=8000,
	ngram_range=(1, 3),
	min_df=1,
	max_df=0.95,
	sublinear_tf=True,
	strip_accents="unicode",
	lowercase=True,
	)
	X_train_vec = vectorizer.fit_transform(X_train)
	X_test_vec = vectorizer.transform(X_test)

	classes = np.unique(y_train)

	print("\nTraining ensemble classifier...")

	rf = RandomForestClassifier(
	n_estimators=300,
	max_depth=20,
	min_samples_split=2,
	min_samples_leaf=1,
	class_weight="balanced",
	random_state=42,
	n_jobs=-1,
	)

	lr = LogisticRegression(
	C=1.0, class_weight="balanced", max_iter=1000, random_state=42, n_jobs=-1
	)

	svm = SVC(
	C=1.0,
	kernel="linear",
	class_weight="balanced",
	probability=True,
	random_state=42,
	)

	gb = GradientBoostingClassifier(
	n_estimators=200, max_depth=8, learning_rate=0.1, random_state=42
	)

	ensemble = VotingClassifier(
	estimators=[("rf", rf), ("lr", lr), ("svm", svm), ("gb", gb)],
	voting="soft",
	weights=[2, 1, 1, 2],
	)

	ensemble.fit(X_train_vec, y_train)

	y_pred = ensemble.predict(X_test_vec)
	accuracy = accuracy_score(y_test, y_pred)
	print(f"\nTest Accuracy: {accuracy:.4f} ({accuracy * 100:.2f}%)")
	print("\nClassification Report:")
	print(classification_report(y_test, y_pred))

	cv_scores = cross_val_score(
	ensemble, X_train_vec, y_train, cv=5, scoring="accuracy"
	)
	print(
	f"\nCross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})"
	)

	label_map = {i: label for i, label in enumerate(classes)}
	reverse_map = {label: i for i, label in label_map.items()}

	model_dir = os.path.join(
	os.path.dirname(__file__), "..", "models", "nlp", "severity_classifier"
	)
	os.makedirs(model_dir, exist_ok=True)

	print(f"\nSaving models to {model_dir}...")

	with open(os.path.join(model_dir, "vectorizer.pkl"), "wb") as f:
	pickle.dump(vectorizer, f)

	with open(os.path.join(model_dir, "classifier.pkl"), "wb") as f:
	pickle.dump(ensemble, f)

	with open(os.path.join(model_dir, "label_encoder.pkl"), "wb") as f:
	pickle.dump({"label_map": label_map, "reverse_map": reverse_map}, f)

	import json

	with open(os.path.join(model_dir, "config.json"), "w") as f:
	json.dump(
	{
	"model_type": "ensemble_voting",
	"estimators": [
	"random_forest",
	"logistic_regression",
	"svm",
	"gradient_boosting",
	],
	"num_features": len(vectorizer.get_feature_names_out()),
	"classes": list(classes),
	"test_accuracy": round(accuracy, 4),
	"cv_accuracy": round(cv_scores.mean(), 4),
	"trained_on": pd.Timestamp.now().isoformat(),
	},
	f,
	indent=2,
	)

	print("✓ TF-IDF classifier saved successfully!")
	print(f" - vectorizer.pkl")
	print(f" - classifier.pkl (ensemble)")
	print(f" - label_encoder.pkl")
	print(f" - config.json")

	return accuracy


	if __name__ == "__main__":
	acc = train_tfidf_classifier()
	if acc and acc >= 0.81:
	print(f"\n✅ Target accuracy achieved: {acc * 100:.2f}% >= 81%")
	elif acc:
	print(f"\n⚠️ Target not met: {acc * 100:.2f}% < 81%")