gapura-oneclick / training /train_tfidf_classifier.py
Muhammad Ridzki Nugraha
Deploy API and config (Batch 3)
07476a1 verified
"""
Train TF-IDF severity classifier with improved accuracy
"""
import os
import sys
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import (
RandomForestClassifier,
GradientBoostingClassifier,
VotingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")
load_dotenv()
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from data.sheets_service import GoogleSheetsService
def train_tfidf_classifier():
print("Training TF-IDF Severity Classifier (Improved)...")
sheets_service = GoogleSheetsService()
spreadsheet_id = os.getenv("GOOGLE_SHEET_ID")
if not spreadsheet_id:
print("ERROR: GOOGLE_SHEET_ID not set")
return
print(f"Fetching data from spreadsheet...")
non_cargo = sheets_service.fetch_sheet_data(spreadsheet_id, "NON CARGO", "A1:AA500")
cargo = sheets_service.fetch_sheet_data(spreadsheet_id, "CGO", "A1:Z500")
all_data = non_cargo + cargo
print(f"Fetched {len(all_data)} records")
data = []
for report in all_data:
texts = []
for field in ["Report", "Root_Caused", "Action_Taken"]:
text = report.get(field, "")
if text and text != "#N/A":
texts.append(text)
combined_text = " ".join(texts)
if not combined_text or len(combined_text) < 10:
continue
text_lower = combined_text.lower()
critical_keywords = [
"emergency",
"darurat",
"critical",
"kritis",
"severe",
"parah",
"injury",
"cedera",
"accident",
"kecelakaan",
"death",
"kematian",
"fire",
"kebakaran",
"explosion",
"ledakan",
"safety issue",
]
high_keywords = [
"damage",
"rusak",
"torn",
"robek",
"broken",
"pecah",
"urgent",
"mendesak",
"lost",
"hilang",
"stolen",
"dicuri",
"security",
"keamanan",
"theft",
"pencurian",
]
medium_keywords = [
"delay",
"terlambat",
"wrong",
"salah",
"error",
"kesalahan",
"fail",
"gagal",
"problem",
"masalah",
"issue",
"isu",
"complaint",
"keluhan",
"reject",
"missing",
]
critical_count = sum(1 for kw in critical_keywords if kw in text_lower)
high_count = sum(1 for kw in high_keywords if kw in text_lower)
medium_count = sum(1 for kw in medium_keywords if kw in text_lower)
text_len = len(combined_text)
has_damage = any(
kw in text_lower
for kw in ["damage", "rusak", "broken", "pecah", "torn", "robek"]
)
if critical_count >= 1:
severity = "Critical"
elif high_count >= 2 or (high_count >= 1 and has_damage):
severity = "High"
elif high_count >= 1:
severity = "High"
elif medium_count >= 3:
severity = "Medium"
elif medium_count >= 1:
severity = "Medium"
elif text_len > 300:
severity = "Medium"
else:
severity = "Low"
data.append({"text": combined_text, "severity": severity})
df = pd.DataFrame(data)
print(f"\nPrepared {len(df)} samples")
print(f"Severity distribution:\n{df['severity'].value_counts()}")
X_train, X_test, y_train, y_test = train_test_split(
df["text"],
df["severity"],
test_size=0.2,
random_state=42,
stratify=df["severity"],
)
print("\nTraining TF-IDF vectorizer...")
vectorizer = TfidfVectorizer(
max_features=8000,
ngram_range=(1, 3),
min_df=1,
max_df=0.95,
sublinear_tf=True,
strip_accents="unicode",
lowercase=True,
)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
classes = np.unique(y_train)
print("\nTraining ensemble classifier...")
rf = RandomForestClassifier(
n_estimators=300,
max_depth=20,
min_samples_split=2,
min_samples_leaf=1,
class_weight="balanced",
random_state=42,
n_jobs=-1,
)
lr = LogisticRegression(
C=1.0, class_weight="balanced", max_iter=1000, random_state=42, n_jobs=-1
)
svm = SVC(
C=1.0,
kernel="linear",
class_weight="balanced",
probability=True,
random_state=42,
)
gb = GradientBoostingClassifier(
n_estimators=200, max_depth=8, learning_rate=0.1, random_state=42
)
ensemble = VotingClassifier(
estimators=[("rf", rf), ("lr", lr), ("svm", svm), ("gb", gb)],
voting="soft",
weights=[2, 1, 1, 2],
)
ensemble.fit(X_train_vec, y_train)
y_pred = ensemble.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f} ({accuracy * 100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
cv_scores = cross_val_score(
ensemble, X_train_vec, y_train, cv=5, scoring="accuracy"
)
print(
f"\nCross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})"
)
label_map = {i: label for i, label in enumerate(classes)}
reverse_map = {label: i for i, label in label_map.items()}
model_dir = os.path.join(
os.path.dirname(__file__), "..", "models", "nlp", "severity_classifier"
)
os.makedirs(model_dir, exist_ok=True)
print(f"\nSaving models to {model_dir}...")
with open(os.path.join(model_dir, "vectorizer.pkl"), "wb") as f:
pickle.dump(vectorizer, f)
with open(os.path.join(model_dir, "classifier.pkl"), "wb") as f:
pickle.dump(ensemble, f)
with open(os.path.join(model_dir, "label_encoder.pkl"), "wb") as f:
pickle.dump({"label_map": label_map, "reverse_map": reverse_map}, f)
import json
with open(os.path.join(model_dir, "config.json"), "w") as f:
json.dump(
{
"model_type": "ensemble_voting",
"estimators": [
"random_forest",
"logistic_regression",
"svm",
"gradient_boosting",
],
"num_features": len(vectorizer.get_feature_names_out()),
"classes": list(classes),
"test_accuracy": round(accuracy, 4),
"cv_accuracy": round(cv_scores.mean(), 4),
"trained_on": pd.Timestamp.now().isoformat(),
},
f,
indent=2,
)
print("✓ TF-IDF classifier saved successfully!")
print(f" - vectorizer.pkl")
print(f" - classifier.pkl (ensemble)")
print(f" - label_encoder.pkl")
print(f" - config.json")
return accuracy
if __name__ == "__main__":
acc = train_tfidf_classifier()
if acc and acc >= 0.81:
print(f"\n✅ Target accuracy achieved: {acc * 100:.2f}% >= 81%")
elif acc:
print(f"\n⚠️ Target not met: {acc * 100:.2f}% < 81%")