Spaces:
Sleeping
Sleeping
| """ | |
| Train TF-IDF severity classifier with improved accuracy | |
| """ | |
| import os | |
| import sys | |
| import pickle | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.ensemble import ( | |
| RandomForestClassifier, | |
| GradientBoostingClassifier, | |
| VotingClassifier, | |
| ) | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.svm import SVC | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.metrics import classification_report, accuracy_score | |
| from dotenv import load_dotenv | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| load_dotenv() | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from data.sheets_service import GoogleSheetsService | |
| def train_tfidf_classifier(): | |
| print("Training TF-IDF Severity Classifier (Improved)...") | |
| sheets_service = GoogleSheetsService() | |
| spreadsheet_id = os.getenv("GOOGLE_SHEET_ID") | |
| if not spreadsheet_id: | |
| print("ERROR: GOOGLE_SHEET_ID not set") | |
| return | |
| print(f"Fetching data from spreadsheet...") | |
| non_cargo = sheets_service.fetch_sheet_data(spreadsheet_id, "NON CARGO", "A1:AA500") | |
| cargo = sheets_service.fetch_sheet_data(spreadsheet_id, "CGO", "A1:Z500") | |
| all_data = non_cargo + cargo | |
| print(f"Fetched {len(all_data)} records") | |
| data = [] | |
| for report in all_data: | |
| texts = [] | |
| for field in ["Report", "Root_Caused", "Action_Taken"]: | |
| text = report.get(field, "") | |
| if text and text != "#N/A": | |
| texts.append(text) | |
| combined_text = " ".join(texts) | |
| if not combined_text or len(combined_text) < 10: | |
| continue | |
| text_lower = combined_text.lower() | |
| critical_keywords = [ | |
| "emergency", | |
| "darurat", | |
| "critical", | |
| "kritis", | |
| "severe", | |
| "parah", | |
| "injury", | |
| "cedera", | |
| "accident", | |
| "kecelakaan", | |
| "death", | |
| "kematian", | |
| "fire", | |
| "kebakaran", | |
| "explosion", | |
| "ledakan", | |
| "safety issue", | |
| ] | |
| high_keywords = [ | |
| "damage", | |
| "rusak", | |
| "torn", | |
| "robek", | |
| "broken", | |
| "pecah", | |
| "urgent", | |
| "mendesak", | |
| "lost", | |
| "hilang", | |
| "stolen", | |
| "dicuri", | |
| "security", | |
| "keamanan", | |
| "theft", | |
| "pencurian", | |
| ] | |
| medium_keywords = [ | |
| "delay", | |
| "terlambat", | |
| "wrong", | |
| "salah", | |
| "error", | |
| "kesalahan", | |
| "fail", | |
| "gagal", | |
| "problem", | |
| "masalah", | |
| "issue", | |
| "isu", | |
| "complaint", | |
| "keluhan", | |
| "reject", | |
| "missing", | |
| ] | |
| critical_count = sum(1 for kw in critical_keywords if kw in text_lower) | |
| high_count = sum(1 for kw in high_keywords if kw in text_lower) | |
| medium_count = sum(1 for kw in medium_keywords if kw in text_lower) | |
| text_len = len(combined_text) | |
| has_damage = any( | |
| kw in text_lower | |
| for kw in ["damage", "rusak", "broken", "pecah", "torn", "robek"] | |
| ) | |
| if critical_count >= 1: | |
| severity = "Critical" | |
| elif high_count >= 2 or (high_count >= 1 and has_damage): | |
| severity = "High" | |
| elif high_count >= 1: | |
| severity = "High" | |
| elif medium_count >= 3: | |
| severity = "Medium" | |
| elif medium_count >= 1: | |
| severity = "Medium" | |
| elif text_len > 300: | |
| severity = "Medium" | |
| else: | |
| severity = "Low" | |
| data.append({"text": combined_text, "severity": severity}) | |
| df = pd.DataFrame(data) | |
| print(f"\nPrepared {len(df)} samples") | |
| print(f"Severity distribution:\n{df['severity'].value_counts()}") | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| df["text"], | |
| df["severity"], | |
| test_size=0.2, | |
| random_state=42, | |
| stratify=df["severity"], | |
| ) | |
| print("\nTraining TF-IDF vectorizer...") | |
| vectorizer = TfidfVectorizer( | |
| max_features=8000, | |
| ngram_range=(1, 3), | |
| min_df=1, | |
| max_df=0.95, | |
| sublinear_tf=True, | |
| strip_accents="unicode", | |
| lowercase=True, | |
| ) | |
| X_train_vec = vectorizer.fit_transform(X_train) | |
| X_test_vec = vectorizer.transform(X_test) | |
| classes = np.unique(y_train) | |
| print("\nTraining ensemble classifier...") | |
| rf = RandomForestClassifier( | |
| n_estimators=300, | |
| max_depth=20, | |
| min_samples_split=2, | |
| min_samples_leaf=1, | |
| class_weight="balanced", | |
| random_state=42, | |
| n_jobs=-1, | |
| ) | |
| lr = LogisticRegression( | |
| C=1.0, class_weight="balanced", max_iter=1000, random_state=42, n_jobs=-1 | |
| ) | |
| svm = SVC( | |
| C=1.0, | |
| kernel="linear", | |
| class_weight="balanced", | |
| probability=True, | |
| random_state=42, | |
| ) | |
| gb = GradientBoostingClassifier( | |
| n_estimators=200, max_depth=8, learning_rate=0.1, random_state=42 | |
| ) | |
| ensemble = VotingClassifier( | |
| estimators=[("rf", rf), ("lr", lr), ("svm", svm), ("gb", gb)], | |
| voting="soft", | |
| weights=[2, 1, 1, 2], | |
| ) | |
| ensemble.fit(X_train_vec, y_train) | |
| y_pred = ensemble.predict(X_test_vec) | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print(f"\nTest Accuracy: {accuracy:.4f} ({accuracy * 100:.2f}%)") | |
| print("\nClassification Report:") | |
| print(classification_report(y_test, y_pred)) | |
| cv_scores = cross_val_score( | |
| ensemble, X_train_vec, y_train, cv=5, scoring="accuracy" | |
| ) | |
| print( | |
| f"\nCross-validation accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})" | |
| ) | |
| label_map = {i: label for i, label in enumerate(classes)} | |
| reverse_map = {label: i for i, label in label_map.items()} | |
| model_dir = os.path.join( | |
| os.path.dirname(__file__), "..", "models", "nlp", "severity_classifier" | |
| ) | |
| os.makedirs(model_dir, exist_ok=True) | |
| print(f"\nSaving models to {model_dir}...") | |
| with open(os.path.join(model_dir, "vectorizer.pkl"), "wb") as f: | |
| pickle.dump(vectorizer, f) | |
| with open(os.path.join(model_dir, "classifier.pkl"), "wb") as f: | |
| pickle.dump(ensemble, f) | |
| with open(os.path.join(model_dir, "label_encoder.pkl"), "wb") as f: | |
| pickle.dump({"label_map": label_map, "reverse_map": reverse_map}, f) | |
| import json | |
| with open(os.path.join(model_dir, "config.json"), "w") as f: | |
| json.dump( | |
| { | |
| "model_type": "ensemble_voting", | |
| "estimators": [ | |
| "random_forest", | |
| "logistic_regression", | |
| "svm", | |
| "gradient_boosting", | |
| ], | |
| "num_features": len(vectorizer.get_feature_names_out()), | |
| "classes": list(classes), | |
| "test_accuracy": round(accuracy, 4), | |
| "cv_accuracy": round(cv_scores.mean(), 4), | |
| "trained_on": pd.Timestamp.now().isoformat(), | |
| }, | |
| f, | |
| indent=2, | |
| ) | |
| print("✓ TF-IDF classifier saved successfully!") | |
| print(f" - vectorizer.pkl") | |
| print(f" - classifier.pkl (ensemble)") | |
| print(f" - label_encoder.pkl") | |
| print(f" - config.json") | |
| return accuracy | |
| if __name__ == "__main__": | |
| acc = train_tfidf_classifier() | |
| if acc and acc >= 0.81: | |
| print(f"\n✅ Target accuracy achieved: {acc * 100:.2f}% >= 81%") | |
| elif acc: | |
| print(f"\n⚠️ Target not met: {acc * 100:.2f}% < 81%") | |