Spaces:

davidtran999
/

hue-portal-backend-v2

Sleeping

App Files Files Community

davidtran999 commited on 10 days ago

Commit

980fef7

verified ·

1 Parent(s): f5ba315

Upload backend/chatbot/training/train_intent.py with huggingface_hub

Browse files

Files changed (1) hide show

backend/chatbot/training/train_intent.py +198 -0

backend/chatbot/training/train_intent.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import argparse
+import json
+import os
+from pathlib import Path
+import sys
+import time
+from datetime import datetime
+import joblib
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.naive_bayes import MultinomialNB
+from sklearn.pipeline import Pipeline
+ROOT_DIR = Path(__file__).resolve().parents[2]
+if str(ROOT_DIR) not in sys.path:
+    sys.path.insert(0, str(ROOT_DIR))
+BASE_DIR = Path(__file__).resolve().parent
+DEFAULT_DATASET = BASE_DIR / "intent_dataset.json"
+GENERATED_QA_DIR = BASE_DIR / "generated_qa"
+ARTIFACT_DIR = BASE_DIR / "artifacts"
+LOG_DIR = ROOT_DIR / "logs" / "intent"
+ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)
+LOG_DIR.mkdir(parents=True, exist_ok=True)
+def load_dataset(path: Path):
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    texts = []
+    labels = []
+    for intent in payload.get("intents", []):
+        name = intent["name"]
+        for example in intent.get("examples", []):
+            texts.append(example)
+            labels.append(name)
+    return texts, labels, payload
+def load_generated_qa(directory: Path):
+    """
+    Load generated QA questions as additional intent training samples.
+    Each JSON file is expected to contain a list of objects compatible
+    with `QAItem` from `generated_qa`, at minimum having:
+      - question: str
+      - intent: str
+    """
+    texts: list[str] = []
+    labels: list[str] = []
+    if not directory.exists():
+        return texts, labels
+    for path in sorted(directory.glob("*.json")):
+        try:
+            payload = json.loads(path.read_text(encoding="utf-8"))
+        except Exception:
+            # Skip malformed files but continue loading others
+            continue
+        if not isinstance(payload, list):
+            continue
+        for item in payload:
+            if not isinstance(item, dict):
+                continue
+            question = str(item.get("question") or "").strip()
+            intent = str(item.get("intent") or "").strip() or "search_legal"
+            if not question:
+                continue
+            texts.append(question)
+            labels.append(intent)
+    return texts, labels
+def load_combined_dataset(path: Path, generated_dir: Path):
+    """
+    Load seed intent dataset and merge with generated QA questions.
+    """
+    texts, labels, meta = load_dataset(path)
+    gen_texts, gen_labels = load_generated_qa(generated_dir)
+    texts.extend(gen_texts)
+    labels.extend(gen_labels)
+    return texts, labels, meta
+def build_pipelines():
+    vectorizer = TfidfVectorizer(
+        analyzer="word",
+        ngram_range=(1, 2),
+        lowercase=True,
+        token_pattern=r"\b\w+\b",
+    )
+    nb_pipeline = Pipeline([
+        ("tfidf", vectorizer),
+        ("clf", MultinomialNB()),
+    ])
+    logreg_pipeline = Pipeline([
+        ("tfidf", vectorizer),
+        ("clf", LogisticRegression(max_iter=1000, solver="lbfgs")),
+    ])
+    return {
+        "multinomial_nb": nb_pipeline,
+        "logistic_regression": logreg_pipeline,
+    }
+def train(dataset_path: Path, test_size: float = 0.2, random_state: int = 42):
+    texts, labels, meta = load_combined_dataset(dataset_path, GENERATED_QA_DIR)
+    if not texts:
+        raise ValueError("Dataset rỗng, không thể huấn luyện")
+    X_train, X_test, y_train, y_test = train_test_split(
+        texts, labels, test_size=test_size, random_state=random_state, stratify=labels
+    )
+    pipelines = build_pipelines()
+    best_model = None
+    best_metrics = None
+    for name, pipeline in pipelines.items():
+        start = time.perf_counter()
+        pipeline.fit(X_train, y_train)
+        train_duration = time.perf_counter() - start
+        y_pred = pipeline.predict(X_test)
+        acc = accuracy_score(y_test, y_pred)
+        report = classification_report(y_test, y_pred, output_dict=True)
+        cm = confusion_matrix(y_test, y_pred, labels=sorted(set(labels)))
+        metrics = {
+            "model": name,
+            "accuracy": acc,
+            "train_duration_sec": train_duration,
+            "classification_report": report,
+            "confusion_matrix": cm.tolist(),
+            "labels": sorted(set(labels)),
+            "dataset_version": meta.get("version"),
+            "timestamp": datetime.utcnow().isoformat() + "Z",
+            "test_size": test_size,
+            "samples": len(texts),
+        }
+        if best_model is None or acc > best_metrics["accuracy"]:
+            best_model = pipeline
+            best_metrics = metrics
+    assert best_model is not None
+    model_path = ARTIFACT_DIR / "intent_model.joblib"
+    metrics_path = ARTIFACT_DIR / "metrics.json"
+    joblib.dump(best_model, model_path)
+    metrics_path.write_text(json.dumps(best_metrics, ensure_ascii=False, indent=2), encoding="utf-8")
+    log_entry = {
+        "event": "train_intent",
+        "model": best_metrics["model"],
+        "accuracy": best_metrics["accuracy"],
+        "timestamp": best_metrics["timestamp"],
+        "samples": best_metrics["samples"],
+        "dataset_version": best_metrics["dataset_version"],
+        "artifact": str(model_path.relative_to(ROOT_DIR)),
+    }
+    log_file = LOG_DIR / "train.log"
+    with log_file.open("a", encoding="utf-8") as fh:
+        fh.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
+    return model_path, metrics_path, best_metrics
+def parse_args():
+    parser = argparse.ArgumentParser(description="Huấn luyện model intent cho chatbot")
+    parser.add_argument("--dataset", type=Path, default=DEFAULT_DATASET, help="Đường dẫn tới intent_dataset.json")
+    parser.add_argument("--test-size", type=float, default=0.2, help="Tỉ lệ dữ liệu test")
+    parser.add_argument("--seed", type=int, default=42, help="Giá trị random seed")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    model_path, metrics_path, metrics = train(args.dataset, test_size=args.test_size, random_state=args.seed)
+    print("Huấn luyện hoàn tất:")
+    print(f"  Model: {metrics['model']}")
+    print(f"  Accuracy: {metrics['accuracy']:.4f}")
+    print(f"  Model artifact: {model_path}")
+    print(f"  Metrics: {metrics_path}")
+if __name__ == "__main__":
+    main()