|
|
import argparse |
|
|
import json |
|
|
import os |
|
|
from pathlib import Path |
|
|
import sys |
|
|
import time |
|
|
from datetime import datetime |
|
|
|
|
|
import joblib |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.naive_bayes import MultinomialNB |
|
|
from sklearn.pipeline import Pipeline |
|
|
|
|
|
|
|
|
ROOT_DIR = Path(__file__).resolve().parents[2] |
|
|
if str(ROOT_DIR) not in sys.path: |
|
|
sys.path.insert(0, str(ROOT_DIR)) |
|
|
|
|
|
|
|
|
BASE_DIR = Path(__file__).resolve().parent |
|
|
DEFAULT_DATASET = BASE_DIR / "intent_dataset.json" |
|
|
GENERATED_QA_DIR = BASE_DIR / "generated_qa" |
|
|
ARTIFACT_DIR = BASE_DIR / "artifacts" |
|
|
LOG_DIR = ROOT_DIR / "logs" / "intent" |
|
|
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True) |
|
|
LOG_DIR.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
def load_dataset(path: Path): |
|
|
payload = json.loads(path.read_text(encoding="utf-8")) |
|
|
texts = [] |
|
|
labels = [] |
|
|
for intent in payload.get("intents", []): |
|
|
name = intent["name"] |
|
|
for example in intent.get("examples", []): |
|
|
texts.append(example) |
|
|
labels.append(name) |
|
|
return texts, labels, payload |
|
|
|
|
|
|
|
|
def load_generated_qa(directory: Path): |
|
|
""" |
|
|
Load generated QA questions as additional intent training samples. |
|
|
|
|
|
Each JSON file is expected to contain a list of objects compatible |
|
|
with `QAItem` from `generated_qa`, at minimum having: |
|
|
- question: str |
|
|
- intent: str |
|
|
""" |
|
|
texts: list[str] = [] |
|
|
labels: list[str] = [] |
|
|
|
|
|
if not directory.exists(): |
|
|
return texts, labels |
|
|
|
|
|
for path in sorted(directory.glob("*.json")): |
|
|
try: |
|
|
payload = json.loads(path.read_text(encoding="utf-8")) |
|
|
except Exception: |
|
|
|
|
|
continue |
|
|
if not isinstance(payload, list): |
|
|
continue |
|
|
for item in payload: |
|
|
if not isinstance(item, dict): |
|
|
continue |
|
|
question = str(item.get("question") or "").strip() |
|
|
intent = str(item.get("intent") or "").strip() or "search_legal" |
|
|
if not question: |
|
|
continue |
|
|
texts.append(question) |
|
|
labels.append(intent) |
|
|
return texts, labels |
|
|
|
|
|
|
|
|
def load_combined_dataset(path: Path, generated_dir: Path): |
|
|
""" |
|
|
Load seed intent dataset and merge with generated QA questions. |
|
|
""" |
|
|
texts, labels, meta = load_dataset(path) |
|
|
gen_texts, gen_labels = load_generated_qa(generated_dir) |
|
|
|
|
|
texts.extend(gen_texts) |
|
|
labels.extend(gen_labels) |
|
|
return texts, labels, meta |
|
|
|
|
|
|
|
|
def build_pipelines(): |
|
|
vectorizer = TfidfVectorizer( |
|
|
analyzer="word", |
|
|
ngram_range=(1, 2), |
|
|
lowercase=True, |
|
|
token_pattern=r"\b\w+\b", |
|
|
) |
|
|
|
|
|
nb_pipeline = Pipeline([ |
|
|
("tfidf", vectorizer), |
|
|
("clf", MultinomialNB()), |
|
|
]) |
|
|
|
|
|
logreg_pipeline = Pipeline([ |
|
|
("tfidf", vectorizer), |
|
|
("clf", LogisticRegression(max_iter=1000, solver="lbfgs")), |
|
|
]) |
|
|
|
|
|
return { |
|
|
"multinomial_nb": nb_pipeline, |
|
|
"logistic_regression": logreg_pipeline, |
|
|
} |
|
|
|
|
|
|
|
|
def train(dataset_path: Path, test_size: float = 0.2, random_state: int = 42): |
|
|
texts, labels, meta = load_combined_dataset(dataset_path, GENERATED_QA_DIR) |
|
|
if not texts: |
|
|
raise ValueError("Dataset rỗng, không thể huấn luyện") |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split( |
|
|
texts, labels, test_size=test_size, random_state=random_state, stratify=labels |
|
|
) |
|
|
|
|
|
pipelines = build_pipelines() |
|
|
best_model = None |
|
|
best_metrics = None |
|
|
|
|
|
for name, pipeline in pipelines.items(): |
|
|
start = time.perf_counter() |
|
|
pipeline.fit(X_train, y_train) |
|
|
train_duration = time.perf_counter() - start |
|
|
|
|
|
y_pred = pipeline.predict(X_test) |
|
|
acc = accuracy_score(y_test, y_pred) |
|
|
report = classification_report(y_test, y_pred, output_dict=True) |
|
|
cm = confusion_matrix(y_test, y_pred, labels=sorted(set(labels))) |
|
|
|
|
|
metrics = { |
|
|
"model": name, |
|
|
"accuracy": acc, |
|
|
"train_duration_sec": train_duration, |
|
|
"classification_report": report, |
|
|
"confusion_matrix": cm.tolist(), |
|
|
"labels": sorted(set(labels)), |
|
|
"dataset_version": meta.get("version"), |
|
|
"timestamp": datetime.utcnow().isoformat() + "Z", |
|
|
"test_size": test_size, |
|
|
"samples": len(texts), |
|
|
} |
|
|
|
|
|
if best_model is None or acc > best_metrics["accuracy"]: |
|
|
best_model = pipeline |
|
|
best_metrics = metrics |
|
|
|
|
|
assert best_model is not None |
|
|
|
|
|
model_path = ARTIFACT_DIR / "intent_model.joblib" |
|
|
metrics_path = ARTIFACT_DIR / "metrics.json" |
|
|
joblib.dump(best_model, model_path) |
|
|
metrics_path.write_text(json.dumps(best_metrics, ensure_ascii=False, indent=2), encoding="utf-8") |
|
|
|
|
|
log_entry = { |
|
|
"event": "train_intent", |
|
|
"model": best_metrics["model"], |
|
|
"accuracy": best_metrics["accuracy"], |
|
|
"timestamp": best_metrics["timestamp"], |
|
|
"samples": best_metrics["samples"], |
|
|
"dataset_version": best_metrics["dataset_version"], |
|
|
"artifact": str(model_path.relative_to(ROOT_DIR)), |
|
|
} |
|
|
|
|
|
log_file = LOG_DIR / "train.log" |
|
|
with log_file.open("a", encoding="utf-8") as fh: |
|
|
fh.write(json.dumps(log_entry, ensure_ascii=False) + "\n") |
|
|
|
|
|
return model_path, metrics_path, best_metrics |
|
|
|
|
|
|
|
|
def parse_args(): |
|
|
parser = argparse.ArgumentParser(description="Huấn luyện model intent cho chatbot") |
|
|
parser.add_argument("--dataset", type=Path, default=DEFAULT_DATASET, help="Đường dẫn tới intent_dataset.json") |
|
|
parser.add_argument("--test-size", type=float, default=0.2, help="Tỉ lệ dữ liệu test") |
|
|
parser.add_argument("--seed", type=int, default=42, help="Giá trị random seed") |
|
|
return parser.parse_args() |
|
|
|
|
|
|
|
|
def main(): |
|
|
args = parse_args() |
|
|
model_path, metrics_path, metrics = train(args.dataset, test_size=args.test_size, random_state=args.seed) |
|
|
print("Huấn luyện hoàn tất:") |
|
|
print(f" Model: {metrics['model']}") |
|
|
print(f" Accuracy: {metrics['accuracy']:.4f}") |
|
|
print(f" Model artifact: {model_path}") |
|
|
print(f" Metrics: {metrics_path}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|