Spaces:
Running
Running
| """ | |
| Xuất một bundle dữ liệu để đẩy lên Hugging Face Dataset repo. | |
| Ví dụ: | |
| python scripts/export_dataset_bundle.py --output-dir ..\\chatbot-lichsu-data --include-pdf | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| from datetime import datetime, timezone | |
| ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if ROOT_DIR not in sys.path: | |
| sys.path.insert(0, ROOT_DIR) | |
| from backend.runtime_paths import APP_DATA_DIR, DB_PATH, PDF_DIR, VECTOR_DIR | |
| from data_processing.indexing import COLLECTION_NAME, get_stats | |
| def _copy_tree(src: str, dst: str) -> None: | |
| if os.path.exists(dst): | |
| shutil.rmtree(dst) | |
| shutil.copytree(src, dst) | |
| def _copy_file(src: str, dst: str) -> None: | |
| os.makedirs(os.path.dirname(dst), exist_ok=True) | |
| shutil.copy2(src, dst) | |
| def build_manifest(include_pdf: bool, include_db: bool) -> dict: | |
| stats = get_stats() | |
| return { | |
| "generated_at": datetime.now(timezone.utc).isoformat(), | |
| "app_data_dir": APP_DATA_DIR, | |
| "collection_name": COLLECTION_NAME, | |
| "total_chunks": stats.get("total_chunks", 0), | |
| "embedding_model": stats.get("embedding_model"), | |
| "includes": { | |
| "csdl_vector": True, | |
| "pdf": include_pdf, | |
| "chatbot_db": include_db, | |
| }, | |
| } | |
| def main() -> int: | |
| parser = argparse.ArgumentParser(description="Export dữ liệu runtime sang bundle để push lên HF Dataset repo.") | |
| parser.add_argument("--output-dir", required=True, help="Thư mục output cho dataset repo.") | |
| parser.add_argument("--include-pdf", action="store_true", help="Sao chép thêm thư mục pdf/.") | |
| parser.add_argument("--include-db", action="store_true", help="Sao chép thêm chatbot.db nếu muốn seed DB.") | |
| args = parser.parse_args() | |
| output_dir = os.path.abspath(args.output_dir) | |
| os.makedirs(output_dir, exist_ok=True) | |
| if not os.path.isdir(VECTOR_DIR): | |
| raise RuntimeError(f"Không tìm thấy thư mục vector runtime: {VECTOR_DIR}") | |
| _copy_tree(VECTOR_DIR, os.path.join(output_dir, "csdl_vector")) | |
| if args.include_pdf and os.path.isdir(PDF_DIR): | |
| _copy_tree(PDF_DIR, os.path.join(output_dir, "pdf")) | |
| if args.include_db and os.path.exists(DB_PATH): | |
| _copy_file(DB_PATH, os.path.join(output_dir, "chatbot.db")) | |
| manifest = build_manifest(args.include_pdf, args.include_db) | |
| with open(os.path.join(output_dir, "manifest.json"), "w", encoding="utf-8") as f: | |
| json.dump(manifest, f, ensure_ascii=False, indent=2) | |
| print(f"Đã xuất dataset bundle vào: {output_dir}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |