Spaces:
Running
Running
File size: 2,732 Bytes
b655c88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | """
Xuất một bundle dữ liệu để đẩy lên Hugging Face Dataset repo.
Ví dụ:
python scripts/export_dataset_bundle.py --output-dir ..\\chatbot-lichsu-data --include-pdf
"""
from __future__ import annotations
import argparse
import json
import os
import shutil
import sys
from datetime import datetime, timezone
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT_DIR not in sys.path:
sys.path.insert(0, ROOT_DIR)
from backend.runtime_paths import APP_DATA_DIR, DB_PATH, PDF_DIR, VECTOR_DIR
from data_processing.indexing import COLLECTION_NAME, get_stats
def _copy_tree(src: str, dst: str) -> None:
if os.path.exists(dst):
shutil.rmtree(dst)
shutil.copytree(src, dst)
def _copy_file(src: str, dst: str) -> None:
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copy2(src, dst)
def build_manifest(include_pdf: bool, include_db: bool) -> dict:
stats = get_stats()
return {
"generated_at": datetime.now(timezone.utc).isoformat(),
"app_data_dir": APP_DATA_DIR,
"collection_name": COLLECTION_NAME,
"total_chunks": stats.get("total_chunks", 0),
"embedding_model": stats.get("embedding_model"),
"includes": {
"csdl_vector": True,
"pdf": include_pdf,
"chatbot_db": include_db,
},
}
def main() -> int:
parser = argparse.ArgumentParser(description="Export dữ liệu runtime sang bundle để push lên HF Dataset repo.")
parser.add_argument("--output-dir", required=True, help="Thư mục output cho dataset repo.")
parser.add_argument("--include-pdf", action="store_true", help="Sao chép thêm thư mục pdf/.")
parser.add_argument("--include-db", action="store_true", help="Sao chép thêm chatbot.db nếu muốn seed DB.")
args = parser.parse_args()
output_dir = os.path.abspath(args.output_dir)
os.makedirs(output_dir, exist_ok=True)
if not os.path.isdir(VECTOR_DIR):
raise RuntimeError(f"Không tìm thấy thư mục vector runtime: {VECTOR_DIR}")
_copy_tree(VECTOR_DIR, os.path.join(output_dir, "csdl_vector"))
if args.include_pdf and os.path.isdir(PDF_DIR):
_copy_tree(PDF_DIR, os.path.join(output_dir, "pdf"))
if args.include_db and os.path.exists(DB_PATH):
_copy_file(DB_PATH, os.path.join(output_dir, "chatbot.db"))
manifest = build_manifest(args.include_pdf, args.include_db)
with open(os.path.join(output_dir, "manifest.json"), "w", encoding="utf-8") as f:
json.dump(manifest, f, ensure_ascii=False, indent=2)
print(f"Đã xuất dataset bundle vào: {output_dir}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|