Spaces:
Running
Running
| """ | |
| Tải dataset runtime từ Hugging Face Dataset repo về thư mục APP_DATA_DIR. | |
| Dataset repo dự kiến có cấu trúc: | |
| manifest.json | |
| csdl_vector/ | |
| pdf/ # tùy chọn | |
| chatbot.db # tùy chọn | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| import tempfile | |
| from pathlib import Path | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| if ROOT_DIR not in sys.path: | |
| sys.path.insert(0, ROOT_DIR) | |
| from backend.runtime_paths import ( | |
| APP_DATA_DIR, | |
| DATASET_MANIFEST_PATH, | |
| DB_PATH, | |
| PDF_DIR, | |
| VECTOR_DIR, | |
| ensure_app_dirs, | |
| ) | |
| def _read_json(path: str) -> dict: | |
| if not os.path.exists(path): | |
| return {} | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def _write_json(path: str, data: dict) -> None: | |
| os.makedirs(os.path.dirname(path), exist_ok=True) | |
| with open(path, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=2) | |
| def _copy_tree(src: str, dst: str) -> None: | |
| if os.path.exists(dst): | |
| shutil.rmtree(dst) | |
| shutil.copytree(src, dst) | |
| def _copy_file_if_missing(src: str, dst: str) -> None: | |
| if os.path.exists(src) and not os.path.exists(dst): | |
| os.makedirs(os.path.dirname(dst), exist_ok=True) | |
| shutil.copy2(src, dst) | |
| def _copy_file_always(src: str, dst: str) -> None: | |
| """Luôn copy file từ src sang dst (ghi đè nếu đã tồn tại).""" | |
| if os.path.exists(src): | |
| os.makedirs(os.path.dirname(dst), exist_ok=True) | |
| shutil.copy2(src, dst) | |
| def _vector_dir_ready() -> bool: | |
| return os.path.isdir(VECTOR_DIR) and any(Path(VECTOR_DIR).iterdir()) | |
| def _load_remote_manifest(repo_id: str, revision: str, token: str | None) -> dict: | |
| try: | |
| manifest_path = hf_hub_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| filename="manifest.json", | |
| revision=revision, | |
| token=token, | |
| ) | |
| return _read_json(manifest_path) | |
| except Exception as exc: | |
| print(f"[BOOTSTRAP] Không tải được manifest.json: {exc}") | |
| return {} | |
| def _should_sync(local_meta: dict, remote_manifest: dict, repo_id: str, revision: str, force: bool) -> bool: | |
| if force: | |
| return True | |
| if not _vector_dir_ready(): | |
| return True | |
| if not local_meta: | |
| return True | |
| if local_meta.get("repo_id") != repo_id: | |
| return True | |
| if local_meta.get("revision") != revision: | |
| return True | |
| if local_meta.get("manifest") != remote_manifest: | |
| return True | |
| return False | |
| def bootstrap_space_data(force: bool = False) -> bool: | |
| ensure_app_dirs() | |
| repo_id = os.getenv("HF_DATASET_REPO", "").strip() | |
| if not repo_id: | |
| print("[BOOTSTRAP] HF_DATASET_REPO chưa được cấu hình. Bỏ qua bước tải dataset.") | |
| return False | |
| revision = os.getenv("HF_DATASET_REVISION", "main").strip() or "main" | |
| token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN") | |
| local_meta = _read_json(DATASET_MANIFEST_PATH) | |
| remote_manifest = _load_remote_manifest(repo_id, revision, token) | |
| if not _should_sync(local_meta, remote_manifest, repo_id, revision, force): | |
| print(f"[BOOTSTRAP] Dataset đã đồng bộ sẵn tại {APP_DATA_DIR}") | |
| return False | |
| print(f"[BOOTSTRAP] Đang tải dataset từ {repo_id}@{revision} ...") | |
| with tempfile.TemporaryDirectory(prefix="hf_dataset_") as tmp_dir: | |
| snapshot_dir = snapshot_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| revision=revision, | |
| token=token, | |
| local_dir=tmp_dir, | |
| ) | |
| vector_src = os.path.join(snapshot_dir, "csdl_vector") | |
| if not os.path.isdir(vector_src): | |
| raise RuntimeError("Dataset repo không chứa thư mục csdl_vector/") | |
| _copy_tree(vector_src, VECTOR_DIR) | |
| pdf_src = os.path.join(snapshot_dir, "pdf") | |
| if os.path.isdir(pdf_src): | |
| _copy_tree(pdf_src, PDF_DIR) | |
| db_src = os.path.join(snapshot_dir, "chatbot.db") | |
| _copy_file_always(db_src, DB_PATH) | |
| print(f"[BOOTSTRAP] Đã tải chatbot.db mới nhất từ dataset repo") | |
| meta = { | |
| "repo_id": repo_id, | |
| "revision": revision, | |
| "manifest": remote_manifest, | |
| } | |
| _write_json(DATASET_MANIFEST_PATH, meta) | |
| print(f"[BOOTSTRAP] Hoàn tất đồng bộ dataset vào {APP_DATA_DIR}") | |
| return True | |
| def main() -> int: | |
| force = os.getenv("HF_DATASET_FORCE_SYNC", "0") == "1" | |
| try: | |
| bootstrap_space_data(force=force) | |
| return 0 | |
| except Exception as exc: | |
| print(f"[BOOTSTRAP] Lỗi đồng bộ dataset: {exc}") | |
| return 1 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |