Spaces:
Running
Running
File size: 4,890 Bytes
b655c88 98f87e1 b655c88 98f87e1 b655c88 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | """
Tải dataset runtime từ Hugging Face Dataset repo về thư mục APP_DATA_DIR.
Dataset repo dự kiến có cấu trúc:
manifest.json
csdl_vector/
pdf/ # tùy chọn
chatbot.db # tùy chọn
"""
from __future__ import annotations
import json
import os
import shutil
import sys
import tempfile
from pathlib import Path
from huggingface_hub import hf_hub_download, snapshot_download
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if ROOT_DIR not in sys.path:
sys.path.insert(0, ROOT_DIR)
from backend.runtime_paths import (
APP_DATA_DIR,
DATASET_MANIFEST_PATH,
DB_PATH,
PDF_DIR,
VECTOR_DIR,
ensure_app_dirs,
)
def _read_json(path: str) -> dict:
if not os.path.exists(path):
return {}
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def _write_json(path: str, data: dict) -> None:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def _copy_tree(src: str, dst: str) -> None:
if os.path.exists(dst):
shutil.rmtree(dst)
shutil.copytree(src, dst)
def _copy_file_if_missing(src: str, dst: str) -> None:
if os.path.exists(src) and not os.path.exists(dst):
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copy2(src, dst)
def _copy_file_always(src: str, dst: str) -> None:
"""Luôn copy file từ src sang dst (ghi đè nếu đã tồn tại)."""
if os.path.exists(src):
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copy2(src, dst)
def _vector_dir_ready() -> bool:
return os.path.isdir(VECTOR_DIR) and any(Path(VECTOR_DIR).iterdir())
def _load_remote_manifest(repo_id: str, revision: str, token: str | None) -> dict:
try:
manifest_path = hf_hub_download(
repo_id=repo_id,
repo_type="dataset",
filename="manifest.json",
revision=revision,
token=token,
)
return _read_json(manifest_path)
except Exception as exc:
print(f"[BOOTSTRAP] Không tải được manifest.json: {exc}")
return {}
def _should_sync(local_meta: dict, remote_manifest: dict, repo_id: str, revision: str, force: bool) -> bool:
if force:
return True
if not _vector_dir_ready():
return True
if not local_meta:
return True
if local_meta.get("repo_id") != repo_id:
return True
if local_meta.get("revision") != revision:
return True
if local_meta.get("manifest") != remote_manifest:
return True
return False
def bootstrap_space_data(force: bool = False) -> bool:
ensure_app_dirs()
repo_id = os.getenv("HF_DATASET_REPO", "").strip()
if not repo_id:
print("[BOOTSTRAP] HF_DATASET_REPO chưa được cấu hình. Bỏ qua bước tải dataset.")
return False
revision = os.getenv("HF_DATASET_REVISION", "main").strip() or "main"
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
local_meta = _read_json(DATASET_MANIFEST_PATH)
remote_manifest = _load_remote_manifest(repo_id, revision, token)
if not _should_sync(local_meta, remote_manifest, repo_id, revision, force):
print(f"[BOOTSTRAP] Dataset đã đồng bộ sẵn tại {APP_DATA_DIR}")
return False
print(f"[BOOTSTRAP] Đang tải dataset từ {repo_id}@{revision} ...")
with tempfile.TemporaryDirectory(prefix="hf_dataset_") as tmp_dir:
snapshot_dir = snapshot_download(
repo_id=repo_id,
repo_type="dataset",
revision=revision,
token=token,
local_dir=tmp_dir,
)
vector_src = os.path.join(snapshot_dir, "csdl_vector")
if not os.path.isdir(vector_src):
raise RuntimeError("Dataset repo không chứa thư mục csdl_vector/")
_copy_tree(vector_src, VECTOR_DIR)
pdf_src = os.path.join(snapshot_dir, "pdf")
if os.path.isdir(pdf_src):
_copy_tree(pdf_src, PDF_DIR)
db_src = os.path.join(snapshot_dir, "chatbot.db")
_copy_file_always(db_src, DB_PATH)
print(f"[BOOTSTRAP] Đã tải chatbot.db mới nhất từ dataset repo")
meta = {
"repo_id": repo_id,
"revision": revision,
"manifest": remote_manifest,
}
_write_json(DATASET_MANIFEST_PATH, meta)
print(f"[BOOTSTRAP] Hoàn tất đồng bộ dataset vào {APP_DATA_DIR}")
return True
def main() -> int:
force = os.getenv("HF_DATASET_FORCE_SYNC", "0") == "1"
try:
bootstrap_space_data(force=force)
return 0
except Exception as exc:
print(f"[BOOTSTRAP] Lỗi đồng bộ dataset: {exc}")
return 1
if __name__ == "__main__":
raise SystemExit(main())
|