ZhangNy's picture
Add Space app files
75db650
"""
Index bootstrap utilities for Hugging Face Spaces.
This Space relies on a prebuilt index stored on Hugging Face Datasets:
- ChromaDB persist directory (vector store)
- SQLite doc store (parent documents)
At startup we download (once) and place the index into a writable storage dir
(prefer /data on Spaces when persistent storage is enabled).
"""
from __future__ import annotations
import json
import logging
import os
import shutil
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from huggingface_hub import snapshot_download
logger = logging.getLogger(__name__)
DEFAULT_INDEX_REPO_ID = "ZhangNy/radiology-index-qwen3-embedding-0.6b"
@dataclass(frozen=True)
class IndexPaths:
vector_db_path: Path
doc_store_path: Path
manifest_path: Optional[Path]
snapshot_dir: Optional[Path]
def resolve_default_storage_dir() -> Path:
"""
Determine a good default storage directory for Spaces.
Priority:
- $RAG_STORAGE_DIR (user override)
- /data/radiology_rag (Spaces persistent storage)
- ./storage (local)
"""
env = (os.getenv("RAG_STORAGE_DIR") or "").strip()
if env:
return Path(env)
if Path("/data").exists():
return Path("/data") / "radiology_rag"
return Path("./storage")
def _find_index_artifacts(snapshot_dir: Path) -> Tuple[Path, Path, Optional[Path]]:
"""
Find (chroma_db_dir, doc_store_db, manifest_json) inside a HF snapshot.
We support either:
- chroma_db/, doc_store.db, manifest.json
- storage/chroma_db/, storage/doc_store.db, storage/manifest.json
"""
candidates = [
(snapshot_dir / "chroma_db", snapshot_dir / "doc_store.db", snapshot_dir / "manifest.json"),
(snapshot_dir / "storage" / "chroma_db", snapshot_dir / "storage" / "doc_store.db", snapshot_dir / "storage" / "manifest.json"),
]
for chroma_dir, doc_db, manifest in candidates:
if chroma_dir.exists() and chroma_dir.is_dir() and doc_db.exists() and doc_db.is_file():
return chroma_dir, doc_db, (manifest if manifest.exists() else None)
raise FileNotFoundError(
"Could not locate index artifacts inside snapshot. "
"Expected either {chroma_db/, doc_store.db} or {storage/chroma_db/, storage/doc_store.db}."
)
def read_manifest(manifest_path: Optional[Path]) -> Optional[Dict[str, Any]]:
if not manifest_path or not manifest_path.exists():
return None
try:
with open(manifest_path, "r", encoding="utf-8") as f:
return json.load(f) or {}
except Exception as e:
logger.warning(f"Failed to read manifest.json: {e}")
return None
def ensure_index(
*,
repo_id: str = DEFAULT_INDEX_REPO_ID,
revision: Optional[str] = None,
target_vector_db_path: Optional[str] = None,
target_doc_store_path: Optional[str] = None,
storage_dir: Optional[str] = None,
force_download: bool = False,
) -> IndexPaths:
"""
Ensure the index exists locally at the configured storage paths.
Returns resolved IndexPaths; raises on unrecoverable errors.
"""
# Resolve target paths
if storage_dir:
base_dir = Path(storage_dir)
else:
base_dir = resolve_default_storage_dir()
base_dir.mkdir(parents=True, exist_ok=True)
vector_db_path = Path(target_vector_db_path) if target_vector_db_path else (base_dir / "chroma_db")
doc_store_path = Path(target_doc_store_path) if target_doc_store_path else (base_dir / "doc_store.db")
# Fast path: already present
if (
not force_download
and vector_db_path.exists()
and vector_db_path.is_dir()
and doc_store_path.exists()
and doc_store_path.is_file()
):
logger.info(f"Index already present: vector_db={vector_db_path} doc_store={doc_store_path}")
manifest_path = (base_dir / "manifest.json") if (base_dir / "manifest.json").exists() else None
return IndexPaths(vector_db_path=vector_db_path, doc_store_path=doc_store_path, manifest_path=manifest_path, snapshot_dir=None)
# Download snapshot
repo_id = (repo_id or "").strip() or DEFAULT_INDEX_REPO_ID
logger.info(f"Downloading index snapshot from HF dataset repo: {repo_id} (revision={revision or 'main'})")
snapshot_dir = Path(
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
revision=revision or None,
local_files_only=False,
)
)
src_chroma_dir, src_doc_db, src_manifest = _find_index_artifacts(snapshot_dir)
logger.info(f"Found index artifacts in snapshot: chroma={src_chroma_dir} doc_store={src_doc_db}")
# Copy to writable target locations
if vector_db_path.exists():
shutil.rmtree(vector_db_path, ignore_errors=True)
vector_db_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copytree(src_chroma_dir, vector_db_path, dirs_exist_ok=False)
doc_store_path.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(src_doc_db, doc_store_path)
manifest_path: Optional[Path] = None
if src_manifest and src_manifest.exists():
manifest_path = doc_store_path.parent / "manifest.json"
try:
shutil.copy2(src_manifest, manifest_path)
except Exception as e:
logger.warning(f"Failed to copy manifest.json: {e}")
manifest_path = None
logger.info(f"Index ready: vector_db={vector_db_path} doc_store={doc_store_path}")
return IndexPaths(vector_db_path=vector_db_path, doc_store_path=doc_store_path, manifest_path=manifest_path, snapshot_dir=snapshot_dir)