Spaces:
Running on Zero
Running on Zero
| """ | |
| Index bootstrap utilities for Hugging Face Spaces. | |
| This Space relies on a prebuilt index stored on Hugging Face Datasets: | |
| - ChromaDB persist directory (vector store) | |
| - SQLite doc store (parent documents) | |
| At startup we download (once) and place the index into a writable storage dir | |
| (prefer /data on Spaces when persistent storage is enabled). | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import logging | |
| import os | |
| import shutil | |
| from dataclasses import dataclass | |
| from pathlib import Path | |
| from typing import Any, Dict, Optional, Tuple | |
| from huggingface_hub import snapshot_download | |
| logger = logging.getLogger(__name__) | |
| DEFAULT_INDEX_REPO_ID = "ZhangNy/radiology-index-qwen3-embedding-0.6b" | |
| class IndexPaths: | |
| vector_db_path: Path | |
| doc_store_path: Path | |
| manifest_path: Optional[Path] | |
| snapshot_dir: Optional[Path] | |
| def resolve_default_storage_dir() -> Path: | |
| """ | |
| Determine a good default storage directory for Spaces. | |
| Priority: | |
| - $RAG_STORAGE_DIR (user override) | |
| - /data/radiology_rag (Spaces persistent storage) | |
| - ./storage (local) | |
| """ | |
| env = (os.getenv("RAG_STORAGE_DIR") or "").strip() | |
| if env: | |
| return Path(env) | |
| if Path("/data").exists(): | |
| return Path("/data") / "radiology_rag" | |
| return Path("./storage") | |
| def _find_index_artifacts(snapshot_dir: Path) -> Tuple[Path, Path, Optional[Path]]: | |
| """ | |
| Find (chroma_db_dir, doc_store_db, manifest_json) inside a HF snapshot. | |
| We support either: | |
| - chroma_db/, doc_store.db, manifest.json | |
| - storage/chroma_db/, storage/doc_store.db, storage/manifest.json | |
| """ | |
| candidates = [ | |
| (snapshot_dir / "chroma_db", snapshot_dir / "doc_store.db", snapshot_dir / "manifest.json"), | |
| (snapshot_dir / "storage" / "chroma_db", snapshot_dir / "storage" / "doc_store.db", snapshot_dir / "storage" / "manifest.json"), | |
| ] | |
| for chroma_dir, doc_db, manifest in candidates: | |
| if chroma_dir.exists() and chroma_dir.is_dir() and doc_db.exists() and doc_db.is_file(): | |
| return chroma_dir, doc_db, (manifest if manifest.exists() else None) | |
| raise FileNotFoundError( | |
| "Could not locate index artifacts inside snapshot. " | |
| "Expected either {chroma_db/, doc_store.db} or {storage/chroma_db/, storage/doc_store.db}." | |
| ) | |
| def read_manifest(manifest_path: Optional[Path]) -> Optional[Dict[str, Any]]: | |
| if not manifest_path or not manifest_path.exists(): | |
| return None | |
| try: | |
| with open(manifest_path, "r", encoding="utf-8") as f: | |
| return json.load(f) or {} | |
| except Exception as e: | |
| logger.warning(f"Failed to read manifest.json: {e}") | |
| return None | |
| def ensure_index( | |
| *, | |
| repo_id: str = DEFAULT_INDEX_REPO_ID, | |
| revision: Optional[str] = None, | |
| target_vector_db_path: Optional[str] = None, | |
| target_doc_store_path: Optional[str] = None, | |
| storage_dir: Optional[str] = None, | |
| force_download: bool = False, | |
| ) -> IndexPaths: | |
| """ | |
| Ensure the index exists locally at the configured storage paths. | |
| Returns resolved IndexPaths; raises on unrecoverable errors. | |
| """ | |
| # Resolve target paths | |
| if storage_dir: | |
| base_dir = Path(storage_dir) | |
| else: | |
| base_dir = resolve_default_storage_dir() | |
| base_dir.mkdir(parents=True, exist_ok=True) | |
| vector_db_path = Path(target_vector_db_path) if target_vector_db_path else (base_dir / "chroma_db") | |
| doc_store_path = Path(target_doc_store_path) if target_doc_store_path else (base_dir / "doc_store.db") | |
| # Fast path: already present | |
| if ( | |
| not force_download | |
| and vector_db_path.exists() | |
| and vector_db_path.is_dir() | |
| and doc_store_path.exists() | |
| and doc_store_path.is_file() | |
| ): | |
| logger.info(f"Index already present: vector_db={vector_db_path} doc_store={doc_store_path}") | |
| manifest_path = (base_dir / "manifest.json") if (base_dir / "manifest.json").exists() else None | |
| return IndexPaths(vector_db_path=vector_db_path, doc_store_path=doc_store_path, manifest_path=manifest_path, snapshot_dir=None) | |
| # Download snapshot | |
| repo_id = (repo_id or "").strip() or DEFAULT_INDEX_REPO_ID | |
| logger.info(f"Downloading index snapshot from HF dataset repo: {repo_id} (revision={revision or 'main'})") | |
| snapshot_dir = Path( | |
| snapshot_download( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| revision=revision or None, | |
| local_files_only=False, | |
| ) | |
| ) | |
| src_chroma_dir, src_doc_db, src_manifest = _find_index_artifacts(snapshot_dir) | |
| logger.info(f"Found index artifacts in snapshot: chroma={src_chroma_dir} doc_store={src_doc_db}") | |
| # Copy to writable target locations | |
| if vector_db_path.exists(): | |
| shutil.rmtree(vector_db_path, ignore_errors=True) | |
| vector_db_path.parent.mkdir(parents=True, exist_ok=True) | |
| shutil.copytree(src_chroma_dir, vector_db_path, dirs_exist_ok=False) | |
| doc_store_path.parent.mkdir(parents=True, exist_ok=True) | |
| shutil.copy2(src_doc_db, doc_store_path) | |
| manifest_path: Optional[Path] = None | |
| if src_manifest and src_manifest.exists(): | |
| manifest_path = doc_store_path.parent / "manifest.json" | |
| try: | |
| shutil.copy2(src_manifest, manifest_path) | |
| except Exception as e: | |
| logger.warning(f"Failed to copy manifest.json: {e}") | |
| manifest_path = None | |
| logger.info(f"Index ready: vector_db={vector_db_path} doc_store={doc_store_path}") | |
| return IndexPaths(vector_db_path=vector_db_path, doc_store_path=doc_store_path, manifest_path=manifest_path, snapshot_dir=snapshot_dir) | |