from __future__ import annotations import os import sys from pathlib import Path from dotenv import load_dotenv from huggingface_hub import hf_hub_download PROJECT_ROOT = Path(__file__).resolve().parents[1] ADK_SRC = PROJECT_ROOT / "adk-python" / "src" LOCAL_DATASET_DIR = PROJECT_ROOT / "data" / "processed" RUNTIME_DATASET_DIR = PROJECT_ROOT / "data" / "_runtime_processed" def _dataset_repo_id() -> str: return os.getenv("MEGUMIN_HF_DATASET_REPO_ID", "Junhoee/megumin-chat") def _dataset_filename() -> str: return os.getenv("MEGUMIN_HF_DATASET_FILENAME", "megumin_qa_dataset.json") def _index_filename() -> str: return os.getenv("MEGUMIN_FAISS_INDEX_FILENAME", "megumin_questions.faiss") def _qa_index_filename() -> str: return os.getenv("MEGUMIN_FAISS_QA_INDEX_FILENAME", "megumin_question_answer.faiss") def _metadata_filename() -> str: return os.getenv("MEGUMIN_FAISS_METADATA_FILENAME", "megumin_questions_meta.json") def _fact_dataset_filename() -> str: return os.getenv("MEGUMIN_HF_FACT_DATASET_FILENAME", "namuwiki_qa.json") def _fact_index_filename() -> str: return os.getenv("MEGUMIN_HF_FACT_INDEX_FILENAME", "namuwiki_questions.faiss") def _fact_qa_index_filename() -> str: return os.getenv("MEGUMIN_HF_FACT_QA_INDEX_FILENAME", "namuwiki_question_answer.faiss") def _fact_metadata_filename() -> str: return os.getenv("MEGUMIN_HF_FACT_METADATA_FILENAME", "namuwiki_questions_meta.json") def bootstrap_environment() -> None: load_dotenv(PROJECT_ROOT / ".env", override=True) if ADK_SRC.exists(): adk_src = str(ADK_SRC) if adk_src not in sys.path: sys.path.insert(0, adk_src) def resolve_dataset_dir() -> Path: RUNTIME_DATASET_DIR.mkdir(parents=True, exist_ok=True) try: hf_token = os.getenv("HF_TOKEN") or None repo_id = _dataset_repo_id() artifact_names = ( _dataset_filename(), _index_filename(), _qa_index_filename(), _metadata_filename(), _fact_dataset_filename(), _fact_index_filename(), _fact_qa_index_filename(), _fact_metadata_filename(), ) for artifact_name in artifact_names: try: hf_hub_download( repo_id=repo_id, repo_type="dataset", filename=artifact_name, token=hf_token, local_dir=str(RUNTIME_DATASET_DIR), ) except Exception: if artifact_name not in {_dataset_filename(), _fact_dataset_filename()}: continue raise return RUNTIME_DATASET_DIR except Exception: if LOCAL_DATASET_DIR.exists() and any(LOCAL_DATASET_DIR.glob("*.json")): return LOCAL_DATASET_DIR raise