| import os |
| import glob |
| import pypdf |
| from huggingface_hub import snapshot_download |
| import zipfile |
|
|
| def load_config(): |
| import yaml |
| with open("configs.yaml", "r") as f: |
| CONFIG = yaml.safe_load(f) |
| return CONFIG |
|
|
| def load_openai_keys(): |
| if not os.getenv("OPENAI_API_KEYS"): |
| from dotenv import load_dotenv |
| load_dotenv() |
| print("OPENAI_API_KEY has been set!") |
|
|
| def prepare_chroma_dir() -> str: |
| db_cfg = load_config()["VectorDB"] |
|
|
| target_dir = db_cfg["PERSISTENT_DIR"] |
| os.makedirs(target_dir, exist_ok=True) |
|
|
| |
| |
|
|
| repo_id = os.getenv( |
| "HF_RAG_DATASET_REPO", |
| "GenAI-Project-Team/chroma-sport-mental-thoughness" |
| ) |
| read_dataset_token = os.getenv("READ_HF_DATASET_TOKEN") |
|
|
| snap_dir = snapshot_download( |
| repo_id=repo_id, |
| repo_type="dataset", |
| token=read_dataset_token, |
| local_dir=target_dir, |
| local_dir_use_symlinks=False, |
| ) |
|
|
| zip_path = os.path.join(snap_dir, f"{db_cfg['DB_NAME']}.zip") |
|
|
| if os.path.isfile(zip_path): |
| |
| with zipfile.ZipFile(zip_path, "r") as zf: |
| zf.extractall(target_dir) |
| persist_dir = os.path.join(target_dir, db_cfg['CHROMA_DIRNAME']) |
|
|
| print("Chosen persist_dir:", persist_dir) |
| return persist_dir |
|
|
| def debug_list_collections(persist_dir: str): |
| from chromadb import PersistentClient |
| client = PersistentClient(path=persist_dir) |
| cols = client.list_collections() |
| print("Collections found:", [c.name for c in cols]) |
| for c in cols: |
| try: |
| print(f"- {c.name}: {c.count()} docs") |
| except Exception as e: |
| print(f"- {c.name}: count unavailable ({e})") |
|
|
| def print_tree(start_path=".", prefix=""): |
| """Recursively prints a tree of files/folders starting from start_path.""" |
| entries = sorted(os.listdir(start_path)) |
| for i, entry in enumerate(entries): |
| path = os.path.join(start_path, entry) |
| connector = "βββ " if i == len(entries) - 1 else "βββ " |
| print(prefix + connector + entry) |
| if os.path.isdir(path): |
| extension = " " if i == len(entries) - 1 else "β " |
| print_tree(path, prefix + extension) |