Arviano's picture
fix: remove the default mounting from /data for vector database
edef54d verified
import os
import glob
import pypdf
from huggingface_hub import snapshot_download
import zipfile
def load_config():
import yaml
with open("configs.yaml", "r") as f:
CONFIG = yaml.safe_load(f)
return CONFIG
def load_openai_keys():
if not os.getenv("OPENAI_API_KEYS"):
from dotenv import load_dotenv
load_dotenv() # the key is stored in os.getenv["OPENAI_API_KEYS"]
print("OPENAI_API_KEY has been set!")
def prepare_chroma_dir() -> str:
db_cfg = load_config()["VectorDB"]
target_dir = db_cfg["PERSISTENT_DIR"]
os.makedirs(target_dir, exist_ok=True)
# if os.path.exists("/data"):
# os.environ.setdefault("HF_HOME", "/data/.huggingface")
repo_id = os.getenv(
"HF_RAG_DATASET_REPO",
"GenAI-Project-Team/chroma-sport-mental-thoughness"
)
read_dataset_token = os.getenv("READ_HF_DATASET_TOKEN")
snap_dir = snapshot_download(
repo_id=repo_id,
repo_type="dataset",
token=read_dataset_token,
local_dir=target_dir,
local_dir_use_symlinks=False,
)
zip_path = os.path.join(snap_dir, f"{db_cfg['DB_NAME']}.zip")
if os.path.isfile(zip_path):
# Always unzip, safer than checking only if empty
with zipfile.ZipFile(zip_path, "r") as zf:
zf.extractall(target_dir)
persist_dir = os.path.join(target_dir, db_cfg['CHROMA_DIRNAME'])
print("Chosen persist_dir:", persist_dir)
return persist_dir
def debug_list_collections(persist_dir: str):
from chromadb import PersistentClient
client = PersistentClient(path=persist_dir)
cols = client.list_collections()
print("Collections found:", [c.name for c in cols])
for c in cols:
try:
print(f"- {c.name}: {c.count()} docs")
except Exception as e:
print(f"- {c.name}: count unavailable ({e})")
def print_tree(start_path=".", prefix=""):
"""Recursively prints a tree of files/folders starting from start_path."""
entries = sorted(os.listdir(start_path))
for i, entry in enumerate(entries):
path = os.path.join(start_path, entry)
connector = "└── " if i == len(entries) - 1 else "β”œβ”€β”€ "
print(prefix + connector + entry)
if os.path.isdir(path):
extension = " " if i == len(entries) - 1 else "β”‚ "
print_tree(path, prefix + extension)