Spaces:
Sleeping
Sleeping
| """ | |
| RAG Ingestion Script | |
| Reads all markdown files from data/RAG, embeds them using Google's | |
| text-embedding-004 model, and stores them in a local ChromaDB collection. | |
| Run this script once (and re-run whenever the RAG data changes): | |
| python ingest.py | |
| """ | |
| import os | |
| import glob | |
| import time | |
| import chromadb | |
| from google import genai | |
| from google.genai import types | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| RAG_DATA_DIR = os.path.join(os.path.dirname(__file__), "data", "RAG") | |
| CHROMA_PATH = os.path.join(os.path.dirname(__file__), "chroma_db") | |
| COLLECTION_NAME = "rag_knowledge" | |
| EMBEDDING_MODEL = "models/gemini-embedding-001" | |
| # ββ Google Generative AI (new SDK) βββββββββββββββββββββββββββββββββββββββββββ | |
| GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
| if not GOOGLE_API_KEY: | |
| raise EnvironmentError("GOOGLE_API_KEY is not set in your .env file.") | |
| client = genai.Client(api_key=GOOGLE_API_KEY) | |
| # ββ ChromaDB (persistent local storage) ββββββββββββββββββββββββββββββββββββββ | |
| chroma_client = chromadb.PersistentClient(path=CHROMA_PATH) | |
| try: | |
| chroma_client.delete_collection(COLLECTION_NAME) | |
| print(f"ποΈ Deleted existing collection '{COLLECTION_NAME}'") | |
| except Exception: | |
| pass | |
| collection = chroma_client.create_collection(COLLECTION_NAME) | |
| print(f"β Created collection '{COLLECTION_NAME}'") | |
| def embed_text(text: str, retries=5) -> list[float]: | |
| """Embed a single text using Google's text-embedding-004, with retry logic.""" | |
| for attempt in range(retries): | |
| try: | |
| response = client.models.embed_content( | |
| model=EMBEDDING_MODEL, | |
| contents=text, | |
| config=types.EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT"), | |
| ) | |
| time.sleep(1) # Delay to prevent hitting rate limits | |
| return response.embeddings[0].values | |
| except Exception as e: | |
| if attempt < retries - 1: | |
| wait_time = 2 ** attempt | |
| print(f" [!] API Error: {e}. Retrying in {wait_time}s...") | |
| time.sleep(wait_time) | |
| else: | |
| raise e | |
| def load_markdown_files() -> list[dict]: | |
| """Recursively find and read all .md files under RAG_DATA_DIR.""" | |
| pattern = os.path.join(RAG_DATA_DIR, "**", "*.md") | |
| files = glob.glob(pattern, recursive=True) | |
| documents = [] | |
| for path in files: | |
| with open(path, "r", encoding="utf-8") as f: | |
| content = f.read().strip() | |
| rel_path = os.path.relpath(path, RAG_DATA_DIR) | |
| source = rel_path.replace("\\", "/") | |
| parts = rel_path.split(os.sep) | |
| category = parts[0] if len(parts) > 1 else "general" | |
| documents.append({ | |
| "id": source, | |
| "content": content, | |
| "metadata": { | |
| "source": source, | |
| "category": category, | |
| "filename": os.path.basename(path), | |
| }, | |
| }) | |
| return documents | |
| def ingest(): | |
| docs = load_markdown_files() | |
| print(f"\nπ Found {len(docs)} markdown files β embeddingβ¦\n") | |
| ids, embeddings, contents, metas = [], [], [], [] | |
| for i, doc in enumerate(docs, 1): | |
| print(f" [{i:2}/{len(docs)}] {doc['metadata']['source']}") | |
| embedding = embed_text(doc["content"]) | |
| ids.append(doc["id"]) | |
| embeddings.append(embedding) | |
| contents.append(doc["content"]) | |
| metas.append(doc["metadata"]) | |
| collection.add( | |
| ids=ids, | |
| embeddings=embeddings, | |
| documents=contents, | |
| metadatas=metas, | |
| ) | |
| print(f"\nπ Done! {len(docs)} documents indexed into ChromaDB at '{CHROMA_PATH}'") | |
| if __name__ == "__main__": | |
| ingest() | |