import google.generativeai as genai import faiss import pickle import json import os import numpy as np from dotenv import load_dotenv # Load env to get API Key load_dotenv() GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") if not GEMINI_API_KEY: print("❌ Error: GEMINI_API_KEY not found in .env") exit(1) genai.configure(api_key=GEMINI_API_KEY) # --- Configuration --- DATA_DIR = "data" SUMMARIES_DIR = os.path.join(DATA_DIR, "summaries") OUTPUT_DIR = os.path.join("backend", "vector_store") # Google's latest embedding model EMBEDDING_MODEL = "models/text-embedding-004" os.makedirs(OUTPUT_DIR, exist_ok=True) def load_json(filename): path = os.path.join(DATA_DIR, filename) return json.load(open(path, 'r', encoding='utf-8')) if os.path.exists(path) else [] def load_summary_text(filename): path = os.path.join(SUMMARIES_DIR, filename) if os.path.exists(path): with open(path, 'r', encoding='utf-8') as f: return f.read().strip() return "" def get_embedding(text): """Wraps Gemini API to get embeddings""" result = genai.embed_content( model=EMBEDDING_MODEL, content=text, task_type="retrieval_document" ) return result['embedding'] def main(): print("🔄 Creating Cloud-Based Vector Index...") chunks = [] metadata = [] def add_chunk(text, source): if text and len(text) > 5: chunks.append(text) metadata.append({"source": source}) # --- Load Data (Same logic as before) --- # 1. Profile profile = load_json("profile.json") if isinstance(profile, dict): contact = profile.get("contact", {}) c_text = f"Contact Details: Name: {contact.get('name')}. Email: {contact.get('email')}. LinkedIn: {contact.get('linkedin')}." add_chunk(c_text, "profile_contact") if profile.get("summary"): add_chunk( f"Professional Summary: {profile.get('summary')}", "profile_summary") # 2. Experience experience = load_json("experience.json") for exp in experience: text = f"Experience: {exp.get('role')} at {exp.get('company')} ({exp.get('duration')}). {exp.get('description')}" add_chunk(text, "experience_entry") # 3. Education education = load_json("education.json") for edu in education: text = f"Education: {edu.get('degree')} from {edu.get('institution')}. {edu.get('details')}" add_chunk(text, "education_entry") # 4. Skills skills = load_json("skills.json") for s in skills: text = f"Skills in {s.get('category')}: {', '.join(s.get('list', []))}" add_chunk(text, "skills_list") # 5. Summaries summary_files = { "about_summary.txt": "profile_about_me", "projects_summary.txt": "ui_trigger_projects", "articles_summary.txt": "ui_trigger_articles", "videos_summary.txt": "ui_trigger_videos", "research_summary.txt": "ui_trigger_research", "skills_summary.txt": "ui_trigger_skills", "certifications_summary.txt": "ui_trigger_certifications" } for filename, tag in summary_files.items(): text = load_summary_text(filename) if text: add_chunk(text, tag) # --- Generate Embeddings --- if not chunks: print("❌ Error: No chunks created.") return print(f"🧠 Encoding {len(chunks)} chunks via Gemini API...") # Batch processing is better, but simple loop works for small portfolios embeddings = [] for i, chunk in enumerate(chunks): if i % 5 == 0: print(f" Processing chunk {i}/{len(chunks)}...") emb = get_embedding(chunk) embeddings.append(emb) embeddings_np = np.array(embeddings).astype("float32") # Create FAISS index index = faiss.IndexFlatL2(embeddings_np.shape[1]) index.add(embeddings_np) faiss.write_index(index, os.path.join(OUTPUT_DIR, "faiss_index.bin")) with open(os.path.join(OUTPUT_DIR, "chunks_metadata.pkl"), "wb") as f: pickle.dump({"chunks": chunks, "metadata": metadata}, f) print(f"🎉 Cloud Indexing Complete! Saved to {OUTPUT_DIR}") if __name__ == "__main__": main()