Spaces:
Sleeping
Sleeping
| import google.generativeai as genai | |
| import faiss | |
| import pickle | |
| import json | |
| import os | |
| import numpy as np | |
| from dotenv import load_dotenv | |
| # Load env to get API Key | |
| load_dotenv() | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| if not GEMINI_API_KEY: | |
| print("β Error: GEMINI_API_KEY not found in .env") | |
| exit(1) | |
| genai.configure(api_key=GEMINI_API_KEY) | |
| # --- Configuration --- | |
| DATA_DIR = "data" | |
| SUMMARIES_DIR = os.path.join(DATA_DIR, "summaries") | |
| OUTPUT_DIR = os.path.join("backend", "vector_store") | |
| # Google's latest embedding model | |
| EMBEDDING_MODEL = "models/text-embedding-004" | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| def load_json(filename): | |
| path = os.path.join(DATA_DIR, filename) | |
| return json.load(open(path, 'r', encoding='utf-8')) if os.path.exists(path) else [] | |
| def load_summary_text(filename): | |
| path = os.path.join(SUMMARIES_DIR, filename) | |
| if os.path.exists(path): | |
| with open(path, 'r', encoding='utf-8') as f: | |
| return f.read().strip() | |
| return "" | |
| def get_embedding(text): | |
| """Wraps Gemini API to get embeddings""" | |
| result = genai.embed_content( | |
| model=EMBEDDING_MODEL, | |
| content=text, | |
| task_type="retrieval_document" | |
| ) | |
| return result['embedding'] | |
| def main(): | |
| print("π Creating Cloud-Based Vector Index...") | |
| chunks = [] | |
| metadata = [] | |
| def add_chunk(text, source): | |
| if text and len(text) > 5: | |
| chunks.append(text) | |
| metadata.append({"source": source}) | |
| # --- Load Data (Same logic as before) --- | |
| # 1. Profile | |
| profile = load_json("profile.json") | |
| if isinstance(profile, dict): | |
| contact = profile.get("contact", {}) | |
| c_text = f"Contact Details: Name: {contact.get('name')}. Email: {contact.get('email')}. LinkedIn: {contact.get('linkedin')}." | |
| add_chunk(c_text, "profile_contact") | |
| if profile.get("summary"): | |
| add_chunk( | |
| f"Professional Summary: {profile.get('summary')}", "profile_summary") | |
| # 2. Experience | |
| experience = load_json("experience.json") | |
| for exp in experience: | |
| text = f"Experience: {exp.get('role')} at {exp.get('company')} ({exp.get('duration')}). {exp.get('description')}" | |
| add_chunk(text, "experience_entry") | |
| # 3. Education | |
| education = load_json("education.json") | |
| for edu in education: | |
| text = f"Education: {edu.get('degree')} from {edu.get('institution')}. {edu.get('details')}" | |
| add_chunk(text, "education_entry") | |
| # 4. Skills | |
| skills = load_json("skills.json") | |
| for s in skills: | |
| text = f"Skills in {s.get('category')}: {', '.join(s.get('list', []))}" | |
| add_chunk(text, "skills_list") | |
| # 5. Summaries | |
| summary_files = { | |
| "about_summary.txt": "profile_about_me", | |
| "projects_summary.txt": "ui_trigger_projects", | |
| "articles_summary.txt": "ui_trigger_articles", | |
| "videos_summary.txt": "ui_trigger_videos", | |
| "research_summary.txt": "ui_trigger_research", | |
| "skills_summary.txt": "ui_trigger_skills", | |
| "certifications_summary.txt": "ui_trigger_certifications" | |
| } | |
| for filename, tag in summary_files.items(): | |
| text = load_summary_text(filename) | |
| if text: | |
| add_chunk(text, tag) | |
| # --- Generate Embeddings --- | |
| if not chunks: | |
| print("β Error: No chunks created.") | |
| return | |
| print(f"π§ Encoding {len(chunks)} chunks via Gemini API...") | |
| # Batch processing is better, but simple loop works for small portfolios | |
| embeddings = [] | |
| for i, chunk in enumerate(chunks): | |
| if i % 5 == 0: | |
| print(f" Processing chunk {i}/{len(chunks)}...") | |
| emb = get_embedding(chunk) | |
| embeddings.append(emb) | |
| embeddings_np = np.array(embeddings).astype("float32") | |
| # Create FAISS index | |
| index = faiss.IndexFlatL2(embeddings_np.shape[1]) | |
| index.add(embeddings_np) | |
| faiss.write_index(index, os.path.join(OUTPUT_DIR, "faiss_index.bin")) | |
| with open(os.path.join(OUTPUT_DIR, "chunks_metadata.pkl"), "wb") as f: | |
| pickle.dump({"chunks": chunks, "metadata": metadata}, f) | |
| print(f"π Cloud Indexing Complete! Saved to {OUTPUT_DIR}") | |
| if __name__ == "__main__": | |
| main() | |