portfolio-rag-api / scripts /create_vector_db.py
tharu280's picture
Initial commit
bc620e9
import google.generativeai as genai
import faiss
import pickle
import json
import os
import numpy as np
from dotenv import load_dotenv
# Load env to get API Key
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
print("❌ Error: GEMINI_API_KEY not found in .env")
exit(1)
genai.configure(api_key=GEMINI_API_KEY)
# --- Configuration ---
DATA_DIR = "data"
SUMMARIES_DIR = os.path.join(DATA_DIR, "summaries")
OUTPUT_DIR = os.path.join("backend", "vector_store")
# Google's latest embedding model
EMBEDDING_MODEL = "models/text-embedding-004"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def load_json(filename):
path = os.path.join(DATA_DIR, filename)
return json.load(open(path, 'r', encoding='utf-8')) if os.path.exists(path) else []
def load_summary_text(filename):
path = os.path.join(SUMMARIES_DIR, filename)
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip()
return ""
def get_embedding(text):
"""Wraps Gemini API to get embeddings"""
result = genai.embed_content(
model=EMBEDDING_MODEL,
content=text,
task_type="retrieval_document"
)
return result['embedding']
def main():
print("πŸ”„ Creating Cloud-Based Vector Index...")
chunks = []
metadata = []
def add_chunk(text, source):
if text and len(text) > 5:
chunks.append(text)
metadata.append({"source": source})
# --- Load Data (Same logic as before) ---
# 1. Profile
profile = load_json("profile.json")
if isinstance(profile, dict):
contact = profile.get("contact", {})
c_text = f"Contact Details: Name: {contact.get('name')}. Email: {contact.get('email')}. LinkedIn: {contact.get('linkedin')}."
add_chunk(c_text, "profile_contact")
if profile.get("summary"):
add_chunk(
f"Professional Summary: {profile.get('summary')}", "profile_summary")
# 2. Experience
experience = load_json("experience.json")
for exp in experience:
text = f"Experience: {exp.get('role')} at {exp.get('company')} ({exp.get('duration')}). {exp.get('description')}"
add_chunk(text, "experience_entry")
# 3. Education
education = load_json("education.json")
for edu in education:
text = f"Education: {edu.get('degree')} from {edu.get('institution')}. {edu.get('details')}"
add_chunk(text, "education_entry")
# 4. Skills
skills = load_json("skills.json")
for s in skills:
text = f"Skills in {s.get('category')}: {', '.join(s.get('list', []))}"
add_chunk(text, "skills_list")
# 5. Summaries
summary_files = {
"about_summary.txt": "profile_about_me",
"projects_summary.txt": "ui_trigger_projects",
"articles_summary.txt": "ui_trigger_articles",
"videos_summary.txt": "ui_trigger_videos",
"research_summary.txt": "ui_trigger_research",
"skills_summary.txt": "ui_trigger_skills",
"certifications_summary.txt": "ui_trigger_certifications"
}
for filename, tag in summary_files.items():
text = load_summary_text(filename)
if text:
add_chunk(text, tag)
# --- Generate Embeddings ---
if not chunks:
print("❌ Error: No chunks created.")
return
print(f"🧠 Encoding {len(chunks)} chunks via Gemini API...")
# Batch processing is better, but simple loop works for small portfolios
embeddings = []
for i, chunk in enumerate(chunks):
if i % 5 == 0:
print(f" Processing chunk {i}/{len(chunks)}...")
emb = get_embedding(chunk)
embeddings.append(emb)
embeddings_np = np.array(embeddings).astype("float32")
# Create FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)
faiss.write_index(index, os.path.join(OUTPUT_DIR, "faiss_index.bin"))
with open(os.path.join(OUTPUT_DIR, "chunks_metadata.pkl"), "wb") as f:
pickle.dump({"chunks": chunks, "metadata": metadata}, f)
print(f"πŸŽ‰ Cloud Indexing Complete! Saved to {OUTPUT_DIR}")
if __name__ == "__main__":
main()