Spaces:

tharu280
/

portfolio-rag-api

Sleeping

App Files Files Community

portfolio-rag-api / scripts /create_vector_db.py

tharu280

Initial commit

bc620e9 about 1 month ago

raw

history blame contribute delete

4.22 kB

	import google.generativeai as genai
	import faiss
	import pickle
	import json
	import os
	import numpy as np
	from dotenv import load_dotenv

	# Load env to get API Key
	load_dotenv()
	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

	if not GEMINI_API_KEY:
	print("❌ Error: GEMINI_API_KEY not found in .env")
	exit(1)

	genai.configure(api_key=GEMINI_API_KEY)

	# --- Configuration ---
	DATA_DIR = "data"
	SUMMARIES_DIR = os.path.join(DATA_DIR, "summaries")
	OUTPUT_DIR = os.path.join("backend", "vector_store")
	# Google's latest embedding model
	EMBEDDING_MODEL = "models/text-embedding-004"

	os.makedirs(OUTPUT_DIR, exist_ok=True)


	def load_json(filename):
	path = os.path.join(DATA_DIR, filename)
	return json.load(open(path, 'r', encoding='utf-8')) if os.path.exists(path) else []


	def load_summary_text(filename):
	path = os.path.join(SUMMARIES_DIR, filename)
	if os.path.exists(path):
	with open(path, 'r', encoding='utf-8') as f:
	return f.read().strip()
	return ""


	def get_embedding(text):
	"""Wraps Gemini API to get embeddings"""
	result = genai.embed_content(
	model=EMBEDDING_MODEL,
	content=text,
	task_type="retrieval_document"
	)
	return result['embedding']


	def main():
	print("🔄 Creating Cloud-Based Vector Index...")
	chunks = []
	metadata = []

	def add_chunk(text, source):
	if text and len(text) > 5:
	chunks.append(text)
	metadata.append({"source": source})

	# --- Load Data (Same logic as before) ---
	# 1. Profile
	profile = load_json("profile.json")
	if isinstance(profile, dict):
	contact = profile.get("contact", {})
	c_text = f"Contact Details: Name: {contact.get('name')}. Email: {contact.get('email')}. LinkedIn: {contact.get('linkedin')}."
	add_chunk(c_text, "profile_contact")
	if profile.get("summary"):
	add_chunk(
	f"Professional Summary: {profile.get('summary')}", "profile_summary")

	# 2. Experience
	experience = load_json("experience.json")
	for exp in experience:
	text = f"Experience: {exp.get('role')} at {exp.get('company')} ({exp.get('duration')}). {exp.get('description')}"
	add_chunk(text, "experience_entry")

	# 3. Education
	education = load_json("education.json")
	for edu in education:
	text = f"Education: {edu.get('degree')} from {edu.get('institution')}. {edu.get('details')}"
	add_chunk(text, "education_entry")

	# 4. Skills
	skills = load_json("skills.json")
	for s in skills:
	text = f"Skills in {s.get('category')}: {', '.join(s.get('list', []))}"
	add_chunk(text, "skills_list")

	# 5. Summaries
	summary_files = {
	"about_summary.txt": "profile_about_me",
	"projects_summary.txt": "ui_trigger_projects",
	"articles_summary.txt": "ui_trigger_articles",
	"videos_summary.txt": "ui_trigger_videos",
	"research_summary.txt": "ui_trigger_research",
	"skills_summary.txt": "ui_trigger_skills",
	"certifications_summary.txt": "ui_trigger_certifications"
	}

	for filename, tag in summary_files.items():
	text = load_summary_text(filename)
	if text:
	add_chunk(text, tag)

	# --- Generate Embeddings ---
	if not chunks:
	print("❌ Error: No chunks created.")
	return

	print(f"🧠 Encoding {len(chunks)} chunks via Gemini API...")

	# Batch processing is better, but simple loop works for small portfolios
	embeddings = []
	for i, chunk in enumerate(chunks):
	if i % 5 == 0:
	print(f" Processing chunk {i}/{len(chunks)}...")
	emb = get_embedding(chunk)
	embeddings.append(emb)

	embeddings_np = np.array(embeddings).astype("float32")

	# Create FAISS index
	index = faiss.IndexFlatL2(embeddings_np.shape[1])
	index.add(embeddings_np)

	faiss.write_index(index, os.path.join(OUTPUT_DIR, "faiss_index.bin"))
	with open(os.path.join(OUTPUT_DIR, "chunks_metadata.pkl"), "wb") as f:
	pickle.dump({"chunks": chunks, "metadata": metadata}, f)

	print(f"🎉 Cloud Indexing Complete! Saved to {OUTPUT_DIR}")


	if __name__ == "__main__":
	main()