Spaces:
Sleeping
Sleeping
File size: 4,217 Bytes
bc620e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
import google.generativeai as genai
import faiss
import pickle
import json
import os
import numpy as np
from dotenv import load_dotenv
# Load env to get API Key
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
print("β Error: GEMINI_API_KEY not found in .env")
exit(1)
genai.configure(api_key=GEMINI_API_KEY)
# --- Configuration ---
DATA_DIR = "data"
SUMMARIES_DIR = os.path.join(DATA_DIR, "summaries")
OUTPUT_DIR = os.path.join("backend", "vector_store")
# Google's latest embedding model
EMBEDDING_MODEL = "models/text-embedding-004"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def load_json(filename):
path = os.path.join(DATA_DIR, filename)
return json.load(open(path, 'r', encoding='utf-8')) if os.path.exists(path) else []
def load_summary_text(filename):
path = os.path.join(SUMMARIES_DIR, filename)
if os.path.exists(path):
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip()
return ""
def get_embedding(text):
"""Wraps Gemini API to get embeddings"""
result = genai.embed_content(
model=EMBEDDING_MODEL,
content=text,
task_type="retrieval_document"
)
return result['embedding']
def main():
print("π Creating Cloud-Based Vector Index...")
chunks = []
metadata = []
def add_chunk(text, source):
if text and len(text) > 5:
chunks.append(text)
metadata.append({"source": source})
# --- Load Data (Same logic as before) ---
# 1. Profile
profile = load_json("profile.json")
if isinstance(profile, dict):
contact = profile.get("contact", {})
c_text = f"Contact Details: Name: {contact.get('name')}. Email: {contact.get('email')}. LinkedIn: {contact.get('linkedin')}."
add_chunk(c_text, "profile_contact")
if profile.get("summary"):
add_chunk(
f"Professional Summary: {profile.get('summary')}", "profile_summary")
# 2. Experience
experience = load_json("experience.json")
for exp in experience:
text = f"Experience: {exp.get('role')} at {exp.get('company')} ({exp.get('duration')}). {exp.get('description')}"
add_chunk(text, "experience_entry")
# 3. Education
education = load_json("education.json")
for edu in education:
text = f"Education: {edu.get('degree')} from {edu.get('institution')}. {edu.get('details')}"
add_chunk(text, "education_entry")
# 4. Skills
skills = load_json("skills.json")
for s in skills:
text = f"Skills in {s.get('category')}: {', '.join(s.get('list', []))}"
add_chunk(text, "skills_list")
# 5. Summaries
summary_files = {
"about_summary.txt": "profile_about_me",
"projects_summary.txt": "ui_trigger_projects",
"articles_summary.txt": "ui_trigger_articles",
"videos_summary.txt": "ui_trigger_videos",
"research_summary.txt": "ui_trigger_research",
"skills_summary.txt": "ui_trigger_skills",
"certifications_summary.txt": "ui_trigger_certifications"
}
for filename, tag in summary_files.items():
text = load_summary_text(filename)
if text:
add_chunk(text, tag)
# --- Generate Embeddings ---
if not chunks:
print("β Error: No chunks created.")
return
print(f"π§ Encoding {len(chunks)} chunks via Gemini API...")
# Batch processing is better, but simple loop works for small portfolios
embeddings = []
for i, chunk in enumerate(chunks):
if i % 5 == 0:
print(f" Processing chunk {i}/{len(chunks)}...")
emb = get_embedding(chunk)
embeddings.append(emb)
embeddings_np = np.array(embeddings).astype("float32")
# Create FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)
faiss.write_index(index, os.path.join(OUTPUT_DIR, "faiss_index.bin"))
with open(os.path.join(OUTPUT_DIR, "chunks_metadata.pkl"), "wb") as f:
pickle.dump({"chunks": chunks, "metadata": metadata}, f)
print(f"π Cloud Indexing Complete! Saved to {OUTPUT_DIR}")
if __name__ == "__main__":
main()
|