Spaces:

tharu280
/

portfolio-rag-api

Sleeping

File size: 4,217 Bytes

bc620e9

import google.generativeai as genai
import faiss
import pickle
import json
import os
import numpy as np
from dotenv import load_dotenv

# Load env to get API Key
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")

if not GEMINI_API_KEY:
    print("❌ Error: GEMINI_API_KEY not found in .env")
    exit(1)

genai.configure(api_key=GEMINI_API_KEY)

# --- Configuration ---
DATA_DIR = "data"
SUMMARIES_DIR = os.path.join(DATA_DIR, "summaries")
OUTPUT_DIR = os.path.join("backend", "vector_store")
# Google's latest embedding model
EMBEDDING_MODEL = "models/text-embedding-004"

os.makedirs(OUTPUT_DIR, exist_ok=True)


def load_json(filename):
    path = os.path.join(DATA_DIR, filename)
    return json.load(open(path, 'r', encoding='utf-8')) if os.path.exists(path) else []


def load_summary_text(filename):
    path = os.path.join(SUMMARIES_DIR, filename)
    if os.path.exists(path):
        with open(path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    return ""


def get_embedding(text):
    """Wraps Gemini API to get embeddings"""
    result = genai.embed_content(
        model=EMBEDDING_MODEL,
        content=text,
        task_type="retrieval_document"
    )
    return result['embedding']


def main():
    print("🔄 Creating Cloud-Based Vector Index...")
    chunks = []
    metadata = []

    def add_chunk(text, source):
        if text and len(text) > 5:
            chunks.append(text)
            metadata.append({"source": source})

    # --- Load Data (Same logic as before) ---
    # 1. Profile
    profile = load_json("profile.json")
    if isinstance(profile, dict):
        contact = profile.get("contact", {})
        c_text = f"Contact Details: Name: {contact.get('name')}. Email: {contact.get('email')}. LinkedIn: {contact.get('linkedin')}."
        add_chunk(c_text, "profile_contact")
        if profile.get("summary"):
            add_chunk(
                f"Professional Summary: {profile.get('summary')}", "profile_summary")

    # 2. Experience
    experience = load_json("experience.json")
    for exp in experience:
        text = f"Experience: {exp.get('role')} at {exp.get('company')} ({exp.get('duration')}). {exp.get('description')}"
        add_chunk(text, "experience_entry")

    # 3. Education
    education = load_json("education.json")
    for edu in education:
        text = f"Education: {edu.get('degree')} from {edu.get('institution')}. {edu.get('details')}"
        add_chunk(text, "education_entry")

    # 4. Skills
    skills = load_json("skills.json")
    for s in skills:
        text = f"Skills in {s.get('category')}: {', '.join(s.get('list', []))}"
        add_chunk(text, "skills_list")

    # 5. Summaries
    summary_files = {
        "about_summary.txt": "profile_about_me",
        "projects_summary.txt": "ui_trigger_projects",
        "articles_summary.txt": "ui_trigger_articles",
        "videos_summary.txt": "ui_trigger_videos",
        "research_summary.txt": "ui_trigger_research",
        "skills_summary.txt": "ui_trigger_skills",
        "certifications_summary.txt": "ui_trigger_certifications"
    }

    for filename, tag in summary_files.items():
        text = load_summary_text(filename)
        if text:
            add_chunk(text, tag)

    # --- Generate Embeddings ---
    if not chunks:
        print("❌ Error: No chunks created.")
        return

    print(f"🧠 Encoding {len(chunks)} chunks via Gemini API...")

    # Batch processing is better, but simple loop works for small portfolios
    embeddings = []
    for i, chunk in enumerate(chunks):
        if i % 5 == 0:
            print(f"   Processing chunk {i}/{len(chunks)}...")
        emb = get_embedding(chunk)
        embeddings.append(emb)

    embeddings_np = np.array(embeddings).astype("float32")

    # Create FAISS index
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)

    faiss.write_index(index, os.path.join(OUTPUT_DIR, "faiss_index.bin"))
    with open(os.path.join(OUTPUT_DIR, "chunks_metadata.pkl"), "wb") as f:
        pickle.dump({"chunks": chunks, "metadata": metadata}, f)

    print(f"🎉 Cloud Indexing Complete! Saved to {OUTPUT_DIR}")


if __name__ == "__main__":
    main()