Podcast_Assistant / build_index.py
StanDataCamp's picture
Adding 2 latest podcasts
368f3e0
"""
Build/rebuild the FAISS index from embedding chunks.
Usage:
python build_index.py # Full rebuild
python build_index.py --check # Check index status only
"""
import os
import sys
import time
import argparse
import pandas as pd
from dotenv import load_dotenv
# Load environment
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
load_dotenv(dotenv_path=os.path.join(SCRIPT_DIR, '.env'))
FAISS_INDEX_PATH = os.path.join(SCRIPT_DIR, "faiss_index.db")
CHUNKS_PATH = os.path.join(SCRIPT_DIR, "data/episodes_embedding_chunks.csv")
EPISODES_PATH = os.path.join(SCRIPT_DIR, "data/episodes_website.json")
def check_index_status():
"""Check the current status of data and index."""
print("=" * 60)
print("INDEX STATUS CHECK")
print("=" * 60)
# Check data files
if os.path.exists(CHUNKS_PATH):
chunks_df = pd.read_csv(CHUNKS_PATH)
num_chunks = len(chunks_df)
num_episodes = chunks_df["episode_number"].nunique()
newest_episode = chunks_df["episode_number"].max()
print(f"\nData file: {CHUNKS_PATH}")
print(f" Chunks: {num_chunks}")
print(f" Episodes: {num_episodes}")
print(f" Newest episode: #{newest_episode}")
else:
print(f"\n❌ Data file not found: {CHUNKS_PATH}")
return
# Check FAISS index
if os.path.exists(FAISS_INDEX_PATH):
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
embeddings = OpenAIEmbeddings()
vector_store = FAISS.load_local(
FAISS_INDEX_PATH,
embeddings=embeddings,
allow_dangerous_deserialization=True
)
index_size = vector_store.index.ntotal
print(f"\nFAISS index: {FAISS_INDEX_PATH}")
print(f" Vectors: {index_size}")
if index_size == num_chunks:
print("\n✅ Index is up to date!")
else:
print(f"\n⚠️ Index out of date: {index_size} vectors vs {num_chunks} chunks")
print(" Run 'python build_index.py' to rebuild")
else:
print(f"\n❌ FAISS index not found: {FAISS_INDEX_PATH}")
print(" Run 'python build_index.py' to create")
def rebuild_index():
"""Rebuild the FAISS index from scratch."""
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
print("=" * 60)
print("REBUILDING FAISS INDEX")
print("=" * 60)
# Load embedding chunks
print(f"\nLoading chunks from: {CHUNKS_PATH}")
chunks_df = pd.read_csv(CHUNKS_PATH)
num_chunks = len(chunks_df)
num_episodes = chunks_df["episode_number"].nunique()
newest = chunks_df["episode_number"].max()
print(f" Chunks: {num_chunks}")
print(f" Episodes: {num_episodes}")
print(f" Newest episode: #{newest}")
# Convert to LangChain Documents
print("\nConverting to documents...")
documents = []
for _, row in chunks_df.iterrows():
doc = Document(
page_content=row["embedding_text"],
metadata={
"chunk_id": row["chunk_id"],
"episode_number": row["episode_number"],
"episode_slug": row["episode_slug"],
"chapter_title": row["chapter_title"],
"start_seconds": row["start_seconds"],
"youtube_url": row["youtube_url"],
}
)
documents.append(doc)
# Create embeddings and index
print("\nCalling OpenAI API for embeddings...")
print("(This may take a few minutes for many chunks)")
embeddings = OpenAIEmbeddings()
t0 = time.perf_counter()
vector_store = FAISS.from_documents(documents, embeddings)
elapsed = time.perf_counter() - t0
# Save index
vector_store.save_local(FAISS_INDEX_PATH)
print(f"\n✅ FAISS index created successfully!")
print(f" Vectors: {vector_store.index.ntotal}")
print(f" Time: {elapsed:.1f}s")
print(f" Saved to: {FAISS_INDEX_PATH}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build FAISS index for podcast search")
parser.add_argument("--check", action="store_true", help="Check index status only")
args = parser.parse_args()
if args.check:
check_index_status()
else:
rebuild_index()