Spaces:
Sleeping
Sleeping
| """ | |
| Build/rebuild the FAISS index from embedding chunks. | |
| Usage: | |
| python build_index.py # Full rebuild | |
| python build_index.py --check # Check index status only | |
| """ | |
| import os | |
| import sys | |
| import time | |
| import argparse | |
| import pandas as pd | |
| from dotenv import load_dotenv | |
| # Load environment | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| load_dotenv(dotenv_path=os.path.join(SCRIPT_DIR, '.env')) | |
| FAISS_INDEX_PATH = os.path.join(SCRIPT_DIR, "faiss_index.db") | |
| CHUNKS_PATH = os.path.join(SCRIPT_DIR, "data/episodes_embedding_chunks.csv") | |
| EPISODES_PATH = os.path.join(SCRIPT_DIR, "data/episodes_website.json") | |
| def check_index_status(): | |
| """Check the current status of data and index.""" | |
| print("=" * 60) | |
| print("INDEX STATUS CHECK") | |
| print("=" * 60) | |
| # Check data files | |
| if os.path.exists(CHUNKS_PATH): | |
| chunks_df = pd.read_csv(CHUNKS_PATH) | |
| num_chunks = len(chunks_df) | |
| num_episodes = chunks_df["episode_number"].nunique() | |
| newest_episode = chunks_df["episode_number"].max() | |
| print(f"\nData file: {CHUNKS_PATH}") | |
| print(f" Chunks: {num_chunks}") | |
| print(f" Episodes: {num_episodes}") | |
| print(f" Newest episode: #{newest_episode}") | |
| else: | |
| print(f"\n❌ Data file not found: {CHUNKS_PATH}") | |
| return | |
| # Check FAISS index | |
| if os.path.exists(FAISS_INDEX_PATH): | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| embeddings = OpenAIEmbeddings() | |
| vector_store = FAISS.load_local( | |
| FAISS_INDEX_PATH, | |
| embeddings=embeddings, | |
| allow_dangerous_deserialization=True | |
| ) | |
| index_size = vector_store.index.ntotal | |
| print(f"\nFAISS index: {FAISS_INDEX_PATH}") | |
| print(f" Vectors: {index_size}") | |
| if index_size == num_chunks: | |
| print("\n✅ Index is up to date!") | |
| else: | |
| print(f"\n⚠️ Index out of date: {index_size} vectors vs {num_chunks} chunks") | |
| print(" Run 'python build_index.py' to rebuild") | |
| else: | |
| print(f"\n❌ FAISS index not found: {FAISS_INDEX_PATH}") | |
| print(" Run 'python build_index.py' to create") | |
| def rebuild_index(): | |
| """Rebuild the FAISS index from scratch.""" | |
| from langchain_core.documents import Document | |
| from langchain_openai import OpenAIEmbeddings | |
| from langchain_community.vectorstores import FAISS | |
| print("=" * 60) | |
| print("REBUILDING FAISS INDEX") | |
| print("=" * 60) | |
| # Load embedding chunks | |
| print(f"\nLoading chunks from: {CHUNKS_PATH}") | |
| chunks_df = pd.read_csv(CHUNKS_PATH) | |
| num_chunks = len(chunks_df) | |
| num_episodes = chunks_df["episode_number"].nunique() | |
| newest = chunks_df["episode_number"].max() | |
| print(f" Chunks: {num_chunks}") | |
| print(f" Episodes: {num_episodes}") | |
| print(f" Newest episode: #{newest}") | |
| # Convert to LangChain Documents | |
| print("\nConverting to documents...") | |
| documents = [] | |
| for _, row in chunks_df.iterrows(): | |
| doc = Document( | |
| page_content=row["embedding_text"], | |
| metadata={ | |
| "chunk_id": row["chunk_id"], | |
| "episode_number": row["episode_number"], | |
| "episode_slug": row["episode_slug"], | |
| "chapter_title": row["chapter_title"], | |
| "start_seconds": row["start_seconds"], | |
| "youtube_url": row["youtube_url"], | |
| } | |
| ) | |
| documents.append(doc) | |
| # Create embeddings and index | |
| print("\nCalling OpenAI API for embeddings...") | |
| print("(This may take a few minutes for many chunks)") | |
| embeddings = OpenAIEmbeddings() | |
| t0 = time.perf_counter() | |
| vector_store = FAISS.from_documents(documents, embeddings) | |
| elapsed = time.perf_counter() - t0 | |
| # Save index | |
| vector_store.save_local(FAISS_INDEX_PATH) | |
| print(f"\n✅ FAISS index created successfully!") | |
| print(f" Vectors: {vector_store.index.ntotal}") | |
| print(f" Time: {elapsed:.1f}s") | |
| print(f" Saved to: {FAISS_INDEX_PATH}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Build FAISS index for podcast search") | |
| parser.add_argument("--check", action="store_true", help="Check index status only") | |
| args = parser.parse_args() | |
| if args.check: | |
| check_index_status() | |
| else: | |
| rebuild_index() | |