Spaces:
Running
Running
| # create_faiss_from_supabase_stories.py | |
| import os | |
| import time | |
| import json | |
| import pickle | |
| import requests | |
| import numpy as np | |
| import faiss | |
| from typing import List, Dict, Any | |
| from langchain_core.documents import Document | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| # --- CONFIG (read from env or hardcode for local testing) --- | |
| SUPABASE_URL = os.environ["SUPABASE_URL"] # e.g. https://xxxx.supabase.co | |
| SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"] # service key (server-side) | |
| OUT_DIR = os.environ.get("STORIES_VS_OUT", "./stories_vectorstore") | |
| EMBED_MODEL_NAME = os.environ.get("EMBED_MODEL", "intfloat/e5-large-v2") | |
| HEADERS = { | |
| "apikey": SUPABASE_SERVICE_KEY, | |
| "Authorization": f"Bearer {SUPABASE_SERVICE_KEY}", | |
| "Content-Type": "application/json", | |
| } | |
| def fetch_all_stories() -> List[Dict[str, Any]]: | |
| url = f"{SUPABASE_URL}/rest/v1/stories" | |
| params = { | |
| "select": "id,handle,title,character_names,body,moral,maxim,topic_primary,created_at,character_id", | |
| "limit": "10000", | |
| } | |
| r = requests.get(url, headers=HEADERS, params=params, timeout=(20, 60)) | |
| r.raise_for_status() | |
| rows = r.json() or [] | |
| print(f"π₯ Downloaded {len(rows)} stories from Supabase.") | |
| return rows | |
| def story_to_documents(story: Dict[str, Any]) -> List[Document]: | |
| """ | |
| Split 'body' into chunks and add dedicated chunks for 'moral' and 'maxim'. | |
| Prepend a small header so names/topics are searchable semantically. | |
| """ | |
| sid = story["id"] | |
| title = story.get("title", "") or "" | |
| chars = story.get("character_names") or [] | |
| topic = story.get("topic_primary", "") or "" | |
| handle = story.get("handle", "") or "" | |
| created = story.get("created_at", "") or "" | |
| body = story.get("body", "") or "" | |
| moral = story.get("moral", "") | |
| maxim = story.get("maxim", "") | |
| char_id = story.get("character_id", "socrates") or "socrates" | |
| header = ( | |
| f"Title: {title}\n" | |
| f"Character: {char_id}\n" | |
| f"Characters: {', '.join(chars) if chars else '(unspecified)'}\n" | |
| f"Topic: {topic}\n" | |
| f"Handle: {handle}\n\n" | |
| ) | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1200, chunk_overlap=120, | |
| separators=["\n\n", "\n", ". ", "! ", "? "] | |
| ) | |
| docs: List[Document] = [] | |
| # Body chunks | |
| for i, chunk_text in enumerate(splitter.split_text(body or "")): | |
| if not chunk_text.strip(): | |
| continue | |
| docs.append( | |
| Document( | |
| page_content=header + chunk_text.strip(), | |
| metadata={ | |
| "story_id": sid, "title": title, "character_names": chars, | |
| "topic_primary": topic, "handle": handle, "created_at": created, | |
| "character_id": char_id, "chunk_id": i, "kind": "body" | |
| }, | |
| ) | |
| ) | |
| # Moral / Maxim as tiny chunks (rank well for moral questions) | |
| if moral: | |
| docs.append( | |
| Document( | |
| page_content=header + f"Moral: {moral}", | |
| metadata={ | |
| "story_id": sid, "title": title, "character_names": chars, | |
| "topic_primary": topic, "handle": handle, "created_at": created, | |
| "character_id": char_id, "chunk_id": -1, "kind": "moral" | |
| }, | |
| ) | |
| ) | |
| if maxim: | |
| docs.append( | |
| Document( | |
| page_content=header + f"Maxim: {maxim}", | |
| metadata={ | |
| "story_id": sid, "title": title, "character_names": chars, | |
| "topic_primary": topic, "handle": handle, "created_at": created, | |
| "character_id": char_id, "chunk_id": -2, "kind": "maxim" | |
| }, | |
| ) | |
| ) | |
| return docs | |
| def normalize(v) -> np.ndarray: | |
| arr = np.array(v, dtype=np.float32) | |
| norm = np.linalg.norm(arr) | |
| return arr / norm if norm > 0 else arr | |
| def save_pickle(obj: Any, path: str) -> None: | |
| with open(path, "wb") as f: | |
| pickle.dump(obj, f) | |
| def embed_texts(texts: List[str], model: HuggingFaceEmbeddings) -> np.ndarray: | |
| vecs: List[np.ndarray] = [] | |
| for i in range(0, len(texts), 64): | |
| batch = texts[i:i+64] | |
| print(f"π§ Embedding batch {i//64 + 1} ({len(batch)} chunks)β¦") | |
| emb = model.embed_documents(batch) | |
| vecs.extend([normalize(v) for v in emb]) | |
| time.sleep(0.2) | |
| return np.vstack(vecs) if vecs else np.zeros((0, 384), dtype=np.float32) | |
| def main(): | |
| os.makedirs(OUT_DIR, exist_ok=True) | |
| stories = fetch_all_stories() | |
| if not stories: | |
| print("β No stories found. Exiting.") | |
| return | |
| # Build Documents | |
| all_docs: List[Document] = [] | |
| for s in stories: | |
| all_docs.extend(story_to_documents(s)) | |
| print(f"π§© Built {len(all_docs)} story chunks (body/moral/maxim).") | |
| texts = [d.page_content for d in all_docs] | |
| metadatas = [d.metadata for d in all_docs] | |
| # Embeddings | |
| print(f"π§ Loading embedding model: {EMBED_MODEL_NAME}") | |
| embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME) | |
| vectors = embed_texts(texts, embedder) | |
| if vectors.shape[0] == 0: | |
| print("β No vectors embedded. Exiting.") | |
| return | |
| # FAISS (Inner Product on normalized vectors) | |
| dim = vectors.shape[1] | |
| index = faiss.IndexFlatIP(dim) | |
| index.add(vectors.astype("float32")) | |
| print(f"π¦ FAISS index built with {index.ntotal} vectors (dim={dim}).") | |
| # Save locally | |
| faiss_path = os.path.join(OUT_DIR, "faiss.index") | |
| docs_path = os.path.join(OUT_DIR, "documents.pkl") | |
| faiss.write_index(index, faiss_path) | |
| save_pickle({"documents": all_docs, "metadatas": metadatas}, docs_path) | |
| print("β Stories vector DB saved.") | |
| if __name__ == "__main__": | |
| main() | |