Socrates_docker / create_faiss_from_supabase_stories.py
alesamodio's picture
update behaviour for request of socratic stories
f03d0ec
# create_faiss_from_supabase_stories.py
import os
import time
import json
import pickle
import requests
import numpy as np
import faiss
from typing import List, Dict, Any
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
# --- CONFIG (read from env or hardcode for local testing) ---
SUPABASE_URL = os.environ["SUPABASE_URL"] # e.g. https://xxxx.supabase.co
SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"] # service key (server-side)
OUT_DIR = os.environ.get("STORIES_VS_OUT", "./stories_vectorstore")
EMBED_MODEL_NAME = os.environ.get("EMBED_MODEL", "intfloat/e5-large-v2")
HEADERS = {
"apikey": SUPABASE_SERVICE_KEY,
"Authorization": f"Bearer {SUPABASE_SERVICE_KEY}",
"Content-Type": "application/json",
}
def fetch_all_stories() -> List[Dict[str, Any]]:
url = f"{SUPABASE_URL}/rest/v1/stories"
params = {
"select": "id,handle,title,character_names,body,moral,maxim,topic_primary,created_at,character_id",
"limit": "10000",
}
r = requests.get(url, headers=HEADERS, params=params, timeout=(20, 60))
r.raise_for_status()
rows = r.json() or []
print(f"πŸ“₯ Downloaded {len(rows)} stories from Supabase.")
return rows
def story_to_documents(story: Dict[str, Any]) -> List[Document]:
"""
Split 'body' into chunks and add dedicated chunks for 'moral' and 'maxim'.
Prepend a small header so names/topics are searchable semantically.
"""
sid = story["id"]
title = story.get("title", "") or ""
chars = story.get("character_names") or []
topic = story.get("topic_primary", "") or ""
handle = story.get("handle", "") or ""
created = story.get("created_at", "") or ""
body = story.get("body", "") or ""
moral = story.get("moral", "")
maxim = story.get("maxim", "")
char_id = story.get("character_id", "socrates") or "socrates"
header = (
f"Title: {title}\n"
f"Character: {char_id}\n"
f"Characters: {', '.join(chars) if chars else '(unspecified)'}\n"
f"Topic: {topic}\n"
f"Handle: {handle}\n\n"
)
splitter = RecursiveCharacterTextSplitter(
chunk_size=1200, chunk_overlap=120,
separators=["\n\n", "\n", ". ", "! ", "? "]
)
docs: List[Document] = []
# Body chunks
for i, chunk_text in enumerate(splitter.split_text(body or "")):
if not chunk_text.strip():
continue
docs.append(
Document(
page_content=header + chunk_text.strip(),
metadata={
"story_id": sid, "title": title, "character_names": chars,
"topic_primary": topic, "handle": handle, "created_at": created,
"character_id": char_id, "chunk_id": i, "kind": "body"
},
)
)
# Moral / Maxim as tiny chunks (rank well for moral questions)
if moral:
docs.append(
Document(
page_content=header + f"Moral: {moral}",
metadata={
"story_id": sid, "title": title, "character_names": chars,
"topic_primary": topic, "handle": handle, "created_at": created,
"character_id": char_id, "chunk_id": -1, "kind": "moral"
},
)
)
if maxim:
docs.append(
Document(
page_content=header + f"Maxim: {maxim}",
metadata={
"story_id": sid, "title": title, "character_names": chars,
"topic_primary": topic, "handle": handle, "created_at": created,
"character_id": char_id, "chunk_id": -2, "kind": "maxim"
},
)
)
return docs
def normalize(v) -> np.ndarray:
arr = np.array(v, dtype=np.float32)
norm = np.linalg.norm(arr)
return arr / norm if norm > 0 else arr
def save_pickle(obj: Any, path: str) -> None:
with open(path, "wb") as f:
pickle.dump(obj, f)
def embed_texts(texts: List[str], model: HuggingFaceEmbeddings) -> np.ndarray:
vecs: List[np.ndarray] = []
for i in range(0, len(texts), 64):
batch = texts[i:i+64]
print(f"🧠 Embedding batch {i//64 + 1} ({len(batch)} chunks)…")
emb = model.embed_documents(batch)
vecs.extend([normalize(v) for v in emb])
time.sleep(0.2)
return np.vstack(vecs) if vecs else np.zeros((0, 384), dtype=np.float32)
def main():
os.makedirs(OUT_DIR, exist_ok=True)
stories = fetch_all_stories()
if not stories:
print("❌ No stories found. Exiting.")
return
# Build Documents
all_docs: List[Document] = []
for s in stories:
all_docs.extend(story_to_documents(s))
print(f"🧩 Built {len(all_docs)} story chunks (body/moral/maxim).")
texts = [d.page_content for d in all_docs]
metadatas = [d.metadata for d in all_docs]
# Embeddings
print(f"πŸ”§ Loading embedding model: {EMBED_MODEL_NAME}")
embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
vectors = embed_texts(texts, embedder)
if vectors.shape[0] == 0:
print("❌ No vectors embedded. Exiting.")
return
# FAISS (Inner Product on normalized vectors)
dim = vectors.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(vectors.astype("float32"))
print(f"πŸ“¦ FAISS index built with {index.ntotal} vectors (dim={dim}).")
# Save locally
faiss_path = os.path.join(OUT_DIR, "faiss.index")
docs_path = os.path.join(OUT_DIR, "documents.pkl")
faiss.write_index(index, faiss_path)
save_pickle({"documents": all_docs, "metadatas": metadatas}, docs_path)
print("βœ… Stories vector DB saved.")
if __name__ == "__main__":
main()