Spaces:

alesamodio
/

Socrates_docker

Running

App Files Files Community

Socrates_docker / create_faiss_from_supabase_stories.py

alesamodio

update behaviour for request of socratic stories

f03d0ec 10 days ago

raw

history blame contribute delete

5.97 kB

	# create_faiss_from_supabase_stories.py
	import os
	import time
	import json
	import pickle
	import requests
	import numpy as np
	import faiss
	from typing import List, Dict, Any
	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain_community.embeddings import HuggingFaceEmbeddings

	# --- CONFIG (read from env or hardcode for local testing) ---
	SUPABASE_URL = os.environ["SUPABASE_URL"] # e.g. https://xxxx.supabase.co
	SUPABASE_SERVICE_KEY = os.environ["SUPABASE_SERVICE_KEY"] # service key (server-side)
	OUT_DIR = os.environ.get("STORIES_VS_OUT", "./stories_vectorstore")
	EMBED_MODEL_NAME = os.environ.get("EMBED_MODEL", "intfloat/e5-large-v2")

	HEADERS = {
	"apikey": SUPABASE_SERVICE_KEY,
	"Authorization": f"Bearer {SUPABASE_SERVICE_KEY}",
	"Content-Type": "application/json",
	}

	def fetch_all_stories() -> List[Dict[str, Any]]:
	url = f"{SUPABASE_URL}/rest/v1/stories"
	params = {
	"select": "id,handle,title,character_names,body,moral,maxim,topic_primary,created_at,character_id",
	"limit": "10000",
	}
	r = requests.get(url, headers=HEADERS, params=params, timeout=(20, 60))
	r.raise_for_status()
	rows = r.json() or []
	print(f"📥 Downloaded {len(rows)} stories from Supabase.")
	return rows

	def story_to_documents(story: Dict[str, Any]) -> List[Document]:
	"""
	Split 'body' into chunks and add dedicated chunks for 'moral' and 'maxim'.
	Prepend a small header so names/topics are searchable semantically.
	"""
	sid = story["id"]
	title = story.get("title", "") or ""
	chars = story.get("character_names") or []
	topic = story.get("topic_primary", "") or ""
	handle = story.get("handle", "") or ""
	created = story.get("created_at", "") or ""
	body = story.get("body", "") or ""
	moral = story.get("moral", "")
	maxim = story.get("maxim", "")
	char_id = story.get("character_id", "socrates") or "socrates"

	header = (
	f"Title: {title}\n"
	f"Character: {char_id}\n"
	f"Characters: {', '.join(chars) if chars else '(unspecified)'}\n"
	f"Topic: {topic}\n"
	f"Handle: {handle}\n\n"
	)

	splitter = RecursiveCharacterTextSplitter(
	chunk_size=1200, chunk_overlap=120,
	separators=["\n\n", "\n", ". ", "! ", "? "]
	)

	docs: List[Document] = []
	# Body chunks
	for i, chunk_text in enumerate(splitter.split_text(body or "")):
	if not chunk_text.strip():
	continue
	docs.append(
	Document(
	page_content=header + chunk_text.strip(),
	metadata={
	"story_id": sid, "title": title, "character_names": chars,
	"topic_primary": topic, "handle": handle, "created_at": created,
	"character_id": char_id, "chunk_id": i, "kind": "body"
	},
	)
	)

	# Moral / Maxim as tiny chunks (rank well for moral questions)
	if moral:
	docs.append(
	Document(
	page_content=header + f"Moral: {moral}",
	metadata={
	"story_id": sid, "title": title, "character_names": chars,
	"topic_primary": topic, "handle": handle, "created_at": created,
	"character_id": char_id, "chunk_id": -1, "kind": "moral"
	},
	)
	)
	if maxim:
	docs.append(
	Document(
	page_content=header + f"Maxim: {maxim}",
	metadata={
	"story_id": sid, "title": title, "character_names": chars,
	"topic_primary": topic, "handle": handle, "created_at": created,
	"character_id": char_id, "chunk_id": -2, "kind": "maxim"
	},
	)
	)
	return docs

	def normalize(v) -> np.ndarray:
	arr = np.array(v, dtype=np.float32)
	norm = np.linalg.norm(arr)
	return arr / norm if norm > 0 else arr

	def save_pickle(obj: Any, path: str) -> None:
	with open(path, "wb") as f:
	pickle.dump(obj, f)

	def embed_texts(texts: List[str], model: HuggingFaceEmbeddings) -> np.ndarray:
	vecs: List[np.ndarray] = []
	for i in range(0, len(texts), 64):
	batch = texts[i:i+64]
	print(f"🧠 Embedding batch {i//64 + 1} ({len(batch)} chunks)…")
	emb = model.embed_documents(batch)
	vecs.extend([normalize(v) for v in emb])
	time.sleep(0.2)
	return np.vstack(vecs) if vecs else np.zeros((0, 384), dtype=np.float32)

	def main():
	os.makedirs(OUT_DIR, exist_ok=True)

	stories = fetch_all_stories()
	if not stories:
	print("❌ No stories found. Exiting.")
	return

	# Build Documents
	all_docs: List[Document] = []
	for s in stories:
	all_docs.extend(story_to_documents(s))
	print(f"🧩 Built {len(all_docs)} story chunks (body/moral/maxim).")

	texts = [d.page_content for d in all_docs]
	metadatas = [d.metadata for d in all_docs]

	# Embeddings
	print(f"🔧 Loading embedding model: {EMBED_MODEL_NAME}")
	embedder = HuggingFaceEmbeddings(model_name=EMBED_MODEL_NAME)
	vectors = embed_texts(texts, embedder)
	if vectors.shape[0] == 0:
	print("❌ No vectors embedded. Exiting.")
	return

	# FAISS (Inner Product on normalized vectors)
	dim = vectors.shape[1]
	index = faiss.IndexFlatIP(dim)
	index.add(vectors.astype("float32"))
	print(f"📦 FAISS index built with {index.ntotal} vectors (dim={dim}).")

	# Save locally
	faiss_path = os.path.join(OUT_DIR, "faiss.index")
	docs_path = os.path.join(OUT_DIR, "documents.pkl")
	faiss.write_index(index, faiss_path)
	save_pickle({"documents": all_docs, "metadatas": metadatas}, docs_path)

	print("✅ Stories vector DB saved.")

	if __name__ == "__main__":
	main()