Spaces:

devjhawar
/

KLypse

Sleeping

DEVJHAWAR11

Deploy Klypse backend

54bef2f 7 months ago

13.9 kB

	# app/storage/vector_store.py

	from langchain_community.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from app.services.embeddings import get_embeddings
	from app.config import config
	import os
	import re

	# ---- CLEAN TRANSCRIPT UTILS ----

	# ...existing code...
	import logging
	from typing import Any, Dict, List, Optional, Sequence

	logger = logging.getLogger(__name__)


	class VectorStore:
	"""
	Generic wrapper around an underlying vector DB client.
	- Call `add_embeddings` to persist vectors.
	- Call `search` to retrieve nearest neighbors.
	This wrapper ensures results are deduplicated (preserve order).
	Adapt client initialization to your project's real client in __init__.
	"""

	def __init__(self, client: Optional[Any] = None, namespace: Optional[str] = None):
	"""
	If `client` is provided, this wrapper will delegate to it.
	Otherwise you must set `self._client` later to an object exposing compatible methods.
	"""
	self._client = client
	self.namespace = namespace

	# -- Helper: dedupe results preserving order --------------------------------
	@staticmethod
	def _dedupe_results(results: Sequence[Dict], key_fields: Optional[Sequence[str]] = None, top_k: Optional[int] = None) -> List[Dict]:
	"""
	Deduplicate a sequence of result dicts preserving order.
	Default dedupe key: result['id'] if present, else result.get('meta', {}).get('chunk_id'), else result.get('text')
	Returns at most top_k items if top_k provided.
	"""
	seen = set()
	out = []
	for r in results:
	# Compose primary key candidates
	key = None
	if isinstance(r, dict):
	key = r.get("id")
	if not key:
	meta = r.get("meta") or {}
	key = meta.get("chunk_id")
	if not key:
	key = r.get("text")
	else:
	key = str(r)

	if key in seen:
	continue
	seen.add(key)
	out.append(r)
	if top_k and len(out) >= top_k:
	break
	return out

	# -- Add embeddings ----------------------------------------------------------
	def add_embeddings(self, ids: Sequence[str], vectors: Sequence[Sequence[float]], metadatas: Optional[Sequence[Dict]] = None):
	"""
	Persist embeddings into the underlying client.
	Expects:
	ids: list of string ids (eg. chunk ids)
	vectors: list of numeric vectors aligned with ids
	metadatas: optional list of metadata dicts aligned with ids
	Adapt to your client's API: this generic implementation will attempt common method names.
	"""
	if self._client is None:
	raise RuntimeError("VectorStore client not configured")

	try:
	# Common client API: add / upsert
	if hasattr(self._client, "upsert"):
	# chroma-like / vectordb clients
	self._client.upsert(ids=ids, embeddings=vectors, metadatas=metadatas, namespace=self.namespace)
	return
	if hasattr(self._client, "add"):
	# faiss/persisted-store wrappers
	self._client.add(ids, vectors, metadatas)
	return
	# Fallback: try generic attribute names
	if hasattr(self._client, "persist"):
	self._client.persist(ids=ids, vectors=vectors, metadatas=metadatas)
	return
	except Exception:
	logger.exception("Failed to add embeddings to vector store")
	raise

	raise RuntimeError("Underlying client does not expose a supported add/upsert API")

	# -- Search / similarity retrieval -------------------------------------------
	def search(self, query_vector: Sequence[float], top_k: int = 10, filter: Optional[Dict] = None) -> List[Dict]:
	"""
	Search the vector DB. Returns a list of result dicts:
	[{"id": <id>, "score": <score>, "text": <text>, "meta": {...}}, ...]
	This wrapper will deduplicate the returned results (by id/text) preserving order.
	"""
	if self._client is None:
	raise RuntimeError("VectorStore client not configured")

	raw_results = None
	try:
	# Try a few common client search signatures:
	if hasattr(self._client, "query") and callable(getattr(self._client, "query")):
	# chroma-like: client.query(query_embeddings=[query_vector], n_results=top_k, where=filter)
	try:
	resp = self._client.query(query_embeddings=[query_vector], n_results=top_k, where=filter, namespace=self.namespace)
	# Normalize response into list of dicts
	raw_results = self._normalize_query_response(resp)
	except TypeError:
	# fallback signature
	resp = self._client.query(query_vector, top_k)
	raw_results = self._normalize_query_response(resp)
	elif hasattr(self._client, "search") and callable(getattr(self._client, "search")):
	# FAISS/other wrappers: client.search(query_vector, top_k, filter=...)
	resp = self._client.search(query_vector, top_k, filter=filter, namespace=self.namespace)
	raw_results = self._normalize_query_response(resp)
	else:
	# Try generic method names
	if hasattr(self._client, "get_nearest_neighbors"):
	resp = self._client.get_nearest_neighbors(query_vector, top_k)
	raw_results = self._normalize_query_response(resp)
	else:
	raise RuntimeError("Underlying client does not expose a supported search/query API")
	except Exception:
	logger.exception("Vector store search failed")
	raise

	# Ensure raw_results is a list of dict-like results
	if not isinstance(raw_results, list):
	logger.debug("Normalizing single search response to list")
	raw_results = list(raw_results) if raw_results is not None else []

	# Deduplicate results preserving order and cap to top_k
	deduped = self._dedupe_results(raw_results, top_k=top_k)
	return deduped

	# -- Response normalization -------------------------------------------------
	@staticmethod
	def _normalize_query_response(resp: Any) -> List[Dict]:
	"""
	Convert common response formats into a list of dicts with keys:
	'id', 'score', 'text', 'meta'
	The exact structure depends on the client; this helper attempts reasonable mappings.
	"""
	out = []

	if resp is None:
	return out

	# choma-like: resp['ids'], resp['distances'], resp['metadatas'], resp['documents']
	try:
	if isinstance(resp, dict):
	# chroma-python query format
	if "ids" in resp and isinstance(resp["ids"], list):
	# chroma returns lists of lists when multiple queries provided
	ids_list = resp["ids"]
	docs_list = resp.get("documents") or resp.get("documents", [])
	metas_list = resp.get("metadatas") or resp.get("metadatas", [])
	dists_list = resp.get("distances") or resp.get("distances", [])
	# take first query's results if nested
	ids = ids_list[0] if ids_list and isinstance(ids_list[0], list) else ids_list
	docs = docs_list[0] if docs_list and isinstance(docs_list[0], list) else docs_list
	metas = metas_list[0] if metas_list and isinstance(metas_list[0], list) else metas_list
	dists = dists_list[0] if dists_list and isinstance(dists_list[0], list) else dists_list

	for i, idv in enumerate(ids):
	out.append({"id": idv, "score": None if not dists else dists[i], "text": (docs[i] if docs and i < len(docs) else None), "meta": (metas[i] if metas and i < len(metas) else {})})
	return out

	# If resp contains 'results' key that is a list
	if "results" in resp and isinstance(resp["results"], list):
	for r in resp["results"]:
	# try to extract known fields
	out.append(
	{
	"id": r.get("id"),
	"score": r.get("score") or r.get("distance") or r.get("score"),
	"text": r.get("document") or r.get("text") or r.get("content"),
	"meta": r.get("metadata") or r.get("meta") or {},
	}
	)
	return out
	except Exception:
	logger.debug("Chroma-like normalization failed, trying other formats", exc_info=True)

	# If resp is an iterable of tuples (id, score, text, meta)
	try:
	if isinstance(resp, (list, tuple)):
	for item in resp:
	if isinstance(item, dict):
	out.append({"id": item.get("id"), "score": item.get("score") or item.get("distance"), "text": item.get("text") or item.get("document") or item.get("content"), "meta": item.get("meta") or item.get("metadata") or {}})
	elif isinstance(item, (list, tuple)) and len(item) >= 2:
	# (id, score) or (id, score, text)
	idv = item[0]
	score = item[1]
	text = item[2] if len(item) > 2 else None
	meta = item[3] if len(item) > 3 else {}
	out.append({"id": idv, "score": score, "text": text, "meta": meta})
	else:
	out.append({"id": None, "score": None, "text": str(item), "meta": {}})
	return out
	except Exception:
	logger.debug("Iterable normalization failed", exc_info=True)

	# Last resort: wrap the resp as single result with text representation
	try:
	out.append({"id": None, "score": None, "text": str(resp), "meta": {}})
	except Exception:
	out = []

	return out
	# ...existing code...

	def remove_double_words(text):
	# FIXED: Correct regex to remove consecutive repeated words
	return re.sub(r'\b(\w+)\s+\1\b', r'\1', text, flags=re.IGNORECASE)

	def clean_transcript(text):
	# Remove duplicate lines, strip, and double words
	lines = text.split('\n')
	unique_lines = []
	prev_line = None

	for line in lines:
	line = line.strip()
	if not line or line == prev_line:
	continue

	cleaned = remove_double_words(line)
	if cleaned != prev_line:
	unique_lines.append(cleaned)
	prev_line = cleaned

	return ' '.join(unique_lines)

	# ---- VECTORSTORE FUNCTIONS ----

	_embeddings = get_embeddings()
	FAISS_INDEX_PATH = config.CHROMA_DB_PATH.replace("chroma", "faiss")
	os.makedirs(FAISS_INDEX_PATH, exist_ok=True)

	_vectorstore = None

	def get_vectorstore():
	global _vectorstore
	if _vectorstore is None:
	index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
	if os.path.exists(index_file):
	try:
	_vectorstore = FAISS.load_local(
	FAISS_INDEX_PATH,
	_embeddings,
	allow_dangerous_deserialization=True
	)
	print(f"✓ Loaded existing FAISS index from {FAISS_INDEX_PATH}")
	except Exception as e:
	print(f"⚠ Could not load existing index: {e}")
	_vectorstore = FAISS.from_texts(["initialization"], _embeddings)
	else:
	_vectorstore = FAISS.from_texts(["initialization"], _embeddings)
	print(f"✓ Created new FAISS index at {FAISS_INDEX_PATH}")

	return _vectorstore

	def add_to_vectorstore(texts):
	vectorstore = get_vectorstore()
	vectorstore.add_texts(texts)
	vectorstore.save_local(FAISS_INDEX_PATH)
	print(f"✓ Added {len(texts)} texts to FAISS and saved to disk")

	def clear_vectorstore():
	global _vectorstore
	index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
	pkl_file = os.path.join(FAISS_INDEX_PATH, "index.pkl")

	if os.path.exists(index_file):
	os.remove(index_file)
	if os.path.exists(pkl_file):
	os.remove(pkl_file)

	_vectorstore = None
	print("✓ Cleared FAISS vectorstore")

	def load_vectorstore_for_video(video_id: str):
	path = f"./data/faiss/{video_id}/"
	if not os.path.exists(path):
	raise FileNotFoundError(f"No vectorstore found for video ID: {video_id}")

	return FAISS.load_local(
	path,
	_embeddings,
	allow_dangerous_deserialization=True
	)

	def create_vectorstore_for_video(video_id: str, transcript: str):
	# FIXED: Clean the transcript before processing
	transcript = clean_transcript(transcript)

	# Split transcript into chunks
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)

	chunks = text_splitter.split_text(transcript)

	# Create vectorstore from chunks
	vectorstore = FAISS.from_texts(
	texts=chunks,
	embedding=_embeddings
	)

	# Save to disk
	path = f"./data/faiss/{video_id}/"
	os.makedirs(path, exist_ok=True)
	vectorstore.save_local(path)

	print(f"✓ Created and saved vectorstore for video {video_id} with {len(chunks)} chunks (cleaned)")
	return vectorstore