Spaces:

JairoDanielMT
/

vector_store_api

Sleeping

App Files Files Community

vector_store_api / vector_db.py

JairoDanielMT

Upload 7 files

ea83a52 verified about 1 year ago

raw

history blame contribute delete

6.02 kB

	import os
	from typing import Optional, List
	import shutil
	from zipfile import ZipFile
	from langchain_community.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_core.embeddings import Embeddings

	from document_processor import DocumentProcessor


	class VectorStoreManager:
	def __init__(self, path: str, name: str, embeddings: Embeddings):
	"""
	Descripción: Clase para gestionar el vectorstore, incluyendo la creación, eliminación y búsqueda de
	documentos similares.

	Parámetros:
	- path: str - ruta del directorio que contiene los documentos (usualmente es "database" que es el directorio
	donde se almacenan las bases de datos).
	- name: str - nombre del vectorstore. (usualmente, es el nombre de la base de datos que contiene los documentos)
	- embeddings: Embeddings - modelo de embeddings para el vectorstore.

	"""
	self.path = path
	self.name = name
	self.embeddings = embeddings
	self.vectorstore = None

	def create_vectorstore(self) -> bool:
	documents = DocumentProcessor(self.path).files_to_texts()
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=200, length_function=len
	)
	texts = text_splitter.split_documents(documents)
	self.vectorstore = FAISS.from_documents(
	documents=texts, embedding=self.embeddings
	)
	base_de_datos_dir = os.path.join("database", self.name)
	self.vectorstore.save_local(folder_path=base_de_datos_dir)
	return True

	def delete_vectorstore(self) -> bool:
	try:
	shutil.rmtree(f"database/{self.name}")
	except FileNotFoundError:
	return False
	return True

	def search_similarity(self, query: str, fuente: Optional[str] = None) -> str:
	"""
	Modo de uso:
	debe ingresar la query y la fuente (opcional) para buscar documentos similares en el vectorstore.

	Nota: debe estar definido el vectorstore para poder realizar la búsqueda.

	Parámetros:
	query: str - texto de la query.
	fuente: str - fuente de los documentos a buscar.

	Retorna:
	str - documentos similares.
	"""
	if not self.vectorstore:
	self.vectorstore = self.load_vectorstore()

	if fuente:
	filtro = {"source": fuente}
	retriever = self.vectorstore.similarity_search(
	query=query, k=5, filter=filtro
	)
	else:
	retriever = self.vectorstore.similarity_search(query=query, k=5)
	busqueda = [
	{
	"content": doc.page_content,
	"title": doc.metadata.get("title", None),
	"source": doc.metadata.get("source", None),
	}
	for doc in retriever
	]

	return str(busqueda)

	def list_sources(self) -> List[str]:
	if not self.vectorstore:
	self.vectorstore = self.load_vectorstore()

	docstore_dict = self.vectorstore.docstore._dict
	source_metadata = {}
	for doc_id, document in docstore_dict.items():
	source = document.metadata.get("source", None)
	source_metadata[doc_id] = source

	return list(set(source_metadata.values()))

	def extract_texts_by_source(self, source: str) -> List[str]:
	if not self.vectorstore:
	self.vectorstore = self.load_vectorstore()

	docstore_dict = self.vectorstore.docstore._dict
	texts = []
	for document in docstore_dict.values():
	source_doc = document.metadata.get("source", None)
	if source_doc == source:
	texts.append(document.page_content)
	return texts

	def save_text_to_file_temp(self, source: str) -> bool:
	texts = self.extract_texts_by_source(source)
	carpeta = "temp"
	target_source_safe = source.replace("\\", "_").replace("/", "_")
	file_path = os.path.join(carpeta, target_source_safe + ".txt")

	try:
	if os.path.exists(carpeta):
	shutil.rmtree(carpeta)
	os.makedirs(carpeta)

	with open(file_path, "w", encoding="utf-8") as file:
	for text in texts:
	file.write(text)
	file.write("\n")
	return True
	except Exception:
	return False

	def load_vectorstore(self) -> FAISS:
	return FAISS.load_local(
	folder_path=os.path.join("database", self.name),
	embeddings=self.embeddings,
	allow_dangerous_deserialization=True,
	)

	def add_files_vectorstore(self) -> Optional[FAISS]:
	temp_folder = "docs"
	if not os.path.exists(temp_folder):
	os.makedirs(temp_folder)
	return None

	documents = DocumentProcessor(temp_folder).files_to_texts()
	if not documents:
	return None

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000, chunk_overlap=200, length_function=len
	)
	texts = text_splitter.split_documents(documents)
	self.vectorstore = self.load_vectorstore()
	self.vectorstore.add_documents(documents=texts)
	self.vectorstore.save_local(folder_path=os.path.join("database", self.name))
	return self.vectorstore

	def download_vectorstore(self):
	# generar un zip de la carpeta del vectorstore, crearlo en la carpeta temp y devolver la ruta
	with ZipFile("temp/vectorstore.zip", "w") as zip:
	for root, dirs, files in os.walk(f"database/{self.name}"):
	for file in files:
	zip.write(os.path.join(root, file))
	return "temp/vectorstore.zip"