Spaces:

csjjin2002
/

financial-rag-chatbot

Sleeping

financial-rag-chatbot / services /vector_store.py

Claude

Add complete Financial RAG system with Metacognitive Agent

f6b05db unverified 6 months ago

6.26 kB

	"""
	Vector Database 통합 (ChromaDB 사용)
	"""
	from typing import List, Dict, Optional, Any
	import chromadb
	from chromadb.config import Settings
	from loguru import logger
	from pathlib import Path


	class VectorStore:
	"""ChromaDB를 사용한 벡터 저장소 클래스"""

	def __init__(
	self,
	persist_directory: str = "./data/chroma_db",
	collection_name: str = "financial_papers"
	):
	"""
	Args:
	persist_directory: ChromaDB 데이터 저장 경로
	collection_name: 컬렉션 이름
	"""
	self.persist_directory = Path(persist_directory)
	self.collection_name = collection_name

	# 디렉토리 생성
	self.persist_directory.mkdir(parents=True, exist_ok=True)

	# ChromaDB 클라이언트 초기화
	logger.info(f"Initializing ChromaDB at {persist_directory}")
	self.client = chromadb.PersistentClient(
	path=str(self.persist_directory)
	)

	# 컬렉션 생성 또는 가져오기
	self.collection = self.client.get_or_create_collection(
	name=collection_name,
	metadata={"description": "Financial and Economics research papers"}
	)

	logger.info(f"Collection '{collection_name}' ready. Current count: {self.collection.count()}")

	def add_documents(
	self,
	chunks: List[Dict[str, Any]],
	embeddings: List[List[float]]
	) -> None:
	"""
	문서 청크들을 벡터 DB에 추가

	Args:
	chunks: 청크 데이터 리스트 (text, metadata 포함)
	embeddings: 각 청크의 임베딩 벡터
	"""
	if len(chunks) != len(embeddings):
	raise ValueError("Number of chunks and embeddings must match")

	logger.info(f"Adding {len(chunks)} documents to vector store...")

	# ChromaDB에 필요한 형식으로 변환
	ids = [f"{chunk['source_filename']}_{chunk['chunk_id']}" for chunk in chunks]
	documents = [chunk['text'] for chunk in chunks]
	metadatas = [
	{
	'source_filename': chunk['source_filename'],
	'source_filepath': chunk['source_filepath'],
	'chunk_id': str(chunk['chunk_id']),
	'total_chunks': str(chunk['total_chunks']),
	'title': chunk['metadata'].get('title', ''),
	'author': chunk['metadata'].get('author', ''),
	'page_count': str(chunk['page_count'])
	}
	for chunk in chunks
	]

	# 배치로 추가 (ChromaDB는 한번에 많은 양 처리 가능)
	batch_size = 100
	for i in range(0, len(chunks), batch_size):
	batch_end = min(i + batch_size, len(chunks))
	self.collection.add(
	ids=ids[i:batch_end],
	embeddings=embeddings[i:batch_end],
	documents=documents[i:batch_end],
	metadatas=metadatas[i:batch_end]
	)
	logger.info(f"Added batch {i // batch_size + 1}/{(len(chunks) + batch_size - 1) // batch_size}")

	logger.info(f"Successfully added {len(chunks)} documents. Total in collection: {self.collection.count()}")

	def search(
	self,
	query_embedding: List[float],
	top_k: int = 5,
	filter_metadata: Optional[Dict[str, str]] = None
	) -> Dict[str, Any]:
	"""
	벡터 검색 수행

	Args:
	query_embedding: 쿼리의 임베딩 벡터
	top_k: 반환할 결과 개수
	filter_metadata: 메타데이터 필터 (optional)

	Returns:
	검색 결과 (documents, metadatas, distances)
	"""
	results = self.collection.query(
	query_embeddings=[query_embedding],
	n_results=top_k,
	where=filter_metadata
	)

	return {
	'documents': results['documents'][0] if results['documents'] else [],
	'metadatas': results['metadatas'][0] if results['metadatas'] else [],
	'distances': results['distances'][0] if results['distances'] else [],
	'ids': results['ids'][0] if results['ids'] else []
	}

	def search_by_text(
	self,
	query_text: str,
	top_k: int = 5,
	filter_metadata: Optional[Dict[str, str]] = None
	) -> Dict[str, Any]:
	"""
	텍스트로 검색 (ChromaDB가 자동으로 임베딩)

	Args:
	query_text: 검색 쿼리 텍스트
	top_k: 반환할 결과 개수
	filter_metadata: 메타데이터 필터

	Returns:
	검색 결과
	"""
	results = self.collection.query(
	query_texts=[query_text],
	n_results=top_k,
	where=filter_metadata
	)

	return {
	'documents': results['documents'][0] if results['documents'] else [],
	'metadatas': results['metadatas'][0] if results['metadatas'] else [],
	'distances': results['distances'][0] if results['distances'] else [],
	'ids': results['ids'][0] if results['ids'] else []
	}

	def get_collection_stats(self) -> Dict[str, Any]:
	"""컬렉션 통계 정보"""
	count = self.collection.count()

	# 샘플 데이터 가져오기
	sample = self.collection.peek(limit=1)

	return {
	'collection_name': self.collection_name,
	'total_documents': count,
	'persist_directory': str(self.persist_directory),
	'has_data': count > 0
	}

	def delete_collection(self) -> None:
	"""컬렉션 삭제 (주의: 모든 데이터 삭제됨)"""
	logger.warning(f"Deleting collection '{self.collection_name}'")
	self.client.delete_collection(name=self.collection_name)
	logger.info("Collection deleted")

	def reset_collection(self) -> None:
	"""컬렉션 초기화 (삭제 후 재생성)"""
	self.delete_collection()
	self.collection = self.client.get_or_create_collection(
	name=self.collection_name,
	metadata={"description": "Financial and Economics research papers"}
	)
	logger.info("Collection reset")