lazzloe.com / rag_system.py
mmcc007's picture
Upload folder using huggingface_hub
748113b verified
import openai
import faiss
import numpy as np
import logging
from sklearn.metrics.pairwise import cosine_similarity
# Configure logging
logger = logging.getLogger(__name__)
class RAGSystem:
def __init__(self, model_name="text-embedding-ada-002"):
self.client = openai.OpenAI()
self.model_name = model_name
self.index = None
self.faiss_data = []
logger.info(f"RAGSystem initialized with model: {model_name}")
def split_into_chunks(self, page_data, max_chunk_size=500):
logger.info(f"Splitting data into chunks with max size: {max_chunk_size}")
chunks = []
for page in page_data:
url = page['url']
for paragraph in page['paragraphs']:
if len(paragraph) <= max_chunk_size:
chunks.append({'content': paragraph, 'url': url})
else:
# Break long paragraphs into smaller chunks
for i in range(0, len(paragraph), max_chunk_size):
chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})
logger.debug(f"Created {len(chunks)} chunks")
return chunks
def compute_embeddings(self, text_chunks):
logger.info(f"Computing embeddings for {len(text_chunks)} chunks")
texts = [chunk['content'] for chunk in text_chunks]
response = self.client.embeddings.create(model=self.model_name, input=texts)
embeddings = [result.embedding for result in response.data]
logger.debug(f"Computed {len(embeddings)} embeddings")
return embeddings
def store_embeddings_in_faiss(self, embeddings, text_chunks):
logger.info("Storing embeddings in FAISS index")
dimension = len(embeddings[0])
self.index = faiss.IndexFlatL2(dimension)
for idx, embedding in enumerate(embeddings):
np_embedding = np.array(embedding, dtype='float32')
self.index.add(np_embedding.reshape(1, -1))
self.faiss_data.append({
'embedding': np_embedding,
'content': text_chunks[idx]['content'],
'url': text_chunks[idx]['url']
})
logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index")
def process_content(self, website_data):
logger.info("Processing website content")
text_chunks = self.split_into_chunks(website_data)
embeddings = self.compute_embeddings(text_chunks)
self.store_embeddings_in_faiss(embeddings, text_chunks)
logger.info("Content processing completed")
def process_user_query(self, query):
logger.info(f"Processing user query: {query}")
response = self.client.embeddings.create(model=self.model_name, input=[query])
query_embedding = response.data[0].embedding
np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)
distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results
similar_chunks = [self.faiss_data[i] for i in indices[0]]
logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query")
return similar_chunks