import openai import faiss import numpy as np import logging from sklearn.metrics.pairwise import cosine_similarity # Configure logging logger = logging.getLogger(__name__) class RAGSystem: def __init__(self, model_name="text-embedding-ada-002"): self.client = openai.OpenAI() self.model_name = model_name self.index = None self.faiss_data = [] logger.info(f"RAGSystem initialized with model: {model_name}") def split_into_chunks(self, page_data, max_chunk_size=500): logger.info(f"Splitting data into chunks with max size: {max_chunk_size}") chunks = [] for page in page_data: url = page['url'] for paragraph in page['paragraphs']: if len(paragraph) <= max_chunk_size: chunks.append({'content': paragraph, 'url': url}) else: # Break long paragraphs into smaller chunks for i in range(0, len(paragraph), max_chunk_size): chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url}) logger.debug(f"Created {len(chunks)} chunks") return chunks def compute_embeddings(self, text_chunks): logger.info(f"Computing embeddings for {len(text_chunks)} chunks") texts = [chunk['content'] for chunk in text_chunks] response = self.client.embeddings.create(model=self.model_name, input=texts) embeddings = [result.embedding for result in response.data] logger.debug(f"Computed {len(embeddings)} embeddings") return embeddings def store_embeddings_in_faiss(self, embeddings, text_chunks): logger.info("Storing embeddings in FAISS index") dimension = len(embeddings[0]) self.index = faiss.IndexFlatL2(dimension) for idx, embedding in enumerate(embeddings): np_embedding = np.array(embedding, dtype='float32') self.index.add(np_embedding.reshape(1, -1)) self.faiss_data.append({ 'embedding': np_embedding, 'content': text_chunks[idx]['content'], 'url': text_chunks[idx]['url'] }) logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index") def process_content(self, website_data): logger.info("Processing website content") text_chunks = self.split_into_chunks(website_data) embeddings = self.compute_embeddings(text_chunks) self.store_embeddings_in_faiss(embeddings, text_chunks) logger.info("Content processing completed") def process_user_query(self, query): logger.info(f"Processing user query: {query}") response = self.client.embeddings.create(model=self.model_name, input=[query]) query_embedding = response.data[0].embedding np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1) distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results similar_chunks = [self.faiss_data[i] for i in indices[0]] logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query") return similar_chunks