Spaces:
Sleeping
Sleeping
| import openai | |
| import faiss | |
| import numpy as np | |
| import logging | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| class RAGSystem: | |
| def __init__(self, model_name="text-embedding-ada-002"): | |
| self.client = openai.OpenAI() | |
| self.model_name = model_name | |
| self.index = None | |
| self.faiss_data = [] | |
| logger.info(f"RAGSystem initialized with model: {model_name}") | |
| def split_into_chunks(self, page_data, max_chunk_size=500): | |
| logger.info(f"Splitting data into chunks with max size: {max_chunk_size}") | |
| chunks = [] | |
| for page in page_data: | |
| url = page['url'] | |
| for paragraph in page['paragraphs']: | |
| if len(paragraph) <= max_chunk_size: | |
| chunks.append({'content': paragraph, 'url': url}) | |
| else: | |
| # Break long paragraphs into smaller chunks | |
| for i in range(0, len(paragraph), max_chunk_size): | |
| chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url}) | |
| logger.debug(f"Created {len(chunks)} chunks") | |
| return chunks | |
| def compute_embeddings(self, text_chunks): | |
| logger.info(f"Computing embeddings for {len(text_chunks)} chunks") | |
| texts = [chunk['content'] for chunk in text_chunks] | |
| response = self.client.embeddings.create(model=self.model_name, input=texts) | |
| embeddings = [result.embedding for result in response.data] | |
| logger.debug(f"Computed {len(embeddings)} embeddings") | |
| return embeddings | |
| def store_embeddings_in_faiss(self, embeddings, text_chunks): | |
| logger.info("Storing embeddings in FAISS index") | |
| dimension = len(embeddings[0]) | |
| self.index = faiss.IndexFlatL2(dimension) | |
| for idx, embedding in enumerate(embeddings): | |
| np_embedding = np.array(embedding, dtype='float32') | |
| self.index.add(np_embedding.reshape(1, -1)) | |
| self.faiss_data.append({ | |
| 'embedding': np_embedding, | |
| 'content': text_chunks[idx]['content'], | |
| 'url': text_chunks[idx]['url'] | |
| }) | |
| logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index") | |
| def process_content(self, website_data): | |
| logger.info("Processing website content") | |
| text_chunks = self.split_into_chunks(website_data) | |
| embeddings = self.compute_embeddings(text_chunks) | |
| self.store_embeddings_in_faiss(embeddings, text_chunks) | |
| logger.info("Content processing completed") | |
| def process_user_query(self, query): | |
| logger.info(f"Processing user query: {query}") | |
| response = self.client.embeddings.create(model=self.model_name, input=[query]) | |
| query_embedding = response.data[0].embedding | |
| np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1) | |
| distances, indices = self.index.search(np_query_embedding, 5) # Retrieve top 5 similar results | |
| similar_chunks = [self.faiss_data[i] for i in indices[0]] | |
| logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query") | |
| return similar_chunks |