File size: 3,215 Bytes
748113b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import openai
import faiss
import numpy as np
import logging
from sklearn.metrics.pairwise import cosine_similarity

# Configure logging
logger = logging.getLogger(__name__)

class RAGSystem:
    def __init__(self, model_name="text-embedding-ada-002"):
        self.client = openai.OpenAI()
        self.model_name = model_name
        self.index = None
        self.faiss_data = []
        logger.info(f"RAGSystem initialized with model: {model_name}")

    def split_into_chunks(self, page_data, max_chunk_size=500):
        logger.info(f"Splitting data into chunks with max size: {max_chunk_size}")
        chunks = []
        for page in page_data:
            url = page['url']
            for paragraph in page['paragraphs']:
                if len(paragraph) <= max_chunk_size:
                    chunks.append({'content': paragraph, 'url': url})
                else:
                    # Break long paragraphs into smaller chunks
                    for i in range(0, len(paragraph), max_chunk_size):
                        chunks.append({'content': paragraph[i:i+max_chunk_size], 'url': url})
        logger.debug(f"Created {len(chunks)} chunks")
        return chunks

    def compute_embeddings(self, text_chunks):
        logger.info(f"Computing embeddings for {len(text_chunks)} chunks")
        texts = [chunk['content'] for chunk in text_chunks]
        response = self.client.embeddings.create(model=self.model_name, input=texts)
        embeddings = [result.embedding for result in response.data]
        logger.debug(f"Computed {len(embeddings)} embeddings")
        return embeddings

    def store_embeddings_in_faiss(self, embeddings, text_chunks):
        logger.info("Storing embeddings in FAISS index")
        dimension = len(embeddings[0])
        self.index = faiss.IndexFlatL2(dimension)
        
        for idx, embedding in enumerate(embeddings):
            np_embedding = np.array(embedding, dtype='float32')
            self.index.add(np_embedding.reshape(1, -1))
            self.faiss_data.append({
                'embedding': np_embedding,
                'content': text_chunks[idx]['content'],
                'url': text_chunks[idx]['url']
            })
        logger.debug(f"Stored {len(embeddings)} embeddings in FAISS index")

    def process_content(self, website_data):
        logger.info("Processing website content")
        text_chunks = self.split_into_chunks(website_data)
        embeddings = self.compute_embeddings(text_chunks)
        self.store_embeddings_in_faiss(embeddings, text_chunks)
        logger.info("Content processing completed")

    def process_user_query(self, query):
        logger.info(f"Processing user query: {query}")
        response = self.client.embeddings.create(model=self.model_name, input=[query])
        query_embedding = response.data[0].embedding
        np_query_embedding = np.array(query_embedding, dtype='float32').reshape(1, -1)

        distances, indices = self.index.search(np_query_embedding, 5)  # Retrieve top 5 similar results
        similar_chunks = [self.faiss_data[i] for i in indices[0]]
        logger.debug(f"Retrieved {len(similar_chunks)} similar chunks for the query")
        return similar_chunks