vichudo commited on
Commit
6a4bd6f
·
1 Parent(s): 8abf329
.gitignore CHANGED
@@ -10,6 +10,16 @@ pdfs/
10
  !requirements.txt
11
  !docker-compose.yml
12
 
 
 
 
 
 
 
 
 
 
 
13
  # Python
14
  __pycache__/
15
  *.py[cod]
 
10
  !requirements.txt
11
  !docker-compose.yml
12
 
13
+ # Allow src directory structure
14
+ !src/
15
+ !src/**
16
+ !src/embeddings/
17
+ !src/embeddings/**
18
+ !src/models/
19
+ !src/models/**
20
+ !src/agents/
21
+ !src/agents/**
22
+
23
  # Python
24
  __pycache__/
25
  *.py[cod]
src/data/__init__.py ADDED
File without changes
src/data/document_processor.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import faiss
4
+ import numpy as np
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ from tqdm import tqdm
7
+
8
+ from src.utils.config import DATA_DIR, EMBEDDINGS_DIR
9
+ from src.embeddings.embedder import TextEmbedder
10
+
11
+ class DocumentProcessor:
12
+ """
13
+ Handles document loading, chunking, and processing operations.
14
+ """
15
+
16
+ def __init__(self, data_dir: str = DATA_DIR, embeddings_dir: str = EMBEDDINGS_DIR):
17
+ """
18
+ Initialize the document processor.
19
+
20
+ Args:
21
+ data_dir: Directory containing the document files
22
+ embeddings_dir: Directory for storing embeddings and indexes
23
+ """
24
+ self.data_dir = data_dir
25
+ self.embeddings_dir = embeddings_dir
26
+ self.embedder = TextEmbedder()
27
+
28
+ # Create directories if they don't exist
29
+ os.makedirs(data_dir, exist_ok=True)
30
+ os.makedirs(embeddings_dir, exist_ok=True)
31
+
32
+ def process_documents(self, doc_chunks: List[Dict[str, Any]], save: bool = True) -> Tuple[List[Dict[str, Any]], List[List[float]]]:
33
+ """
34
+ Process document chunks by generating embeddings and creating a FAISS index.
35
+
36
+ Args:
37
+ doc_chunks: List of document chunks to process
38
+ save: Whether to save the processed data to disk
39
+
40
+ Returns:
41
+ Tuple containing the document chunks and their embeddings
42
+ """
43
+ print(f"Processing {len(doc_chunks)} document chunks...")
44
+
45
+ # Extract text chunks for embedding
46
+ texts = [chunk["chunk"] for chunk in doc_chunks]
47
+
48
+ # Generate embeddings
49
+ print("Generating embeddings...")
50
+ embeddings = self.embedder.get_embeddings_for_texts(texts)
51
+
52
+ # Save the results if requested
53
+ if save:
54
+ self._save_processed_data(doc_chunks, embeddings)
55
+
56
+ return doc_chunks, embeddings
57
+
58
+ def create_faiss_index(self, embeddings: List[List[float]], save: bool = True) -> faiss.Index:
59
+ """
60
+ Create a FAISS index from the document embeddings.
61
+
62
+ Args:
63
+ embeddings: List of embedding vectors
64
+ save: Whether to save the index to disk
65
+
66
+ Returns:
67
+ FAISS index
68
+ """
69
+ print("Creating FAISS index...")
70
+
71
+ # Convert embeddings to numpy array
72
+ embedding_array = np.array(embeddings, dtype='float32')
73
+
74
+ # Get dimensions
75
+ vector_dimension = embedding_array.shape[1]
76
+
77
+ # Create the index
78
+ index = faiss.IndexFlatL2(vector_dimension)
79
+ index.add(embedding_array)
80
+
81
+ print(f"Created FAISS index with {index.ntotal} vectors of dimension {vector_dimension}")
82
+
83
+ # Save the index if requested
84
+ if save:
85
+ index_path = os.path.join(self.embeddings_dir, "faiss_index.index")
86
+ faiss.write_index(index, index_path)
87
+ print(f"FAISS index saved to {index_path}")
88
+
89
+ return index
90
+
91
+ def _save_processed_data(self, doc_chunks: List[Dict[str, Any]], embeddings: List[List[float]]) -> None:
92
+ """
93
+ Save the processed document chunks and embeddings to disk.
94
+
95
+ Args:
96
+ doc_chunks: List of document chunks
97
+ embeddings: List of embedding vectors
98
+ """
99
+ # Save document chunks
100
+ chunks_path = os.path.join(self.data_dir, "doc_chunks.pkl")
101
+ with open(chunks_path, "wb") as f:
102
+ pickle.dump(doc_chunks, f)
103
+ print(f"Document chunks saved to {chunks_path}")
104
+
105
+ # Save embeddings
106
+ embeddings_path = os.path.join(self.embeddings_dir, "embeddings.pkl")
107
+ with open(embeddings_path, "wb") as f:
108
+ pickle.dump(embeddings, f)
109
+ print(f"Embeddings saved to {embeddings_path}")
110
+
111
+ def load_processed_data(self) -> Tuple[List[Dict[str, Any]], List[List[float]], faiss.Index]:
112
+ """
113
+ Load processed document chunks, embeddings, and FAISS index from disk.
114
+
115
+ Returns:
116
+ Tuple containing document chunks, embeddings, and FAISS index
117
+ """
118
+ # Load document chunks
119
+ chunks_path = os.path.join(self.data_dir, "doc_chunks.pkl")
120
+ with open(chunks_path, "rb") as f:
121
+ doc_chunks = pickle.load(f)
122
+ print(f"Document chunks loaded from {chunks_path}")
123
+
124
+ # Load embeddings
125
+ embeddings_path = os.path.join(self.embeddings_dir, "embeddings.pkl")
126
+ with open(embeddings_path, "rb") as f:
127
+ embeddings = pickle.load(f)
128
+ print(f"Embeddings loaded from {embeddings_path}")
129
+
130
+ # Load FAISS index
131
+ index_path = os.path.join(self.embeddings_dir, "faiss_index.index")
132
+ index = faiss.read_index(index_path)
133
+ print(f"FAISS index loaded from {index_path}")
134
+
135
+ return doc_chunks, embeddings, index
src/embeddings/embedder.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import numpy as np
3
+ from tqdm import tqdm
4
+ from openai import OpenAI
5
+ from typing import List, Dict, Any, Optional
6
+
7
+ from src.utils.config import EMBEDDING_MODEL, EMBEDDING_BATCH_SIZE, OPENAI_API_KEY
8
+
9
+ class TextEmbedder:
10
+ """Class for generating embeddings for document chunks using OpenAI's embeddings API."""
11
+
12
+ def __init__(self, model: str = EMBEDDING_MODEL, batch_size: int = EMBEDDING_BATCH_SIZE):
13
+ """
14
+ Initialize the TextEmbedder with the specified embedding model and batch size.
15
+
16
+ Args:
17
+ model: The OpenAI embedding model to use
18
+ batch_size: Number of chunks to embed per API call
19
+ """
20
+ self.model = model
21
+ self.batch_size = batch_size
22
+ self.client = OpenAI(api_key=OPENAI_API_KEY)
23
+ self.embedding_dim = 1536 # Default dimension for text-embedding-3-small
24
+
25
+ def get_embedding_for_text(self, text: str) -> List[float]:
26
+ """Generate embedding for a single text."""
27
+ try:
28
+ response = self.client.embeddings.create(
29
+ input=[text],
30
+ model=self.model
31
+ )
32
+ return response.data[0].embedding
33
+ except Exception as e:
34
+ print(f"Error generating embedding: {e}")
35
+ return [0.0] * self.embedding_dim
36
+
37
+ def get_embeddings_for_texts(self, texts: List[str]) -> List[List[float]]:
38
+ """
39
+ Compute embeddings for a list of texts using batched API calls.
40
+
41
+ Args:
42
+ texts: List of text chunks to embed
43
+
44
+ Returns:
45
+ List of embedding vectors
46
+ """
47
+ embeddings = []
48
+ for i in tqdm(range(0, len(texts), self.batch_size), desc="Embedding chunks"):
49
+ batch = texts[i:i + self.batch_size]
50
+ try:
51
+ response = self.client.embeddings.create(
52
+ input=batch,
53
+ model=self.model
54
+ )
55
+ # Extract embeddings from the response
56
+ for item in response.data:
57
+ embeddings.append(item.embedding)
58
+ except Exception as e:
59
+ print(f"Error embedding batch starting at index {i}: {e}")
60
+ # Append placeholder zero vectors for failed texts
61
+ for _ in batch:
62
+ embeddings.append([0.0] * self.embedding_dim)
63
+ # Brief pause to avoid rate limits
64
+ time.sleep(0.2)
65
+
66
+ return embeddings
67
+
68
+ def get_query_embedding(self, query: str) -> np.ndarray:
69
+ """
70
+ Generate embedding for a query string and return as numpy array.
71
+
72
+ Args:
73
+ query: The query text to embed
74
+
75
+ Returns:
76
+ Numpy array of the embedding
77
+ """
78
+ try:
79
+ q_response = self.client.embeddings.create(
80
+ input=[query],
81
+ model=self.model
82
+ )
83
+ return np.array(q_response.data[0].embedding, dtype='float32').reshape(1, -1)
84
+ except Exception as e:
85
+ print(f"Error creating embedding for query: {e}")
86
+ return np.zeros((1, self.embedding_dim), dtype='float32')