Rabbitt-AI commited on
Commit
7c7b13f
·
verified ·
1 Parent(s): a6777ad

Update app.py

Browse files

pagerank_score error fixed

Files changed (1) hide show
  1. app.py +35 -63
app.py CHANGED
@@ -9,21 +9,20 @@ import asyncio
9
  import networkx as nx
10
  from mistralai import Mistral
11
  from annoy import AnnoyIndex
12
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
13
- from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
14
- from sklearn.preprocessing import normalize
15
  from rank_bm25 import BM25Okapi
16
  from gensim.models import Word2Vec
17
  from typing import List, Optional, Tuple
18
  import gradio as gr
19
 
20
-
21
  logger = logging.getLogger(__name__)
 
 
22
  api_key = os.getenv("MISTRAL_API_KEY")
23
  client = Mistral(api_key=api_key)
24
 
25
-
26
- def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
27
  embeddings = []
28
  for text in text_list:
29
  retries = 0
@@ -32,7 +31,7 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
32
  try:
33
  token_count = len(text.split())
34
  if token_count > 16384:
35
- print("Warning: Text chunk exceeds the token limit. Truncating the text.")
36
  text = " ".join(text.split()[:16384])
37
  response = client.embeddings.create(model="mistral-embed", inputs=[text])
38
  embeddings.extend([embedding.embedding for embedding in response.data])
@@ -40,15 +39,14 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
40
  break
41
  except Exception as e:
42
  retries += 1
43
- print(f"Rate limit exceeded, retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
44
  time.sleep(delay)
45
- delay *= 2
46
  if retries == max_retries:
47
- print("Max retries reached. Skipping this chunk.")
48
  break
49
  return embeddings
50
 
51
-
52
  def store_embeddings_in_vector_db(
53
  pdf_path: str,
54
  vector_db_path: str,
@@ -92,7 +90,6 @@ def store_embeddings_in_vector_db(
92
  annoy_index.save(annoy_index_path)
93
  logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
94
 
95
-
96
  def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
97
  tokens = text.split()
98
  chunks = []
@@ -112,10 +109,9 @@ class MistralRAGChatbot:
112
  self.word2vec_model = self.train_word2vec(self.texts)
113
  self.reranking_methods = {
114
  'advanced_fusion': self.advanced_fusion_retrieval
115
- }
116
  logging.info("MistralRAGChatbot initialized successfully.")
117
 
118
-
119
  def load_vector_db(self, vector_db_path: str) -> Tuple[np.ndarray, List[str]]:
120
  with open(vector_db_path, "rb") as f:
121
  data = dill.load(f)
@@ -125,12 +121,13 @@ class MistralRAGChatbot:
125
  return embeddings, texts
126
 
127
  def load_annoy_index(self, annoy_index_path: str, embedding_dim: int) -> AnnoyIndex:
 
 
128
  annoy_index = AnnoyIndex(embedding_dim, 'angular')
129
  annoy_index.load(annoy_index_path)
130
  logging.info(f"Loaded Annoy index from {annoy_index_path}.")
131
  return annoy_index
132
 
133
-
134
  def train_word2vec(self, texts: List[str]) -> Word2Vec:
135
  tokenized_texts = [text.split() for text in texts]
136
  model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
@@ -143,7 +140,7 @@ class MistralRAGChatbot:
143
  return np.array(response.data[0].embedding)
144
  except Exception as e:
145
  logging.error(f"Error fetching embedding: {e}")
146
- return np.zeros((1024,))
147
 
148
  def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
149
  query_embedding = self.create_embeddings([user_query])[0]
@@ -166,22 +163,20 @@ class MistralRAGChatbot:
166
  combined_scores[idx] = (
167
  0.5 * vector_scores.get(idx, 0) +
168
  0.3 * bm25_scores.get(idx, 0) +
169
- 0.2 * pagerank_scores[idx]
170
  )
171
-
172
  min_score = min(combined_scores.values())
173
  max_score = max(combined_scores.values())
174
-
175
-
176
  normalized_scores = {idx: (score - min_score) / (max_score - min_score) for idx, score in combined_scores.items()}
177
-
178
-
179
  sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
180
 
181
  return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': normalized_scores[i], 'index': i} for i in sorted_indices[:5]]
182
 
183
  def create_embeddings(self, text_list: List[str]) -> np.ndarray:
184
- expected_dim = 1024
185
  embeddings = []
186
  for text in text_list:
187
  word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
@@ -203,9 +198,9 @@ class MistralRAGChatbot:
203
  selected_reranking_methods: Optional[List[str]] = None
204
  ) -> Tuple[str, List[str], List[dict]]:
205
  if not selected_retrieval_methods:
206
- selected_retrieval_methods = ['annoy', 'tfidf', 'bm25', 'word2vec', 'euclidean', 'jaccard']
207
  if not selected_reranking_methods:
208
- selected_reranking_methods = ['reciprocal_rank_fusion', 'weighted_score_fusion', 'advanced_fusion']
209
  query_embedding = await self.get_text_embedding(user_query)
210
  retrieved_docs = self.retrieve_documents(user_query, query_embedding, top_k, selected_retrieval_methods)
211
  reranked_docs = self.rerank_documents(user_query, retrieved_docs, selected_reranking_methods)
@@ -244,11 +239,10 @@ class MistralRAGChatbot:
244
  def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
245
  n_results = min(top_k, len(self.texts))
246
  indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
247
- scores = [1.0 - (dist / max(distances)) for dist in distances]
248
  logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
249
  return indices, scores
250
 
251
-
252
  def retrieve_with_bm25(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
253
  tokenized_query = user_query.split()
254
  scores = self.bm25.get_scores(tokenized_query)
@@ -256,21 +250,6 @@ class MistralRAGChatbot:
256
  logging.debug(f"BM25 retrieval returned {len(indices)} documents.")
257
  return indices, scores[indices].tolist()
258
 
259
- def retrieve_with_word2vec(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
260
- query_tokens = user_query.split()
261
- query_vec = np.mean([self.word2vec_model.wv[token] for token in query_tokens if token in self.word2vec_model.wv], axis=0)
262
- expected_dim = query_vec.shape[0]
263
- doc_vectors = []
264
- for doc in self.texts:
265
- word_vectors = [self.word2vec_model.wv[token] for token in doc.split() if token in self.word2vec_model.wv]
266
- avg_vector = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(expected_dim)
267
- doc_vectors.append(avg_vector)
268
- doc_vectors = np.array(doc_vectors)
269
- similarities = cosine_similarity([query_vec], doc_vectors).flatten()
270
- indices = np.argsort(-similarities)[:top_k]
271
- return indices, similarities[indices].tolist()
272
-
273
-
274
  def rerank_documents(
275
  self,
276
  user_query: str,
@@ -286,7 +265,6 @@ class MistralRAGChatbot:
286
 
287
  return reranked_docs
288
 
289
-
290
  def build_prompt(self, context: str, user_query: str, response_style: str) -> str:
291
  styles = {
292
  "detailed": "Provide a comprehensive and detailed answer based on the provided context.",
@@ -323,18 +301,16 @@ class MistralRAGChatbot:
323
  common_terms = query_terms.intersection(context_terms)
324
  return len(common_terms) > len(query_terms) * 0.2
325
 
326
-
327
  def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
328
  store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
329
  print("Vector database and Annoy index creation completed.")
330
 
331
-
332
  def chatbot_interface(file, user_query, response_style):
333
  vector_db_path = "vector_db.pkl"
334
  annoy_index_path = "vector_index.ann"
335
  chunk_size = 2048
336
  overlap = 200
337
- store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
338
 
339
  chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
340
 
@@ -356,26 +332,22 @@ def chatbot_interface(file, user_query, response_style):
356
  formatted_response += f"Retrieval Method: {doc_info['method']}\n"
357
  if 'score' in doc_info:
358
  formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
359
- for key, value in doc_info.items():
360
- if key.endswith('_score') and key != 'score':
361
- formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
362
 
363
  return formatted_response
364
- iface = gr.Blocks(theme="Rabbitt-AI/ChanceRAG")
 
365
  with iface:
366
- gr.Image("images/chanceRAG_logo.jpg", label="Image", show_label = False)
367
  gr.Interface(
368
- fn=chatbot_interface,
369
- theme="Rabbitt-AI/ChanceRAG",
370
-
371
- inputs=[
372
- gr.File(label="Upload a File"),
373
- gr.Textbox(lines=5, label="User Query"),
374
- gr.Dropdown(
375
- ["Detailed", "Concise", "Creative", "Technical"], label="Retreival Style"
376
- ),
377
- ],
378
- outputs=gr.Textbox(label="ChanceRAG Response"),
379
- )
380
 
381
  iface.launch(share=True)
 
9
  import networkx as nx
10
  from mistralai import Mistral
11
  from annoy import AnnoyIndex
12
+ from sklearn.feature_extraction.text import TfidfVectorizer
13
+ from sklearn.metrics.pairwise import cosine_similarity
 
14
  from rank_bm25 import BM25Okapi
15
  from gensim.models import Word2Vec
16
  from typing import List, Optional, Tuple
17
  import gradio as gr
18
 
 
19
  logger = logging.getLogger(__name__)
20
+ logging.basicConfig(level=logging.INFO)
21
+
22
  api_key = os.getenv("MISTRAL_API_KEY")
23
  client = Mistral(api_key=api_key)
24
 
25
+ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
 
26
  embeddings = []
27
  for text in text_list:
28
  retries = 0
 
31
  try:
32
  token_count = len(text.split())
33
  if token_count > 16384:
34
+ logger.warning("Text chunk exceeds the token limit. Truncating the text.")
35
  text = " ".join(text.split()[:16384])
36
  response = client.embeddings.create(model="mistral-embed", inputs=[text])
37
  embeddings.extend([embedding.embedding for embedding in response.data])
 
39
  break
40
  except Exception as e:
41
  retries += 1
42
+ logger.warning(f"Rate limit exceeded, retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
43
  time.sleep(delay)
44
+ delay = min(delay * 2, max_delay)
45
  if retries == max_retries:
46
+ logger.error("Max retries reached. Skipping this chunk.")
47
  break
48
  return embeddings
49
 
 
50
  def store_embeddings_in_vector_db(
51
  pdf_path: str,
52
  vector_db_path: str,
 
90
  annoy_index.save(annoy_index_path)
91
  logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
92
 
 
93
  def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
94
  tokens = text.split()
95
  chunks = []
 
109
  self.word2vec_model = self.train_word2vec(self.texts)
110
  self.reranking_methods = {
111
  'advanced_fusion': self.advanced_fusion_retrieval
112
+ }
113
  logging.info("MistralRAGChatbot initialized successfully.")
114
 
 
115
  def load_vector_db(self, vector_db_path: str) -> Tuple[np.ndarray, List[str]]:
116
  with open(vector_db_path, "rb") as f:
117
  data = dill.load(f)
 
121
  return embeddings, texts
122
 
123
  def load_annoy_index(self, annoy_index_path: str, embedding_dim: int) -> AnnoyIndex:
124
+ if not os.path.exists(annoy_index_path):
125
+ raise FileNotFoundError(f"Annoy index file {annoy_index_path} not found.")
126
  annoy_index = AnnoyIndex(embedding_dim, 'angular')
127
  annoy_index.load(annoy_index_path)
128
  logging.info(f"Loaded Annoy index from {annoy_index_path}.")
129
  return annoy_index
130
 
 
131
  def train_word2vec(self, texts: List[str]) -> Word2Vec:
132
  tokenized_texts = [text.split() for text in texts]
133
  model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
 
140
  return np.array(response.data[0].embedding)
141
  except Exception as e:
142
  logging.error(f"Error fetching embedding: {e}")
143
+ return np.zeros((1024,))
144
 
145
  def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
146
  query_embedding = self.create_embeddings([user_query])[0]
 
163
  combined_scores[idx] = (
164
  0.5 * vector_scores.get(idx, 0) +
165
  0.3 * bm25_scores.get(idx, 0) +
166
+ 0.2 * pagerank_scores[idx] if idx < len(pagerank_scores) else 0
167
  )
168
+
169
  min_score = min(combined_scores.values())
170
  max_score = max(combined_scores.values())
171
+
 
172
  normalized_scores = {idx: (score - min_score) / (max_score - min_score) for idx, score in combined_scores.items()}
173
+
 
174
  sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
175
 
176
  return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': normalized_scores[i], 'index': i} for i in sorted_indices[:5]]
177
 
178
  def create_embeddings(self, text_list: List[str]) -> np.ndarray:
179
+ expected_dim = 1024
180
  embeddings = []
181
  for text in text_list:
182
  word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
 
198
  selected_reranking_methods: Optional[List[str]] = None
199
  ) -> Tuple[str, List[str], List[dict]]:
200
  if not selected_retrieval_methods:
201
+ selected_retrieval_methods = ['annoy', 'bm25']
202
  if not selected_reranking_methods:
203
+ selected_reranking_methods = ['advanced_fusion']
204
  query_embedding = await self.get_text_embedding(user_query)
205
  retrieved_docs = self.retrieve_documents(user_query, query_embedding, top_k, selected_retrieval_methods)
206
  reranked_docs = self.rerank_documents(user_query, retrieved_docs, selected_reranking_methods)
 
239
  def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
240
  n_results = min(top_k, len(self.texts))
241
  indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
242
+ scores = [1.0 - (dist / max(distances)) for dist in distances] if distances else []
243
  logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
244
  return indices, scores
245
 
 
246
  def retrieve_with_bm25(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
247
  tokenized_query = user_query.split()
248
  scores = self.bm25.get_scores(tokenized_query)
 
250
  logging.debug(f"BM25 retrieval returned {len(indices)} documents.")
251
  return indices, scores[indices].tolist()
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def rerank_documents(
254
  self,
255
  user_query: str,
 
265
 
266
  return reranked_docs
267
 
 
268
  def build_prompt(self, context: str, user_query: str, response_style: str) -> str:
269
  styles = {
270
  "detailed": "Provide a comprehensive and detailed answer based on the provided context.",
 
301
  common_terms = query_terms.intersection(context_terms)
302
  return len(common_terms) > len(query_terms) * 0.2
303
 
 
304
  def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
305
  store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
306
  print("Vector database and Annoy index creation completed.")
307
 
 
308
  def chatbot_interface(file, user_query, response_style):
309
  vector_db_path = "vector_db.pkl"
310
  annoy_index_path = "vector_index.ann"
311
  chunk_size = 2048
312
  overlap = 200
313
+ store_embeddings_in_vector_db(file.name, vector_db_path, annoy_index_path, chunk_size, overlap)
314
 
315
  chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
316
 
 
332
  formatted_response += f"Retrieval Method: {doc_info['method']}\n"
333
  if 'score' in doc_info:
334
  formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
 
 
 
335
 
336
  return formatted_response
337
+
338
+ iface = gr.Blocks()
339
  with iface:
340
+ gr.Image("images/chanceRAG_logo.jpg", label="Image", show_label=False)
341
  gr.Interface(
342
+ fn=chatbot_interface,
343
+ inputs=[
344
+ gr.File(label="Upload a File"),
345
+ gr.Textbox(lines=5, label="User Query"),
346
+ gr.Dropdown([
347
+ "Detailed", "Concise", "Creative", "Technical"], label="Response Style"
348
+ ),
349
+ ],
350
+ outputs=gr.Textbox(label="ChanceRAG Response"),
351
+ )
 
 
352
 
353
  iface.launch(share=True)