Spaces:
Running
Running
Update app.py
Browse filespagerank_score error fixed
app.py
CHANGED
|
@@ -9,21 +9,20 @@ import asyncio
|
|
| 9 |
import networkx as nx
|
| 10 |
from mistralai import Mistral
|
| 11 |
from annoy import AnnoyIndex
|
| 12 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 14 |
-
from sklearn.preprocessing import normalize
|
| 15 |
from rank_bm25 import BM25Okapi
|
| 16 |
from gensim.models import Word2Vec
|
| 17 |
from typing import List, Optional, Tuple
|
| 18 |
import gradio as gr
|
| 19 |
|
| 20 |
-
|
| 21 |
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
| 22 |
api_key = os.getenv("MISTRAL_API_KEY")
|
| 23 |
client = Mistral(api_key=api_key)
|
| 24 |
|
| 25 |
-
|
| 26 |
-
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
|
| 27 |
embeddings = []
|
| 28 |
for text in text_list:
|
| 29 |
retries = 0
|
|
@@ -32,7 +31,7 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
|
|
| 32 |
try:
|
| 33 |
token_count = len(text.split())
|
| 34 |
if token_count > 16384:
|
| 35 |
-
|
| 36 |
text = " ".join(text.split()[:16384])
|
| 37 |
response = client.embeddings.create(model="mistral-embed", inputs=[text])
|
| 38 |
embeddings.extend([embedding.embedding for embedding in response.data])
|
|
@@ -40,15 +39,14 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
|
|
| 40 |
break
|
| 41 |
except Exception as e:
|
| 42 |
retries += 1
|
| 43 |
-
|
| 44 |
time.sleep(delay)
|
| 45 |
-
delay
|
| 46 |
if retries == max_retries:
|
| 47 |
-
|
| 48 |
break
|
| 49 |
return embeddings
|
| 50 |
|
| 51 |
-
|
| 52 |
def store_embeddings_in_vector_db(
|
| 53 |
pdf_path: str,
|
| 54 |
vector_db_path: str,
|
|
@@ -92,7 +90,6 @@ def store_embeddings_in_vector_db(
|
|
| 92 |
annoy_index.save(annoy_index_path)
|
| 93 |
logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
|
| 94 |
|
| 95 |
-
|
| 96 |
def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
|
| 97 |
tokens = text.split()
|
| 98 |
chunks = []
|
|
@@ -112,10 +109,9 @@ class MistralRAGChatbot:
|
|
| 112 |
self.word2vec_model = self.train_word2vec(self.texts)
|
| 113 |
self.reranking_methods = {
|
| 114 |
'advanced_fusion': self.advanced_fusion_retrieval
|
| 115 |
-
|
| 116 |
logging.info("MistralRAGChatbot initialized successfully.")
|
| 117 |
|
| 118 |
-
|
| 119 |
def load_vector_db(self, vector_db_path: str) -> Tuple[np.ndarray, List[str]]:
|
| 120 |
with open(vector_db_path, "rb") as f:
|
| 121 |
data = dill.load(f)
|
|
@@ -125,12 +121,13 @@ class MistralRAGChatbot:
|
|
| 125 |
return embeddings, texts
|
| 126 |
|
| 127 |
def load_annoy_index(self, annoy_index_path: str, embedding_dim: int) -> AnnoyIndex:
|
|
|
|
|
|
|
| 128 |
annoy_index = AnnoyIndex(embedding_dim, 'angular')
|
| 129 |
annoy_index.load(annoy_index_path)
|
| 130 |
logging.info(f"Loaded Annoy index from {annoy_index_path}.")
|
| 131 |
return annoy_index
|
| 132 |
|
| 133 |
-
|
| 134 |
def train_word2vec(self, texts: List[str]) -> Word2Vec:
|
| 135 |
tokenized_texts = [text.split() for text in texts]
|
| 136 |
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
|
|
@@ -143,7 +140,7 @@ class MistralRAGChatbot:
|
|
| 143 |
return np.array(response.data[0].embedding)
|
| 144 |
except Exception as e:
|
| 145 |
logging.error(f"Error fetching embedding: {e}")
|
| 146 |
-
return np.zeros((1024,))
|
| 147 |
|
| 148 |
def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
|
| 149 |
query_embedding = self.create_embeddings([user_query])[0]
|
|
@@ -166,22 +163,20 @@ class MistralRAGChatbot:
|
|
| 166 |
combined_scores[idx] = (
|
| 167 |
0.5 * vector_scores.get(idx, 0) +
|
| 168 |
0.3 * bm25_scores.get(idx, 0) +
|
| 169 |
-
0.2 * pagerank_scores[idx]
|
| 170 |
)
|
| 171 |
-
|
| 172 |
min_score = min(combined_scores.values())
|
| 173 |
max_score = max(combined_scores.values())
|
| 174 |
-
|
| 175 |
-
|
| 176 |
normalized_scores = {idx: (score - min_score) / (max_score - min_score) for idx, score in combined_scores.items()}
|
| 177 |
-
|
| 178 |
-
|
| 179 |
sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
|
| 180 |
|
| 181 |
return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': normalized_scores[i], 'index': i} for i in sorted_indices[:5]]
|
| 182 |
|
| 183 |
def create_embeddings(self, text_list: List[str]) -> np.ndarray:
|
| 184 |
-
expected_dim = 1024
|
| 185 |
embeddings = []
|
| 186 |
for text in text_list:
|
| 187 |
word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
|
|
@@ -203,9 +198,9 @@ class MistralRAGChatbot:
|
|
| 203 |
selected_reranking_methods: Optional[List[str]] = None
|
| 204 |
) -> Tuple[str, List[str], List[dict]]:
|
| 205 |
if not selected_retrieval_methods:
|
| 206 |
-
selected_retrieval_methods = ['annoy', '
|
| 207 |
if not selected_reranking_methods:
|
| 208 |
-
selected_reranking_methods = ['
|
| 209 |
query_embedding = await self.get_text_embedding(user_query)
|
| 210 |
retrieved_docs = self.retrieve_documents(user_query, query_embedding, top_k, selected_retrieval_methods)
|
| 211 |
reranked_docs = self.rerank_documents(user_query, retrieved_docs, selected_reranking_methods)
|
|
@@ -244,11 +239,10 @@ class MistralRAGChatbot:
|
|
| 244 |
def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 245 |
n_results = min(top_k, len(self.texts))
|
| 246 |
indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
|
| 247 |
-
scores = [1.0 - (dist / max(distances)) for dist in distances]
|
| 248 |
logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
|
| 249 |
return indices, scores
|
| 250 |
|
| 251 |
-
|
| 252 |
def retrieve_with_bm25(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 253 |
tokenized_query = user_query.split()
|
| 254 |
scores = self.bm25.get_scores(tokenized_query)
|
|
@@ -256,21 +250,6 @@ class MistralRAGChatbot:
|
|
| 256 |
logging.debug(f"BM25 retrieval returned {len(indices)} documents.")
|
| 257 |
return indices, scores[indices].tolist()
|
| 258 |
|
| 259 |
-
def retrieve_with_word2vec(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 260 |
-
query_tokens = user_query.split()
|
| 261 |
-
query_vec = np.mean([self.word2vec_model.wv[token] for token in query_tokens if token in self.word2vec_model.wv], axis=0)
|
| 262 |
-
expected_dim = query_vec.shape[0]
|
| 263 |
-
doc_vectors = []
|
| 264 |
-
for doc in self.texts:
|
| 265 |
-
word_vectors = [self.word2vec_model.wv[token] for token in doc.split() if token in self.word2vec_model.wv]
|
| 266 |
-
avg_vector = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(expected_dim)
|
| 267 |
-
doc_vectors.append(avg_vector)
|
| 268 |
-
doc_vectors = np.array(doc_vectors)
|
| 269 |
-
similarities = cosine_similarity([query_vec], doc_vectors).flatten()
|
| 270 |
-
indices = np.argsort(-similarities)[:top_k]
|
| 271 |
-
return indices, similarities[indices].tolist()
|
| 272 |
-
|
| 273 |
-
|
| 274 |
def rerank_documents(
|
| 275 |
self,
|
| 276 |
user_query: str,
|
|
@@ -286,7 +265,6 @@ class MistralRAGChatbot:
|
|
| 286 |
|
| 287 |
return reranked_docs
|
| 288 |
|
| 289 |
-
|
| 290 |
def build_prompt(self, context: str, user_query: str, response_style: str) -> str:
|
| 291 |
styles = {
|
| 292 |
"detailed": "Provide a comprehensive and detailed answer based on the provided context.",
|
|
@@ -323,18 +301,16 @@ class MistralRAGChatbot:
|
|
| 323 |
common_terms = query_terms.intersection(context_terms)
|
| 324 |
return len(common_terms) > len(query_terms) * 0.2
|
| 325 |
|
| 326 |
-
|
| 327 |
def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
|
| 328 |
store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
|
| 329 |
print("Vector database and Annoy index creation completed.")
|
| 330 |
|
| 331 |
-
|
| 332 |
def chatbot_interface(file, user_query, response_style):
|
| 333 |
vector_db_path = "vector_db.pkl"
|
| 334 |
annoy_index_path = "vector_index.ann"
|
| 335 |
chunk_size = 2048
|
| 336 |
overlap = 200
|
| 337 |
-
store_embeddings_in_vector_db(file.name,
|
| 338 |
|
| 339 |
chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
|
| 340 |
|
|
@@ -356,26 +332,22 @@ def chatbot_interface(file, user_query, response_style):
|
|
| 356 |
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
| 357 |
if 'score' in doc_info:
|
| 358 |
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
| 359 |
-
for key, value in doc_info.items():
|
| 360 |
-
if key.endswith('_score') and key != 'score':
|
| 361 |
-
formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
|
| 362 |
|
| 363 |
return formatted_response
|
| 364 |
-
|
|
|
|
| 365 |
with iface:
|
| 366 |
-
gr.Image("images/chanceRAG_logo.jpg", label="Image", show_label
|
| 367 |
gr.Interface(
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
),
|
| 377 |
-
|
| 378 |
-
outputs=gr.Textbox(label="ChanceRAG Response"),
|
| 379 |
-
)
|
| 380 |
|
| 381 |
iface.launch(share=True)
|
|
|
|
| 9 |
import networkx as nx
|
| 10 |
from mistralai import Mistral
|
| 11 |
from annoy import AnnoyIndex
|
| 12 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 13 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 14 |
from rank_bm25 import BM25Okapi
|
| 15 |
from gensim.models import Word2Vec
|
| 16 |
from typing import List, Optional, Tuple
|
| 17 |
import gradio as gr
|
| 18 |
|
|
|
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
+
logging.basicConfig(level=logging.INFO)
|
| 21 |
+
|
| 22 |
api_key = os.getenv("MISTRAL_API_KEY")
|
| 23 |
client = Mistral(api_key=api_key)
|
| 24 |
|
| 25 |
+
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10, max_delay=60):
|
|
|
|
| 26 |
embeddings = []
|
| 27 |
for text in text_list:
|
| 28 |
retries = 0
|
|
|
|
| 31 |
try:
|
| 32 |
token_count = len(text.split())
|
| 33 |
if token_count > 16384:
|
| 34 |
+
logger.warning("Text chunk exceeds the token limit. Truncating the text.")
|
| 35 |
text = " ".join(text.split()[:16384])
|
| 36 |
response = client.embeddings.create(model="mistral-embed", inputs=[text])
|
| 37 |
embeddings.extend([embedding.embedding for embedding in response.data])
|
|
|
|
| 39 |
break
|
| 40 |
except Exception as e:
|
| 41 |
retries += 1
|
| 42 |
+
logger.warning(f"Rate limit exceeded, retrying in {delay} seconds... (Attempt {retries}/{max_retries})")
|
| 43 |
time.sleep(delay)
|
| 44 |
+
delay = min(delay * 2, max_delay)
|
| 45 |
if retries == max_retries:
|
| 46 |
+
logger.error("Max retries reached. Skipping this chunk.")
|
| 47 |
break
|
| 48 |
return embeddings
|
| 49 |
|
|
|
|
| 50 |
def store_embeddings_in_vector_db(
|
| 51 |
pdf_path: str,
|
| 52 |
vector_db_path: str,
|
|
|
|
| 90 |
annoy_index.save(annoy_index_path)
|
| 91 |
logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
|
| 92 |
|
|
|
|
| 93 |
def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
|
| 94 |
tokens = text.split()
|
| 95 |
chunks = []
|
|
|
|
| 109 |
self.word2vec_model = self.train_word2vec(self.texts)
|
| 110 |
self.reranking_methods = {
|
| 111 |
'advanced_fusion': self.advanced_fusion_retrieval
|
| 112 |
+
}
|
| 113 |
logging.info("MistralRAGChatbot initialized successfully.")
|
| 114 |
|
|
|
|
| 115 |
def load_vector_db(self, vector_db_path: str) -> Tuple[np.ndarray, List[str]]:
|
| 116 |
with open(vector_db_path, "rb") as f:
|
| 117 |
data = dill.load(f)
|
|
|
|
| 121 |
return embeddings, texts
|
| 122 |
|
| 123 |
def load_annoy_index(self, annoy_index_path: str, embedding_dim: int) -> AnnoyIndex:
|
| 124 |
+
if not os.path.exists(annoy_index_path):
|
| 125 |
+
raise FileNotFoundError(f"Annoy index file {annoy_index_path} not found.")
|
| 126 |
annoy_index = AnnoyIndex(embedding_dim, 'angular')
|
| 127 |
annoy_index.load(annoy_index_path)
|
| 128 |
logging.info(f"Loaded Annoy index from {annoy_index_path}.")
|
| 129 |
return annoy_index
|
| 130 |
|
|
|
|
| 131 |
def train_word2vec(self, texts: List[str]) -> Word2Vec:
|
| 132 |
tokenized_texts = [text.split() for text in texts]
|
| 133 |
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
|
|
|
|
| 140 |
return np.array(response.data[0].embedding)
|
| 141 |
except Exception as e:
|
| 142 |
logging.error(f"Error fetching embedding: {e}")
|
| 143 |
+
return np.zeros((1024,))
|
| 144 |
|
| 145 |
def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
|
| 146 |
query_embedding = self.create_embeddings([user_query])[0]
|
|
|
|
| 163 |
combined_scores[idx] = (
|
| 164 |
0.5 * vector_scores.get(idx, 0) +
|
| 165 |
0.3 * bm25_scores.get(idx, 0) +
|
| 166 |
+
0.2 * pagerank_scores[idx] if idx < len(pagerank_scores) else 0
|
| 167 |
)
|
| 168 |
+
|
| 169 |
min_score = min(combined_scores.values())
|
| 170 |
max_score = max(combined_scores.values())
|
| 171 |
+
|
|
|
|
| 172 |
normalized_scores = {idx: (score - min_score) / (max_score - min_score) for idx, score in combined_scores.items()}
|
| 173 |
+
|
|
|
|
| 174 |
sorted_indices = sorted(combined_scores, key=combined_scores.get, reverse=True)
|
| 175 |
|
| 176 |
return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': normalized_scores[i], 'index': i} for i in sorted_indices[:5]]
|
| 177 |
|
| 178 |
def create_embeddings(self, text_list: List[str]) -> np.ndarray:
|
| 179 |
+
expected_dim = 1024
|
| 180 |
embeddings = []
|
| 181 |
for text in text_list:
|
| 182 |
word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
|
|
|
|
| 198 |
selected_reranking_methods: Optional[List[str]] = None
|
| 199 |
) -> Tuple[str, List[str], List[dict]]:
|
| 200 |
if not selected_retrieval_methods:
|
| 201 |
+
selected_retrieval_methods = ['annoy', 'bm25']
|
| 202 |
if not selected_reranking_methods:
|
| 203 |
+
selected_reranking_methods = ['advanced_fusion']
|
| 204 |
query_embedding = await self.get_text_embedding(user_query)
|
| 205 |
retrieved_docs = self.retrieve_documents(user_query, query_embedding, top_k, selected_retrieval_methods)
|
| 206 |
reranked_docs = self.rerank_documents(user_query, retrieved_docs, selected_reranking_methods)
|
|
|
|
| 239 |
def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 240 |
n_results = min(top_k, len(self.texts))
|
| 241 |
indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
|
| 242 |
+
scores = [1.0 - (dist / max(distances)) for dist in distances] if distances else []
|
| 243 |
logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
|
| 244 |
return indices, scores
|
| 245 |
|
|
|
|
| 246 |
def retrieve_with_bm25(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 247 |
tokenized_query = user_query.split()
|
| 248 |
scores = self.bm25.get_scores(tokenized_query)
|
|
|
|
| 250 |
logging.debug(f"BM25 retrieval returned {len(indices)} documents.")
|
| 251 |
return indices, scores[indices].tolist()
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
def rerank_documents(
|
| 254 |
self,
|
| 255 |
user_query: str,
|
|
|
|
| 265 |
|
| 266 |
return reranked_docs
|
| 267 |
|
|
|
|
| 268 |
def build_prompt(self, context: str, user_query: str, response_style: str) -> str:
|
| 269 |
styles = {
|
| 270 |
"detailed": "Provide a comprehensive and detailed answer based on the provided context.",
|
|
|
|
| 301 |
common_terms = query_terms.intersection(context_terms)
|
| 302 |
return len(common_terms) > len(query_terms) * 0.2
|
| 303 |
|
|
|
|
| 304 |
def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
|
| 305 |
store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
|
| 306 |
print("Vector database and Annoy index creation completed.")
|
| 307 |
|
|
|
|
| 308 |
def chatbot_interface(file, user_query, response_style):
|
| 309 |
vector_db_path = "vector_db.pkl"
|
| 310 |
annoy_index_path = "vector_index.ann"
|
| 311 |
chunk_size = 2048
|
| 312 |
overlap = 200
|
| 313 |
+
store_embeddings_in_vector_db(file.name, vector_db_path, annoy_index_path, chunk_size, overlap)
|
| 314 |
|
| 315 |
chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
|
| 316 |
|
|
|
|
| 332 |
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
| 333 |
if 'score' in doc_info:
|
| 334 |
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
|
|
|
|
|
|
|
|
|
| 335 |
|
| 336 |
return formatted_response
|
| 337 |
+
|
| 338 |
+
iface = gr.Blocks()
|
| 339 |
with iface:
|
| 340 |
+
gr.Image("images/chanceRAG_logo.jpg", label="Image", show_label=False)
|
| 341 |
gr.Interface(
|
| 342 |
+
fn=chatbot_interface,
|
| 343 |
+
inputs=[
|
| 344 |
+
gr.File(label="Upload a File"),
|
| 345 |
+
gr.Textbox(lines=5, label="User Query"),
|
| 346 |
+
gr.Dropdown([
|
| 347 |
+
"Detailed", "Concise", "Creative", "Technical"], label="Response Style"
|
| 348 |
+
),
|
| 349 |
+
],
|
| 350 |
+
outputs=gr.Textbox(label="ChanceRAG Response"),
|
| 351 |
+
)
|
|
|
|
|
|
|
| 352 |
|
| 353 |
iface.launch(share=True)
|