Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,7 +15,7 @@ from sklearn.preprocessing import normalize
|
|
| 15 |
from rank_bm25 import BM25Okapi
|
| 16 |
from gensim.models import Word2Vec
|
| 17 |
from typing import List, Optional, Tuple
|
| 18 |
-
import
|
| 19 |
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
|
@@ -152,7 +152,7 @@ class MistralRAGChatbot:
|
|
| 152 |
return np.array(response.data[0].embedding)
|
| 153 |
except Exception as e:
|
| 154 |
logging.error(f"Error fetching embedding: {e}")
|
| 155 |
-
return np.zeros((1024,))
|
| 156 |
|
| 157 |
def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
|
| 158 |
query_embedding = self.create_embeddings([user_query])[0]
|
|
@@ -183,7 +183,7 @@ class MistralRAGChatbot:
|
|
| 183 |
return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': combined_scores[i], 'index': i} for i in sorted_indices[:5]]
|
| 184 |
|
| 185 |
def create_embeddings(self, text_list: List[str]) -> np.ndarray:
|
| 186 |
-
expected_dim = 1024
|
| 187 |
embeddings = []
|
| 188 |
for text in text_list:
|
| 189 |
word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
|
|
@@ -195,7 +195,6 @@ class MistralRAGChatbot:
|
|
| 195 |
embeddings.append(avg_embedding)
|
| 196 |
return np.array(embeddings, dtype=np.float32)
|
| 197 |
|
| 198 |
-
|
| 199 |
async def generate_response_with_rag(
|
| 200 |
self,
|
| 201 |
user_query: str,
|
|
@@ -225,7 +224,6 @@ class MistralRAGChatbot:
|
|
| 225 |
response = "An error occurred while generating the response."
|
| 226 |
return response, [doc['text'] for doc in reranked_docs[:5]], reranked_docs[:5]
|
| 227 |
|
| 228 |
-
|
| 229 |
def retrieve_documents(
|
| 230 |
self,
|
| 231 |
user_query: str,
|
|
@@ -248,7 +246,7 @@ class MistralRAGChatbot:
|
|
| 248 |
def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 249 |
n_results = min(top_k, len(self.texts))
|
| 250 |
indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
|
| 251 |
-
scores = [1.0 - (dist / max(distances)) for dist in distances]
|
| 252 |
logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
|
| 253 |
return indices, scores
|
| 254 |
|
|
@@ -315,9 +313,9 @@ class MistralRAGChatbot:
|
|
| 315 |
return reranked_docs
|
| 316 |
|
| 317 |
def reciprocal_rank_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
| 318 |
-
k = 60
|
| 319 |
method_ranks = {}
|
| 320 |
-
fused_scores = {}
|
| 321 |
for doc in docs:
|
| 322 |
method = doc['method']
|
| 323 |
if method not in method_ranks:
|
|
@@ -328,9 +326,9 @@ class MistralRAGChatbot:
|
|
| 328 |
idx = doc['index']
|
| 329 |
if idx not in fused_scores:
|
| 330 |
fused_scores[idx] = sum(1 / (k + rank) for method_rank in method_ranks.values() for i, rank in method_rank.items() if i == idx)
|
| 331 |
-
reranked_docs = sorted(docs, key=lambda x: fused_scores.get(x['index'], 0), reverse=True)
|
| 332 |
for doc in reranked_docs:
|
| 333 |
-
doc['rrf_score'] = fused_scores.get(doc['index'], 0)
|
| 334 |
return reranked_docs
|
| 335 |
|
| 336 |
def weighted_score_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
|
@@ -405,17 +403,9 @@ def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path)
|
|
| 405 |
print("Vector database and Annoy index creation completed.")
|
| 406 |
|
| 407 |
|
| 408 |
-
|
| 409 |
-
import gradio as gr
|
| 410 |
-
|
| 411 |
def chatbot_interface(file, user_query, response_style, selected_retrieval_methods, selected_reranking_methods, chunk_size, overlap):
|
| 412 |
vector_db_path = "vector_db.pkl"
|
| 413 |
annoy_index_path = "vector_index.ann"
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
|
| 420 |
store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
|
| 421 |
|
|
@@ -451,7 +441,7 @@ iface = gr.Interface(
|
|
| 451 |
gr.File(label="Upload a PDF"),
|
| 452 |
gr.Textbox(lines=5, label="User Query"),
|
| 453 |
gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
|
| 454 |
-
gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True),
|
| 455 |
gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
|
| 456 |
gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
|
| 457 |
gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
|
|
|
|
| 15 |
from rank_bm25 import BM25Okapi
|
| 16 |
from gensim.models import Word2Vec
|
| 17 |
from typing import List, Optional, Tuple
|
| 18 |
+
import gradio as gr
|
| 19 |
|
| 20 |
|
| 21 |
logger = logging.getLogger(__name__)
|
|
|
|
| 152 |
return np.array(response.data[0].embedding)
|
| 153 |
except Exception as e:
|
| 154 |
logging.error(f"Error fetching embedding: {e}")
|
| 155 |
+
return np.zeros((1024,))
|
| 156 |
|
| 157 |
def advanced_fusion_retrieval(self, user_query: str, docs: List[dict]) -> List[dict]:
|
| 158 |
query_embedding = self.create_embeddings([user_query])[0]
|
|
|
|
| 183 |
return [{'text': self.texts[i], 'method': 'advanced_fusion', 'score': combined_scores[i], 'index': i} for i in sorted_indices[:5]]
|
| 184 |
|
| 185 |
def create_embeddings(self, text_list: List[str]) -> np.ndarray:
|
| 186 |
+
expected_dim = 1024
|
| 187 |
embeddings = []
|
| 188 |
for text in text_list:
|
| 189 |
word_vectors = [self.word2vec_model.wv[token] for token in text.split() if token in self.word2vec_model.wv]
|
|
|
|
| 195 |
embeddings.append(avg_embedding)
|
| 196 |
return np.array(embeddings, dtype=np.float32)
|
| 197 |
|
|
|
|
| 198 |
async def generate_response_with_rag(
|
| 199 |
self,
|
| 200 |
user_query: str,
|
|
|
|
| 224 |
response = "An error occurred while generating the response."
|
| 225 |
return response, [doc['text'] for doc in reranked_docs[:5]], reranked_docs[:5]
|
| 226 |
|
|
|
|
| 227 |
def retrieve_documents(
|
| 228 |
self,
|
| 229 |
user_query: str,
|
|
|
|
| 246 |
def retrieve_with_annoy(self, user_query: str, query_embedding: np.ndarray, top_k: int) -> Tuple[List[int], List[float]]:
|
| 247 |
n_results = min(top_k, len(self.texts))
|
| 248 |
indices, distances = self.annoy_index.get_nns_by_vector(query_embedding, n_results, include_distances=True)
|
| 249 |
+
scores = [1.0 - (dist / max(distances)) for dist in distances]
|
| 250 |
logging.debug(f"Annoy retrieval returned {len(indices)} documents.")
|
| 251 |
return indices, scores
|
| 252 |
|
|
|
|
| 313 |
return reranked_docs
|
| 314 |
|
| 315 |
def reciprocal_rank_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
| 316 |
+
k = 60
|
| 317 |
method_ranks = {}
|
| 318 |
+
fused_scores = {}
|
| 319 |
for doc in docs:
|
| 320 |
method = doc['method']
|
| 321 |
if method not in method_ranks:
|
|
|
|
| 326 |
idx = doc['index']
|
| 327 |
if idx not in fused_scores:
|
| 328 |
fused_scores[idx] = sum(1 / (k + rank) for method_rank in method_ranks.values() for i, rank in method_rank.items() if i == idx)
|
| 329 |
+
reranked_docs = sorted(docs, key=lambda x: fused_scores.get(x['index'], 0), reverse=True)
|
| 330 |
for doc in reranked_docs:
|
| 331 |
+
doc['rrf_score'] = fused_scores.get(doc['index'], 0)
|
| 332 |
return reranked_docs
|
| 333 |
|
| 334 |
def weighted_score_fusion(self, user_query: str, docs: List[dict]) -> List[dict]:
|
|
|
|
| 403 |
print("Vector database and Annoy index creation completed.")
|
| 404 |
|
| 405 |
|
|
|
|
|
|
|
|
|
|
| 406 |
def chatbot_interface(file, user_query, response_style, selected_retrieval_methods, selected_reranking_methods, chunk_size, overlap):
|
| 407 |
vector_db_path = "vector_db.pkl"
|
| 408 |
annoy_index_path = "vector_index.ann"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
|
| 411 |
|
|
|
|
| 441 |
gr.File(label="Upload a PDF"),
|
| 442 |
gr.Textbox(lines=5, label="User Query"),
|
| 443 |
gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
|
| 444 |
+
gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True),
|
| 445 |
gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
|
| 446 |
gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
|
| 447 |
gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
|