Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
-
# Cell 2: Import necessary libraries
|
| 2 |
import time
|
| 3 |
-
import fitz
|
| 4 |
import numpy as np
|
| 5 |
import pickle
|
| 6 |
import os
|
| 7 |
import dill
|
| 8 |
import logging
|
| 9 |
import asyncio
|
| 10 |
-
import networkx as nx
|
| 11 |
from mistralai import Mistral
|
| 12 |
from annoy import AnnoyIndex
|
| 13 |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
|
@@ -18,12 +17,12 @@ from gensim.models import Word2Vec
|
|
| 18 |
from typing import List, Optional, Tuple
|
| 19 |
import tempfile
|
| 20 |
|
| 21 |
-
|
| 22 |
logger = logging.getLogger(__name__)
|
| 23 |
api_key = "VHTGVu2YH2WxcTbfpkK00wAidHU12Stn"
|
| 24 |
client = Mistral(api_key=api_key)
|
| 25 |
|
| 26 |
-
|
| 27 |
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
|
| 28 |
embeddings = []
|
| 29 |
for text in text_list:
|
|
@@ -49,7 +48,7 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
|
|
| 49 |
break
|
| 50 |
return embeddings
|
| 51 |
|
| 52 |
-
|
| 53 |
def store_embeddings_in_vector_db(
|
| 54 |
pdf_path: str,
|
| 55 |
vector_db_path: str,
|
|
@@ -93,7 +92,7 @@ def store_embeddings_in_vector_db(
|
|
| 93 |
annoy_index.save(annoy_index_path)
|
| 94 |
logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
|
| 95 |
|
| 96 |
-
|
| 97 |
def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
|
| 98 |
tokens = text.split()
|
| 99 |
chunks = []
|
|
@@ -400,100 +399,12 @@ class MistralRAGChatbot:
|
|
| 400 |
common_terms = query_terms.intersection(context_terms)
|
| 401 |
return len(common_terms) > len(query_terms) * 0.2
|
| 402 |
|
| 403 |
-
|
| 404 |
def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
|
| 405 |
store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
|
| 406 |
print("Vector database and Annoy index creation completed.")
|
| 407 |
|
| 408 |
-
|
| 409 |
-
# Replace 'example.pdf' with your PDF file path.
|
| 410 |
-
# It will create 'vector_db.pkl' and 'vector_index.ann'
|
| 411 |
-
# create_vector_db_and_annoy_index('med.pdf', 'vector_db.pkl', 'vector_index.ann')
|
| 412 |
-
|
| 413 |
-
# # Cell 10: Query the chatbot with user input
|
| 414 |
-
# async def query_chatbot():
|
| 415 |
-
# vector_db_path = "vector_db.pkl"
|
| 416 |
-
# annoy_index_path = "vector_index.ann"
|
| 417 |
-
# chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
|
| 418 |
-
|
| 419 |
-
# user_query = input("Please enter your query: ")
|
| 420 |
-
# response_style = input("Please choose response style (Detailed, Concise, Creative, Technical): ").strip().lower()
|
| 421 |
-
# selected_retrieval_methods = input("Please choose retrieval methods (comma-separated: annoy, tfidf, bm25, euclidean, jaccard): ")
|
| 422 |
-
# selected_reranking_methods = input("Please choose reranking methods (comma-separated: advanced_fusion, reciprocal_rank_fusion, weighted_score_fusion, semantic_similarity): ")
|
| 423 |
-
|
| 424 |
-
# selected_retrieval_methods_list = [method.strip() for method in selected_retrieval_methods.split(',') if method.strip()]
|
| 425 |
-
# selected_reranking_methods_list = [method.strip() for method in selected_reranking_methods.split(',') if method.strip()]
|
| 426 |
-
|
| 427 |
-
# response, retrieved_docs, source_info = await chatbot.generate_response_with_rag(
|
| 428 |
-
# user_query=user_query,
|
| 429 |
-
# response_style=response_style,
|
| 430 |
-
# selected_retrieval_methods=selected_retrieval_methods_list,
|
| 431 |
-
# selected_reranking_methods=selected_reranking_methods_list
|
| 432 |
-
# )
|
| 433 |
-
|
| 434 |
-
# print("\nResponse:")
|
| 435 |
-
# print(response)
|
| 436 |
-
# print("\nRetrieved and Reranked Documents:")
|
| 437 |
-
# for idx, doc_info in enumerate(source_info, start=1):
|
| 438 |
-
# print(f"\nDocument {idx}:")
|
| 439 |
-
# print(f"Content Preview: {doc_info['text'][:200]}...")
|
| 440 |
-
# print(f"Original Retrieval Method: {doc_info['method']}")
|
| 441 |
-
# if 'score' in doc_info:
|
| 442 |
-
# print(f"Original Score: {doc_info['score']:.4f}")
|
| 443 |
-
# for key, value in doc_info.items():
|
| 444 |
-
# if key.endswith('_score') and key != 'score':
|
| 445 |
-
# print(f"{key.replace('_', ' ').title()}: {value:.4f}")
|
| 446 |
-
|
| 447 |
-
# import asyncio
|
| 448 |
-
|
| 449 |
-
# async def query_chatbot2():
|
| 450 |
-
# vector_db_path = "vector_db.pkl"
|
| 451 |
-
# annoy_index_path = "vector_index.ann"
|
| 452 |
-
# chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
|
| 453 |
-
|
| 454 |
-
# user_query = "what is the name of the patient"
|
| 455 |
-
# response_style = "Concise"
|
| 456 |
-
# selected_retrieval_methods_list = ["tfidf", "bm25"]
|
| 457 |
-
# selected_reranking_methods_list = ["reciprocal_rank_fusion"]
|
| 458 |
-
|
| 459 |
-
# try:
|
| 460 |
-
# response, retrieved_docs, source_info = await chatbot.generate_response_with_rag(
|
| 461 |
-
# user_query=user_query,
|
| 462 |
-
# response_style=response_style,
|
| 463 |
-
# selected_retrieval_methods=selected_retrieval_methods_list,
|
| 464 |
-
# selected_reranking_methods=selected_reranking_methods_list
|
| 465 |
-
# )
|
| 466 |
-
|
| 467 |
-
# print("\n--- Response ---")
|
| 468 |
-
# print(response)
|
| 469 |
-
|
| 470 |
-
# print("\n--- Retrieved and Reranked Documents ---")
|
| 471 |
-
# for idx, doc_info in enumerate(source_info, start=1):
|
| 472 |
-
# print(f"\nDocument {idx}:")
|
| 473 |
-
# print(f"Content Preview: {doc_info['text'][:150]}...") # Show a preview of the document content
|
| 474 |
-
# print(f"Original Retrieval Method: {doc_info['method']}")
|
| 475 |
-
# if 'score' in doc_info:
|
| 476 |
-
# print(f"Original Score: {doc_info['score']:.4f}")
|
| 477 |
-
|
| 478 |
-
# # Display scores from specific reranking methods
|
| 479 |
-
# if 'rrf_score' in doc_info:
|
| 480 |
-
# print(f"Reciprocal Rank Fusion Score (RRF): {doc_info['rrf_score']:.4f}")
|
| 481 |
-
# if 'wsf_score' in doc_info:
|
| 482 |
-
# print(f"Weighted Score Fusion (WSF) Score: {doc_info['wsf_score']:.4f}")
|
| 483 |
-
# if 'semantic_score' in doc_info:
|
| 484 |
-
# print(f"Semantic Similarity Score: {doc_info['semantic_score']:.4f}")
|
| 485 |
-
# if 'pagerank_score' in doc_info:
|
| 486 |
-
# print(f"PageRank Score: {doc_info['pagerank_score']:.4f}")
|
| 487 |
-
# if 'advanced_fusion_score' in doc_info:
|
| 488 |
-
# print(f"Advanced Fusion Score: {doc_info['advanced_fusion_score']:.4f}")
|
| 489 |
-
|
| 490 |
-
# except Exception as e:
|
| 491 |
-
# logging.error(f"Error generating response: {e}")
|
| 492 |
-
# print("\nResponse:")
|
| 493 |
-
# print("An error occurred while generating the response.")
|
| 494 |
-
|
| 495 |
-
# # Call the function in a Jupyter notebook environment
|
| 496 |
-
# await query_chatbot()
|
| 497 |
|
| 498 |
import gradio as gr
|
| 499 |
|
|
@@ -505,7 +416,7 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
|
|
| 505 |
|
| 506 |
|
| 507 |
|
| 508 |
-
|
| 509 |
store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
|
| 510 |
|
| 511 |
chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
|
|
@@ -520,14 +431,14 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
|
|
| 520 |
selected_reranking_methods=selected_reranking_methods_list
|
| 521 |
))
|
| 522 |
|
| 523 |
-
formatted_response = f"
|
| 524 |
-
formatted_response += "
|
| 525 |
for idx, doc_info in enumerate(source_info, start=1):
|
| 526 |
-
formatted_response += f"\
|
| 527 |
formatted_response += f"Content Preview: {doc_info['text'][:200]}...\n"
|
| 528 |
-
formatted_response += f"
|
| 529 |
if 'score' in doc_info:
|
| 530 |
-
formatted_response += f"
|
| 531 |
for key, value in doc_info.items():
|
| 532 |
if key.endswith('_score') and key != 'score':
|
| 533 |
formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
|
|
@@ -536,17 +447,17 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
|
|
| 536 |
iface = gr.Interface(
|
| 537 |
fn=chatbot_interface,
|
| 538 |
theme='IndusCloud9/RabbittLlama',
|
| 539 |
-
inputs=[
|
|
|
|
| 540 |
gr.Textbox(lines=5, label="User Query"),
|
| 541 |
gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
|
| 542 |
gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True), # This line is changed
|
| 543 |
gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
|
| 544 |
-
gr.File(label="Upload a PDF"),
|
| 545 |
gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
|
| 546 |
gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
|
| 547 |
],
|
| 548 |
-
outputs=gr.Textbox(label="
|
| 549 |
-
title="
|
| 550 |
)
|
| 551 |
|
| 552 |
iface.launch(share=True)
|
|
|
|
|
|
|
| 1 |
import time
|
| 2 |
+
import fitz
|
| 3 |
import numpy as np
|
| 4 |
import pickle
|
| 5 |
import os
|
| 6 |
import dill
|
| 7 |
import logging
|
| 8 |
import asyncio
|
| 9 |
+
import networkx as nx
|
| 10 |
from mistralai import Mistral
|
| 11 |
from annoy import AnnoyIndex
|
| 12 |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
|
|
|
|
| 17 |
from typing import List, Optional, Tuple
|
| 18 |
import tempfile
|
| 19 |
|
| 20 |
+
|
| 21 |
logger = logging.getLogger(__name__)
|
| 22 |
api_key = "VHTGVu2YH2WxcTbfpkK00wAidHU12Stn"
|
| 23 |
client = Mistral(api_key=api_key)
|
| 24 |
|
| 25 |
+
|
| 26 |
def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
|
| 27 |
embeddings = []
|
| 28 |
for text in text_list:
|
|
|
|
| 48 |
break
|
| 49 |
return embeddings
|
| 50 |
|
| 51 |
+
|
| 52 |
def store_embeddings_in_vector_db(
|
| 53 |
pdf_path: str,
|
| 54 |
vector_db_path: str,
|
|
|
|
| 92 |
annoy_index.save(annoy_index_path)
|
| 93 |
logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
|
| 94 |
|
| 95 |
+
|
| 96 |
def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
|
| 97 |
tokens = text.split()
|
| 98 |
chunks = []
|
|
|
|
| 399 |
common_terms = query_terms.intersection(context_terms)
|
| 400 |
return len(common_terms) > len(query_terms) * 0.2
|
| 401 |
|
| 402 |
+
|
| 403 |
def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
|
| 404 |
store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
|
| 405 |
print("Vector database and Annoy index creation completed.")
|
| 406 |
|
| 407 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
|
| 409 |
import gradio as gr
|
| 410 |
|
|
|
|
| 416 |
|
| 417 |
|
| 418 |
|
| 419 |
+
|
| 420 |
store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
|
| 421 |
|
| 422 |
chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
|
|
|
|
| 431 |
selected_reranking_methods=selected_reranking_methods_list
|
| 432 |
))
|
| 433 |
|
| 434 |
+
formatted_response = f"Response:\n{response}\n\n"
|
| 435 |
+
formatted_response += "Retrieved and Reranked Documents:\n"
|
| 436 |
for idx, doc_info in enumerate(source_info, start=1):
|
| 437 |
+
formatted_response += f"\nDocument {idx}:\n"
|
| 438 |
formatted_response += f"Content Preview: {doc_info['text'][:200]}...\n"
|
| 439 |
+
formatted_response += f"Retrieval Method: {doc_info['method']}\n"
|
| 440 |
if 'score' in doc_info:
|
| 441 |
+
formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
|
| 442 |
for key, value in doc_info.items():
|
| 443 |
if key.endswith('_score') and key != 'score':
|
| 444 |
formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
|
|
|
|
| 447 |
iface = gr.Interface(
|
| 448 |
fn=chatbot_interface,
|
| 449 |
theme='IndusCloud9/RabbittLlama',
|
| 450 |
+
inputs=[
|
| 451 |
+
gr.File(label="Upload a PDF"),
|
| 452 |
gr.Textbox(lines=5, label="User Query"),
|
| 453 |
gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
|
| 454 |
gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True), # This line is changed
|
| 455 |
gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
|
|
|
|
| 456 |
gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
|
| 457 |
gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
|
| 458 |
],
|
| 459 |
+
outputs=gr.Textbox(label="RAG Response"),
|
| 460 |
+
title="Chance RAG"
|
| 461 |
)
|
| 462 |
|
| 463 |
iface.launch(share=True)
|