Spaces:

Rabbitt-AI
/

ChanceRAG

Running

App Files Files Community

Rabbitt-AI commited on Sep 9, 2024

Commit

d52e613

verified ·

1 Parent(s): c6ab696

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -107

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
-# Cell 2: Import necessary libraries
 import time
-import fitz  # PyMuPDF
 import numpy as np
 import pickle
 import os
 import dill
 import logging
 import asyncio
-import networkx as nx  # Import networkx here
 from mistralai import Mistral
 from annoy import AnnoyIndex
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
@@ -18,12 +17,12 @@ from gensim.models import Word2Vec
 from typing import List, Optional, Tuple
 import tempfile
-# Cell 3: Set up logging and Mistral client
 logger = logging.getLogger(__name__)
 api_key = "VHTGVu2YH2WxcTbfpkK00wAidHU12Stn"
 client = Mistral(api_key=api_key)
-# Cell 4: Function to get embeddings with rate limiting
 def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
     embeddings = []
     for text in text_list:
@@ -49,7 +48,7 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
                     break
     return embeddings
-# Cell 5: Function to store embeddings in a vector database
 def store_embeddings_in_vector_db(
     pdf_path: str,
     vector_db_path: str,
@@ -93,7 +92,7 @@ def store_embeddings_in_vector_db(
     annoy_index.save(annoy_index_path)
     logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
-# Cell 6: Helper functions for text processing
 def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
     tokens = text.split()
     chunks = []
@@ -400,100 +399,12 @@ class MistralRAGChatbot:
         common_terms = query_terms.intersection(context_terms)
         return len(common_terms) > len(query_terms) * 0.2
-# Cell 8: Store embeddings in vector DB and Annoy index
 def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
     store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
     print("Vector database and Annoy index creation completed.")
-# Cell 9: Run the store embeddings function (example)
-# Replace 'example.pdf' with your PDF file path.
-# It will create 'vector_db.pkl' and 'vector_index.ann'
-# create_vector_db_and_annoy_index('med.pdf', 'vector_db.pkl', 'vector_index.ann')
-# # Cell 10: Query the chatbot with user input
-# async def query_chatbot():
-#     vector_db_path = "vector_db.pkl"
-#     annoy_index_path = "vector_index.ann"
-#     chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
-#     user_query = input("Please enter your query: ")
-#     response_style = input("Please choose response style (Detailed, Concise, Creative, Technical): ").strip().lower()
-#     selected_retrieval_methods = input("Please choose retrieval methods (comma-separated: annoy, tfidf, bm25, euclidean, jaccard): ")
-#     selected_reranking_methods = input("Please choose reranking methods (comma-separated: advanced_fusion, reciprocal_rank_fusion, weighted_score_fusion, semantic_similarity): ")
-#     selected_retrieval_methods_list = [method.strip() for method in selected_retrieval_methods.split(',') if method.strip()]
-#     selected_reranking_methods_list = [method.strip() for method in selected_reranking_methods.split(',') if method.strip()]
-#     response, retrieved_docs, source_info = await chatbot.generate_response_with_rag(
-#         user_query=user_query,
-#         response_style=response_style,
-#         selected_retrieval_methods=selected_retrieval_methods_list,
-#         selected_reranking_methods=selected_reranking_methods_list
-#     )
-#     print("\nResponse:")
-#     print(response)
-#     print("\nRetrieved and Reranked Documents:")
-#     for idx, doc_info in enumerate(source_info, start=1):
-#         print(f"\nDocument {idx}:")
-#         print(f"Content Preview: {doc_info['text'][:200]}...")
-#         print(f"Original Retrieval Method: {doc_info['method']}")
-#         if 'score' in doc_info:
-#             print(f"Original Score: {doc_info['score']:.4f}")
-#         for key, value in doc_info.items():
-#             if key.endswith('_score') and key != 'score':
-#                 print(f"{key.replace('_', ' ').title()}: {value:.4f}")
-# import asyncio
-# async def query_chatbot2():
-#     vector_db_path = "vector_db.pkl"
-#     annoy_index_path = "vector_index.ann"
-#     chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
-#     user_query = "what is the name of the patient"
-#     response_style = "Concise"
-#     selected_retrieval_methods_list = ["tfidf", "bm25"]
-#     selected_reranking_methods_list = ["reciprocal_rank_fusion"]
-#     try:
-#         response, retrieved_docs, source_info = await chatbot.generate_response_with_rag(
-#             user_query=user_query,
-#             response_style=response_style,
-#             selected_retrieval_methods=selected_retrieval_methods_list,
-#             selected_reranking_methods=selected_reranking_methods_list
-#         )
-#         print("\n--- Response ---")
-#         print(response)
-#         print("\n--- Retrieved and Reranked Documents ---")
-#         for idx, doc_info in enumerate(source_info, start=1):
-#             print(f"\nDocument {idx}:")
-#             print(f"Content Preview: {doc_info['text'][:150]}...")  # Show a preview of the document content
-#             print(f"Original Retrieval Method: {doc_info['method']}")
-#             if 'score' in doc_info:
-#                 print(f"Original Score: {doc_info['score']:.4f}")
-#             # Display scores from specific reranking methods
-#             if 'rrf_score' in doc_info:
-#                 print(f"Reciprocal Rank Fusion Score (RRF): {doc_info['rrf_score']:.4f}")
-#             if 'wsf_score' in doc_info:
-#                 print(f"Weighted Score Fusion (WSF) Score: {doc_info['wsf_score']:.4f}")
-#             if 'semantic_score' in doc_info:
-#                 print(f"Semantic Similarity Score: {doc_info['semantic_score']:.4f}")
-#             if 'pagerank_score' in doc_info:
-#                 print(f"PageRank Score: {doc_info['pagerank_score']:.4f}")
-#             if 'advanced_fusion_score' in doc_info:
-#                 print(f"Advanced Fusion Score: {doc_info['advanced_fusion_score']:.4f}")
-#     except Exception as e:
-#         logging.error(f"Error generating response: {e}")
-#         print("\nResponse:")
-#         print("An error occurred while generating the response.")
-# # Call the function in a Jupyter notebook environment
-# await query_chatbot()
 import gradio as gr
@@ -505,7 +416,7 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
-    #Load the documents and create embeddings with the provided chunk_size and overlap
     store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
     chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
@@ -520,14 +431,14 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
         selected_reranking_methods=selected_reranking_methods_list
     ))
-    formatted_response = f"**Response:**\n{response}\n\n"
-    formatted_response += "**Retrieved and Reranked Documents:**\n"
     for idx, doc_info in enumerate(source_info, start=1):
-        formatted_response += f"\n**Document {idx}:**\n"
         formatted_response += f"Content Preview: {doc_info['text'][:200]}...\n"
-        formatted_response += f"Original Retrieval Method: {doc_info['method']}\n"
         if 'score' in doc_info:
-            formatted_response += f"Original Score: {doc_info['score']:.4f}\n"
         for key, value in doc_info.items():
             if key.endswith('_score') and key != 'score':
                 formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
@@ -536,17 +447,17 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
 iface = gr.Interface(
     fn=chatbot_interface,
     theme='IndusCloud9/RabbittLlama',
-    inputs=[
         gr.Textbox(lines=5, label="User Query"),
         gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
         gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True), # This line is changed
         gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
-        gr.File(label="Upload a PDF"),
         gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
         gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
     ],
-    outputs=gr.Textbox(label="Chatbot Response"),
-    title="Chat with Document"
 )
 iface.launch(share=True)

 import time
+import fitz
 import numpy as np
 import pickle
 import os
 import dill
 import logging
 import asyncio
+import networkx as nx
 from mistralai import Mistral
 from annoy import AnnoyIndex
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from typing import List, Optional, Tuple
 import tempfile
 logger = logging.getLogger(__name__)
 api_key = "VHTGVu2YH2WxcTbfpkK00wAidHU12Stn"
 client = Mistral(api_key=api_key)
 def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
     embeddings = []
     for text in text_list:
                     break
     return embeddings
 def store_embeddings_in_vector_db(
     pdf_path: str,
     vector_db_path: str,
     annoy_index.save(annoy_index_path)
     logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
 def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
     tokens = text.split()
     chunks = []
         common_terms = query_terms.intersection(context_terms)
         return len(common_terms) > len(query_terms) * 0.2
 def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
     store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
     print("Vector database and Annoy index creation completed.")
 import gradio as gr
     store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
     chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
         selected_reranking_methods=selected_reranking_methods_list
     ))
+    formatted_response = f"Response:\n{response}\n\n"
+    formatted_response += "Retrieved and Reranked Documents:\n"
     for idx, doc_info in enumerate(source_info, start=1):
+        formatted_response += f"\nDocument {idx}:\n"
         formatted_response += f"Content Preview: {doc_info['text'][:200]}...\n"
+        formatted_response += f"Retrieval Method: {doc_info['method']}\n"
         if 'score' in doc_info:
+            formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
         for key, value in doc_info.items():
             if key.endswith('_score') and key != 'score':
                 formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
 iface = gr.Interface(
     fn=chatbot_interface,
     theme='IndusCloud9/RabbittLlama',
+    inputs=[
+        gr.File(label="Upload a PDF"),
         gr.Textbox(lines=5, label="User Query"),
         gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
         gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True), # This line is changed
         gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
         gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
         gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
     ],
+    outputs=gr.Textbox(label="RAG Response"),
+    title="Chance RAG"
 )
 iface.launch(share=True)