Rabbitt-AI commited on
Commit
d52e613
·
verified ·
1 Parent(s): c6ab696

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -107
app.py CHANGED
@@ -1,13 +1,12 @@
1
- # Cell 2: Import necessary libraries
2
  import time
3
- import fitz # PyMuPDF
4
  import numpy as np
5
  import pickle
6
  import os
7
  import dill
8
  import logging
9
  import asyncio
10
- import networkx as nx # Import networkx here
11
  from mistralai import Mistral
12
  from annoy import AnnoyIndex
13
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
@@ -18,12 +17,12 @@ from gensim.models import Word2Vec
18
  from typing import List, Optional, Tuple
19
  import tempfile
20
 
21
- # Cell 3: Set up logging and Mistral client
22
  logger = logging.getLogger(__name__)
23
  api_key = "VHTGVu2YH2WxcTbfpkK00wAidHU12Stn"
24
  client = Mistral(api_key=api_key)
25
 
26
- # Cell 4: Function to get embeddings with rate limiting
27
  def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
28
  embeddings = []
29
  for text in text_list:
@@ -49,7 +48,7 @@ def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=1
49
  break
50
  return embeddings
51
 
52
- # Cell 5: Function to store embeddings in a vector database
53
  def store_embeddings_in_vector_db(
54
  pdf_path: str,
55
  vector_db_path: str,
@@ -93,7 +92,7 @@ def store_embeddings_in_vector_db(
93
  annoy_index.save(annoy_index_path)
94
  logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
95
 
96
- # Cell 6: Helper functions for text processing
97
  def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
98
  tokens = text.split()
99
  chunks = []
@@ -400,100 +399,12 @@ class MistralRAGChatbot:
400
  common_terms = query_terms.intersection(context_terms)
401
  return len(common_terms) > len(query_terms) * 0.2
402
 
403
- # Cell 8: Store embeddings in vector DB and Annoy index
404
  def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
405
  store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
406
  print("Vector database and Annoy index creation completed.")
407
 
408
- # Cell 9: Run the store embeddings function (example)
409
- # Replace 'example.pdf' with your PDF file path.
410
- # It will create 'vector_db.pkl' and 'vector_index.ann'
411
- # create_vector_db_and_annoy_index('med.pdf', 'vector_db.pkl', 'vector_index.ann')
412
-
413
- # # Cell 10: Query the chatbot with user input
414
- # async def query_chatbot():
415
- # vector_db_path = "vector_db.pkl"
416
- # annoy_index_path = "vector_index.ann"
417
- # chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
418
-
419
- # user_query = input("Please enter your query: ")
420
- # response_style = input("Please choose response style (Detailed, Concise, Creative, Technical): ").strip().lower()
421
- # selected_retrieval_methods = input("Please choose retrieval methods (comma-separated: annoy, tfidf, bm25, euclidean, jaccard): ")
422
- # selected_reranking_methods = input("Please choose reranking methods (comma-separated: advanced_fusion, reciprocal_rank_fusion, weighted_score_fusion, semantic_similarity): ")
423
-
424
- # selected_retrieval_methods_list = [method.strip() for method in selected_retrieval_methods.split(',') if method.strip()]
425
- # selected_reranking_methods_list = [method.strip() for method in selected_reranking_methods.split(',') if method.strip()]
426
-
427
- # response, retrieved_docs, source_info = await chatbot.generate_response_with_rag(
428
- # user_query=user_query,
429
- # response_style=response_style,
430
- # selected_retrieval_methods=selected_retrieval_methods_list,
431
- # selected_reranking_methods=selected_reranking_methods_list
432
- # )
433
-
434
- # print("\nResponse:")
435
- # print(response)
436
- # print("\nRetrieved and Reranked Documents:")
437
- # for idx, doc_info in enumerate(source_info, start=1):
438
- # print(f"\nDocument {idx}:")
439
- # print(f"Content Preview: {doc_info['text'][:200]}...")
440
- # print(f"Original Retrieval Method: {doc_info['method']}")
441
- # if 'score' in doc_info:
442
- # print(f"Original Score: {doc_info['score']:.4f}")
443
- # for key, value in doc_info.items():
444
- # if key.endswith('_score') and key != 'score':
445
- # print(f"{key.replace('_', ' ').title()}: {value:.4f}")
446
-
447
- # import asyncio
448
-
449
- # async def query_chatbot2():
450
- # vector_db_path = "vector_db.pkl"
451
- # annoy_index_path = "vector_index.ann"
452
- # chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
453
-
454
- # user_query = "what is the name of the patient"
455
- # response_style = "Concise"
456
- # selected_retrieval_methods_list = ["tfidf", "bm25"]
457
- # selected_reranking_methods_list = ["reciprocal_rank_fusion"]
458
-
459
- # try:
460
- # response, retrieved_docs, source_info = await chatbot.generate_response_with_rag(
461
- # user_query=user_query,
462
- # response_style=response_style,
463
- # selected_retrieval_methods=selected_retrieval_methods_list,
464
- # selected_reranking_methods=selected_reranking_methods_list
465
- # )
466
-
467
- # print("\n--- Response ---")
468
- # print(response)
469
-
470
- # print("\n--- Retrieved and Reranked Documents ---")
471
- # for idx, doc_info in enumerate(source_info, start=1):
472
- # print(f"\nDocument {idx}:")
473
- # print(f"Content Preview: {doc_info['text'][:150]}...") # Show a preview of the document content
474
- # print(f"Original Retrieval Method: {doc_info['method']}")
475
- # if 'score' in doc_info:
476
- # print(f"Original Score: {doc_info['score']:.4f}")
477
-
478
- # # Display scores from specific reranking methods
479
- # if 'rrf_score' in doc_info:
480
- # print(f"Reciprocal Rank Fusion Score (RRF): {doc_info['rrf_score']:.4f}")
481
- # if 'wsf_score' in doc_info:
482
- # print(f"Weighted Score Fusion (WSF) Score: {doc_info['wsf_score']:.4f}")
483
- # if 'semantic_score' in doc_info:
484
- # print(f"Semantic Similarity Score: {doc_info['semantic_score']:.4f}")
485
- # if 'pagerank_score' in doc_info:
486
- # print(f"PageRank Score: {doc_info['pagerank_score']:.4f}")
487
- # if 'advanced_fusion_score' in doc_info:
488
- # print(f"Advanced Fusion Score: {doc_info['advanced_fusion_score']:.4f}")
489
-
490
- # except Exception as e:
491
- # logging.error(f"Error generating response: {e}")
492
- # print("\nResponse:")
493
- # print("An error occurred while generating the response.")
494
-
495
- # # Call the function in a Jupyter notebook environment
496
- # await query_chatbot()
497
 
498
  import gradio as gr
499
 
@@ -505,7 +416,7 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
505
 
506
 
507
 
508
- #Load the documents and create embeddings with the provided chunk_size and overlap
509
  store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
510
 
511
  chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
@@ -520,14 +431,14 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
520
  selected_reranking_methods=selected_reranking_methods_list
521
  ))
522
 
523
- formatted_response = f"**Response:**\n{response}\n\n"
524
- formatted_response += "**Retrieved and Reranked Documents:**\n"
525
  for idx, doc_info in enumerate(source_info, start=1):
526
- formatted_response += f"\n**Document {idx}:**\n"
527
  formatted_response += f"Content Preview: {doc_info['text'][:200]}...\n"
528
- formatted_response += f"Original Retrieval Method: {doc_info['method']}\n"
529
  if 'score' in doc_info:
530
- formatted_response += f"Original Score: {doc_info['score']:.4f}\n"
531
  for key, value in doc_info.items():
532
  if key.endswith('_score') and key != 'score':
533
  formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
@@ -536,17 +447,17 @@ def chatbot_interface(user_query, response_style, selected_retrieval_methods, se
536
  iface = gr.Interface(
537
  fn=chatbot_interface,
538
  theme='IndusCloud9/RabbittLlama',
539
- inputs=[
 
540
  gr.Textbox(lines=5, label="User Query"),
541
  gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
542
  gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True), # This line is changed
543
  gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
544
- gr.File(label="Upload a PDF"),
545
  gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
546
  gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
547
  ],
548
- outputs=gr.Textbox(label="Chatbot Response"),
549
- title="Chat with Document"
550
  )
551
 
552
  iface.launch(share=True)
 
 
1
  import time
2
+ import fitz
3
  import numpy as np
4
  import pickle
5
  import os
6
  import dill
7
  import logging
8
  import asyncio
9
+ import networkx as nx
10
  from mistralai import Mistral
11
  from annoy import AnnoyIndex
12
  from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 
17
  from typing import List, Optional, Tuple
18
  import tempfile
19
 
20
+
21
  logger = logging.getLogger(__name__)
22
  api_key = "VHTGVu2YH2WxcTbfpkK00wAidHU12Stn"
23
  client = Mistral(api_key=api_key)
24
 
25
+
26
  def get_text_embedding_with_rate_limit(text_list, initial_delay=2, max_retries=10):
27
  embeddings = []
28
  for text in text_list:
 
48
  break
49
  return embeddings
50
 
51
+
52
  def store_embeddings_in_vector_db(
53
  pdf_path: str,
54
  vector_db_path: str,
 
92
  annoy_index.save(annoy_index_path)
93
  logging.info(f"Annoy index built with {len(all_embeddings)} items and saved to {annoy_index_path}.")
94
 
95
+
96
  def split_text_into_chunks(text: str, chunk_size: int = 2048, overlap: int = 200) -> List[str]:
97
  tokens = text.split()
98
  chunks = []
 
399
  common_terms = query_terms.intersection(context_terms)
400
  return len(common_terms) > len(query_terms) * 0.2
401
 
402
+
403
  def create_vector_db_and_annoy_index(pdf_path, vector_db_path, annoy_index_path):
404
  store_embeddings_in_vector_db(pdf_path, vector_db_path, annoy_index_path)
405
  print("Vector database and Annoy index creation completed.")
406
 
407
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
 
409
  import gradio as gr
410
 
 
416
 
417
 
418
 
419
+
420
  store_embeddings_in_vector_db(file.name, 'vector_db.pkl', 'vector_index.ann', chunk_size, overlap)
421
 
422
  chatbot = MistralRAGChatbot(vector_db_path, annoy_index_path)
 
431
  selected_reranking_methods=selected_reranking_methods_list
432
  ))
433
 
434
+ formatted_response = f"Response:\n{response}\n\n"
435
+ formatted_response += "Retrieved and Reranked Documents:\n"
436
  for idx, doc_info in enumerate(source_info, start=1):
437
+ formatted_response += f"\nDocument {idx}:\n"
438
  formatted_response += f"Content Preview: {doc_info['text'][:200]}...\n"
439
+ formatted_response += f"Retrieval Method: {doc_info['method']}\n"
440
  if 'score' in doc_info:
441
+ formatted_response += f"Precision Score: {doc_info['score']:.4f}\n"
442
  for key, value in doc_info.items():
443
  if key.endswith('_score') and key != 'score':
444
  formatted_response += f"{key.replace('_', ' ').title()}: {value:.4f}\n"
 
447
  iface = gr.Interface(
448
  fn=chatbot_interface,
449
  theme='IndusCloud9/RabbittLlama',
450
+ inputs=[
451
+ gr.File(label="Upload a PDF"),
452
  gr.Textbox(lines=5, label="User Query"),
453
  gr.Dropdown(["Detailed", "Concise", "Creative", "Technical"], label="Response Style"),
454
  gr.Dropdown(["annoy", "tfidf", "bm25", "euclidean", "jaccard"], label="Retrieval Methods", multiselect=True), # This line is changed
455
  gr.Dropdown(["advanced_fusion", "reciprocal_rank_fusion", "weighted_score_fusion", "semantic_similarity"], label="Reranking Methods"),
 
456
  gr.Slider(minimum=1024, maximum=2048, step=128, value=2048, label="Chunk Size"),
457
  gr.Slider(minimum=100, maximum=300, step=100, value=200, label="Overlap")
458
  ],
459
+ outputs=gr.Textbox(label="RAG Response"),
460
+ title="Chance RAG"
461
  )
462
 
463
  iface.launch(share=True)