Spaces:

gourisankar85
/

realtime-rag-pipeline

Sleeping

App Files Files Community

Gourisankar Padihary commited on Dec 20, 2024

Commit

f7c2fa3

1 Parent(s): b1b2c27

Compute RMSE and AUCROC

Browse files

Files changed (6) hide show

data/load_dataset.py +1 -1
generator/compute_metrics.py +32 -7
generator/compute_rmse_auc_roc_metrics.py +101 -0
generator/extract_attributes.py +10 -3
generator/initialize_llm.py +5 -0
main.py +11 -21

data/load_dataset.py CHANGED Viewed

@@ -5,5 +5,5 @@ def load_data():
     logging.info("Loading dataset")
     dataset = load_dataset("rungalileo/ragbench", 'covidqa', split="test")
     logging.info("Dataset loaded successfully")
-    logging.info(dataset)
     return dataset

     logging.info("Loading dataset")
     dataset = load_dataset("rungalileo/ragbench", 'covidqa', split="test")
     logging.info("Dataset loaded successfully")
+    logging.info(f"Number of documents found: {dataset.num_rows}")
     return dataset

generator/compute_metrics.py CHANGED Viewed

@@ -1,3 +1,6 @@
 def compute_metrics(attributes, total_sentences):
     # Extract relevant information from attributes
     all_relevant_sentence_keys = attributes.get("all_relevant_sentence_keys", [])
@@ -8,17 +11,39 @@ def compute_metrics(attributes, total_sentences):
     context_relevance = len(all_relevant_sentence_keys) / total_sentences if total_sentences else 0
     # Compute Context Utilization
-    context_utilization = len(all_utilized_sentence_keys) / len(sentence_support_information) if sentence_support_information else 0
-    # Compute Completeness
-    completeness = all(info.get("fully_supported", False) for info in sentence_support_information)
     # Compute Adherence
-    adherence = attributes.get("overall_supported", False)
     return {
         "Context Relevance": context_relevance,
         "Context Utilization": context_utilization,
-        "Completeness": completeness,
         "Adherence": adherence
-    }

+import json
+import logging
 def compute_metrics(attributes, total_sentences):
     # Extract relevant information from attributes
     all_relevant_sentence_keys = attributes.get("all_relevant_sentence_keys", [])
     context_relevance = len(all_relevant_sentence_keys) / total_sentences if total_sentences else 0
     # Compute Context Utilization
+    context_utilization = len(all_utilized_sentence_keys) / total_sentences if total_sentences else 0
+    # Compute Completeness score
+    Ri = set(all_relevant_sentence_keys)
+    Ui = set(all_utilized_sentence_keys)
+    completeness_score = len(Ri & Ui) / len(Ri) if len(Ri) else 0
     # Compute Adherence
+    adherence = all(info.get("fully_supported", False) for info in sentence_support_information)
     return {
         "Context Relevance": context_relevance,
         "Context Utilization": context_utilization,
+        "Completeness Score": completeness_score,
         "Adherence": adherence
+    }
+def get_metrics(attributes, total_sentences):
+    if attributes.content:
+        result_content = attributes.content  # Access the content attribute
+        # Extract the JSON part from the result_content
+        json_start = result_content.find("{")
+        json_end = result_content.rfind("}") + 1
+        json_str = result_content[json_start:json_end]
+        try:
+            result_json = json.loads(json_str)
+            print(json.dumps(result_json, indent=2))
+            # Compute metrics using the extracted attributes
+            metrics = compute_metrics(result_json, total_sentences)
+            print(metrics)
+            return metrics
+        except json.JSONDecodeError as e:
+            logging.error(f"JSONDecodeError: {e}")

generator/compute_rmse_auc_roc_metrics.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from sklearn.metrics import roc_auc_score, root_mean_squared_error
+from generator.compute_metrics import get_metrics
+from generator.extract_attributes import extract_attributes
+from generator.generate_response import generate_response
+from retriever.retrieve_documents import retrieve_top_k_documents
+def compute_rmse_auc_roc_metrics(llm, dataset, vector_store):
+     # Lists to accumulate ground truths and predictions for AUC-ROC computation
+    all_ground_truth_relevance = []
+    all_predicted_relevance = []
+    all_ground_truth_utilization = []
+    all_predicted_utilization = []
+    all_ground_truth_adherence = []
+    all_predicted_adherence = []
+    # To store RMSE scores for each question
+    relevance_scores = []
+    utilization_scores = []
+    adherence_scores = []
+    for i, sample in enumerate(dataset):
+        print(sample)
+        sample_question = sample['question']
+        # Extract ground truth metrics from dataset
+        ground_truth_relevance = dataset[i]['relevance_score']
+        ground_truth_utilization = dataset[i]['utilization_score']
+        ground_truth_completeness = dataset[i]['completeness_score']
+        # Step 1: Retrieve relevant documents
+        relevant_docs = retrieve_top_k_documents(vector_store, sample_question, top_k=5)
+        # Step 2: Generate a response using LLM
+        response, source_docs = generate_response(llm, vector_store, sample_question, relevant_docs)
+        # Step 3: Extract attributes
+        attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
+        # Call the process_attributes method in the main block
+        metrics = get_metrics(attributes, total_sentences)
+        # Extract predicted metrics (ensure these are continuous if possible)
+        predicted_relevance = metrics['Context Relevance']
+        predicted_utilization = metrics['Context Utilization']
+        predicted_completeness = metrics['Completeness Score']
+        # === Handle Continuous Inputs for RMSE ===
+        relevance_rmse = root_mean_squared_error([ground_truth_relevance], [predicted_relevance])
+        utilization_rmse = root_mean_squared_error([ground_truth_utilization], [predicted_utilization])
+        #adherence_rmse = mean_squared_error([ground_truth_adherence], [predicted_adherence], squared=False)
+        # === Handle Binary Conversion for AUC-ROC ===
+        binary_ground_truth_relevance = 1 if ground_truth_relevance > 0.5 else 0
+        binary_predicted_relevance = 1 if predicted_relevance > 0.5 else 0
+        binary_ground_truth_utilization = 1 if ground_truth_utilization > 0.5 else 0
+        binary_predicted_utilization = 1 if predicted_utilization > 0.5 else 0
+        #binary_ground_truth_adherence = 1 if ground_truth_adherence > 0.5 else 0
+        #binary_predicted_adherence = 1 if predicted_adherence > 0.5 else 0
+        # === Accumulate data for overall AUC-ROC computation ===
+        all_ground_truth_relevance.append(binary_ground_truth_relevance)
+        all_predicted_relevance.append(predicted_relevance)  # Use probability-based predictions
+        all_ground_truth_utilization.append(binary_ground_truth_utilization)
+        all_predicted_utilization.append(predicted_utilization)
+        #all_ground_truth_adherence.append(binary_ground_truth_adherence)
+        #all_predicted_adherence.append(predicted_adherence)
+        # Store RMSE scores for each question
+        relevance_scores.append(relevance_rmse)
+        utilization_scores.append(utilization_rmse)
+        #adherence_scores.append(adherence_rmse)
+        if i == 9:  # Stop after processing the first 10 rows
+          break
+    # === Compute AUC-ROC for the Entire Dataset ===
+    try:
+        print(f"All Ground Truth Relevance: {all_ground_truth_relevance}")
+        print(f"All Predicted Relevance: {all_predicted_relevance}")
+        relevance_auc = roc_auc_score(all_ground_truth_relevance, all_predicted_relevance)
+    except ValueError:
+        relevance_auc = None
+    try:
+        print(f"All Ground Truth Utilization: {all_ground_truth_utilization}")
+        print(f"All Predicted Utilization: {all_predicted_utilization}")
+        utilization_auc = roc_auc_score(all_ground_truth_utilization, all_predicted_utilization)
+    except ValueError:
+        utilization_auc = None
+    print(f"Relevance RMSE (per question): {relevance_scores}")
+    print(f"Utilization RMSE (per question): {utilization_scores}")
+    #print(f"Adherence RMSE (per question): {adherence_scores}")
+    print(f"\nOverall Relevance AUC-ROC: {relevance_auc}")
+    print(f"Overall Utilization AUC-ROC: {utilization_auc}")

generator/extract_attributes.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from generator.create_prompt import create_prompt
-from generator.initialize_llm import initialize_llm
 from generator.document_utils import Document, apply_sentence_keys_documents, apply_sentence_keys_response
 # Initialize the LLM
-llm = initialize_llm()
 # Function to extract attributes
 def extract_attributes(question, relevant_docs, response):
@@ -12,9 +12,16 @@ def extract_attributes(question, relevant_docs, response):
     formatted_documents = apply_sentence_keys_documents(relevant_docs)
     formatted_responses = apply_sentence_keys_response(response)
     # Calculate the total number of sentences from formatted_documents
     total_sentences = sum(len(doc) for doc in formatted_documents)
     attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)
     # Instead of using BaseMessage, pass the formatted prompt directly to invoke

 from generator.create_prompt import create_prompt
+from generator.initialize_llm import initialize_validation_llm
 from generator.document_utils import Document, apply_sentence_keys_documents, apply_sentence_keys_response
 # Initialize the LLM
+llm = initialize_validation_llm()
 # Function to extract attributes
 def extract_attributes(question, relevant_docs, response):
     formatted_documents = apply_sentence_keys_documents(relevant_docs)
     formatted_responses = apply_sentence_keys_response(response)
+    #print(f"Formatted documents : {formatted_documents}")
+    # Print the number of sentences in each document
+    for i, doc in enumerate(formatted_documents):
+        num_sentences = len(doc)
+        print(f"Document {i} has {num_sentences} sentences.")
     # Calculate the total number of sentences from formatted_documents
     total_sentences = sum(len(doc) for doc in formatted_documents)
+    print(f"Total number of sentences {total_sentences}")
     attribute_prompt = create_prompt(formatted_documents, question, formatted_responses)
     # Instead of using BaseMessage, pass the formatted prompt directly to invoke

generator/initialize_llm.py CHANGED Viewed

@@ -4,4 +4,9 @@ from langchain_groq import ChatGroq
 def initialize_llm():
     os.environ["GROQ_API_KEY"] = "your_groq_api_key"
     llm = ChatGroq(model="llama3-8b-8192", temperature=0.7)
     return llm

 def initialize_llm():
     os.environ["GROQ_API_KEY"] = "your_groq_api_key"
     llm = ChatGroq(model="llama3-8b-8192", temperature=0.7)
+    return llm
+def initialize_validation_llm():
+    os.environ["GROQ_API_KEY"] = "your_groq_api_key"
+    llm = ChatGroq(model="llama3-70b-8192", temperature=0.7)
     return llm

main.py CHANGED Viewed

@@ -1,12 +1,13 @@
-import logging, json
 from data.load_dataset import load_data
 from retriever.chunk_documents import chunk_documents
 from retriever.embed_documents import embed_documents
 from retriever.retrieve_documents import retrieve_top_k_documents
 from generator.initialize_llm import initialize_llm
 from generator.generate_response import generate_response
 from generator.extract_attributes import extract_attributes
-from generator.compute_metrics import compute_metrics
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -27,7 +28,8 @@ def main():
     logging.info("Documents embedded")
     # Sample question
-    sample_question = dataset[0]['question']
     logging.info(f"Sample question: {sample_question}")
     # Retrieve relevant documents
@@ -52,23 +54,11 @@ def main():
     # Valuations : Extract attributes from the response and source documents
     attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
-    # Only proceed if the content is not empty
-    if attributes.content:
-        result_content = attributes.content  # Access the content attribute
-        # Extract the JSON part from the result_content
-        json_start = result_content.find("{")
-        json_end = result_content.rfind("}") + 1
-        json_str = result_content[json_start:json_end]
-        try:
-            result_json = json.loads(json_str)
-            print(json.dumps(result_json, indent=2))
-            # Compute metrics using the extracted attributes
-            metrics = compute_metrics(result_json, total_sentences)
-            print(metrics)
-        except json.JSONDecodeError as e:
-            logging.error(f"JSONDecodeError: {e}")
 if __name__ == "__main__":
     main()

+import logging
 from data.load_dataset import load_data
+from generator import compute_rmse_auc_roc_metrics
 from retriever.chunk_documents import chunk_documents
 from retriever.embed_documents import embed_documents
 from retriever.retrieve_documents import retrieve_top_k_documents
 from generator.initialize_llm import initialize_llm
 from generator.generate_response import generate_response
 from generator.extract_attributes import extract_attributes
+from generator.compute_metrics import get_metrics
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
     logging.info("Documents embedded")
     # Sample question
+    row_num = 1
+    sample_question = dataset[row_num]['question']
     logging.info(f"Sample question: {sample_question}")
     # Retrieve relevant documents
     # Valuations : Extract attributes from the response and source documents
     attributes, total_sentences = extract_attributes(sample_question, source_docs, response)
+    # Call the process_attributes method in the main block
+    metrics = get_metrics(attributes, total_sentences)
+    #Compute RMSE and AUC-ROC for entire dataset
+    #compute_rmse_auc_roc_metrics(llm, dataset, vector_store)
 if __name__ == "__main__":
     main()