Spaces:

badminton001
/

Recommender

Sleeping

App Files Files Community

badminton001 commited on Jun 29, 2025

Commit

f15ad29

verified ·

1 Parent(s): 29812d2

Update evaluation/evaluate_books.py

Browse files

Files changed (1) hide show

evaluation/evaluate_books.py +349 -352

evaluation/evaluate_books.py CHANGED Viewed

@@ -1,352 +1,349 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-import os
-import sys
-import json
-import time
-import matplotlib.pyplot as plt
-import numpy as np # Import numpy for better array handling
-from typing import List, Tuple, Dict, Any, Callable
-from sklearn.metrics import precision_score, recall_score, f1_score
-# Set project root
-this_dir = os.path.dirname(__file__)
-project_root = os.path.abspath(os.path.join(this_dir, ".."))
-sys.path.append(project_root)
-# Assuming these imports are correct and available in your project structure
-from retrieval.retrieve_books_50000 import get_recommendations as book_recs, book_records
-# Import the updated query parser (renamed to user_query_parser)
-from utils.query_parser import parse_user_query
-# ---------- 1. Load Evaluation Data ----------
-def load_eval_data(test_file: str, gt_file: str) -> Tuple[List[str], List[List[int]]]:
-    """
-    Loads evaluation queries and ground truth data for books.
-    Args:
-        test_file (str): Filename for the test queries JSON.
-        gt_file (str): Filename for the ground truth JSON.
-    Returns:
-        Tuple[List[str], List[List[int]]]: A tuple containing a list of queries
-                                           and a list of corresponding ground truth indices.
-    """
-    base = os.path.join(project_root, "evaluation")
-    # Read and parse test queries after removing comments
-    test_path = os.path.join(base, test_file)
-    try:
-        with open(test_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-            # Remove lines that start with '//' as comments
-            content = ''.join([l for l in lines if not l.strip().startswith('//')])
-            queries_raw = json.loads(content)
-    except FileNotFoundError:
-        print(f"Error: Test queries file not found at {test_path}")
-        return [], []
-    except json.JSONDecodeError:
-        print(f"Error: Could not decode JSON from {test_path}")
-        return [], []
-    # Read and parse Ground-truth after removing comments
-    gt_path = os.path.join(base, gt_file)
-    try:
-        with open(gt_path, 'r', encoding='utf-8') as f:
-            gt_lines = f.readlines()
-            # Remove lines that start with '//' as comments
-            gt_content = ''.join([l for l in gt_lines if not l.strip().startswith('//')])
-            gt_map = json.loads(gt_content)
-    except FileNotFoundError:
-        print(f"Error: Ground truth file not found at {gt_path}")
-        return [], []
-    except json.JSONDecodeError:
-        print(f"Error: Could not decode JSON from {gt_path}")
-        return [], []
-    # Build query_id -> query text map
-    id_to_query = {item['query_id']: item['query'] for item in queries_raw}
-    # Build query_id -> [ground truths] map
-    id_to_gt = {int(qid): vals for qid, vals in gt_map.items()}
-    # Align and return queries and truths
-    queries, truths = [], []
-    for qid, qtext in id_to_query.items():
-        if qid in id_to_gt:
-            queries.append(qtext)
-            truths.append(id_to_gt[qid])
-    print(f"✅ Loaded {len(queries)} queries with ground truths.")
-    return queries, truths
-# ---------- 2. Retrieval Function Factory ----------
-def retrieval_func_factory(params: Dict[str, Any]) -> Callable[[str], List[Tuple[int, float]]]:
-    """
-    Creates a retrieval function based on specified parameters for books.
-    Args:
-        params (Dict[str, Any]): A dictionary containing 'top_k' and 'method' for retrieval.
-    Returns:
-        Callable[[str], List[Tuple[int, float]]]: A function that takes a query string
-                                                  and returns a list of (index, score) tuples.
-    """
-    def fn(query: str) -> List[Tuple[int, float]]:
-        # Parse the query to extract tags using the user_query_parser
-        parsed_tags = parse_user_query(query)
-        # Call the book recommendation function with parsed_query_tags
-        results = book_recs(query, top_k=params['top_k'], method=params['method'], parsed_query_tags=parsed_tags)
-        # Create an index map from book title to its original index in book_records
-        # This assumes book titles are unique enough for mapping. If not, a unique identifier
-        # (like source_key or a dedicated ID) should be used.
-        # Ensure book_records is correctly populated and contains 'title' and 'source_key' or similar unique ID
-        index_map = {item.get('title'): idx for idx, item in enumerate(book_records) if item.get('title')}
-        # As a fallback or if titles are not unique, consider using 'source_key'
-        # index_map = {item.get('source_key'): idx for idx, item in enumerate(book_records) if item.get('source_key')}
-        retrieved_items_with_indices = []
-        for r in results:
-            # Try to get the original index using the title (or other unique ID)
-            # Use .get() with a default to avoid KeyError if title not in map
-            original_idx = index_map.get(r.get('title'))
-            if original_idx is not None and 'score' in r:
-                retrieved_items_with_indices.append((original_idx, r['score']))
-            # If original_idx is None, it means the recommended item's title wasn't found in the index map.
-            # This could indicate an issue with how index_map is created or how results are structured.
-        return retrieved_items_with_indices
-    return fn
-# ---------- 3. Accuracy Evaluation ----------
-def evaluate_accuracy(retrieval_func: Callable[[str], List[Tuple[int, float]]], queries: List[str],
-                      truths: List[List[int]]) -> float:
-    """
-    Evaluates the accuracy (Top-1 Hit Rate) of the retrieval function for books.
-    Args:
-        retrieval_func (Callable): The retrieval function to evaluate.
-        queries (List[str]): List of test queries.
-        truths (List[List[int]]): List of ground truth indices for each query.
-    Returns:
-        float: The Top-1 accuracy score.
-    """
-    correct = 0
-    for q, gt in zip(queries, truths):
-        results = retrieval_func(q)
-        # Check if the top-1 result is in the ground truth
-        if results and results[0][0] in gt:
-            correct += 1
-    return correct / len(queries) if queries else 0.0
-# ---------- 4. Timing ----------
-def measure_response_time(retrieval_func: Callable[[str], Any], queries: List[str]) -> float:
-    """
-    Measures the average response time per query for the retrieval function.
-    Args:
-        retrieval_func (Callable): The retrieval function to measure.
-        queries (List[str]): List of test queries.
-    Returns:
-        float: Average response time in seconds per query.
-    """
-    start = time.time()
-    for q in queries:
-        retrieval_func(q)
-    end = time.time()
-    return (end - start) / len(queries) if queries else 0.0
-# ---------- 5. Visualization ----------
-def plot_optimization_report(metrics_data: Dict[str, Dict[str, List[float]]],
-                             param_grid: Dict[str, List[Any]],
-                             save_path_prefix: str = 'optimization_report_books'):
-    """
-    Plots the optimization report for retrieval metrics for books, separating plots by metric
-    and grouping lines by retrieval method.
-    Args:
-        metrics_data (Dict[str, Dict[str, List[float]]]): Structured dictionary
-            {metric_name: {method_name: [list_of_values_for_top_ks]}}
-        param_grid (Dict[str, List[Any]]): The original parameter grid used for evaluation.
-        save_path_prefix (str): Prefix for saving the plot images (e.g., 'optimization_report_books_accuracy.png').
-    """
-    top_k_values = sorted(param_grid['top_k'])
-    methods = param_grid['method']
-    for metric_name, method_metrics in metrics_data.items():
-        plt.figure(figsize=(10, 6))
-        # Use a consistent color cycle for different methods
-        colors = plt.cm.get_cmap('viridis', len(methods))
-        for i, method in enumerate(methods):
-            values = method_metrics.get(method, [])
-            if values:
-                plt.plot(top_k_values, values, label=f'{method} Method',
-                         marker='o', linestyle='-', linewidth=2, color=colors(i))
-                # Add text labels for values on the plot
-                for x, y in zip(top_k_values, values):
-                    plt.text(x, y, f'{y:.3f}', ha='center', va='bottom', fontsize=8)
-        plt.xlabel('Top-k Value')
-        plt.ylabel(metric_name.replace('_', ' ').title())
-        plt.title(f'Book Retrieval Optimization - {metric_name.replace("_", " ").title()} by Top-k and Method')
-        plt.xticks(top_k_values)
-        plt.legend(title='Retrieval Method')
-        plt.grid(True, linestyle='--', alpha=0.7)
-        plt.tight_layout()
-        save_file = f"{save_path_prefix}_{metric_name}.png"
-        plt.savefig(save_file)
-        print(f"✅ Plot saved to {save_file}")
-        plt.close()
-# ---------- 6. Top-k and Binary Metrics ----------
-def compute_topk_metrics(retrieval_func: Callable, queries: List[str], truths: List[List[int]],
-                         k_values: List[int] = [1, 3, 5]):
-    """
-    Computes Top-k Hit Rates, and average Precision, Recall, and F1-score for books.
-    Args:
-        retrieval_func (Callable): The retrieval function to evaluate.
-        queries (List[str]): List of test queries.
-        truths (List[List[int]]): List of ground truth indices for each query.
-        k_values (List[int]): List of k values for Top-k Hit Rate calculation.
-    """
-    hit_rates = {k: 0 for k in k_values}
-    precisions, recalls, f1s = [], [], []
-    total_queries = len(queries)
-    # Total size of the book corpus, needed for binary metrics
-    corpus_size = len(book_records)
-    if total_queries == 0:
-        print("No queries to evaluate.")
-        return
-    for q_idx, (q, gt) in enumerate(zip(queries, truths)):
-        retrieved = retrieval_func(q)
-        # Extract only the indices of retrieved items
-        retrieved_ids = [idx for idx, _ in retrieved]
-        # Calculate Top-k Hit Rate
-        for k in k_values:
-            if any(pred_id in gt for pred_id in retrieved_ids[:k]):
-                hit_rates[k] += 1
-        # Prepare for Precision, Recall, F1-score (binary classification for each item in corpus)
-        # y_true: A binary list where 1 means the item is a ground truth, 0 otherwise
-        y_true = [1 if i in gt else 0 for i in range(corpus_size)]
-        # y_pred: A binary list where 1 means the item was retrieved, 0 otherwise
-        # Only consider items that were actually retrieved by the system
-        y_pred = [0] * corpus_size
-        for idx in retrieved_ids:
-            if idx < corpus_size:  # Ensure index is within bounds
-                y_pred[idx] = 1
-        # Compute metrics for the current query and append
-        precisions.append(precision_score(y_true, y_pred, zero_division=0))
-        recalls.append(recall_score(y_true, y_pred, zero_division=0))
-        f1s.append(f1_score(y_true, y_pred, zero_division=0))
-    print("\n--- Book Retrieval Metrics ---")
-    for k in k_values:
-        print(f"Top@{k} Hit Rate: {hit_rates[k] / total_queries:.4f}")
-    print(f"Avg Precision: {sum(precisions) / total_queries:.4f}")
-    print(f"Avg Recall: {sum(recalls) / total_queries:.4f}")
-    print(f"Avg F1: {sum(f1s) / total_queries:.4f}")
-# ---------- 7. Main Execution ----------
-if __name__ == '__main__':
-    # Load book specific evaluation data
-    # Make sure these files exist in your 'evaluation' directory
-    queries_books, truths_books = load_eval_data('test/test_queries_books_100.json', 'test/ground_truth_books_100.json')
-    if not queries_books:
-        print("Exiting evaluation due to no queries loaded.")
-        sys.exit(1)
-    print(f"✅ {len(queries_books)} book queries loaded.\n")
-    # Define parameter grid for optimization
-    param_grid = {
-        'top_k': [1, 3, 5, 10],  # Added 10 for more granular evaluation
-        'method': ['tfidf', 'sbert']
-    }
-    best_score, best_params = -1.0, {}
-    # Store metrics in a more structured way for easier plotting
-    metrics_for_plotting = {
-        'accuracy': {method: [] for method in param_grid['method']},
-        'response_time': {method: [] for method in param_grid['method']}
-    }
-    from itertools import product
-    print("--- Starting Book Retrieval Evaluation ---")
-    # Sort top_k values to ensure consistent plotting order
-    sorted_top_k = sorted(param_grid['top_k'])
-    # Temporary storage to build sorted lists for plotting
-    temp_metrics_by_method_topk = {
-        method: {k: {'accuracy': 0, 'response_time': 0} for k in sorted_top_k}
-        for method in param_grid['method']
-    }
-    for combo in product(sorted_top_k, param_grid['method']):
-        params = {
-            'top_k': combo[0],
-            'method': combo[1]
-        }
-        # Create retrieval function for current parameters
-        func = retrieval_func_factory(params)
-        # Evaluate accuracy and response time
-        score = evaluate_accuracy(func, queries_books, truths_books)
-        avg_time = measure_response_time(func, queries_books)
-        # Store data for plotting
-        temp_metrics_by_method_topk[params['method']][params['top_k']]['accuracy'] = score
-        temp_metrics_by_method_topk[params['method']][params['top_k']]['response_time'] = avg_time
-        print(f"Params {params} -> Acc: {score:.4f}, Time: {avg_time:.4f}s")
-        # Track the best parameters based on accuracy (still global best acc)
-        if score > best_score:
-            best_score, best_params = score, params
-    # Populate the metrics_for_plotting dictionary after all evaluations are done
-    # This ensures the lists are in the correct order based on sorted_top_k
-    for method in param_grid['method']:
-        for k in sorted_top_k:
-            metrics_for_plotting['accuracy'][method].append(temp_metrics_by_method_topk[method][k]['accuracy'])
-            metrics_for_plotting['response_time'][method].append(temp_metrics_by_method_topk[method][k]['response_time'])
-    print(f"\n✨ Best Params for Books: {best_params}, Accuracy: {best_score:.4f}")
-    # Plot the optimization report using the improved function
-    plot_optimization_report(metrics_for_plotting, param_grid,
-                             save_path_prefix='optimization_report_books')
-    # Compute and print Top-k and binary metrics for the best performing model
-    print("\n--- Detailed Metrics for Best Book Retrieval Model ---")
-    compute_topk_metrics(retrieval_func_factory(best_params), queries_books, truths_books)
-    print("\nBook evaluation complete.")

+import os
+import sys
+import json
+import time
+import matplotlib.pyplot as plt
+import numpy as np # Import numpy for better array handling
+from typing import List, Tuple, Dict, Any, Callable
+from sklearn.metrics import precision_score, recall_score, f1_score
+# Set project root
+this_dir = os.path.dirname(__file__)
+project_root = os.path.abspath(os.path.join(this_dir, ".."))
+sys.path.append(project_root)
+# Assuming these imports are correct and available in your project structure
+from retrieval.retrieve_books_50000 import get_recommendations as book_recs, book_records
+# Import the updated query parser (renamed to user_query_parser)
+from utils.query_parser import parse_user_query
+# ---------- 1. Load Evaluation Data ----------
+def load_eval_data(test_file: str, gt_file: str) -> Tuple[List[str], List[List[int]]]:
+    """
+    Loads evaluation queries and ground truth data for books.
+    Args:
+        test_file (str): Filename for the test queries JSON.
+        gt_file (str): Filename for the ground truth JSON.
+    Returns:
+        Tuple[List[str], List[List[int]]]: A tuple containing a list of queries
+                                           and a list of corresponding ground truth indices.
+    """
+    base = os.path.join(project_root, "evaluation")
+    # Read and parse test queries after removing comments
+    test_path = os.path.join(base, test_file)
+    try:
+        with open(test_path, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+            # Remove lines that start with '//' as comments
+            content = ''.join([l for l in lines if not l.strip().startswith('//')])
+            queries_raw = json.loads(content)
+    except FileNotFoundError:
+        print(f"Error: Test queries file not found at {test_path}")
+        return [], []
+    except json.JSONDecodeError:
+        print(f"Error: Could not decode JSON from {test_path}")
+        return [], []
+    # Read and parse Ground-truth after removing comments
+    gt_path = os.path.join(base, gt_file)
+    try:
+        with open(gt_path, 'r', encoding='utf-8') as f:
+            gt_lines = f.readlines()
+            # Remove lines that start with '//' as comments
+            gt_content = ''.join([l for l in gt_lines if not l.strip().startswith('//')])
+            gt_map = json.loads(gt_content)
+    except FileNotFoundError:
+        print(f"Error: Ground truth file not found at {gt_path}")
+        return [], []
+    except json.JSONDecodeError:
+        print(f"Error: Could not decode JSON from {gt_path}")
+        return [], []
+    # Build query_id -> query text map
+    id_to_query = {item['query_id']: item['query'] for item in queries_raw}
+    # Build query_id -> [ground truths] map
+    id_to_gt = {int(qid): vals for qid, vals in gt_map.items()}
+    # Align and return queries and truths
+    queries, truths = [], []
+    for qid, qtext in id_to_query.items():
+        if qid in id_to_gt:
+            queries.append(qtext)
+            truths.append(id_to_gt[qid])
+    print(f"✅ Loaded {len(queries)} queries with ground truths.")
+    return queries, truths
+# ---------- 2. Retrieval Function Factory ----------
+def retrieval_func_factory(params: Dict[str, Any]) -> Callable[[str], List[Tuple[int, float]]]:
+    """
+    Creates a retrieval function based on specified parameters for books.
+    Args:
+        params (Dict[str, Any]): A dictionary containing 'top_k' and 'method' for retrieval.
+    Returns:
+        Callable[[str], List[Tuple[int, float]]]: A function that takes a query string
+                                                  and returns a list of (index, score) tuples.
+    """
+    def fn(query: str) -> List[Tuple[int, float]]:
+        # Parse the query to extract tags using the user_query_parser
+        parsed_tags = parse_user_query(query)
+        # Call the book recommendation function with parsed_query_tags
+        results = book_recs(query, top_k=params['top_k'], method=params['method'], parsed_query_tags=parsed_tags)
+        # Create an index map from book title to its original index in book_records
+        # This assumes book titles are unique enough for mapping. If not, a unique identifier
+        # (like source_key or a dedicated ID) should be used.
+        # Ensure book_records is correctly populated and contains 'title' and 'source_key' or similar unique ID
+        index_map = {item.get('title'): idx for idx, item in enumerate(book_records) if item.get('title')}
+        # As a fallback or if titles are not unique, consider using 'source_key'
+        # index_map = {item.get('source_key'): idx for idx, item in enumerate(book_records) if item.get('source_key')}
+        retrieved_items_with_indices = []
+        for r in results:
+            # Try to get the original index using the title (or other unique ID)
+            # Use .get() with a default to avoid KeyError if title not in map
+            original_idx = index_map.get(r.get('title'))
+            if original_idx is not None and 'score' in r:
+                retrieved_items_with_indices.append((original_idx, r['score']))
+            # If original_idx is None, it means the recommended item's title wasn't found in the index map.
+            # This could indicate an issue with how index_map is created or how results are structured.
+        return retrieved_items_with_indices
+    return fn
+# ---------- 3. Accuracy Evaluation ----------
+def evaluate_accuracy(retrieval_func: Callable[[str], List[Tuple[int, float]]], queries: List[str],
+                      truths: List[List[int]]) -> float:
+    """
+    Evaluates the accuracy (Top-1 Hit Rate) of the retrieval function for books.
+    Args:
+        retrieval_func (Callable): The retrieval function to evaluate.
+        queries (List[str]): List of test queries.
+        truths (List[List[int]]): List of ground truth indices for each query.
+    Returns:
+        float: The Top-1 accuracy score.
+    """
+    correct = 0
+    for q, gt in zip(queries, truths):
+        results = retrieval_func(q)
+        # Check if the top-1 result is in the ground truth
+        if results and results[0][0] in gt:
+            correct += 1
+    return correct / len(queries) if queries else 0.0
+# ---------- 4. Timing ----------
+def measure_response_time(retrieval_func: Callable[[str], Any], queries: List[str]) -> float:
+    """
+    Measures the average response time per query for the retrieval function.
+    Args:
+        retrieval_func (Callable): The retrieval function to measure.
+        queries (List[str]): List of test queries.
+    Returns:
+        float: Average response time in seconds per query.
+    """
+    start = time.time()
+    for q in queries:
+        retrieval_func(q)
+    end = time.time()
+    return (end - start) / len(queries) if queries else 0.0
+# ---------- 5. Visualization ----------
+def plot_optimization_report(metrics_data: Dict[str, Dict[str, List[float]]],
+                             param_grid: Dict[str, List[Any]],
+                             save_path_prefix: str = 'optimization_report_books'):
+    """
+    Plots the optimization report for retrieval metrics for books, separating plots by metric
+    and grouping lines by retrieval method.
+    Args:
+        metrics_data (Dict[str, Dict[str, List[float]]]): Structured dictionary
+            {metric_name: {method_name: [list_of_values_for_top_ks]}}
+        param_grid (Dict[str, List[Any]]): The original parameter grid used for evaluation.
+        save_path_prefix (str): Prefix for saving the plot images (e.g., 'optimization_report_books_accuracy.png').
+    """
+    top_k_values = sorted(param_grid['top_k'])
+    methods = param_grid['method']
+    for metric_name, method_metrics in metrics_data.items():
+        plt.figure(figsize=(10, 6))
+        # Use a consistent color cycle for different methods
+        colors = plt.cm.get_cmap('viridis', len(methods))
+        for i, method in enumerate(methods):
+            values = method_metrics.get(method, [])
+            if values:
+                plt.plot(top_k_values, values, label=f'{method} Method',
+                         marker='o', linestyle='-', linewidth=2, color=colors(i))
+                # Add text labels for values on the plot
+                for x, y in zip(top_k_values, values):
+                    plt.text(x, y, f'{y:.3f}', ha='center', va='bottom', fontsize=8)
+        plt.xlabel('Top-k Value')
+        plt.ylabel(metric_name.replace('_', ' ').title())
+        plt.title(f'Book Retrieval Optimization - {metric_name.replace("_", " ").title()} by Top-k and Method')
+        plt.xticks(top_k_values)
+        plt.legend(title='Retrieval Method')
+        plt.grid(True, linestyle='--', alpha=0.7)
+        plt.tight_layout()
+        save_file = f"{save_path_prefix}_{metric_name}.png"
+        plt.savefig(save_file)
+        print(f"✅ Plot saved to {save_file}")
+        plt.close()
+# ---------- 6. Top-k and Binary Metrics ----------
+def compute_topk_metrics(retrieval_func: Callable, queries: List[str], truths: List[List[int]],
+                         k_values: List[int] = [1, 3, 5]):
+    """
+    Computes Top-k Hit Rates, and average Precision, Recall, and F1-score for books.
+    Args:
+        retrieval_func (Callable): The retrieval function to evaluate.
+        queries (List[str]): List of test queries.
+        truths (List[List[int]]): List of ground truth indices for each query.
+        k_values (List[int]): List of k values for Top-k Hit Rate calculation.
+    """
+    hit_rates = {k: 0 for k in k_values}
+    precisions, recalls, f1s = [], [], []
+    total_queries = len(queries)
+    # Total size of the book corpus, needed for binary metrics
+    corpus_size = len(book_records)
+    if total_queries == 0:
+        print("No queries to evaluate.")
+        return
+    for q_idx, (q, gt) in enumerate(zip(queries, truths)):
+        retrieved = retrieval_func(q)
+        # Extract only the indices of retrieved items
+        retrieved_ids = [idx for idx, _ in retrieved]
+        # Calculate Top-k Hit Rate
+        for k in k_values:
+            if any(pred_id in gt for pred_id in retrieved_ids[:k]):
+                hit_rates[k] += 1
+        # Prepare for Precision, Recall, F1-score (binary classification for each item in corpus)
+        # y_true: A binary list where 1 means the item is a ground truth, 0 otherwise
+        y_true = [1 if i in gt else 0 for i in range(corpus_size)]
+        # y_pred: A binary list where 1 means the item was retrieved, 0 otherwise
+        # Only consider items that were actually retrieved by the system
+        y_pred = [0] * corpus_size
+        for idx in retrieved_ids:
+            if idx < corpus_size:  # Ensure index is within bounds
+                y_pred[idx] = 1
+        # Compute metrics for the current query and append
+        precisions.append(precision_score(y_true, y_pred, zero_division=0))
+        recalls.append(recall_score(y_true, y_pred, zero_division=0))
+        f1s.append(f1_score(y_true, y_pred, zero_division=0))
+    print("\n--- Book Retrieval Metrics ---")
+    for k in k_values:
+        print(f"Top@{k} Hit Rate: {hit_rates[k] / total_queries:.4f}")
+    print(f"Avg Precision: {sum(precisions) / total_queries:.4f}")
+    print(f"Avg Recall: {sum(recalls) / total_queries:.4f}")
+    print(f"Avg F1: {sum(f1s) / total_queries:.4f}")
+# ---------- 7. Main Execution ----------
+if __name__ == '__main__':
+    # Load book specific evaluation data
+    # Make sure these files exist in your 'evaluation' directory
+    queries_books, truths_books = load_eval_data('test/test_queries_books_100.json', 'test/ground_truth_books_100.json')
+    if not queries_books:
+        print("Exiting evaluation due to no queries loaded.")
+        sys.exit(1)
+    print(f"✅ {len(queries_books)} book queries loaded.\n")
+    # Define parameter grid for optimization
+    param_grid = {
+        'top_k': [1, 3, 5, 10],  # Added 10 for more granular evaluation
+        'method': ['tfidf', 'sbert']
+    }
+    best_score, best_params = -1.0, {}
+    # Store metrics in a more structured way for easier plotting
+    metrics_for_plotting = {
+        'accuracy': {method: [] for method in param_grid['method']},
+        'response_time': {method: [] for method in param_grid['method']}
+    }
+    from itertools import product
+    print("--- Starting Book Retrieval Evaluation ---")
+    # Sort top_k values to ensure consistent plotting order
+    sorted_top_k = sorted(param_grid['top_k'])
+    # Temporary storage to build sorted lists for plotting
+    temp_metrics_by_method_topk = {
+        method: {k: {'accuracy': 0, 'response_time': 0} for k in sorted_top_k}
+        for method in param_grid['method']
+    }
+    for combo in product(sorted_top_k, param_grid['method']):
+        params = {
+            'top_k': combo[0],
+            'method': combo[1]
+        }
+        # Create retrieval function for current parameters
+        func = retrieval_func_factory(params)
+        # Evaluate accuracy and response time
+        score = evaluate_accuracy(func, queries_books, truths_books)
+        avg_time = measure_response_time(func, queries_books)
+        # Store data for plotting
+        temp_metrics_by_method_topk[params['method']][params['top_k']]['accuracy'] = score
+        temp_metrics_by_method_topk[params['method']][params['top_k']]['response_time'] = avg_time
+        print(f"Params {params} -> Acc: {score:.4f}, Time: {avg_time:.4f}s")
+        # Track the best parameters based on accuracy (still global best acc)
+        if score > best_score:
+            best_score, best_params = score, params
+    # Populate the metrics_for_plotting dictionary after all evaluations are done
+    # This ensures the lists are in the correct order based on sorted_top_k
+    for method in param_grid['method']:
+        for k in sorted_top_k:
+            metrics_for_plotting['accuracy'][method].append(temp_metrics_by_method_topk[method][k]['accuracy'])
+            metrics_for_plotting['response_time'][method].append(temp_metrics_by_method_topk[method][k]['response_time'])
+    print(f"\n✨ Best Params for Books: {best_params}, Accuracy: {best_score:.4f}")
+    # Plot the optimization report using the improved function
+    plot_optimization_report(metrics_for_plotting, param_grid,
+                             save_path_prefix='optimization_report_books')
+    # Compute and print Top-k and binary metrics for the best performing model
+    print("\n--- Detailed Metrics for Best Book Retrieval Model ---")
+    compute_topk_metrics(retrieval_func_factory(best_params), queries_books, truths_books)
+    print("\nBook evaluation complete.")