{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "1408eea5", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "with open('/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json', 'r') as f:\n", " data_item = json.load(f)\n", "data = []\n", "for item in data_item:\n", " attribution=item['attribution']['accuracy']\n", " data.append(attribution)" ] }, { "cell_type": "code", "execution_count": null, "id": "c706e713", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "from scipy import stats\n", "\n", "# Example data list\n", "# data = [12, 15, 14, 18, 19, 17, 21]\n", "\n", "# Convert to a pandas Series for convenience\n", "s = pd.Series(data)\n", "\n", "# --- 1. Basic statistics ---\n", "summary = s.describe()\n", "print(\"Basic statistics:\")\n", "print(summary)\n", "\n", "# Extra metrics\n", "print(\"\\nAdditional info:\")\n", "print(f\"Variance: {s.var():.2f}\")\n", "print(f\"Skewness: {s.skew():.2f}\")\n", "print(f\"Kurtosis: {s.kurt():.2f}\")\n", "print(f\"Mode: {s.mode().tolist()}\")\n", "\n", "# --- 2. Visualization ---\n", "plt.figure(figsize=(8, 5))\n", "sns.histplot(s, bins=10, kde=True, color='skyblue', edgecolor='black')\n", "plt.title(\"Distribution curve of data\")\n", "plt.xlabel(\"Value\")\n", "plt.ylabel(\"Frequency\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "860aff4b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "s = pd.Series(data) # sample data with an outlier\n", "\n", "# Compute IQR boundaries\n", "Q1 = s.quantile(0.25)\n", "Q3 = s.quantile(0.75)\n", "IQR = Q3 - Q1\n", "\n", "lower_lim = Q1 - 1.5 * IQR\n", "upper_lim = Q3 + 1.5 * IQR\n", "\n", "cleaned = s[(s >= lower_lim) & (s <= upper_lim)]\n", "\n", "print(\"Cleaned data:\")\n", "print(len(cleaned.tolist()))\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "sns.boxplot(x=s, color=\"lightblue\")\n", "plt.title(\"Before cleaning\")\n", "plt.show()\n", "\n", "sns.boxplot(x=cleaned, color=\"lightgreen\")\n", "plt.title(\"After IQR cleaning\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "4b1f16b3", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from scipy import stats\n", "\n", "z_scores = np.abs(stats.zscore(s))\n", "threshold = 3 # commonly used threshold\n", "cleaned_z = s[z_scores < threshold]\n", "print(len(cleaned_z.tolist()))\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "print(\"Cleaned data (Z-score method):\")\n", "sns.boxplot(x=s, color=\"lightblue\")\n", "plt.title(\"Before cleaning\")\n", "plt.show()\n", "\n", "sns.boxplot(x=cleaned_z, color=\"lightgreen\")\n", "plt.title(\"After Z-score cleaning\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "4394d44c", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "8e24c8c2", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "f97f821e", "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "def analyze_doclens_results(file_path):\n", " \"\"\"\n", " Loads, parses, and analyzes the DOCLENS evaluation results from a JSON file.\n", "\n", " Args:\n", " file_path (str): The path to the JSON results file.\n", "\n", " Returns:\n", " pandas.DataFrame: A DataFrame with the aggregated mean scores.\n", " \"\"\"\n", " # Load the entire JSON file\n", " try:\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " data = json.load(f)\n", " except FileNotFoundError:\n", " print(f\"Error: The file '{file_path}' was not found.\")\n", " return None\n", " except json.JSONDecodeError:\n", " print(f\"Error: The file '{file_path}' is not a valid JSON file.\")\n", " return None\n", "\n", " # Parse the nested data into a flat list of dictionaries\n", " parsed_data = []\n", " for record in data:\n", " record_id = record.get(\"id\")\n", " version = record.get(\"version\")\n", " \n", " # Extract accuracy scores safely\n", " completeness_acc = record.get(\"completeness\", {}).get(\"accuracy\", 0)\n", " conciseness_acc = record.get(\"conciseness\", {}).get(\"accuracy\", 0)\n", " attribution_acc = record.get(\"attribution\", {}).get(\"accuracy\", 0)\n", "\n", " parsed_data.append({\n", " \"id\": record_id,\n", " \"version\": version,\n", " \"completeness\": completeness_acc,\n", " \"conciseness\": conciseness_acc,\n", " \"attribution\": attribution_acc\n", " })\n", "\n", " # Create a pandas DataFrame\n", " df = pd.DataFrame(parsed_data)\n", "\n", " # Calculate the mean scores for each version\n", " # The order is specified to ensure 'easy', 'intermediate', 'hard' are plotted correctly\n", " version_order = ['easy', 'intermediate', 'hard']\n", " df['version'] = pd.Categorical(df['version'], categories=version_order, ordered=True)\n", " \n", " agg_results = df.groupby('version')[['completeness', 'conciseness', 'attribution']].mean().reset_index()\n", "\n", " print(\"--- Aggregated Mean Scores ---\")\n", " print(agg_results.to_string(index=False))\n", " \n", " return agg_results\n", "\n", "def visualize_results(df):\n", " \"\"\"\n", " Generates and saves bar charts to visualize the aggregated results.\n", " \"\"\"\n", " if df is None or df.empty:\n", " print(\"Cannot visualize results. DataFrame is empty.\")\n", " return\n", "\n", " sns.set_style(\"whitegrid\")\n", " fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)\n", " fig.suptitle('Average Evaluation Metrics Across Summary Versions', fontsize=16)\n", "\n", " # Plot Completeness\n", " sns.barplot(ax=axes[0], x='version', y='completeness', data=df, palette='Blues_d')\n", " axes[0].set_title('Completeness (Claim Recall)')\n", " axes[0].set_xlabel('Summary Version')\n", " axes[0].set_ylabel('Average Accuracy (%)')\n", "\n", " # Plot Conciseness\n", " sns.barplot(ax=axes[1], x='version', y='conciseness', data=df, palette='Greens_d')\n", " axes[1].set_title('Conciseness (Claim Precision)')\n", " axes[1].set_xlabel('Summary Version')\n", " axes[1].set_ylabel('')\n", "\n", " # Plot Attribution\n", " sns.barplot(ax=axes[2], x='version', y='attribution', data=df, palette='Oranges_d')\n", " axes[2].set_title('Attribution')\n", " axes[2].set_xlabel('Summary Version')\n", " axes[2].set_ylabel('')\n", " \n", " # Improve layout and save the figure\n", " plt.tight_layout(rect=[0, 0, 1, 0.96])\n", " plt.savefig(\"doclens_evaluation_summary.png\", dpi=300)\n", " print(\"\\nChart saved as 'doclens_evaluation_summary.png'\")\n", " plt.show()\n", "\n", "\n", "# --- Main Execution ---\n", "# Replace 'your_results_file.json' with the actual path to your file\n", "results_file = '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json' \n", "aggregated_data = analyze_doclens_results(results_file)\n", "\n", "if aggregated_data is not None:\n", " visualize_results(aggregated_data)" ] }, { "cell_type": "markdown", "id": "b5afb981", "metadata": {}, "source": [ "## Eliminate dataset" ] }, { "cell_type": "code", "execution_count": 18, "id": "b29bcf30", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Rejected 15 items due to low attribution.\n", "Rejected 9 additional items due to incorrect completeness trend.\n", "\n", "--- Filtering Summary ---\n", "Total unique items analyzed: 100\n", "Items kept (High Quality): 76\n", "Items rejected (Low Quality): 24\n", "Saved data to '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B_clean.json'\n", "Saved data to '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json'\n" ] } ], "source": [ "import json\n", "import pandas as pd\n", "\n", "def filter_low_quality_data(file_path, attribution_threshold=80.0, completeness_trend_check=True):\n", " \"\"\"\n", " Loads DOCLENS results, filters out low-quality data, and returns clean/rejected data.\n", " \"\"\"\n", " try:\n", " with open(file_path, 'r', encoding='utf-8') as f:\n", " data = json.load(f)\n", " except (FileNotFoundError, json.JSONDecodeError) as e:\n", " print(f\"Error loading file: {e}\")\n", " return None, None\n", "\n", " # --- FIX: Parse the nested JSON to extract numeric accuracy scores ---\n", " # Create a flat list of dictionaries instead of a list of nested objects\n", " parsed_data = []\n", " for record in data:\n", " parsed_data.append({\n", " \"id\": record.get(\"id\"),\n", " \"version\": record.get(\"version\"),\n", " \"completeness\": record.get(\"completeness\", {}).get(\"accuracy\", 0),\n", " \"conciseness\": record.get(\"conciseness\", {}).get(\"accuracy\", 0),\n", " \"attribution\": record.get(\"attribution\", {}).get(\"accuracy\", 0)\n", " })\n", "\n", " # Create DataFrame from the *parsed* data\n", " df = pd.DataFrame(parsed_data)\n", " # --------------------------------------------------------------------\n", " \n", " all_ids = set(df['id'].unique())\n", " rejected_ids = set()\n", "\n", " # --- Pivot data for easier comparison across versions ---\n", " # This part now works correctly because the columns are numeric\n", " pivot_df = df.pivot_table(\n", " index='id',\n", " columns='version',\n", " values=['completeness', 'conciseness', 'attribution']\n", " )\n", " pivot_df.columns = ['_'.join(map(str, col)).strip() for col in pivot_df.columns.values]\n", " \n", " # --- Filter 1: Low Attribution ---\n", " low_attribution_mask = (pivot_df['attribution_easy'] < attribution_threshold) | \\\n", " (pivot_df['attribution_intermediate'] < attribution_threshold) | \\\n", " (pivot_df['attribution_hard'] < attribution_threshold)\n", " rejected_attribution_ids = pivot_df[low_attribution_mask].index\n", " rejected_ids.update(rejected_attribution_ids)\n", " print(f\"Rejected {len(rejected_attribution_ids)} items due to low attribution.\")\n", "\n", " # --- Filter 2: Incorrect Completeness Trend ---\n", " if completeness_trend_check:\n", " bad_trend_mask = pivot_df['completeness_easy'] > pivot_df['completeness_hard']\n", " rejected_trend_ids = pivot_df[bad_trend_mask].index\n", " newly_rejected_count = len(rejected_trend_ids.difference(rejected_ids))\n", " rejected_ids.update(rejected_trend_ids)\n", " print(f\"Rejected {newly_rejected_count} additional items due to incorrect completeness trend.\")\n", "\n", " # --- Separate the data ---\n", " clean_ids = all_ids - rejected_ids\n", " \n", " # We need to filter the original 'data' list, not the parsed one, to keep the full structure\n", " original_df = pd.DataFrame(data)\n", " clean_data = original_df[original_df['id'].isin(clean_ids)].to_dict('records')\n", " rejected_data = original_df[original_df['id'].isin(rejected_ids)].to_dict('records')\n", " \n", " print(\"\\n--- Filtering Summary ---\")\n", " print(f\"Total unique items analyzed: {len(all_ids)}\")\n", " print(f\"Items kept (High Quality): {len(clean_ids)}\")\n", " print(f\"Items rejected (Low Quality): {len(rejected_ids)}\")\n", " \n", " return clean_data, rejected_data\n", "\n", "def save_json(data, file_path):\n", " \"\"\"Saves data to a JSON file.\"\"\"\n", " with open(file_path, 'w', encoding='utf-8') as f:\n", " json.dump(data, f, indent=4, ensure_ascii=False)\n", " print(f\"Saved data to '{file_path}'\")\n", "\n", "\n", "# --- Main Execution ---\n", "# Replace with your file paths and desired thresholds\n", "RESULTS_FILE = '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json' # Make sure this points to your file\n", "# CLEAN_FILE_PATH = '/home/mshahidul/readctrl/results/dataset_quality_check/high_quality_dataset.json'\n", "# REJECTED_FILE_PATH = '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json'\n", "ATTRIBUTION_THRESHOLD = 80.0\n", "\n", "clean_dataset, rejected_dataset = filter_low_quality_data(\n", " RESULTS_FILE,\n", " attribution_threshold=ATTRIBUTION_THRESHOLD\n", ")\n", "\n", "if clean_dataset is not None:\n", " save_json(clean_dataset, '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B_clean.json')\n", " save_json(rejected_dataset, '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json')" ] } ], "metadata": { "kernelspec": { "display_name": "unsloth", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.11" } }, "nbformat": 4, "nbformat_minor": 5 }