File size: 15,017 Bytes

9c6961c

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1408eea5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json', 'r') as f:\n",
    "    data_item = json.load(f)\n",
    "data = []\n",
    "for item in data_item:\n",
    "    attribution=item['attribution']['accuracy']\n",
    "    data.append(attribution)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c706e713",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from scipy import stats\n",
    "\n",
    "# Example data list\n",
    "# data = [12, 15, 14, 18, 19, 17, 21]\n",
    "\n",
    "# Convert to a pandas Series for convenience\n",
    "s = pd.Series(data)\n",
    "\n",
    "# --- 1. Basic statistics ---\n",
    "summary = s.describe()\n",
    "print(\"Basic statistics:\")\n",
    "print(summary)\n",
    "\n",
    "# Extra metrics\n",
    "print(\"\\nAdditional info:\")\n",
    "print(f\"Variance: {s.var():.2f}\")\n",
    "print(f\"Skewness: {s.skew():.2f}\")\n",
    "print(f\"Kurtosis: {s.kurt():.2f}\")\n",
    "print(f\"Mode: {s.mode().tolist()}\")\n",
    "\n",
    "# --- 2. Visualization ---\n",
    "plt.figure(figsize=(8, 5))\n",
    "sns.histplot(s, bins=10, kde=True, color='skyblue', edgecolor='black')\n",
    "plt.title(\"Distribution curve of data\")\n",
    "plt.xlabel(\"Value\")\n",
    "plt.ylabel(\"Frequency\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "860aff4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "s = pd.Series(data)  # sample data with an outlier\n",
    "\n",
    "# Compute IQR boundaries\n",
    "Q1 = s.quantile(0.25)\n",
    "Q3 = s.quantile(0.75)\n",
    "IQR = Q3 - Q1\n",
    "\n",
    "lower_lim = Q1 - 1.5 * IQR\n",
    "upper_lim = Q3 + 1.5 * IQR\n",
    "\n",
    "cleaned = s[(s >= lower_lim) & (s <= upper_lim)]\n",
    "\n",
    "print(\"Cleaned data:\")\n",
    "print(len(cleaned.tolist()))\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "sns.boxplot(x=s, color=\"lightblue\")\n",
    "plt.title(\"Before cleaning\")\n",
    "plt.show()\n",
    "\n",
    "sns.boxplot(x=cleaned, color=\"lightgreen\")\n",
    "plt.title(\"After IQR cleaning\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b1f16b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from scipy import stats\n",
    "\n",
    "z_scores = np.abs(stats.zscore(s))\n",
    "threshold = 3  # commonly used threshold\n",
    "cleaned_z = s[z_scores < threshold]\n",
    "print(len(cleaned_z.tolist()))\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "print(\"Cleaned data (Z-score method):\")\n",
    "sns.boxplot(x=s, color=\"lightblue\")\n",
    "plt.title(\"Before cleaning\")\n",
    "plt.show()\n",
    "\n",
    "sns.boxplot(x=cleaned_z, color=\"lightgreen\")\n",
    "plt.title(\"After Z-score cleaning\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4394d44c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e24c8c2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f97f821e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "def analyze_doclens_results(file_path):\n",
    "    \"\"\"\n",
    "    Loads, parses, and analyzes the DOCLENS evaluation results from a JSON file.\n",
    "\n",
    "    Args:\n",
    "        file_path (str): The path to the JSON results file.\n",
    "\n",
    "    Returns:\n",
    "        pandas.DataFrame: A DataFrame with the aggregated mean scores.\n",
    "    \"\"\"\n",
    "    # Load the entire JSON file\n",
    "    try:\n",
    "        with open(file_path, 'r', encoding='utf-8') as f:\n",
    "            data = json.load(f)\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Error: The file '{file_path}' was not found.\")\n",
    "        return None\n",
    "    except json.JSONDecodeError:\n",
    "        print(f\"Error: The file '{file_path}' is not a valid JSON file.\")\n",
    "        return None\n",
    "\n",
    "    # Parse the nested data into a flat list of dictionaries\n",
    "    parsed_data = []\n",
    "    for record in data:\n",
    "        record_id = record.get(\"id\")\n",
    "        version = record.get(\"version\")\n",
    "        \n",
    "        # Extract accuracy scores safely\n",
    "        completeness_acc = record.get(\"completeness\", {}).get(\"accuracy\", 0)\n",
    "        conciseness_acc = record.get(\"conciseness\", {}).get(\"accuracy\", 0)\n",
    "        attribution_acc = record.get(\"attribution\", {}).get(\"accuracy\", 0)\n",
    "\n",
    "        parsed_data.append({\n",
    "            \"id\": record_id,\n",
    "            \"version\": version,\n",
    "            \"completeness\": completeness_acc,\n",
    "            \"conciseness\": conciseness_acc,\n",
    "            \"attribution\": attribution_acc\n",
    "        })\n",
    "\n",
    "    # Create a pandas DataFrame\n",
    "    df = pd.DataFrame(parsed_data)\n",
    "\n",
    "    # Calculate the mean scores for each version\n",
    "    # The order is specified to ensure 'easy', 'intermediate', 'hard' are plotted correctly\n",
    "    version_order = ['easy', 'intermediate', 'hard']\n",
    "    df['version'] = pd.Categorical(df['version'], categories=version_order, ordered=True)\n",
    "    \n",
    "    agg_results = df.groupby('version')[['completeness', 'conciseness', 'attribution']].mean().reset_index()\n",
    "\n",
    "    print(\"--- Aggregated Mean Scores ---\")\n",
    "    print(agg_results.to_string(index=False))\n",
    "    \n",
    "    return agg_results\n",
    "\n",
    "def visualize_results(df):\n",
    "    \"\"\"\n",
    "    Generates and saves bar charts to visualize the aggregated results.\n",
    "    \"\"\"\n",
    "    if df is None or df.empty:\n",
    "        print(\"Cannot visualize results. DataFrame is empty.\")\n",
    "        return\n",
    "\n",
    "    sns.set_style(\"whitegrid\")\n",
    "    fig, axes = plt.subplots(1, 3, figsize=(18, 6), sharey=True)\n",
    "    fig.suptitle('Average Evaluation Metrics Across Summary Versions', fontsize=16)\n",
    "\n",
    "    # Plot Completeness\n",
    "    sns.barplot(ax=axes[0], x='version', y='completeness', data=df, palette='Blues_d')\n",
    "    axes[0].set_title('Completeness (Claim Recall)')\n",
    "    axes[0].set_xlabel('Summary Version')\n",
    "    axes[0].set_ylabel('Average Accuracy (%)')\n",
    "\n",
    "    # Plot Conciseness\n",
    "    sns.barplot(ax=axes[1], x='version', y='conciseness', data=df, palette='Greens_d')\n",
    "    axes[1].set_title('Conciseness (Claim Precision)')\n",
    "    axes[1].set_xlabel('Summary Version')\n",
    "    axes[1].set_ylabel('')\n",
    "\n",
    "    # Plot Attribution\n",
    "    sns.barplot(ax=axes[2], x='version', y='attribution', data=df, palette='Oranges_d')\n",
    "    axes[2].set_title('Attribution')\n",
    "    axes[2].set_xlabel('Summary Version')\n",
    "    axes[2].set_ylabel('')\n",
    "    \n",
    "    # Improve layout and save the figure\n",
    "    plt.tight_layout(rect=[0, 0, 1, 0.96])\n",
    "    plt.savefig(\"doclens_evaluation_summary.png\", dpi=300)\n",
    "    print(\"\\nChart saved as 'doclens_evaluation_summary.png'\")\n",
    "    plt.show()\n",
    "\n",
    "\n",
    "# --- Main Execution ---\n",
    "# Replace 'your_results_file.json' with the actual path to your file\n",
    "results_file = '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json' \n",
    "aggregated_data = analyze_doclens_results(results_file)\n",
    "\n",
    "if aggregated_data is not None:\n",
    "    visualize_results(aggregated_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5afb981",
   "metadata": {},
   "source": [
    "## Eliminate dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "b29bcf30",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rejected 15 items due to low attribution.\n",
      "Rejected 9 additional items due to incorrect completeness trend.\n",
      "\n",
      "--- Filtering Summary ---\n",
      "Total unique items analyzed: 100\n",
      "Items kept (High Quality): 76\n",
      "Items rejected (Low Quality): 24\n",
      "Saved data to '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B_clean.json'\n",
      "Saved data to '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json'\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import pandas as pd\n",
    "\n",
    "def filter_low_quality_data(file_path, attribution_threshold=80.0, completeness_trend_check=True):\n",
    "    \"\"\"\n",
    "    Loads DOCLENS results, filters out low-quality data, and returns clean/rejected data.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        with open(file_path, 'r', encoding='utf-8') as f:\n",
    "            data = json.load(f)\n",
    "    except (FileNotFoundError, json.JSONDecodeError) as e:\n",
    "        print(f\"Error loading file: {e}\")\n",
    "        return None, None\n",
    "\n",
    "    # --- FIX: Parse the nested JSON to extract numeric accuracy scores ---\n",
    "    # Create a flat list of dictionaries instead of a list of nested objects\n",
    "    parsed_data = []\n",
    "    for record in data:\n",
    "        parsed_data.append({\n",
    "            \"id\": record.get(\"id\"),\n",
    "            \"version\": record.get(\"version\"),\n",
    "            \"completeness\": record.get(\"completeness\", {}).get(\"accuracy\", 0),\n",
    "            \"conciseness\": record.get(\"conciseness\", {}).get(\"accuracy\", 0),\n",
    "            \"attribution\": record.get(\"attribution\", {}).get(\"accuracy\", 0)\n",
    "        })\n",
    "\n",
    "    # Create DataFrame from the *parsed* data\n",
    "    df = pd.DataFrame(parsed_data)\n",
    "    # --------------------------------------------------------------------\n",
    "    \n",
    "    all_ids = set(df['id'].unique())\n",
    "    rejected_ids = set()\n",
    "\n",
    "    # --- Pivot data for easier comparison across versions ---\n",
    "    # This part now works correctly because the columns are numeric\n",
    "    pivot_df = df.pivot_table(\n",
    "        index='id',\n",
    "        columns='version',\n",
    "        values=['completeness', 'conciseness', 'attribution']\n",
    "    )\n",
    "    pivot_df.columns = ['_'.join(map(str, col)).strip() for col in pivot_df.columns.values]\n",
    "    \n",
    "    # --- Filter 1: Low Attribution ---\n",
    "    low_attribution_mask = (pivot_df['attribution_easy'] < attribution_threshold) | \\\n",
    "                           (pivot_df['attribution_intermediate'] < attribution_threshold) | \\\n",
    "                           (pivot_df['attribution_hard'] < attribution_threshold)\n",
    "    rejected_attribution_ids = pivot_df[low_attribution_mask].index\n",
    "    rejected_ids.update(rejected_attribution_ids)\n",
    "    print(f\"Rejected {len(rejected_attribution_ids)} items due to low attribution.\")\n",
    "\n",
    "    # --- Filter 2: Incorrect Completeness Trend ---\n",
    "    if completeness_trend_check:\n",
    "        bad_trend_mask = pivot_df['completeness_easy'] > pivot_df['completeness_hard']\n",
    "        rejected_trend_ids = pivot_df[bad_trend_mask].index\n",
    "        newly_rejected_count = len(rejected_trend_ids.difference(rejected_ids))\n",
    "        rejected_ids.update(rejected_trend_ids)\n",
    "        print(f\"Rejected {newly_rejected_count} additional items due to incorrect completeness trend.\")\n",
    "\n",
    "    # --- Separate the data ---\n",
    "    clean_ids = all_ids - rejected_ids\n",
    "    \n",
    "    # We need to filter the original 'data' list, not the parsed one, to keep the full structure\n",
    "    original_df = pd.DataFrame(data)\n",
    "    clean_data = original_df[original_df['id'].isin(clean_ids)].to_dict('records')\n",
    "    rejected_data = original_df[original_df['id'].isin(rejected_ids)].to_dict('records')\n",
    "    \n",
    "    print(\"\\n--- Filtering Summary ---\")\n",
    "    print(f\"Total unique items analyzed: {len(all_ids)}\")\n",
    "    print(f\"Items kept (High Quality): {len(clean_ids)}\")\n",
    "    print(f\"Items rejected (Low Quality): {len(rejected_ids)}\")\n",
    "    \n",
    "    return clean_data, rejected_data\n",
    "\n",
    "def save_json(data, file_path):\n",
    "    \"\"\"Saves data to a JSON file.\"\"\"\n",
    "    with open(file_path, 'w', encoding='utf-8') as f:\n",
    "        json.dump(data, f, indent=4, ensure_ascii=False)\n",
    "    print(f\"Saved data to '{file_path}'\")\n",
    "\n",
    "\n",
    "# --- Main Execution ---\n",
    "# Replace with your file paths and desired thresholds\n",
    "RESULTS_FILE = '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json' # Make sure this points to your file\n",
    "# CLEAN_FILE_PATH = '/home/mshahidul/readctrl/results/dataset_quality_check/high_quality_dataset.json'\n",
    "# REJECTED_FILE_PATH = '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json'\n",
    "ATTRIBUTION_THRESHOLD = 80.0\n",
    "\n",
    "clean_dataset, rejected_dataset = filter_low_quality_data(\n",
    "    RESULTS_FILE,\n",
    "    attribution_threshold=ATTRIBUTION_THRESHOLD\n",
    ")\n",
    "\n",
    "if clean_dataset is not None:\n",
    "    save_json(clean_dataset, '/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B_clean.json')\n",
    "    save_json(rejected_dataset, '/home/mshahidul/readctrl/results/dataset_quality_check/rejected_dataset.json')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "unsloth",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}