File size: 13,461 Bytes
1db7196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "30a7b117",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "# Define the file paths\n",
    "file_paths = [\n",
    "    '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_0_100_qwen3-32B.json',\n",
    "    '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_100_200_qwen3-32B.json',\n",
    "    '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_200_300_qwen3-32B.json'\n",
    "]\n",
    "\n",
    "merged_data = []\n",
    "\n",
    "# Loop through and append data\n",
    "for file_path in file_paths:\n",
    "    if os.path.exists(file_path):\n",
    "        with open(file_path, 'r', encoding='utf-8') as f:\n",
    "            data = json.load(f)\n",
    "            # Assuming each file contains a list of objects\n",
    "            if isinstance(data, list):\n",
    "                merged_data.extend(data)\n",
    "            else:\n",
    "                merged_data.append(data)\n",
    "        print(f\"Successfully loaded: {file_path}\")\n",
    "    else:\n",
    "        print(f\"Warning: File not found: {file_path}\")\n",
    "\n",
    "# Save the merged result\n",
    "output_path = '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_merged_0_300_qwen3-32B.json'\n",
    "with open(output_path, 'w', encoding='utf-8') as f:\n",
    "    json.dump(merged_data, f, indent=4)\n",
    "\n",
    "print(f\"\\nTotal records merged: {len(merged_data)}\")\n",
    "print(f\"Merged file saved to: {output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27ab3270",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "# Define file paths\n",
    "readability_path = '/home/mshahidul/readctrl/data/classified_readability/classified_multiclinsum_test_en.json'\n",
    "reasoning_path = '/home/mshahidul/readctrl/data/reasoning/refined_evaluated_support_merged_0_300_qwen3-32B.json'\n",
    "output_path = '/home/mshahidul/readctrl/data/reasoning/merged_readability_reasoning_en_final.json'\n",
    "\n",
    "# 1. Load the readability data and create a lookup map\n",
    "with open(readability_path, 'r') as f:\n",
    "    readability_data = json.load(f)\n",
    "\n",
    "# Create a dictionary for O(1) lookup: {id: score}\n",
    "readability_lookup = {item['id']: item['readability_score'] for item in readability_data}\n",
    "\n",
    "# 2. Load the reasoning data\n",
    "with open(reasoning_path, 'r') as f:\n",
    "    reasoning_data = json.load(f)\n",
    "\n",
    "# 3. Merge the scores into the reasoning data\n",
    "merged_count = 0\n",
    "for entry in reasoning_data:\n",
    "    entry_id = entry.get('id')\n",
    "    if entry_id in readability_lookup:\n",
    "        # Add the score to the existing dictionary\n",
    "        entry['readability_score'] = readability_lookup[entry_id]\n",
    "        merged_count += 1\n",
    "    else:\n",
    "        # Optional: Handle cases where an ID is missing in the readability file\n",
    "        entry['readability_score'] = None\n",
    "\n",
    "# 4. Save the merged result\n",
    "with open(output_path, 'w') as f:\n",
    "    json.dump(reasoning_data, f, indent=4)\n",
    "\n",
    "print(f\"Successfully merged {merged_count} records. Saved to {output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "2ef2e0b6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Threshold set to: 90.0%\n",
      "Successfully saved 192 records to: /home/mshahidul/readctrl/data/final_result/processed_threshold_results.json\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "# Configuration\n",
    "input_file = '/home/mshahidul/readctrl/data/reasoning/merged_readability_reasoning_en_final.json'\n",
    "output_dir = '/home/mshahidul/readctrl/data/final_result'\n",
    "output_filename = 'processed_threshold_results.json'\n",
    "\n",
    "# Set your threshold here (e.g., 0.90 for 90%, 0.85 for 85%)\n",
    "SUPPORT_THRESHOLD = 0.90 \n",
    "\n",
    "def process_with_threshold(threshold):\n",
    "    # Ensure the output folder exists\n",
    "    if not os.path.exists(output_dir):\n",
    "        os.makedirs(output_dir)\n",
    "\n",
    "    # Load the source data\n",
    "    try:\n",
    "        with open(input_file, 'r') as f:\n",
    "            data = json.load(f)\n",
    "    except FileNotFoundError:\n",
    "        print(f\"Error: Source file not found at {input_file}\")\n",
    "        return\n",
    "\n",
    "    final_output = []\n",
    "\n",
    "    for item in data:\n",
    "        evals = item.get('subclaim_evaluations', [])\n",
    "        \n",
    "        if not evals:\n",
    "            continue  # Skip items with no subclaims to evaluate\n",
    "            \n",
    "        # Calculate the percentage of supported subclaims\n",
    "        supported_count = sum(1 for sub in evals if sub.get('support_label') == 'supported')\n",
    "        support_ratio = supported_count / len(evals)\n",
    "        \n",
    "        # Keep if it meets the threshold (e.g., 0.90)\n",
    "        if support_ratio >= threshold:\n",
    "            clean_item = item.copy()\n",
    "            \n",
    "            # Map readability_score to difficulty\n",
    "            score = clean_item.get('readability_score', 0)\n",
    "            if score >= 4:\n",
    "                clean_item['difficulty'] = 'easy'\n",
    "            elif score == 3:\n",
    "                clean_item['difficulty'] = 'medium'\n",
    "            else:\n",
    "                clean_item['difficulty'] = 'hard'\n",
    "            \n",
    "            # Add metadata about the support ratio for transparency\n",
    "            clean_item['support_percentage'] = round(support_ratio * 100, 2)\n",
    "            \n",
    "            # Remove the subclaim_evaluations field\n",
    "            if 'subclaim_evaluations' in clean_item:\n",
    "                del clean_item['subclaim_evaluations']\n",
    "            \n",
    "            final_output.append(clean_item)\n",
    "\n",
    "    # Save to a single JSON file\n",
    "    target_path = os.path.join(output_dir, output_filename)\n",
    "    with open(target_path, 'w', encoding='utf-8') as out_f:\n",
    "        json.dump(final_output, out_f, indent=4, ensure_ascii=False)\n",
    "    \n",
    "    print(f\"Threshold set to: {threshold * 100}%\")\n",
    "    print(f\"Successfully saved {len(final_output)} records to: {target_path}\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    process_with_threshold(SUPPORT_THRESHOLD)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "295a4a2a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Success! Merged data saved to: /home/mshahidul/readctrl/data/factual_testing/merged_evaluated_support_0_300.json\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import os\n",
    "\n",
    "# List of file paths to merge\n",
    "file_paths = [\n",
    "    '/home/mshahidul/readctrl/data/factual_testing/evaluated_support_0_100_qwen3-32B.json',\n",
    "    '/home/mshahidul/readctrl/data/factual_testing/evaluated_support_100_200_qwen3-32B.json',\n",
    "    '/home/mshahidul/readctrl/data/factual_testing/evaluated_support_200_300_qwen3-32B.json'\n",
    "]\n",
    "\n",
    "merged_data = []\n",
    "\n",
    "# Iterate through each file and append its contents to the list\n",
    "for file_path in file_paths:\n",
    "    if os.path.exists(file_path):\n",
    "        with open(file_path, 'r', encoding='utf-8') as f:\n",
    "            data = json.load(f)\n",
    "            # If the JSON is a list, extend the merged list\n",
    "            if isinstance(data, list):\n",
    "                merged_data.extend(data)\n",
    "            # If the JSON is a single dictionary, append it\n",
    "            else:\n",
    "                merged_data.append(data)\n",
    "    else:\n",
    "        print(f\"Warning: File not found - {file_path}\")\n",
    "\n",
    "# Save the combined data to a new file\n",
    "output_file = '/home/mshahidul/readctrl/data/factual_testing/merged_evaluated_support_0_300.json'\n",
    "\n",
    "with open(output_file, 'w', encoding='utf-8') as f:\n",
    "    json.dump(merged_data, f, indent=4)\n",
    "\n",
    "print(f\"Success! Merged data saved to: {output_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "e7ba1534",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Updating scores for 100 documents...\n",
      "Successfully updated scores for 100 documents.\n",
      "File saved to: /home/mshahidul/readctrl/data/reasoning/updated_scores/refined_v2_full_evaluation_200_300_qwen3-32B.json\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import argparse\n",
    "import os\n",
    "\n",
    "def calculate_scores(data):\n",
    "    \"\"\"\n",
    "    Recalculates factual_attribution and completeness scores based on \n",
    "    the updated labels in attribution_details and completeness_details.\n",
    "    \"\"\"\n",
    "    updated_count = 0\n",
    "\n",
    "    for doc in data:\n",
    "        # 1. Recalculate Factual Attribution Score\n",
    "        attribution_list = doc.get('attribution_details', [])\n",
    "        if attribution_list:\n",
    "            supported_attr = sum(1 for item in attribution_list if item.get('label') == 'supported')\n",
    "            doc['scores']['factual_attribution'] = supported_attr / len(attribution_list)\n",
    "        else:\n",
    "            doc['scores']['factual_attribution'] = 0.0\n",
    "\n",
    "        # 2. Recalculate Completeness Score\n",
    "        completeness_list = doc.get('completeness_details', [])\n",
    "        if completeness_list:\n",
    "            supported_comp = sum(1 for item in completeness_list if item.get('present_in_summary') == 'supported')\n",
    "            doc['scores']['completeness'] = supported_comp / len(completeness_list)\n",
    "        else:\n",
    "            doc['scores']['completeness'] = 0.0\n",
    "            \n",
    "        updated_count += 1\n",
    "\n",
    "    return data, updated_count\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # parser = argparse.ArgumentParser(description=\"Update scores in refined clinical evaluation JSON.\")\n",
    "    # parser.add_argument(\"--input_file\", type=str, required=True, help=\"Path to the refined JSON file.\")\n",
    "    # parser.add_argument(\"--output_file\", type=str, help=\"Path to save the updated JSON. If omitted, overwrites input.\")\n",
    "    # args = parser.parse_args()\n",
    "    input_file = '/home/mshahidul/readctrl/data/reasoning/refined_v2_full_evaluation_200_300_qwen3-32B.json'\n",
    "    output_path = \"/home/mshahidul/readctrl/data/reasoning/updated_scores\"\n",
    "    output_file = os.path.join(output_path, os.path.basename(input_file))\n",
    "    # Load data\n",
    "    with open(input_file, 'r') as f:\n",
    "        data = json.load(f)\n",
    "\n",
    "    print(f\"Updating scores for {len(data)} documents...\")\n",
    "    \n",
    "    # Process\n",
    "    updated_data, count = calculate_scores(data)\n",
    "\n",
    "   \n",
    "    \n",
    "    # Save results\n",
    "    with open(output_file, 'w') as f:\n",
    "        json.dump(updated_data, f, indent=2, ensure_ascii=False)\n",
    "\n",
    "    print(f\"Successfully updated scores for {count} documents.\")\n",
    "    print(f\"File saved to: {output_file}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "612109dc",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dict_keys(['index', 'id', 'fulltext', 'fulltext_subclaims', 'summary', 'summary_subclaims', 'diff_label_texts', 'diff_label_subclaims', 'readability_score'])\n",
      "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n",
      "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])\n"
     ]
    }
   ],
   "source": [
    "# /home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json\n",
    "import json\n",
    "with open('/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_syn_data_with_gs_summary_en.json', 'r') as f:\n",
    "    anno_data = json.load(f)\n",
    "print(anno_data[0].keys())\n",
    "print(anno_data[0]['diff_label_texts'].keys())\n",
    "print(anno_data[0]['diff_label_subclaims'].keys())"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "un",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}