{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "7087d09c", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "file_path_attrib = \"/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_attribution.json\"\n", "with open(file_path_attrib, 'r') as f:\n", " readability_attribution = json.load(f)\n", "# print keys of the first element\n", "# readability_attribution[0].keys() #dict_keys(['id', 'difficulty_level', 'response'])\n", "# readability_attribution[0]['response'].keys() #dict_keys(['evaluation_table', 'attribution_score', 'overall_explanation'])\n" ] }, { "cell_type": "code", "execution_count": null, "id": "0525f35d", "metadata": {}, "outputs": [], "source": [ "dict1={}\n", "for item in readability_attribution:\n", " for item2 in item['response']['evaluation_table']:\n", " if item2['evaluation'] != 'misleading / hallucinated':\n", " dict1[(item['id'], item['difficulty_level'],item2['id'])]=1" ] }, { "cell_type": "code", "execution_count": null, "id": "cb017c17", "metadata": {}, "outputs": [], "source": [ "dict1" ] }, { "cell_type": "code", "execution_count": null, "id": "1895fa25", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "file_path_verifier = \"/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json\"\n", "with open(file_path_verifier, 'r') as f_verifier:\n", " subclaim_verifier_results = json.load(f_verifier)\n", "# keys\n", "# subclaim_verifier_results[0].keys() #dict_keys(['id', 'version', 'completeness', 'conciseness', 'attribution'])\n", "# subclaim_verifier_results[0]['attribution'].keys() #dict_keys(['metric', 'version', 'input_text', 'results', 'total', 'correct', 'accuracy'])" ] }, { "cell_type": "code", "execution_count": null, "id": "2d7f2c2e", "metadata": {}, "outputs": [], "source": [ "full_info=[]\n", "for item in subclaim_verifier_results:\n", " success=0\n", " for item2 in item['attribution']['results']:\n", " if item2['result']==\"1\":\n", " success+=1\n", " elif item2['result']==\"0\":\n", " print((item['id'], item['version'], item2['subclaim']['id']))\n", " print(dict1.get((item['id'], item['version'], item2['subclaim']['id'])))\n", " success+=dict1.get((item['id'], item['version'], item2['subclaim']['id']),0)\n", " full_info.append({'id':item['id'], 'version':item['version'], 'success':success, 'accuracy':success/len(item['attribution']['results'])})\n", " " ] }, { "cell_type": "code", "execution_count": null, "id": "6b797d72", "metadata": {}, "outputs": [], "source": [ "accuracy_list=sum([entry['accuracy'] for entry in full_info if entry['version']=='easy'])/len([entry['accuracy'] for entry in full_info if entry['version']=='easy'])\n", "print(\"Easy version accuracy:\", accuracy_list)\n", "accuracy_list=sum([entry['accuracy'] for entry in full_info if entry['version']=='intermediate'])/len([entry['accuracy'] for entry in full_info if entry['version']=='intermediate'])\n", "print(\"Intermediate version accuracy:\", accuracy_list)\n", "accuracy_list=sum([entry['accuracy'] for entry in full_info if entry['version']=='hard'])/len([entry['accuracy'] for entry in full_info if entry['version']=='hard'])\n", "print(\"Hard version accuracy:\", accuracy_list)" ] }, { "cell_type": "code", "execution_count": null, "id": "d819721b", "metadata": {}, "outputs": [], "source": [ "# /home/mshahidul/readctrl/data/model_validity_check/subclaims_validity_check_v1.json read\n", "import json\n", "with open(\"/home/mshahidul/readctrl/data/model_validity_check/subclaims_validity_check_v1.json\", \"r\") as f:\n", " res = json.load(f)\n", "acc=0\n", "incorrect_cases=0\n", "for item in res:\n", " acc+=item['overall_accuracy']\n", " incorrect_cases+=len(item['incorrect_or_unsafe_subclaims'])\n", "print(\"Overall accuracy:\", acc/len(res))\n", "print(\"Total incorrect or unsafe subclaims:\", incorrect_cases/len(res))" ] }, { "cell_type": "code", "execution_count": null, "id": "7fc2ac40", "metadata": {}, "outputs": [], "source": [ "# /home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check(attr)_v1(cal_v1).json read\n", "import json\n", "with open(\"/home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check(attr)_v1(cal_v1).json\", \"r\") as f:\n", " res = json.load(f)\n", "acc=0\n", "for item in res:\n", " acc+=item['accuracy']\n", "print(\"Overall support validity accuracy:\", acc/len(res))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5257bad8", "metadata": {}, "outputs": [], "source": [ "import json\n", "with open(\"/home/mshahidul/readctrl/data/concise_complete_attr_testing/evaluated_metrics_0_480_nemotron-3-nano-30b-a3b_v2.json\", \"r\") as f:\n", " res = json.load(f)\n", "acc=0\n", "for item in res:\n", " acc+=item['accuracy']\n", "print(\"Overall support validity accuracy:\", acc/len(res))" ] }, { "cell_type": "code", "execution_count": null, "id": "03e9f9e4", "metadata": {}, "outputs": [], "source": [ "# /home/mshahidul/readctrl/data/concise_complete_attr_testing/evaluated_metrics_0_240_mistral31_24B_v2.json\n", "import json\n", "# Overall correctness accuracy: 0.895 --> mistral31_24B\n", "# Overall correctness accuracy: 0.898 --> qwen3_32B\n", "with open(\"/home/mshahidul/readctrl/data/concise_complete_attr_testing/evaluated_metrics_0_480_nemotron-3-nano-30b-a3b_v2.json\", \"r\") as f:\n", " res = json.load(f)\n", "# print(res[0])\n", "acc=0\n", "for item in res:\n", " if item['correctness']==True:\n", " acc+=1\n", "print(\"Overall correctness accuracy:\", acc/len(res))" ] }, { "cell_type": "code", "execution_count": null, "id": "ebb4a213", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import cohen_kappa_score, confusion_matrix\n", "import pandas as pd\n", "with open(\"/home/mshahidul/readctrl/data/concise_complete_attr_testing/evaluated_metrics_0_480_nemotron-3-nano-30b-a3b_v2.json\", \"r\") as f:\n", " res = json.load(f)\n", "# 1. Define your model outputs\n", "# Ensure the order of elements matches for both lists\n", "# gpt5_labels = [\"Supported\", \"Not Supported\", \"Supported\", \"Supported\", \"Not Supported\"]\n", "# qwen_labels = [\"Supported\", \"Supported\", \"Supported\", \"Not Supported\", \"Not Supported\"]\n", "gpt5_labels=[x['label_gt'] for x in res]\n", "qwen_labels=[x['label_gen'] for x in res]\n", "# 2. Map strings to integers for calculation\n", "mapping = {\"supported\": 1, \"not_supported\": 0}\n", "y_gpt5 = [mapping[label] for label in gpt5_labels]\n", "y_qwen = [mapping[label] for label in qwen_labels]\n", "\n", "# 3. Calculate Cohen's Kappa\n", "kappa = cohen_kappa_score(y_gpt5, y_qwen)\n", "\n", "print(f\"Cohen's Kappa: {kappa:.4f}\")\n", "\n", "# 4. (Optional) Visualize the disagreement with a Confusion Matrix\n", "cm = confusion_matrix(y_gpt5, y_qwen)\n", "cm_df = pd.DataFrame(cm, index=['Actual Not-Sup', 'Actual Sup'], \n", " columns=['Pred Not-Sup', 'Pred Sup'])\n", "print(\"\\nConfusion Matrix:\")\n", "print(cm_df)" ] }, { "cell_type": "code", "execution_count": null, "id": "d3ef3549", "metadata": {}, "outputs": [], "source": [ "with open(\"/home/mshahidul/readctrl/data/extracting_subclaim/extracted_subclaims_full_data.json\", \"r\") as f:\n", " full_text = json.load(f)\n", "full_text_info=[]\n", "for entry in full_text[:5]:\n", " for label in [\"easy\", \"intermediate\", \"hard\"]:\n", " full_text_info.append(entry['fulltext'])" ] }, { "cell_type": "code", "execution_count": null, "id": "90ad1af2", "metadata": {}, "outputs": [], "source": [ "len(full_text_info)" ] }, { "cell_type": "code", "execution_count": null, "id": "4ebd5e67", "metadata": {}, "outputs": [], "source": [ "# /home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check_gt_gpt5(1-5).json\n", "with open(\"/home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check_gt_gpt5(1-5).json\", \"r\") as f:\n", " res = json.load(f)\n", "full_data=[]\n", "for index, item in enumerate(res):\n", " full_data.append({\n", " \"index\": index,\n", " \"full_text\": full_text_info[index],\n", " \"dat\": item\n", " })" ] }, { "cell_type": "code", "execution_count": null, "id": "5f7a19ff", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "deba1c76", "metadata": {}, "outputs": [], "source": [ "with open(\"/home/mshahidul/readctrl/data/model_validity_check/subclaims_support_validity_check_gt_gpt5(1-5).json\", \"w\") as f:\n", " json.dump(full_data, f, indent=2, ensure_ascii=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "5812366d", "metadata": {}, "outputs": [], "source": [ "python -m wikiextractor.WikiExtractor /home/mshahidul/readctrl/data/wiki-text/simplewiki-latest-pages-articles.xml --json -o /home/mshahidul/readctrl/data/wiki-text/wiki" ] }, { "cell_type": "code", "execution_count": 11, "id": "02b7dd74", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Generating train split: 100%|██████████| 1841155/1841155 [00:25<00:00, 71022.97 examples/s] \n" ] } ], "source": [ "from datasets import load_dataset\n", "\n", "ds = load_dataset(\"wikimedia/wikipedia\", \"20231101.es\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "7aaaa96e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1841155" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "name": "stdout", "output_type": "stream", "text": [ "The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.\n" ] } ], "source": [ "len(ds['train'])" ] }, { "cell_type": "code", "execution_count": null, "id": "2c8a80f0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['id', 'url', 'title', 'text'],\n", " num_rows: 6407814\n", " })\n", "})" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_target_documents=[item['text'] for item in ds['test'].select(range(5))]" ] }, { "cell_type": "code", "execution_count": null, "id": "9049e532", "metadata": {}, "outputs": [], "source": [ "from rank_bm25 import BM25Okapi\n", "\n", "# 1. Your collection of documents\n", "# corpus = [\n", "# \"The capital of France is Paris.\",\n", "# \"Python is a popular programming language.\",\n", "# \"The deep learning model was trained on a large dataset.\",\n", "# \"Paris is known for the Eiffel Tower.\"\n", "# ]\n", "corpus = [item['text'] for item in ds['train'].select(range(100))]\n", "tokenized_corpus=[]\n", "for item in ds['train'].select(range(100)):\n", " dd=item['text'].lower().replace(\"\\n\",\" \").strip().split(\" \")\n", " tokenized_corpus.append(dd)\n", " \n", "# 2. Tokenize the corpus (split into words)\n", "# tokenized_corpus = [doc.lower().split(\" \") for doc in corpus]\n", "bm25 = BM25Okapi(tokenized_corpus)\n", "\n", "# 3. Define a query\n", "query = \"What is the capital of France?\"\n", "tokenized_query = query.lower().split(\" \")\n", "\n", "# 4. Get the best results\n", "top_n = bm25.get_top_n(tokenized_query, corpus, n=1)\n", "print(f\"Top Result: {top_n[0]}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "4627abce", "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer, util\n", "\n", "model = SentenceTransformer('all-MiniLM-L6-v2')\n", "wiki_embeddings = model.encode(wiki_list, convert_to_tensor=True)\n", "\n", "# For a given document D\n", "d_embedding = model.encode(document_d, convert_to_tensor=True)\n", "hits = util.semantic_search(d_embedding, wiki_embeddings, top_k=5)\n", "# Filter hits by length and select the best match" ] }, { "cell_type": "code", "execution_count": 5, "id": "c885f4e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['index', 'fulltext', 'diff_label_texts'])\n" ] } ], "source": [ "# /home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_v1.json\n", "import json\n", "with open(\"/home/mshahidul/readctrl/data/synthetic_dataset_diff_labels/syn_data_diff_labels_en_v1.json\", \"r\") as f:\n", " res = json.load(f)\n", "print(res[0].keys())" ] }, { "cell_type": "code", "execution_count": 6, "id": "b1aac332", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['low_health_literacy', 'intermediate_health_literacy', 'proficient_health_literacy'])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res[0]['diff_label_texts'].keys()" ] }, { "cell_type": "code", "execution_count": 7, "id": "f799f34a", "metadata": {}, "outputs": [], "source": [ "my_target_documents = []\n", "for item in res:\n", " for key,value in item['diff_label_texts'].items():\n", " my_target_documents.append({\n", " \"index\": item['index'],\n", " \"label\": key,\n", " \"diff_label_texts\": value # Example: pick one of the diff label texts\n", " })" ] }, { "cell_type": "code", "execution_count": 8, "id": "4bf14cda", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'index': 0,\n", " 'label': 'low_health_literacy',\n", " 'diff_label_texts': 'You are a 20‑year‑old woman with a long‑term kidney problem that makes you lose protein in your urine. It first showed up when you had big blood clots in the veins of your brain and in your lungs. You took blood thinners and steroid pills. Later you took another medicine to calm the immune system and prevent flare‑ups. Tests for a built‑in clotting problem were normal. You had several flare‑ups, but steroid pills kept them under control until 2017. After that, you stayed well. The blood thinners and the immune‑calming medicine were stopped. About a year later, you had sudden, very bad belly pain. You threw up after eating. Your legs became puffy. Tests showed your kidney problem had come back. A scan showed a new clot in a big artery that feeds your intestines. Not enough blood reached your bowel. In surgery, most of your small intestine was found dead. The damage could not be fixed. You died 48 hours later.'}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_target_documents[0]" ] } ], "metadata": { "kernelspec": { "display_name": "un", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.14" } }, "nbformat": 4, "nbformat_minor": 5 }