import os, json def return_promptst(reference_summary, generated_summary, subclaims_json, difficulty_level): prompt=f''' **SYSTEM / ROLE INSTRUCTION:** You are a **medical readability evaluator**. Your task is to judge whether omitted subclaims (those with `"result": 0"`) from a generated summary are *reasonably omitted* based on the intended **readability level**: *easy*, *intermediate*, or *hard*. You evaluate this from the standpoint of clarity, faithfulness, and readability goals. --- ### **READABILITY GUIDELINES** | Level | Target Audience | Content Expectation | Technical Detail Allowed | | :--------------- | :--------------------------------------- | :-------------------------------------------------------------- | :--------------------------------------------------------------- | | **Easy** | General public | Focus on main events, outcomes, and diagnoses in plain Spanish. | Minimal — avoid measurements, anatomy, and test results. | | **Intermediate** | Educated lay readers or medical students | Include key findings and procedures in simplified form. | Moderate — basic terms and causes allowed. | | **Hard** | Medical professionals | Retain most technical information and precision. | High — measurements, anatomy, and test interpretations expected. | --- ### **INPUT FIELDS** **Reference summary:** {reference_summary} **Generated summary ({difficulty_level}):** {generated_summary} **Subclaims and results:** {subclaims_json} --- ### **TASK INSTRUCTIONS** 1. Focus on subclaims with `"result": 0"` (not supported by the generated summary). 2. For each omitted subclaim: * Decide whether omission is **reasonable** given the readability level. * Label as: `"yes"`, `"no"`, or `"borderline"`. * Write a brief justification (1–2 sentences). 3. After individual evaluations, assign a **reasonableness score (0–5)** using this scale: * **5** = All omissions appropriate for target readability. * **4** = Minor omissions could improve completeness. * **3** = Some omissions reduce understanding or medical clarity. * **2** = Many important omissions harm faithfulness. * **1** = Major omissions misrepresent case. * **0** = Summary fails to reflect key medical information. 4. End with an **overall explanation (3–5 sentences)** describing: * The main reasoning behind the score. * Whether the summary fits its intended readability level. * Suggestions for improvement if needed. --- ### **OUTPUT FORMAT (strict JSON)** ```json {{ "evaluation_table": [ {{ "id": , "subclaim": "", "reasonable_omission": "", "explanation": "" }} ], "reasonableness_score": <0-5>, "overall_explanation": "" }} ``` ''' return prompt from openai import OpenAI file_path = "/home/mshahidul/api_new.json" with open(file_path, "r") as file: api_keys = json.load(file) openai_api_key = api_keys.get("openai") client = OpenAI(api_key=openai_api_key) def openai_return(prompt): response = client.chat.completions.create( model="gpt-5-mini", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] ) cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "") return json.loads(cleaned_response) import json file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json" with open(file_path, 'r') as f: synthetic_data = json.load(f) file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json" with open(file_path_qwen3_32B, 'r') as f: qwen3_32B_results = json.load(f) # dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions']) # print(f"Full text: {synthetic_data[0]['full_text']}") res=[] save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5.json" if os.path.exists(save_path): with open(save_path, 'r') as f: res = json.load(f) print(f"Resuming from {len(res)} entries") import tqdm for ind in tqdm.tqdm(range(len(res),100)): print(f"Processing index: {ind}") for version in ["easy", "intermediate", "hard"]: ref_summary = (f"{synthetic_data[ind]['ref_summary']['text']}") generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}") subclaims_results = (f"{qwen3_32B_results[ind]['completeness']['results']}") try: prompt = return_promptst(ref_summary, generated_summary, subclaims_results, version) res.append({ "id": synthetic_data[ind]['id'], "difficulty_level": version, "prompt": openai_return(prompt) }) if len(res)%2==0: print(f"Completed {len(res)} out of 300") with open(save_path, 'w') as outfile: json.dump(res, outfile, indent=2) except Exception as e: print(f"Error at {ind} {version}: {e}") # print(prompt) # assert False with open(save_path, 'w') as outfile: json.dump(res, outfile, indent=2)