File size: 6,143 Bytes
c7a6fe6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | import os, json
def return_promptst(reference_summary, generated_summary, subclaims_json, difficulty_level):
prompt=f'''
**SYSTEM / ROLE INSTRUCTION:**
You are a **medical readability evaluator**.
Your task is to judge whether omitted subclaims (those with `"result": 0"`) from a generated summary are *reasonably omitted* based on the intended **readability level**: *easy*, *intermediate*, or *hard*.
You evaluate this from the standpoint of clarity, faithfulness, and readability goals.
---
### **READABILITY GUIDELINES**
| Level | Target Audience | Content Expectation | Technical Detail Allowed |
| :--------------- | :--------------------------------------- | :-------------------------------------------------------------- | :--------------------------------------------------------------- |
| **Easy** | General public | Focus on main events, outcomes, and diagnoses in plain Spanish. | Minimal — avoid measurements, anatomy, and test results. |
| **Intermediate** | Educated lay readers or medical students | Include key findings and procedures in simplified form. | Moderate — basic terms and causes allowed. |
| **Hard** | Medical professionals | Retain most technical information and precision. | High — measurements, anatomy, and test interpretations expected. |
---
### **INPUT FIELDS**
**Reference summary:**
{reference_summary}
**Generated summary ({difficulty_level}):**
{generated_summary}
**Subclaims and results:**
{subclaims_json}
---
### **TASK INSTRUCTIONS**
1. Focus on subclaims with `"result": 0"` (not supported by the generated summary).
2. For each omitted subclaim:
* Decide whether omission is **reasonable** given the readability level.
* Label as: `"yes"`, `"no"`, or `"borderline"`.
* Write a brief justification (1–2 sentences).
3. After individual evaluations, assign a **reasonableness score (0–5)** using this scale:
* **5** = All omissions appropriate for target readability.
* **4** = Minor omissions could improve completeness.
* **3** = Some omissions reduce understanding or medical clarity.
* **2** = Many important omissions harm faithfulness.
* **1** = Major omissions misrepresent case.
* **0** = Summary fails to reflect key medical information.
4. End with an **overall explanation (3–5 sentences)** describing:
* The main reasoning behind the score.
* Whether the summary fits its intended readability level.
* Suggestions for improvement if needed.
---
### **OUTPUT FORMAT (strict JSON)**
```json
{{
"evaluation_table": [
{{
"id": <subclaim_id>,
"subclaim": "<text>",
"reasonable_omission": "<yes | no | borderline>",
"explanation": "<short reason>"
}}
],
"reasonableness_score": <0-5>,
"overall_explanation": "<concise paragraph>"
}}
```
'''
return prompt
from openai import OpenAI
file_path = "/home/mshahidul/api_new.json"
with open(file_path, "r") as file:
api_keys = json.load(file)
openai_api_key = api_keys.get("openai")
client = OpenAI(api_key=openai_api_key)
def openai_return(prompt):
response = client.chat.completions.create(
model="gpt-5-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
)
cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
return json.loads(cleaned_response)
import json
file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
with open(file_path, 'r') as f:
synthetic_data = json.load(f)
file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
with open(file_path_qwen3_32B, 'r') as f:
qwen3_32B_results = json.load(f)
# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
# print(f"Full text: {synthetic_data[0]['full_text']}")
res=[]
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5.json"
if os.path.exists(save_path):
with open(save_path, 'r') as f:
res = json.load(f)
print(f"Resuming from {len(res)} entries")
import tqdm
for ind in tqdm.tqdm(range(len(res),100)):
print(f"Processing index: {ind}")
for version in ["easy", "intermediate", "hard"]:
ref_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
subclaims_results = (f"{qwen3_32B_results[ind]['completeness']['results']}")
try:
prompt = return_promptst(ref_summary, generated_summary, subclaims_results, version)
res.append({
"id": synthetic_data[ind]['id'],
"difficulty_level": version,
"prompt": openai_return(prompt)
})
if len(res)%2==0:
print(f"Completed {len(res)} out of 300")
with open(save_path, 'w') as outfile:
json.dump(res, outfile, indent=2)
except Exception as e:
print(f"Error at {ind} {version}: {e}")
# print(prompt)
# assert False
with open(save_path, 'w') as outfile:
json.dump(res, outfile, indent=2) |