File size: 4,819 Bytes

1db7196

import os, json
def return_prompts(reference_summary, generated_summary, subclaims_json, difficulty_level):
    prompt=f'''
You are a **medical summarization quality evaluator**.
Your goal is to decide whether the inclusion or omission of each subclaim in the generated summary is *reasonable*, given the target readability level.

---

### **Input**

```
Readability Level: {difficulty_level}

Reference Summary:
{reference_summary}

Generated Summary:
{generated_summary}

Subclaims with Support Results:
{subclaims_json}
```

---

### **Task**

For each subclaim:

1. Read `result`:

   * `1` = the subclaim is supported or clearly mentioned in the generated summary.
   * `0` = the subclaim is missing or not supported.

2. Based on readability level and medical relevance, decide whether this inclusion/omission is **reasonable**, **partially reasonable**, or **unreasonable**.

3. Provide a short justification (1–2 sentences) explaining your reasoning.

---

### **Output Format**

Return structured JSON:

```json
{{
  "readability_level": "<easy/intermediate/hard>",
  "evaluations": [
    {{
      "subclaim_id": <id>,
      "subclaim_text": "<text>",
      "result": <0 or 1>,
      "reasonableness": "<reasonable | partially_reasonable | unreasonable>",
      "justification": "<short explanation>"
    }},
    ...
  ]
}}
```

---

### **Evaluation Guidelines**

| Readability Level | Reasonable Omission                                          | Unreasonable Omission                             |
| ----------------- | ------------------------------------------------------------ | ------------------------------------------------- |
| **Easy**          | Technical, anatomical, quantitative, or procedural details.  | Key clinical findings, diagnoses, or outcomes.    |
| **Intermediate**  | Minor imaging details or measurements.                       | Any main diagnostic finding or cause–effect link. |
| **Hard**          | Very few omissions acceptable; mostly stylistic compression. | Any missing clinical or diagnostic information.   |

'''
    return prompt

from openai import OpenAI

file_path = "/home/mshahidul/api_new.json"
with open(file_path, "r") as file:
    api_keys = json.load(file)

openai_api_key = api_keys.get("openai")

client = OpenAI(api_key=openai_api_key)
def openai_return(prompt):
    response = client.chat.completions.create(
        model="gpt-5",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
    return json.loads(cleaned_response)

import json
file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"

with open(file_path, 'r') as f:
    synthetic_data = json.load(f)

file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"

with open(file_path_qwen3_32B, 'r') as f:
    qwen3_32B_results = json.load(f)

# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
# print(f"Full text: {synthetic_data[0]['full_text']}")
res=[]
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_data_resonability_check_20_gpt5.json"
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        res = json.load(f)
exist_check_ids = set([(item['id'], item['difficulty_level']) for item in res])
print(f"Resuming from {len(res)} entries")
import tqdm
for ind in tqdm.tqdm(range(0,20)):
    print(f"Processing index: {ind}")
    for version in ["easy", "intermediate", "hard"]:
        if (synthetic_data[ind]['id'], version) in exist_check_ids:
            print(f"Skipping {synthetic_data[ind]['id']} {version}")
            continue
        ref_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
        generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
        subclaims_results = (f"{qwen3_32B_results[ind]['completeness']['results']}")
        try:
            prompt = return_prompts(ref_summary, generated_summary, subclaims_results, version)
            res.append({
                "id": synthetic_data[ind]['id'],
                "difficulty_level": version,
                "reasonableness": openai_return(prompt)
            })
            if len(res)%2==0:
                print(f"Completed {len(res)} out of 300")
                with open(save_path, 'w') as outfile:
                    json.dump(res, outfile, indent=2)
        except Exception as e:
            print(f"Error at {ind} {version}: {e}")
        # print(prompt)
        # assert False
with open(save_path, 'w') as outfile:
    json.dump(res, outfile, indent=2)