File size: 5,672 Bytes

c7a6fe6

def revised_results(reference_summary, generated_summary, list_of_missing_subclaims, difficulty_level):
    return f'''
### **SYSTEM / ROLE INSTRUCTION**

You are a **medical text rewriting assistant** that improves summaries while maintaining the intended readability level (*easy / intermediate / hard*).
You will receive:

* The **original reference summary** (the factual source)
* The **current generated summary**
* A list of **important missing subclaims** to be reintroduced
* The **target readability level**

Your task:
Revise the generated summary so that it **adds the missing information** naturally, while keeping:

* The same **tone, vocabulary, and sentence simplicity** of the given readability level.
* Logical **flow and coherence**.
* No extra, invented information beyond what’s in the reference summary.

---

### **INPUT FIELDS**

**Reference summary:**
{reference_summary}

**Current generated summary ({difficulty_level}):**
{generated_summary}

**Missing important subclaims to add back:**
{list_of_missing_subclaims}

**Target readability level:**
{difficulty_level}


---

### **TASK INSTRUCTIONS**

1. Integrate the missing subclaims **smoothly** into the generated summary.
2. Do **not** add any new facts beyond those listed.
3. Maintain the **same readability level**:

   * **Easy:** conversational, short sentences, no jargon.
   * **Intermediate:** light medical terms, brief explanations.
   * **Hard:** concise clinical tone with correct terminology.
4. Keep the summary approximately the same length; avoid redundancy.
5. Ensure the resulting text remains **fluent, coherent, and faithful** to the reference summary.

---

### **OUTPUT FORMAT**

```json
{{
  "revised_summary": "<the new version of the summary, rewritten with the added subclaims>",
  "explanation": "<brief note explaining how the missing subclaims were added while preserving readability>"
}}
```

'''
from openai import OpenAI
import json
file_path = "/home/mshahidul/api_new.json"
with open(file_path, "r") as file:
    api_keys = json.load(file)

openai_api_key = api_keys.get("openai")

client = OpenAI(api_key=openai_api_key)
def openai_return(prompt):
    response = client.chat.completions.create(
        model="gpt-5-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
    return json.loads(cleaned_response)
import json
file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"

with open(file_path, 'r') as f:
    synthetic_data = json.load(f)

# /home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_completeness.json



with open("/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_completeness.json", 'r') as f:
    readability_reasoning = json.load(f)
# readability_reasoning[0].keys() # dict_keys(['id', 'difficulty_level', 'prompt'])
# readability_reasoning[0]['prompt'].keys() # dict_keys(['evaluation_table', 'reasonableness_score', 'overall_explanation'])
reason_info={}
for item in readability_reasoning:
    id=item['id']
    difficulty_level=item['difficulty_level']
    data_temp=item['prompt']
    for _data in data_temp['evaluation_table']:
        if _data['reasonable_omission'] == "no":
            key=(id, difficulty_level)
            if key not in reason_info:
                reason_info[key]=[]
            reason_info[key].append(_data['subclaim'])

file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"

with open(file_path_qwen3_32B, 'r') as f:
    qwen3_32B_results = json.load(f)

# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
# print(f"Full text: {synthetic_data[0]['full_text']}")
import os
# def revised_results(reference_summary, generated_summary, list_of_missing_subclaims, difficulty_level):
res=[]
temp=""
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/results_revised_100_gpt5.json"
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        res = json.load(f)
existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
print(f"Resuming from {len(res)} entries")
import tqdm
for ind in tqdm.tqdm(range(0,100)):
    for version in ["easy", "intermediate", "hard"]:
        reference_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
        generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
        if (synthetic_data[ind]['id'],version) in existing_check:
            continue
        if (synthetic_data[ind]['id'],version) not in reason_info:
            continue
        subclaims_results = reason_info[(synthetic_data[ind]['id'],version)]
        prompt = revised_results(reference_summary, generated_summary, subclaims_results, version)
        try:
            ans=openai_return(prompt)
            res.append({
                "id": synthetic_data[ind]['id'],
                "difficulty_level": version,
                "prompt": prompt,
                "response": ans
            })
            
            if len(res)%2==0:
                print(f"Completed {len(res)} out of 300")
                with open(save_path, 'w') as outfile:
                    json.dump(res, outfile, indent=2)
        except Exception as e:
            print(f"Error at index {ind}, version {version}: {e}")

with open(save_path, 'w') as outfile:
    json.dump(res, outfile, indent=2)