File size: 6,316 Bytes

c7a6fe6

def inference_prompt_revise_summary(fulltext, ref_summary, generated_summary, version, missing_subclaims):
    prompt = f"""
You are a medical summarization model specialized in readability-controlled text revision.

Your task is to improve the **Generated Summary** by adding back the key missing clinical information listed under **Missing Subclaims**, while keeping the readability style defined for the level **{version}**.

Do not copy the reference summary. Keep coherence, brevity, and correctness.

---

### INPUT

**Full Text (for context):**
{fulltext}

**Reference Summary (for comparison only):**
{ref_summary}

**Generated Summary (to revise):**
{generated_summary}

**Missing Subclaims (to integrate naturally):**
{missing_subclaims}

---

### READABILITY STYLES

- **easy (FH 70–100, grade 5–7):**
  - Short sentences, familiar vocabulary, concrete ideas.
  - Avoid subordinate clauses and medical jargon.
  - Tone: explanatory, simple, and friendly.

- **intermediate (FH 50–69, grade 8–12):**
  - Moderate sentence complexity and domain vocabulary.
  - Clear and structured explanation.

- **hard (FH 0–49, university/professional):**
  - Use specialized terminology, formal and dense phrasing.
  - Include:
    - precise domain vocabulary;
    - causal or analytical connectors (por consiguiente, sin embargo, dado que…);
    - one definition, one process description, and one implication statement if possible;
    - optional subordinate clauses for academic rhythm.

---

### OUTPUT
Return the result in the following JSON format:

{{
  "revised_summary": "<your revised summary text here>"
}}

Ensure the text is coherent, medically accurate, and matches the **{version}** readability level.
"""
    return prompt


from openai import OpenAI
import json
file_path = "/home/mshahidul/api_new.json"
with open(file_path, "r") as file:
    api_keys = json.load(file)

openai_api_key = api_keys.get("openai")

client = OpenAI(api_key=openai_api_key)
def openai_return(prompt):
    response = client.chat.completions.create(
        model="gpt-5",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    try:
        cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
        return json.loads(cleaned_response)
    except Exception as e:
        return response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
import json
file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"

with open(file_path, 'r') as f:
    synthetic_data = json.load(f)



with open("/home/mshahidul/readctrl/results/dataset_quality_check/completeness_resonability_check_100_qwen3-32B_v3.json", 'r') as f:
    readability_reasoning = json.load(f)

import json, ast

reason_info = {}

for item in readability_reasoning:
    id = item['id']
    difficulty_level = item['version']
    data_temp = item['completeness']
    for _data in data_temp['results']:
        reasonableness = _data['reasonableness']
        
        # Step 1: Try to parse as JSON
        if isinstance(reasonableness, str):
            parsed = None
            try:
                parsed = json.loads(reasonableness)
            except Exception:
                try:
                    parsed = ast.literal_eval(reasonableness)
                except Exception:
                    # Not JSON or dict — treat as plain text
                    parsed = {"reasonableness": "unknown", "justification": reasonableness}
            reasonableness = parsed

        # Step 2: Skip if "reasonable"
        if reasonableness.get('reasonableness') in ["reasonable","unknown"]:
            continue

        # Step 3: Collect non-reasonable subclaims
        key = (id, difficulty_level)
        reason_info.setdefault(key, []).append(_data['subclaim'])



file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"

with open(file_path_qwen3_32B, 'r') as f:
    qwen3_32B_results = json.load(f)

# def inference_prompt_revise_summary(fulltext, ref_summary, generated_summary, version, missing_subclaims):
import os
with open("/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json", "r") as f_train:
    multiclinsum_gs_train_es = json.load(f_train)
dat_full_text={}
dat_summary={}
for item in multiclinsum_gs_train_es:
    dat_full_text[item['id']]=item['fulltext']
    dat_summary[item['id']]=item['summary']
res=[]
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/results_revised_100_gpt5_v3.json"
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        res = json.load(f)
existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
print(f"Resuming from {len(res)} entries")
import tqdm
for ind in tqdm.tqdm(range(0,10)):
    for version in ["easy", "intermediate", "hard"]:
        reference_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
        generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
        if (synthetic_data[ind]['id'],version) in existing_check:
            continue
        if (synthetic_data[ind]['id'],version) not in reason_info or len(reason_info[(synthetic_data[ind]['id'],version)])==0:
            continue
        missing_subclaims = reason_info[(synthetic_data[ind]['id'],version)]
        prompt = inference_prompt_revise_summary(dat_full_text[synthetic_data[ind]['id']], reference_summary, generated_summary, version, missing_subclaims)
        try:
            ans=openai_return(prompt)
            res.append({
                "id": synthetic_data[ind]['id'],
                "difficulty_level": version,
                "prompt": prompt,
                "response": ans
            })
            
            if len(res)%2==0:
                print(f"Completed {len(res)} out of 300")
                with open(save_path, 'w') as outfile:
                    json.dump(res, outfile, indent=2)
        except Exception as e:
            print(f"Error at index {ind}, version {version}: {e}")

with open(save_path, 'w') as outfile:
    json.dump(res, outfile, indent=2)