File size: 5,775 Bytes

c7a6fe6

import os
import json
import tqdm
from openai import OpenAI

# =====================================================
# 1️⃣  Setup: Load API key, initialize client
# =====================================================

api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
    api_keys = json.load(f)
openai_api_key = api_keys["openai"]

client = OpenAI(api_key=openai_api_key)


# =====================================================
# 2️⃣  OpenAI call helper
# =====================================================

def openai_return(prompt, model="gpt-5"):
    """Send a prompt to GPT and parse JSON."""
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    content = response.choices[0].message.content.strip()
    cleaned = content.replace("```json", "").replace("```", "").strip()
    try:
        return json.loads(cleaned)
    except json.JSONDecodeError:
        print("⚠️ JSON parse failed — storing raw text.")
        return cleaned


# =====================================================
# 3️⃣  Multi‑subclaim attribution prompt builder
# =====================================================

def return_prompts_attribution_multi(reference_full_text, generated_summary, subclaims_json, difficulty_level):
    return f"""
### **SYSTEM / ROLE INSTRUCTION**

You are a **medical factuality and attribution evaluator**.
You will analyze all subclaims found in a generated summary, each labeled with a `"result"` flag:
- `1` = supported by the reference
- `0` = unsupported by the reference

Your main task is to **evaluate only the unsupported subclaims (`"result": 0"`)**, judging whether each is a *reasonable addition* given the specified readability level (*easy / intermediate / hard*).

The presence of supported items (`"result": 1"`) helps you understand the full context of what is confirmed versus speculative,
but you will not rate those. Their inclusion enriches the training data diversity and realism.

---

### **READABILITY & ATTRIBUTION GUIDELINES**

| Level | Audience | Linguistic & Stylistic Profile | Allowable Additions |
| :-- | :-- | :-- | :-- |
| **Easy (FH 70–100)** | General public | Short, simple, concrete sentences | General explanations only; no new factual claims |
| **Intermediate (FH 50–69)** | Educated layperson | Moderate complexity and precision | Clarifying causal links aligned with the text |
| **Hard (FH 0–49)** | Professionals | Formal, technical, multi‑clause detail | Must strictly reflect source evidence |

---

### **Input**
Readability Level: {difficulty_level}

Reference Full Text:
{reference_full_text}

Generated Summary:
{generated_summary}

All Subclaims with Support Results:
{subclaims_json}

---

### **TASK INSTRUCTIONS**

For **each subclaim where** `"result": 0"`, classify it as:

- `"reasonable"` – legitimate simplification aligned with readability needs  
- `"partially_reasonable"` – harmless addition or neutral paraphrase  
- `"unreasonable"` – misleading, speculative, or factually unsupported  

Support your judgment with a 1–2 sentence justification per item.

Do **not** modify or comment on subclaims where `"result": 1"`.

---

### **Output JSON Format**

```json
{{
  "evaluations": [
    {{
      "subclaim_id": <id>,
      "subclaim": "<verbatim_subclaim>",
      "result": <0 or 1>,
      "reasonableness": "<reasonable | partially_reasonable | unreasonable | not_applicable>",
      "justification": "<short justification for result=0; for result=1, just write 'supported, no evaluation required'>"
    }},
    ...
  ]
}}
"""
file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_attribution_resonability_check_100_gpt5_train_v2.json"

with open(file_synth, 'r') as f:
    synthetic_data = json.load(f)
with open(file_qwen_results, 'r') as f:
    qwen3_32B_results = json.load(f) 
res = []
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        res = json.load(f)
print(f"🔁 Resuming from {len(res)} entries")

existing = set((e["id"], e["difficulty_level"]) for e in res)

for ind in tqdm.tqdm(range(0, 30)):
    entry = synthetic_data[ind]
    subclaims_results = qwen3_32B_results[ind]['attribution']['results']
    subclaims_json = json.dumps(subclaims_results, indent=2, ensure_ascii=False)
    for level in ["easy", "intermediate", "hard"]:
        if (entry["id"], level) in existing:
            print(f"⏭️ Skipping {entry['id']} ({level})")
            continue

        ref_full_text = entry["full_text"]
        generated_summary = entry["readability_versions"][level]["text"]

        prompt = return_prompts_attribution_multi(
            ref_full_text,
            generated_summary,
            subclaims_json,
            level
        )
        # print(prompt)
        # assert False

        try:
            response = openai_return(prompt)
            res.append({
                "id": entry["id"],
                "difficulty_level": level,
                "response": response
            })

            # save periodically
            if len(res) % 2 == 0:
                with open(save_path, 'w') as f:
                    json.dump(res, f, indent=2, ensure_ascii=False)
                print(f"💾 Saved after {len(res)} entries")

        except Exception as e:
            print(f"❌ Error at index {ind}, level {level}: {e}")