File size: 5,956 Bytes

1db7196

def return_prompts_attribution(reference_full_text, generated_summary, subclaims_json, difficulty_level):
    return f'''
### **SYSTEM / ROLE INSTRUCTION**

You are a **medical factuality and attribution evaluator**.
You will assess whether **unsupported subclaims** in a generated summary (those with `"result": 0"`) are *reasonable additions* based on the readability level (*easy / intermediate / hard*).

The goal is to determine whether these **extra pieces of information** are acceptable simplifications or *hallucinations* that reduce factual faithfulness.

---

### **READABILITY & ATTRIBUTION GUIDELINES**

| Level            | Audience                         | Content Goal                                                           | Allowable Additions                                                                |
| :--------------- | :------------------------------- | :--------------------------------------------------------------------- | :--------------------------------------------------------------------------------- |
| **Easy**         | General public                   | Simplify and clarify events                                            | Allow general background info or lay explanations, but not new facts or diagnoses. |
| **Intermediate** | Educated layperson / med student | Add brief clarifications or causal context if consistent with the text | Allow inferred, non-contradictory context; avoid adding unconfirmed data.          |
| **Hard**         | Medical professional             | Maintain factual precision                                             | No additions; everything must be supported by source text.                         |

---

### **INPUT FIELDS**

**Reference full text:**
{reference_full_text}

**Generated summary ({difficulty_level}):**
{generated_summary}

**Subclaims and results:**
{subclaims_json}

---

### **TASK INSTRUCTIONS**

1. Focus only on subclaims with `"result": 0"` (not supported by the input text).
2. For each unsupported subclaim:

   * Judge whether adding it is **reasonable** for the given readability level.
   * Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.
   * Provide a **1–2 sentence justification** explaining your reasoning.
3. After all evaluations, assign a **numerical attribution score (0–5)**:

   * **5** = All additions are reasonable or harmless simplifications.
   * **4** = Mostly reasonable; minor harmless additions.
   * **3** = Some misleading or unjustified additions.
   * **2** = Many factual inaccuracies.
   * **1** = Serious hallucinations; distorts source meaning.
   * **0** = Highly unfaithful; mostly invented content.
4. End with an **overall explanation (3–5 sentences)** summarizing your reasoning and suggestions.

---

### **OUTPUT FORMAT (strict JSON)**

```json
{{
  "evaluation_table": [
    {{
      "id": <subclaim_id>,
      "subclaim": "<text>",
      "evaluation": "<reasonable addition | unnecessary but harmless | misleading / hallucinated>",
      "explanation": "<short justification>"
    }}
  ],
  "attribution_score": <0-5>,
  "overall_explanation": "<concise summary of your judgment>"
}}
```
'''
from openai import OpenAI
import json
file_path = "/home/mshahidul/api_new.json"
with open(file_path, "r") as file:
    api_keys = json.load(file)

openai_api_key = api_keys.get("openai")

client = OpenAI(api_key=openai_api_key)
def openai_return(prompt):
    response = client.chat.completions.create(
        model="gpt-5-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
    return json.loads(cleaned_response)


import json
file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"

with open(file_path, 'r') as f:
    synthetic_data = json.load(f)

file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"

with open(file_path_qwen3_32B, 'r') as f:
    qwen3_32B_results = json.load(f)

# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
# print(f"Full text: {synthetic_data[0]['full_text']}")
import os

res=[]
temp=""
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_attribution.json"
if os.path.exists(save_path):
    with open(save_path, 'r') as f:
        res = json.load(f)
print(f"Resuming from {len(res)} entries")
existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
import tqdm
for ind in tqdm.tqdm(range(len(res),100)):
    for version in ["easy", "intermediate", "hard"]:
        if (synthetic_data[ind]['id'], version) in existing_check:
            print(f"Skipping {synthetic_data[ind]['id']}, {version}")
            continue
        ref_full_text_summary = (f"{synthetic_data[ind]['full_text']}")
        generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
        subclaims_results = (f"{qwen3_32B_results[ind]['attribution']['results']}")
        prompt = return_prompts_attribution(ref_full_text_summary, generated_summary, subclaims_results, version)
        try:
            ans=openai_return(prompt)
            res.append({
                "id": synthetic_data[ind]['id'],
                "difficulty_level": version,
                "response": ans
            })
            
            if len(res)%2==0:
                print(f"Completed {len(res)} out of 300")
                with open(save_path, 'w') as outfile:
                    json.dump(res, outfile, indent=2)
        except Exception as e:
            print(f"Error at index {ind}, version {version}: {e}")

with open(save_path, 'w') as outfile:
    json.dump(res, outfile, indent=2)