File size: 5,956 Bytes
1db7196 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | def return_prompts_attribution(reference_full_text, generated_summary, subclaims_json, difficulty_level):
return f'''
### **SYSTEM / ROLE INSTRUCTION**
You are a **medical factuality and attribution evaluator**.
You will assess whether **unsupported subclaims** in a generated summary (those with `"result": 0"`) are *reasonable additions* based on the readability level (*easy / intermediate / hard*).
The goal is to determine whether these **extra pieces of information** are acceptable simplifications or *hallucinations* that reduce factual faithfulness.
---
### **READABILITY & ATTRIBUTION GUIDELINES**
| Level | Audience | Content Goal | Allowable Additions |
| :--------------- | :------------------------------- | :--------------------------------------------------------------------- | :--------------------------------------------------------------------------------- |
| **Easy** | General public | Simplify and clarify events | Allow general background info or lay explanations, but not new facts or diagnoses. |
| **Intermediate** | Educated layperson / med student | Add brief clarifications or causal context if consistent with the text | Allow inferred, non-contradictory context; avoid adding unconfirmed data. |
| **Hard** | Medical professional | Maintain factual precision | No additions; everything must be supported by source text. |
---
### **INPUT FIELDS**
**Reference full text:**
{reference_full_text}
**Generated summary ({difficulty_level}):**
{generated_summary}
**Subclaims and results:**
{subclaims_json}
---
### **TASK INSTRUCTIONS**
1. Focus only on subclaims with `"result": 0"` (not supported by the input text).
2. For each unsupported subclaim:
* Judge whether adding it is **reasonable** for the given readability level.
* Choose one of: `"reasonable addition"`, `"unnecessary but harmless"`, `"misleading / hallucinated"`.
* Provide a **1–2 sentence justification** explaining your reasoning.
3. After all evaluations, assign a **numerical attribution score (0–5)**:
* **5** = All additions are reasonable or harmless simplifications.
* **4** = Mostly reasonable; minor harmless additions.
* **3** = Some misleading or unjustified additions.
* **2** = Many factual inaccuracies.
* **1** = Serious hallucinations; distorts source meaning.
* **0** = Highly unfaithful; mostly invented content.
4. End with an **overall explanation (3–5 sentences)** summarizing your reasoning and suggestions.
---
### **OUTPUT FORMAT (strict JSON)**
```json
{{
"evaluation_table": [
{{
"id": <subclaim_id>,
"subclaim": "<text>",
"evaluation": "<reasonable addition | unnecessary but harmless | misleading / hallucinated>",
"explanation": "<short justification>"
}}
],
"attribution_score": <0-5>,
"overall_explanation": "<concise summary of your judgment>"
}}
```
'''
from openai import OpenAI
import json
file_path = "/home/mshahidul/api_new.json"
with open(file_path, "r") as file:
api_keys = json.load(file)
openai_api_key = api_keys.get("openai")
client = OpenAI(api_key=openai_api_key)
def openai_return(prompt):
response = client.chat.completions.create(
model="gpt-5-mini",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
)
cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
return json.loads(cleaned_response)
import json
file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"
with open(file_path, 'r') as f:
synthetic_data = json.load(f)
file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"
with open(file_path_qwen3_32B, 'r') as f:
qwen3_32B_results = json.load(f)
# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
# print(f"Full text: {synthetic_data[0]['full_text']}")
import os
res=[]
temp=""
save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_attribution.json"
if os.path.exists(save_path):
with open(save_path, 'r') as f:
res = json.load(f)
print(f"Resuming from {len(res)} entries")
existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
import tqdm
for ind in tqdm.tqdm(range(len(res),100)):
for version in ["easy", "intermediate", "hard"]:
if (synthetic_data[ind]['id'], version) in existing_check:
print(f"Skipping {synthetic_data[ind]['id']}, {version}")
continue
ref_full_text_summary = (f"{synthetic_data[ind]['full_text']}")
generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
subclaims_results = (f"{qwen3_32B_results[ind]['attribution']['results']}")
prompt = return_prompts_attribution(ref_full_text_summary, generated_summary, subclaims_results, version)
try:
ans=openai_return(prompt)
res.append({
"id": synthetic_data[ind]['id'],
"difficulty_level": version,
"response": ans
})
if len(res)%2==0:
print(f"Completed {len(res)} out of 300")
with open(save_path, 'w') as outfile:
json.dump(res, outfile, indent=2)
except Exception as e:
print(f"Error at index {ind}, version {version}: {e}")
with open(save_path, 'w') as outfile:
json.dump(res, outfile, indent=2) |