import os import json import tqdm from openai import OpenAI # ===================================================== # 1️⃣ Setup: Load API key, initialize client # ===================================================== api_file = "/home/mshahidul/api_new.json" with open(api_file, "r") as f: api_keys = json.load(f) openai_api_key = api_keys["openai"] client = OpenAI(api_key=openai_api_key) # ===================================================== # 2️⃣ OpenAI call helper # ===================================================== def openai_return(prompt, model="gpt-5"): """Send a prompt to GPT and parse JSON.""" response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] ) content = response.choices[0].message.content.strip() cleaned = content.replace("```json", "").replace("```", "").strip() try: return json.loads(cleaned) except json.JSONDecodeError: print("⚠️ JSON parse failed — storing raw text.") return cleaned # ===================================================== # 3️⃣ Multi‑subclaim attribution prompt builder # ===================================================== def return_prompts_attribution_multi(reference_full_text, generated_summary, subclaims_json, difficulty_level): return f""" ### **SYSTEM / ROLE INSTRUCTION** You are a **medical factuality and attribution evaluator**. You will analyze all subclaims found in a generated summary, each labeled with a `"result"` flag: - `1` = supported by the reference - `0` = unsupported by the reference Your main task is to **evaluate only the unsupported subclaims (`"result": 0"`)**, judging whether each is a *reasonable addition* given the specified readability level (*easy / intermediate / hard*). The presence of supported items (`"result": 1"`) helps you understand the full context of what is confirmed versus speculative, but you will not rate those. Their inclusion enriches the training data diversity and realism. --- ### **READABILITY & ATTRIBUTION GUIDELINES** | Level | Audience | Linguistic & Stylistic Profile | Allowable Additions | | :-- | :-- | :-- | :-- | | **Easy (FH 70–100)** | General public | Short, simple, concrete sentences | General explanations only; no new factual claims | | **Intermediate (FH 50–69)** | Educated layperson | Moderate complexity and precision | Clarifying causal links aligned with the text | | **Hard (FH 0–49)** | Professionals | Formal, technical, multi‑clause detail | Must strictly reflect source evidence | --- ### **Input** Readability Level: {difficulty_level} Reference Full Text: {reference_full_text} Generated Summary: {generated_summary} All Subclaims with Support Results: {subclaims_json} --- ### **TASK INSTRUCTIONS** For **each subclaim where** `"result": 0"`, classify it as: - `"reasonable"` – legitimate simplification aligned with readability needs - `"partially_reasonable"` – harmless addition or neutral paraphrase - `"unreasonable"` – misleading, speculative, or factually unsupported Support your judgment with a 1–2 sentence justification per item. Do **not** modify or comment on subclaims where `"result": 1"`. --- ### **Output JSON Format** ```json {{ "evaluations": [ {{ "subclaim_id": , "subclaim": "", "result": <0 or 1>, "reasonableness": "", "justification": "" }}, ... ] }} """ file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json" file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json" save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_attribution_resonability_check_100_gpt5_train_v2.json" with open(file_synth, 'r') as f: synthetic_data = json.load(f) with open(file_qwen_results, 'r') as f: qwen3_32B_results = json.load(f) res = [] if os.path.exists(save_path): with open(save_path, 'r') as f: res = json.load(f) print(f"🔁 Resuming from {len(res)} entries") existing = set((e["id"], e["difficulty_level"]) for e in res) for ind in tqdm.tqdm(range(0, 30)): entry = synthetic_data[ind] subclaims_results = qwen3_32B_results[ind]['attribution']['results'] subclaims_json = json.dumps(subclaims_results, indent=2, ensure_ascii=False) for level in ["easy", "intermediate", "hard"]: if (entry["id"], level) in existing: print(f"⏭️ Skipping {entry['id']} ({level})") continue ref_full_text = entry["full_text"] generated_summary = entry["readability_versions"][level]["text"] prompt = return_prompts_attribution_multi( ref_full_text, generated_summary, subclaims_json, level ) # print(prompt) # assert False try: response = openai_return(prompt) res.append({ "id": entry["id"], "difficulty_level": level, "response": response }) # save periodically if len(res) % 2 == 0: with open(save_path, 'w') as f: json.dump(res, f, indent=2, ensure_ascii=False) print(f"💾 Saved after {len(res)} entries") except Exception as e: print(f"❌ Error at index {ind}, level {level}: {e}")