| | import os |
| | import json |
| | import tqdm |
| | from openai import OpenAI |
| |
|
| | |
| | |
| | |
| |
|
| | api_file = "/home/mshahidul/api_new.json" |
| | with open(api_file, "r") as f: |
| | api_keys = json.load(f) |
| | openai_api_key = api_keys["openai"] |
| |
|
| | client = OpenAI(api_key=openai_api_key) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def openai_return(prompt, model="gpt-5"): |
| | """Send a prompt to GPT and parse JSON.""" |
| | response = client.chat.completions.create( |
| | model=model, |
| | messages=[ |
| | {"role": "system", "content": "You are a helpful assistant."}, |
| | {"role": "user", "content": prompt} |
| | ] |
| | ) |
| | content = response.choices[0].message.content.strip() |
| | cleaned = content.replace("```json", "").replace("```", "").strip() |
| | try: |
| | return json.loads(cleaned) |
| | except json.JSONDecodeError: |
| | print("⚠️ JSON parse failed — storing raw text.") |
| | return cleaned |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def return_prompts_attribution_multi(reference_full_text, generated_summary, subclaims_json, difficulty_level): |
| | return f""" |
| | ### **SYSTEM / ROLE INSTRUCTION** |
| | |
| | You are a **medical factuality and attribution evaluator**. |
| | You will analyze all subclaims found in a generated summary, each labeled with a `"result"` flag: |
| | - `1` = supported by the reference |
| | - `0` = unsupported by the reference |
| | |
| | Your main task is to **evaluate only the unsupported subclaims (`"result": 0"`)**, judging whether each is a *reasonable addition* given the specified readability level (*easy / intermediate / hard*). |
| | |
| | The presence of supported items (`"result": 1"`) helps you understand the full context of what is confirmed versus speculative, |
| | but you will not rate those. Their inclusion enriches the training data diversity and realism. |
| | |
| | --- |
| | |
| | ### **READABILITY & ATTRIBUTION GUIDELINES** |
| | |
| | | Level | Audience | Linguistic & Stylistic Profile | Allowable Additions | |
| | | :-- | :-- | :-- | :-- | |
| | | **Easy (FH 70–100)** | General public | Short, simple, concrete sentences | General explanations only; no new factual claims | |
| | | **Intermediate (FH 50–69)** | Educated layperson | Moderate complexity and precision | Clarifying causal links aligned with the text | |
| | | **Hard (FH 0–49)** | Professionals | Formal, technical, multi‑clause detail | Must strictly reflect source evidence | |
| | |
| | --- |
| | |
| | ### **Input** |
| | Readability Level: {difficulty_level} |
| | |
| | Reference Full Text: |
| | {reference_full_text} |
| | |
| | Generated Summary: |
| | {generated_summary} |
| | |
| | All Subclaims with Support Results: |
| | {subclaims_json} |
| | |
| | --- |
| | |
| | ### **TASK INSTRUCTIONS** |
| | |
| | For **each subclaim where** `"result": 0"`, classify it as: |
| | |
| | - `"reasonable"` – legitimate simplification aligned with readability needs |
| | - `"partially_reasonable"` – harmless addition or neutral paraphrase |
| | - `"unreasonable"` – misleading, speculative, or factually unsupported |
| | |
| | Support your judgment with a 1–2 sentence justification per item. |
| | |
| | Do **not** modify or comment on subclaims where `"result": 1"`. |
| | |
| | --- |
| | |
| | ### **Output JSON Format** |
| | |
| | ```json |
| | {{ |
| | "evaluations": [ |
| | {{ |
| | "subclaim_id": <id>, |
| | "subclaim": "<verbatim_subclaim>", |
| | "result": <0 or 1>, |
| | "reasonableness": "<reasonable | partially_reasonable | unreasonable | not_applicable>", |
| | "justification": "<short justification for result=0; for result=1, just write 'supported, no evaluation required'>" |
| | }}, |
| | ... |
| | ] |
| | }} |
| | """ |
| | file_synth = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json" |
| | file_qwen_results = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json" |
| | save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/syn_attribution_resonability_check_100_gpt5_train_v2.json" |
| |
|
| | with open(file_synth, 'r') as f: |
| | synthetic_data = json.load(f) |
| | with open(file_qwen_results, 'r') as f: |
| | qwen3_32B_results = json.load(f) |
| | res = [] |
| | if os.path.exists(save_path): |
| | with open(save_path, 'r') as f: |
| | res = json.load(f) |
| | print(f"🔁 Resuming from {len(res)} entries") |
| |
|
| | existing = set((e["id"], e["difficulty_level"]) for e in res) |
| |
|
| | for ind in tqdm.tqdm(range(0, 30)): |
| | entry = synthetic_data[ind] |
| | subclaims_results = qwen3_32B_results[ind]['attribution']['results'] |
| | subclaims_json = json.dumps(subclaims_results, indent=2, ensure_ascii=False) |
| | for level in ["easy", "intermediate", "hard"]: |
| | if (entry["id"], level) in existing: |
| | print(f"⏭️ Skipping {entry['id']} ({level})") |
| | continue |
| |
|
| | ref_full_text = entry["full_text"] |
| | generated_summary = entry["readability_versions"][level]["text"] |
| |
|
| | prompt = return_prompts_attribution_multi( |
| | ref_full_text, |
| | generated_summary, |
| | subclaims_json, |
| | level |
| | ) |
| | |
| | |
| |
|
| | try: |
| | response = openai_return(prompt) |
| | res.append({ |
| | "id": entry["id"], |
| | "difficulty_level": level, |
| | "response": response |
| | }) |
| |
|
| | |
| | if len(res) % 2 == 0: |
| | with open(save_path, 'w') as f: |
| | json.dump(res, f, indent=2, ensure_ascii=False) |
| | print(f"💾 Saved after {len(res)} entries") |
| |
|
| | except Exception as e: |
| | print(f"❌ Error at index {ind}, level {level}: {e}") |
| |
|
| |
|
| |
|