| | import json |
| | from pathlib import Path |
| |
|
| | |
| | DATA_PATH = Path( |
| | "/home/mshahidul/readctrl/data/extracting_subclaim/synthetic_subclaims_first200.json" |
| | ) |
| | OUTPUT_PATH = Path( |
| | "/home/mshahidul/readctrl/data/finetuning_data/dataset_for_sft_support_check_list_new.json" |
| | ) |
| |
|
| |
|
| | def training_prompt(medical_text, subclaims, labels): |
| | numbered_subclaims = "\n".join( |
| | [f"{idx + 1}. {claim}" for idx, claim in enumerate(subclaims)] |
| | ) |
| | |
| | system_prompt = f""" |
| | You are an expert medical adjudicator. Determine if the 'Medical Passage' contains the core factual information of each 'Subclaim', even if the passage uses simpler language or layperson terms. |
| | Rules: |
| | - Label 'supported' if the essential meaning is present. |
| | - Label 'not_supported' only if the information is missing or contradicted. |
| | Output: JSON array of strings ['supported', 'not_supported', ...] |
| | |
| | Medical text: |
| | {medical_text} |
| | |
| | Subclaims: |
| | {numbered_subclaims} |
| | """ |
| |
|
| | conversation = {} |
| | conversation["conversations"] = ( |
| | {"from": "user", "content": system_prompt}, |
| | {"from": "assistant", "content": json.dumps(labels, ensure_ascii=False)}, |
| | ) |
| | return conversation |
| |
|
| |
|
| | def load_conversation_dataset(data_path=DATA_PATH): |
| | with Path(data_path).open("r", encoding="utf-8") as f: |
| | raw_data = json.load(f) |
| |
|
| | formatted_data = [] |
| | for record in raw_data: |
| | generated = record.get("generated", {}) |
| | medical_text = generated.get("passage", "") |
| | raw_subclaims = generated.get("subclaims", []) |
| |
|
| | subclaims = [] |
| | labels = [] |
| | for subclaim in raw_subclaims: |
| | claim_text = subclaim.get("claim_text", "").strip() |
| | if not claim_text: |
| | continue |
| | subclaims.append(claim_text) |
| | labels.append(subclaim.get("label", "not_supported")) |
| |
|
| | if not medical_text or not subclaims: |
| | continue |
| |
|
| | formatted_data.append(training_prompt(medical_text, subclaims, labels)) |
| |
|
| | return formatted_data |
| |
|
| |
|
| | |
| | dataset_for_sft = load_conversation_dataset() |
| |
|
| | with OUTPUT_PATH.open("w", encoding="utf-8") as f: |
| | json.dump(dataset_for_sft, f, ensure_ascii=False, indent=2) |
| |
|
| | print(len(dataset_for_sft)) |
| | print(dataset_for_sft[0]) |