| import argparse |
| import json |
| import os |
| import time |
| from pathlib import Path |
| from typing import List |
|
|
| import tqdm |
| from openai import OpenAI |
|
|
|
|
| |
| |
| |
| def extraction_prompt(medical_text: str) -> str: |
| prompt = f""" |
| You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text. |
| A subclaim is the smallest standalone factual unit that can be independently verified. |
| |
| Instructions: |
| 1. Read the provided medical text. |
| 2. Break it into clear, objective, atomic subclaims. |
| 3. Each subclaim must come directly from the text. Do not infer or add information. |
| 4. Keep subclaims short, non-overlapping, and de-duplicated. |
| 5. Preserve numbers, units, and dates exactly as written. |
| 6. If the text is empty, return an empty JSON list. |
| 7. Return ONLY a valid JSON list of strings (no extra text). |
| |
| Medical Text: |
| {medical_text} |
| |
| Return your output in JSON list format: |
| [ |
| "subclaim 1", |
| "subclaim 2" |
| ] |
| """ |
| return prompt |
|
|
|
|
| def _load_openai_client() -> OpenAI: |
| api_file = "/home/mshahidul/api_new.json" |
| with open(api_file, "r") as f: |
| api_keys = json.load(f) |
| return OpenAI(api_key=api_keys["openai"]) |
|
|
|
|
| def _parse_json_list(text: str) -> List[str]: |
| cleaned = text.replace("```json", "").replace("```", "").strip() |
| start_idx = cleaned.find("[") |
| end_idx = cleaned.rfind("]") + 1 |
| if start_idx == -1 or end_idx <= start_idx: |
| raise ValueError("No JSON list found") |
| parsed = json.loads(cleaned[start_idx:end_idx]) |
| if not isinstance(parsed, list): |
| raise ValueError("Parsed JSON is not a list") |
| return parsed |
|
|
|
|
| def infer_subclaims( |
| medical_text: str, |
| client: OpenAI, |
| model: str = "gpt-5-mini", |
| retries: int = 1, |
| ) -> List[str]: |
| if not medical_text or medical_text.strip() == "": |
| return [] |
|
|
| prompt = extraction_prompt(medical_text) |
| try: |
| response = client.chat.completions.create( |
| model=model, |
| messages=[ |
| {"role": "system", "content": "Return only a valid JSON list of strings."}, |
| {"role": "user", "content": prompt}, |
| ], |
| ) |
| output_text = response.choices[0].message.content.strip() |
| return _parse_json_list(output_text) |
| except Exception as e: |
| if retries > 0: |
| time.sleep(1.5) |
| return infer_subclaims( |
| medical_text, |
| client, |
| model=model, |
| retries=retries - 1, |
| ) |
| return [f"ERROR: {str(e)}"] |
|
|
|
|
| |
| |
| |
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "--input_file", |
| type=str, |
| default="/home/mshahidul/readctrl/data/annotators_validate_data_(20_80)/combine/verified_combined_0-80.json", |
| ) |
| parser.add_argument( |
| "--save_folder", |
| type=str, |
| default="/home/mshahidul/readctrl/data/extracting_subclaim", |
| ) |
| parser.add_argument("--model", type=str, default="gpt-5-mini") |
| args = parser.parse_args() |
|
|
| input_file = args.input_file |
| save_folder = args.save_folder |
| file_name = os.path.basename(input_file).split(".json")[0] |
| output_file = os.path.join(save_folder, f"extracted_subclaims_{file_name}.json") |
|
|
| Path(save_folder).mkdir(parents=True, exist_ok=True) |
| client = _load_openai_client() |
|
|
| with open(input_file, "r") as f: |
| data = json.load(f) |
|
|
| result = [] |
| if os.path.exists(output_file): |
| with open(output_file, "r") as f: |
| result = json.load(f) |
|
|
| def _item_key(obj: dict) -> str: |
| if obj.get("index") is not None: |
| return str(obj.get("index")) |
| if obj.get("id") is not None: |
| return str(obj.get("id")) |
| if obj.get("doc_id") is not None and obj.get("label") is not None: |
| return f"{obj.get('doc_id')}_{obj.get('label')}" |
| return str(obj.get("doc_id") or obj.get("label") or "") |
|
|
| processed_data = {_item_key(item): item for item in result} |
|
|
| for item in tqdm.tqdm(data): |
| item_id = _item_key(item) |
| existing_entry = processed_data.get(item_id) |
|
|
| |
| if not existing_entry or not isinstance(existing_entry.get("fulltext_subclaims"), list): |
| f_sub = infer_subclaims( |
| item.get("fulltext", ""), |
| client, |
| model=args.model, |
| retries=2, |
| ) |
| else: |
| f_sub = existing_entry["fulltext_subclaims"] |
|
|
| |
| if not existing_entry or not isinstance(existing_entry.get("summary_subclaims"), list): |
| s_sub = infer_subclaims( |
| item.get("summary", ""), |
| client, |
| model=args.model, |
| retries=1, |
| ) |
| else: |
| s_sub = existing_entry["summary_subclaims"] |
|
|
| |
| diff_label_texts = item.get("diff_label_texts", "") |
| if isinstance(diff_label_texts, dict): |
| diff_label_subclaims = existing_entry.get("diff_label_subclaims", {}) if existing_entry else {} |
| for label, text in diff_label_texts.items(): |
| if label not in diff_label_subclaims or not isinstance(diff_label_subclaims[label], list): |
| diff_label_subclaims[label] = infer_subclaims( |
| text, |
| client, |
| model=args.model, |
| retries=1, |
| ) |
| else: |
| if not existing_entry or not isinstance(existing_entry.get("diff_label_subclaims"), list): |
| diff_label_subclaims = infer_subclaims( |
| diff_label_texts, |
| client, |
| model=args.model, |
| retries=1, |
| ) |
| else: |
| diff_label_subclaims = existing_entry["diff_label_subclaims"] |
|
|
| |
| new_entry = { |
| "doc_id": item.get("doc_id"), |
| "label": item.get("label"), |
| "fulltext": item.get("fulltext", ""), |
| "fulltext_subclaims": f_sub, |
| "summary": item.get("summary", ""), |
| "summary_subclaims": s_sub, |
| "diff_label_texts": diff_label_texts, |
| "diff_label_subclaims": diff_label_subclaims, |
| } |
| processed_data[item_id] = new_entry |
|
|
| if len(processed_data) % 10 == 0: |
| with open(output_file, "w") as f: |
| json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False) |
|
|
| with open(output_file, "w") as f: |
| json.dump(list(processed_data.values()), f, indent=4, ensure_ascii=False) |
|
|
| print(f"Extraction completed. File saved at: {output_file}") |
|
|