| import os |
| import json |
| import tqdm |
| import argparse |
| from openai import OpenAI |
|
|
| |
| |
| |
| |
| MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged" |
| API_URL = "http://localhost:8015/v1" |
| API_KEY = "EMPTY" |
|
|
| |
| client = OpenAI(base_url=API_URL, api_key=API_KEY) |
|
|
| |
| |
| |
| def extraction_prompt(medical_text: str) -> str: |
| prompt = f""" |
| You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text. |
| A subclaim is the smallest standalone factual unit that can be independently verified. |
| Instructions: |
| 1. Read the provided medical text. |
| 2. Break it into clear, objective, atomic subclaims. |
| 3. Each subclaim must come directly from the text. |
| 4. Do not add, guess, or infer information. |
| 5. Each subclaim should be short, specific, and verifiable. |
| 6. Return ONLY a Python-style list of strings. |
| Medical Text: |
| {medical_text} |
| Return your output in JSON list format, like: |
| [ |
| "subclaim 1", |
| "subclaim 2", |
| ... |
| ] |
| """ |
| return prompt |
|
|
| |
| |
| |
| def infer_subclaims(medical_text: str, temperature: float = 0.2) -> str: |
| """Sends prompt to vLLM server and returns generated text.""" |
| |
| |
| final_prompt = extraction_prompt(medical_text) |
| |
| |
| try: |
| response = client.chat.completions.create( |
| model=MODEL_NAME, |
| messages=[ |
| {"role": "user", "content": final_prompt} |
| ], |
| max_tokens=1000, |
| temperature=temperature, |
| top_p=0.9, |
| frequency_penalty=0.0, |
| presence_penalty=0.0, |
| ) |
| res = response.choices[0].message.content.strip() |
| res=res.split("</think>")[-1].strip() |
| return res |
| except Exception as e: |
| print(f"Error during API call: {e}") |
| return None |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--input_file", type=str, required=True, |
| help="Path to the input JSON file containing medical texts.") |
| args = parser.parse_args() |
|
|
| INPUT_FILE = args.input_file |
| file_name = os.path.basename(INPUT_FILE).split(".json")[0] |
| |
| SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim" |
| os.makedirs(SAVE_FOLDER, exist_ok=True) |
| OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}.json") |
|
|
| |
| with open(INPUT_FILE, "r") as f: |
| data = json.load(f) |
|
|
| |
| result = [] |
| if os.path.exists(OUTPUT_FILE): |
| with open(OUTPUT_FILE, "r") as f: |
| try: |
| result = json.load(f) |
| except json.JSONDecodeError: |
| result = [] |
| |
| existing_ids = {item["id"] for item in result} |
|
|
| print(f"Starting inference on {len(data)} items using vLLM server...") |
| save=False |
| |
| |
| |
| for item in tqdm.tqdm(data): |
| if item["id"] in existing_ids: |
| continue |
|
|
| medical_text = item.get("fulltext", "") |
| |
| |
| extracted = infer_subclaims(medical_text) |
|
|
| result.append({ |
| "id": item["id"], |
| "medical_text": medical_text, |
| "subclaims": extracted, |
| "summary": item.get("summary", "") |
| }) |
|
|
| |
| if len(result) % 20 == 0: |
| with open(OUTPUT_FILE, "w") as f: |
| if save: |
| json.dump(result, f, indent=4, ensure_ascii=False) |
|
|
| |
| with open(OUTPUT_FILE, "w") as f: |
| if save: |
| json.dump(result, f, indent=4, ensure_ascii=False) |
|
|
| print(f"Extraction completed. Saved to {OUTPUT_FILE}") |