| | import os |
| | import json |
| | import tqdm |
| | import argparse |
| | from openai import OpenAI |
| |
|
| | |
| | |
| | |
| | |
| | MODEL_NAME = "/home/mshahidul/readctrl_model/full_model/qwen3-32B_subclaims_BF16_merged" |
| | API_URL = "http://localhost:8015/v1" |
| | API_KEY = "EMPTY" |
| |
|
| | |
| | client = OpenAI(base_url=API_URL, api_key=API_KEY) |
| |
|
| | |
| | |
| | |
| | def extraction_prompt(medical_text: str) -> str: |
| | prompt = f""" |
| | You are an expert medical annotator. Your task is to extract granular, factual subclaims from medical text. |
| | A subclaim is the smallest standalone factual unit that can be independently verified. |
| | Instructions: |
| | 1. Read the provided medical text. |
| | 2. Break it into clear, objective, atomic subclaims. |
| | 3. Each subclaim must come directly from the text. |
| | 4. Do not add, guess, or infer information. |
| | 5. Each subclaim should be short, specific, and verifiable. |
| | 6. Return ONLY a Python-style list of strings. |
| | Medical Text: |
| | {medical_text} |
| | Return your output in JSON list format, like: |
| | [ |
| | "subclaim 1", |
| | "subclaim 2", |
| | ... |
| | ] |
| | """ |
| | return prompt |
| |
|
| | |
| | |
| | |
| | def infer_subclaims(medical_text: str, temperature: float = 0.2) -> str: |
| | """Sends prompt to vLLM server and returns generated text.""" |
| | |
| | |
| | final_prompt = extraction_prompt(medical_text) |
| | |
| | |
| | try: |
| | response = client.chat.completions.create( |
| | model=MODEL_NAME, |
| | messages=[ |
| | {"role": "user", "content": final_prompt} |
| | ], |
| | max_tokens=1000, |
| | temperature=temperature, |
| | top_p=0.9, |
| | frequency_penalty=0.0, |
| | presence_penalty=0.0, |
| | ) |
| | res = response.choices[0].message.content.strip() |
| | res=res.split("</think>")[-1].strip() |
| | return res |
| | except Exception as e: |
| | print(f"Error during API call: {e}") |
| | return None |
| |
|
| | |
| | |
| | |
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument("--input_file", type=str, required=True, |
| | help="Path to the input JSON file containing medical texts.") |
| | args = parser.parse_args() |
| |
|
| | INPUT_FILE = args.input_file |
| | file_name = os.path.basename(INPUT_FILE).split(".json")[0] |
| | |
| | SAVE_FOLDER = "/home/mshahidul/readctrl/data/extracting_subclaim" |
| | os.makedirs(SAVE_FOLDER, exist_ok=True) |
| | OUTPUT_FILE = os.path.join(SAVE_FOLDER, f"extracted_subclaims_{file_name}.json") |
| |
|
| | |
| | with open(INPUT_FILE, "r") as f: |
| | data = json.load(f) |
| |
|
| | |
| | result = [] |
| | if os.path.exists(OUTPUT_FILE): |
| | with open(OUTPUT_FILE, "r") as f: |
| | try: |
| | result = json.load(f) |
| | except json.JSONDecodeError: |
| | result = [] |
| | |
| | existing_ids = {item["id"] for item in result} |
| |
|
| | print(f"Starting inference on {len(data)} items using vLLM server...") |
| | save=False |
| | |
| | |
| | |
| | for item in tqdm.tqdm(data): |
| | if item["id"] in existing_ids: |
| | continue |
| |
|
| | medical_text = item.get("fulltext", "") |
| | |
| | |
| | extracted = infer_subclaims(medical_text) |
| |
|
| | result.append({ |
| | "id": item["id"], |
| | "medical_text": medical_text, |
| | "subclaims": extracted, |
| | "summary": item.get("summary", "") |
| | }) |
| |
|
| | |
| | if len(result) % 20 == 0: |
| | with open(OUTPUT_FILE, "w") as f: |
| | if save: |
| | json.dump(result, f, indent=4, ensure_ascii=False) |
| |
|
| | |
| | with open(OUTPUT_FILE, "w") as f: |
| | if save: |
| | json.dump(result, f, indent=4, ensure_ascii=False) |
| |
|
| | print(f"Extraction completed. Saved to {OUTPUT_FILE}") |