from openai import OpenAI import json, os # Prompt template with placeholder for INPUT_TEXT with open("/home/mshahidul/readctrl/prompts/syn_dataset_subclaims_support_check_v3.txt", "r") as f: prompt_template = f.read() # Source data: translated clinical texts data_path = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma_merged.json" with open(data_path, "r") as f: input_items = json.load(f) api_file = "/home/mshahidul/api_new.json" with open(api_file, "r") as f: api_keys = json.load(f) openai_api_key = api_keys["openai"] client = OpenAI(api_key=openai_api_key) # USD cost per token INPUT_COST_PER_TOKEN = 1.25 / 1_000_000 OUTPUT_COST_PER_TOKEN = 10 / 1_000_000 def openai_return(prompt, model="gpt-5"): """Send a prompt to GPT and parse JSON and return usage.""" response = client.chat.completions.create( model=model, messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}, ], ) content = response.choices[0].message.content.strip() cleaned = content.replace("```json", "").replace("```", "").strip() usage = None if getattr(response, "usage", None) is not None: usage = { "prompt_tokens": getattr(response.usage, "prompt_tokens", 0) or 0, "completion_tokens": getattr(response.usage, "completion_tokens", 0) or 0, "total_tokens": getattr(response.usage, "total_tokens", 0) or 0, } try: parsed = json.loads(cleaned) except json.JSONDecodeError: print("⚠️ JSON parse failed — storing raw text.") parsed = cleaned return parsed, usage save_path="/home/mshahidul/readctrl/data/finetuning_data/finetune_dataset_subclaim_support_bn.json" res=[] if os.path.exists(save_path): with open(save_path, "r") as f: res = json.load(f) total_prompt_tokens = 0 total_completion_tokens = 0 import tqdm for i, item in enumerate(tqdm.tqdm(input_items)): input_text = item.get("translated_fulltext") # Fill the INPUT_TEXT placeholder in the prompt template prompt = prompt_template.replace("{{INPUT_TEXT}}", input_text) sample, usage = openai_return(prompt, model="gpt-5") # Keep track of which source record this sample came from res.append( { "id": item.get("id"), "input_text": input_text, "output": sample, } ) # import ipdb; ipdb.set_trace() prompt_tokens = 0 completion_tokens = 0 if usage is not None: prompt_tokens = usage.get("prompt_tokens", 0) or 0 completion_tokens = usage.get("completion_tokens", 0) or 0 total_prompt_tokens += prompt_tokens total_completion_tokens += completion_tokens input_cost = prompt_tokens * INPUT_COST_PER_TOKEN output_cost = completion_tokens * OUTPUT_COST_PER_TOKEN total_cost = input_cost + output_cost print( f"Run {i+1}: prompt_tokens={prompt_tokens}, " f"completion_tokens={completion_tokens}, " f"input_cost=${input_cost:.6f}, " f"output_cost=${output_cost:.6f}, " f"total_cost=${total_cost:.6f}" ) if len(res) % 2 == 0: with open(save_path, "w") as f: json.dump(res, f, indent=2, ensure_ascii=False) print(f"Saved {len(res)} samples so far.") with open(save_path, "w") as f: json.dump(res, f, indent=2, ensure_ascii=False) overall_input_cost = total_prompt_tokens * INPUT_COST_PER_TOKEN overall_output_cost = total_completion_tokens * OUTPUT_COST_PER_TOKEN overall_total_cost = overall_input_cost + overall_output_cost print( f"Total prompt_tokens={total_prompt_tokens}, " f"total completion_tokens={total_completion_tokens}, " f"overall_input_cost=${overall_input_cost:.6f}, " f"overall_output_cost=${overall_output_cost:.6f}, " f"overall_total_cost=${overall_total_cost:.6f}" )