File size: 3,955 Bytes
030876e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | from openai import OpenAI
import json, os
# Prompt template with placeholder for INPUT_TEXT
with open("/home/mshahidul/readctrl/prompts/syn_dataset_subclaims_support_check_v3.txt", "r") as f:
prompt_template = f.read()
# Source data: translated clinical texts
data_path = "/home/mshahidul/readctrl/data/translated_data/multiclinsum_gs_train_en2bn_gemma_merged.json"
with open(data_path, "r") as f:
input_items = json.load(f)
api_file = "/home/mshahidul/api_new.json"
with open(api_file, "r") as f:
api_keys = json.load(f)
openai_api_key = api_keys["openai"]
client = OpenAI(api_key=openai_api_key)
# USD cost per token
INPUT_COST_PER_TOKEN = 1.25 / 1_000_000
OUTPUT_COST_PER_TOKEN = 10 / 1_000_000
def openai_return(prompt, model="gpt-5"):
"""Send a prompt to GPT and parse JSON and return usage."""
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt},
],
)
content = response.choices[0].message.content.strip()
cleaned = content.replace("```json", "").replace("```", "").strip()
usage = None
if getattr(response, "usage", None) is not None:
usage = {
"prompt_tokens": getattr(response.usage, "prompt_tokens", 0) or 0,
"completion_tokens": getattr(response.usage, "completion_tokens", 0) or 0,
"total_tokens": getattr(response.usage, "total_tokens", 0) or 0,
}
try:
parsed = json.loads(cleaned)
except json.JSONDecodeError:
print("⚠️ JSON parse failed — storing raw text.")
parsed = cleaned
return parsed, usage
save_path="/home/mshahidul/readctrl/data/finetuning_data/finetune_dataset_subclaim_support_bn.json"
res=[]
if os.path.exists(save_path):
with open(save_path, "r") as f:
res = json.load(f)
total_prompt_tokens = 0
total_completion_tokens = 0
import tqdm
for i, item in enumerate(tqdm.tqdm(input_items)):
input_text = item.get("translated_fulltext")
# Fill the INPUT_TEXT placeholder in the prompt template
prompt = prompt_template.replace("{{INPUT_TEXT}}", input_text)
sample, usage = openai_return(prompt, model="gpt-5")
# Keep track of which source record this sample came from
res.append(
{
"id": item.get("id"),
"input_text": input_text,
"output": sample,
}
)
# import ipdb; ipdb.set_trace()
prompt_tokens = 0
completion_tokens = 0
if usage is not None:
prompt_tokens = usage.get("prompt_tokens", 0) or 0
completion_tokens = usage.get("completion_tokens", 0) or 0
total_prompt_tokens += prompt_tokens
total_completion_tokens += completion_tokens
input_cost = prompt_tokens * INPUT_COST_PER_TOKEN
output_cost = completion_tokens * OUTPUT_COST_PER_TOKEN
total_cost = input_cost + output_cost
print(
f"Run {i+1}: prompt_tokens={prompt_tokens}, "
f"completion_tokens={completion_tokens}, "
f"input_cost=${input_cost:.6f}, "
f"output_cost=${output_cost:.6f}, "
f"total_cost=${total_cost:.6f}"
)
if len(res) % 2 == 0:
with open(save_path, "w") as f:
json.dump(res, f, indent=2, ensure_ascii=False)
print(f"Saved {len(res)} samples so far.")
with open(save_path, "w") as f:
json.dump(res, f, indent=2, ensure_ascii=False)
overall_input_cost = total_prompt_tokens * INPUT_COST_PER_TOKEN
overall_output_cost = total_completion_tokens * OUTPUT_COST_PER_TOKEN
overall_total_cost = overall_input_cost + overall_output_cost
print(
f"Total prompt_tokens={total_prompt_tokens}, "
f"total completion_tokens={total_completion_tokens}, "
f"overall_input_cost=${overall_input_cost:.6f}, "
f"overall_output_cost=${overall_output_cost:.6f}, "
f"overall_total_cost=${overall_total_cost:.6f}"
) |