readctrl / code /old /synthetic_data_generation.py
shahidul034's picture
Add files using upload-large-folder tool
c7a6fe6 verified
import os
import json
from openai import OpenAI
import tqdm
# Initialize client (ensure you have OPENAI_API_KEY in env vars)
client = OpenAI(api_key=json.load(open('/home/mshahidul/api.json', 'r'))['openai_api_key'])
# System prompts (from Appendix B in your proposal)
PROMPTS = {
"B1": """You are a summarization assistant trained to rewrite medical case reports' expert summaries
for readers at an elementary school level (ages 5–11, FKGL 1.0–6.0).
Your job is to generate summaries that are:
* Kind and empathetic
* Clear, simple, and understandable for readers without medical background
* Accurate and faithful to the source
General Instructions:
- Assume the reader is an elementary school student with no medical knowledge.
- Avoid medical jargon. If it must appear, explain it in very simple terms.
- Use short sentences and everyday words.
- Reassure the reader when findings are normal; explain gently if something is abnormal.
- Do not overwhelm with detail; focus on main ideas.
- Never use emojis.
- Do not explain pronunciation.
""",
"B2": """You are a summarization assistant trained to rewrite medical case reports' expert summaries for readers at a middle or high school level (ages 11–17, FKGL 6.0–12.0).
Your job is to generate summaries that are:
* Kind and empathetic
* Clear and understandable for readers with only general school-level science
* Accurate and faithful to the source
General Instructions:
- Assume the reader is a secondary school student with limited medical knowledge.
- Avoid unnecessary jargon. If a medical term is included, provide a brief, clear explanation.
- Write in a style appropriate for middle/high school reading comprehension.
- Present abnormal findings with calm, explanatory language, including possible next steps.
- Keep the tone warm, patient, and caring.
- Never use emojis.
- Do not explain pronunciation.
""",
"B3": """You are a summarization assistant trained to rewrite medical case reports' expert summaries
for readers at a college or higher education level (ages 17+, FKGL 12.0+).
Your job is to generate summaries that are:
* Kind and empathetic
* Clear and precise, while remaining faithful to the source
* Appropriate for readers with advanced literacy but no formal medical training
General Instructions:
- Assume the reader is a college-level reader with no medical specialization.
- Medical terms can be used if they are commonly understood or explained briefly.
- Provide a more detailed and structured summary than for younger readers.
- Clearly distinguish between normal and abnormal findings, and outline potential implications or next steps.
- Maintain an empathetic and respectful tone at all times.
- Never use emojis.
- Do not explain pronunciation.
"""
}
def generate_synthetic_summary(article, gold_summary, band):
"""Call GPT-5-mini to generate a synthetic summary for a given readability band"""
prompt = f"""Article:
{article}
Gold Summary:
{gold_summary}
Task:
Please generate a summary at readability band {band}.
"""
response = client.chat.completions.create(
model="gpt-5-mini",
messages=[
{"role": "system", "content": PROMPTS[band]},
{"role": "user", "content": prompt}
],
temperature=1.0
)
return response.choices[0].message.content.strip()
def build_synthetic_dataset(input_path, output_path, max_samples=None):
"""Generate synthetic dataset from a JSONL file with {article, gold_summary}"""
results = []
if os.path.exists(output_path):
results = json.load(open(output_path, 'r'))
with open(input_path, "r") as f:
data = json.load(f)
for item in tqdm.tqdm(data):
if max_samples and len(results) >= max_samples:
break
article, gold = item["fulltext"], item["summary"]
if article in [r['article'] for r in results]:
continue
temp={}
for band in ["B1", "B2", "B3"]:
synthetic = generate_synthetic_summary(article, gold, band)
temp[band] = synthetic
results.append({
"article": article,
"gold_summary": gold,
"synthetic_summary": temp
})
if len(results)%5==0:
print(f"Processed {len(results)} samples, saving progress...")
with open(output_path, "w") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
with open(output_path, "w") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
# Example usage:
lang = "es" # Change to desired language
path=f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_{lang}.json"
build_synthetic_dataset(path, f"/home/mshahidul/readctrl/generating_data/{lang}_synthetic.json", max_samples=100)