import os import json from openai import OpenAI import tqdm # Initialize client (ensure you have OPENAI_API_KEY in env vars) client = OpenAI(api_key=json.load(open('/home/mshahidul/api.json', 'r'))['openai_api_key']) # System prompts (from Appendix B in your proposal) PROMPTS = { "B1": """You are a summarization assistant trained to rewrite medical case reports' expert summaries for readers at an elementary school level (ages 5–11, FKGL 1.0–6.0). Your job is to generate summaries that are: * Kind and empathetic * Clear, simple, and understandable for readers without medical background * Accurate and faithful to the source General Instructions: - Assume the reader is an elementary school student with no medical knowledge. - Avoid medical jargon. If it must appear, explain it in very simple terms. - Use short sentences and everyday words. - Reassure the reader when findings are normal; explain gently if something is abnormal. - Do not overwhelm with detail; focus on main ideas. - Never use emojis. - Do not explain pronunciation. """, "B2": """You are a summarization assistant trained to rewrite medical case reports' expert summaries for readers at a middle or high school level (ages 11–17, FKGL 6.0–12.0). Your job is to generate summaries that are: * Kind and empathetic * Clear and understandable for readers with only general school-level science * Accurate and faithful to the source General Instructions: - Assume the reader is a secondary school student with limited medical knowledge. - Avoid unnecessary jargon. If a medical term is included, provide a brief, clear explanation. - Write in a style appropriate for middle/high school reading comprehension. - Present abnormal findings with calm, explanatory language, including possible next steps. - Keep the tone warm, patient, and caring. - Never use emojis. - Do not explain pronunciation. """, "B3": """You are a summarization assistant trained to rewrite medical case reports' expert summaries for readers at a college or higher education level (ages 17+, FKGL 12.0+). Your job is to generate summaries that are: * Kind and empathetic * Clear and precise, while remaining faithful to the source * Appropriate for readers with advanced literacy but no formal medical training General Instructions: - Assume the reader is a college-level reader with no medical specialization. - Medical terms can be used if they are commonly understood or explained briefly. - Provide a more detailed and structured summary than for younger readers. - Clearly distinguish between normal and abnormal findings, and outline potential implications or next steps. - Maintain an empathetic and respectful tone at all times. - Never use emojis. - Do not explain pronunciation. """ } def generate_synthetic_summary(article, gold_summary, band): """Call GPT-5-mini to generate a synthetic summary for a given readability band""" prompt = f"""Article: {article} Gold Summary: {gold_summary} Task: Please generate a summary at readability band {band}. """ response = client.chat.completions.create( model="gpt-5-mini", messages=[ {"role": "system", "content": PROMPTS[band]}, {"role": "user", "content": prompt} ], temperature=1.0 ) return response.choices[0].message.content.strip() def build_synthetic_dataset(input_path, output_path, max_samples=None): """Generate synthetic dataset from a JSONL file with {article, gold_summary}""" results = [] if os.path.exists(output_path): results = json.load(open(output_path, 'r')) with open(input_path, "r") as f: data = json.load(f) for item in tqdm.tqdm(data): if max_samples and len(results) >= max_samples: break article, gold = item["fulltext"], item["summary"] if article in [r['article'] for r in results]: continue temp={} for band in ["B1", "B2", "B3"]: synthetic = generate_synthetic_summary(article, gold, band) temp[band] = synthetic results.append({ "article": article, "gold_summary": gold, "synthetic_summary": temp }) if len(results)%5==0: print(f"Processed {len(results)} samples, saving progress...") with open(output_path, "w") as f: json.dump(results, f, ensure_ascii=False, indent=4) with open(output_path, "w") as f: json.dump(results, f, ensure_ascii=False, indent=4) # Example usage: lang = "es" # Change to desired language path=f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_{lang}.json" build_synthetic_dataset(path, f"/home/mshahidul/readctrl/generating_data/{lang}_synthetic.json", max_samples=100)