| import os |
| import json |
| from openai import OpenAI |
| import tqdm |
| |
| client = OpenAI(api_key=json.load(open('/home/mshahidul/api.json', 'r'))['openai_api_key']) |
|
|
| |
| PROMPTS = { |
| "B1": """You are a summarization assistant trained to rewrite medical case reports' expert summaries |
| for readers at an elementary school level (ages 5–11, FKGL 1.0–6.0). |
| |
| Your job is to generate summaries that are: |
| * Kind and empathetic |
| * Clear, simple, and understandable for readers without medical background |
| * Accurate and faithful to the source |
| |
| General Instructions: |
| - Assume the reader is an elementary school student with no medical knowledge. |
| - Avoid medical jargon. If it must appear, explain it in very simple terms. |
| - Use short sentences and everyday words. |
| - Reassure the reader when findings are normal; explain gently if something is abnormal. |
| - Do not overwhelm with detail; focus on main ideas. |
| - Never use emojis. |
| - Do not explain pronunciation. |
| """, |
| "B2": """You are a summarization assistant trained to rewrite medical case reports' expert summaries for readers at a middle or high school level (ages 11–17, FKGL 6.0–12.0). |
| |
| Your job is to generate summaries that are: |
| * Kind and empathetic |
| * Clear and understandable for readers with only general school-level science |
| * Accurate and faithful to the source |
| |
| General Instructions: |
| - Assume the reader is a secondary school student with limited medical knowledge. |
| - Avoid unnecessary jargon. If a medical term is included, provide a brief, clear explanation. |
| - Write in a style appropriate for middle/high school reading comprehension. |
| - Present abnormal findings with calm, explanatory language, including possible next steps. |
| - Keep the tone warm, patient, and caring. |
| - Never use emojis. |
| - Do not explain pronunciation. |
| """, |
| "B3": """You are a summarization assistant trained to rewrite medical case reports' expert summaries |
| for readers at a college or higher education level (ages 17+, FKGL 12.0+). |
| |
| Your job is to generate summaries that are: |
| * Kind and empathetic |
| * Clear and precise, while remaining faithful to the source |
| * Appropriate for readers with advanced literacy but no formal medical training |
| |
| General Instructions: |
| - Assume the reader is a college-level reader with no medical specialization. |
| - Medical terms can be used if they are commonly understood or explained briefly. |
| - Provide a more detailed and structured summary than for younger readers. |
| - Clearly distinguish between normal and abnormal findings, and outline potential implications or next steps. |
| - Maintain an empathetic and respectful tone at all times. |
| - Never use emojis. |
| - Do not explain pronunciation. |
| """ |
| } |
|
|
| def generate_synthetic_summary(article, gold_summary, band): |
| """Call GPT-5-mini to generate a synthetic summary for a given readability band""" |
| prompt = f"""Article: |
| {article} |
| |
| Gold Summary: |
| {gold_summary} |
| |
| Task: |
| Please generate a summary at readability band {band}. |
| """ |
|
|
| response = client.chat.completions.create( |
| model="gpt-5-mini", |
| messages=[ |
| {"role": "system", "content": PROMPTS[band]}, |
| {"role": "user", "content": prompt} |
| ], |
| temperature=1.0 |
| ) |
|
|
| return response.choices[0].message.content.strip() |
|
|
| def build_synthetic_dataset(input_path, output_path, max_samples=None): |
| """Generate synthetic dataset from a JSONL file with {article, gold_summary}""" |
| results = [] |
| if os.path.exists(output_path): |
| results = json.load(open(output_path, 'r')) |
| with open(input_path, "r") as f: |
| data = json.load(f) |
| for item in tqdm.tqdm(data): |
| if max_samples and len(results) >= max_samples: |
| break |
| article, gold = item["fulltext"], item["summary"] |
| if article in [r['article'] for r in results]: |
| continue |
| temp={} |
| for band in ["B1", "B2", "B3"]: |
| synthetic = generate_synthetic_summary(article, gold, band) |
| temp[band] = synthetic |
| results.append({ |
| "article": article, |
| "gold_summary": gold, |
| "synthetic_summary": temp |
| }) |
| if len(results)%5==0: |
| print(f"Processed {len(results)} samples, saving progress...") |
| with open(output_path, "w") as f: |
| json.dump(results, f, ensure_ascii=False, indent=4) |
|
|
| with open(output_path, "w") as f: |
| json.dump(results, f, ensure_ascii=False, indent=4) |
|
|
| |
| lang = "es" |
| path=f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_{lang}.json" |
| build_synthetic_dataset(path, f"/home/mshahidul/readctrl/generating_data/{lang}_synthetic.json", max_samples=100) |
|
|