readctrl / code /old /synthetic_data_generation.py

Add files using upload-large-folder tool

c7a6fe6 verified 28 days ago

4.89 kB

	import os
	import json
	from openai import OpenAI
	import tqdm
	# Initialize client (ensure you have OPENAI_API_KEY in env vars)
	client = OpenAI(api_key=json.load(open('/home/mshahidul/api.json', 'r'))['openai_api_key'])

	# System prompts (from Appendix B in your proposal)
	PROMPTS = {
	"B1": """You are a summarization assistant trained to rewrite medical case reports' expert summaries
	for readers at an elementary school level (ages 5–11, FKGL 1.0–6.0).

	Your job is to generate summaries that are:
	* Kind and empathetic
	* Clear, simple, and understandable for readers without medical background
	* Accurate and faithful to the source

	General Instructions:
	- Assume the reader is an elementary school student with no medical knowledge.
	- Avoid medical jargon. If it must appear, explain it in very simple terms.
	- Use short sentences and everyday words.
	- Reassure the reader when findings are normal; explain gently if something is abnormal.
	- Do not overwhelm with detail; focus on main ideas.
	- Never use emojis.
	- Do not explain pronunciation.
	""",
	"B2": """You are a summarization assistant trained to rewrite medical case reports' expert summaries for readers at a middle or high school level (ages 11–17, FKGL 6.0–12.0).

	Your job is to generate summaries that are:
	* Kind and empathetic
	* Clear and understandable for readers with only general school-level science
	* Accurate and faithful to the source

	General Instructions:
	- Assume the reader is a secondary school student with limited medical knowledge.
	- Avoid unnecessary jargon. If a medical term is included, provide a brief, clear explanation.
	- Write in a style appropriate for middle/high school reading comprehension.
	- Present abnormal findings with calm, explanatory language, including possible next steps.
	- Keep the tone warm, patient, and caring.
	- Never use emojis.
	- Do not explain pronunciation.
	""",
	"B3": """You are a summarization assistant trained to rewrite medical case reports' expert summaries
	for readers at a college or higher education level (ages 17+, FKGL 12.0+).

	Your job is to generate summaries that are:
	* Kind and empathetic
	* Clear and precise, while remaining faithful to the source
	* Appropriate for readers with advanced literacy but no formal medical training

	General Instructions:
	- Assume the reader is a college-level reader with no medical specialization.
	- Medical terms can be used if they are commonly understood or explained briefly.
	- Provide a more detailed and structured summary than for younger readers.
	- Clearly distinguish between normal and abnormal findings, and outline potential implications or next steps.
	- Maintain an empathetic and respectful tone at all times.
	- Never use emojis.
	- Do not explain pronunciation.
	"""
	}

	def generate_synthetic_summary(article, gold_summary, band):
	"""Call GPT-5-mini to generate a synthetic summary for a given readability band"""
	prompt = f"""Article:
	{article}

	Gold Summary:
	{gold_summary}

	Task:
	Please generate a summary at readability band {band}.
	"""

	response = client.chat.completions.create(
	model="gpt-5-mini",
	messages=[
	{"role": "system", "content": PROMPTS[band]},
	{"role": "user", "content": prompt}
	],
	temperature=1.0
	)

	return response.choices[0].message.content.strip()

	def build_synthetic_dataset(input_path, output_path, max_samples=None):
	"""Generate synthetic dataset from a JSONL file with {article, gold_summary}"""
	results = []
	if os.path.exists(output_path):
	results = json.load(open(output_path, 'r'))
	with open(input_path, "r") as f:
	data = json.load(f)
	for item in tqdm.tqdm(data):
	if max_samples and len(results) >= max_samples:
	break
	article, gold = item["fulltext"], item["summary"]
	if article in [r['article'] for r in results]:
	continue
	temp={}
	for band in ["B1", "B2", "B3"]:
	synthetic = generate_synthetic_summary(article, gold, band)
	temp[band] = synthetic
	results.append({
	"article": article,
	"gold_summary": gold,
	"synthetic_summary": temp
	})
	if len(results)%5==0:
	print(f"Processed {len(results)} samples, saving progress...")
	with open(output_path, "w") as f:
	json.dump(results, f, ensure_ascii=False, indent=4)

	with open(output_path, "w") as f:
	json.dump(results, f, ensure_ascii=False, indent=4)

	# Example usage:
	lang = "es" # Change to desired language
	path=f"/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_{lang}.json"
	build_synthetic_dataset(path, f"/home/mshahidul/readctrl/generating_data/{lang}_synthetic.json", max_samples=100)