readCtrl_lambda / code /old /revised_readability_results.py

mshahidul

Initial commit of readCtrl code without large models

030876e 7 days ago

5.67 kB

	def revised_results(reference_summary, generated_summary, list_of_missing_subclaims, difficulty_level):
	return f'''
	### SYSTEM / ROLE INSTRUCTION

	You are a medical text rewriting assistant that improves summaries while maintaining the intended readability level (easy / intermediate / hard).
	You will receive:

	* The original reference summary (the factual source)
	* The current generated summary
	* A list of important missing subclaims to be reintroduced
	* The target readability level

	Your task:
	Revise the generated summary so that it adds the missing information naturally, while keeping:

	* The same tone, vocabulary, and sentence simplicity of the given readability level.
	* Logical flow and coherence.
	* No extra, invented information beyond what’s in the reference summary.

	---

	### INPUT FIELDS

	Reference summary:
	{reference_summary}

	Current generated summary ({difficulty_level}):
	{generated_summary}

	Missing important subclaims to add back:
	{list_of_missing_subclaims}

	Target readability level:
	{difficulty_level}


	---

	### TASK INSTRUCTIONS

	1. Integrate the missing subclaims smoothly into the generated summary.
	2. Do not add any new facts beyond those listed.
	3. Maintain the same readability level:

	* Easy: conversational, short sentences, no jargon.
	* Intermediate: light medical terms, brief explanations.
	* Hard: concise clinical tone with correct terminology.
	4. Keep the summary approximately the same length; avoid redundancy.
	5. Ensure the resulting text remains fluent, coherent, and faithful to the reference summary.

	---

	### OUTPUT FORMAT

	```json
	{{
	"revised_summary": "<the new version of the summary, rewritten with the added subclaims>",
	"explanation": "<brief note explaining how the missing subclaims were added while preserving readability>"
	}}
	```

	'''
	from openai import OpenAI
	import json
	file_path = "/home/mshahidul/api_new.json"
	with open(file_path, "r") as file:
	api_keys = json.load(file)

	openai_api_key = api_keys.get("openai")

	client = OpenAI(api_key=openai_api_key)
	def openai_return(prompt):
	response = client.chat.completions.create(
	model="gpt-5-mini",
	messages=[
	{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": prompt}
	]
	)
	cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "")
	return json.loads(cleaned_response)
	import json
	file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json"

	with open(file_path, 'r') as f:
	synthetic_data = json.load(f)

	# /home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_completeness.json



	with open("/home/mshahidul/readctrl/results/dataset_quality_check/resonability_check_100_gpt5_completeness.json", 'r') as f:
	readability_reasoning = json.load(f)
	# readability_reasoning[0].keys() # dict_keys(['id', 'difficulty_level', 'prompt'])
	# readability_reasoning[0]['prompt'].keys() # dict_keys(['evaluation_table', 'reasonableness_score', 'overall_explanation'])
	reason_info={}
	for item in readability_reasoning:
	id=item['id']
	difficulty_level=item['difficulty_level']
	data_temp=item['prompt']
	for _data in data_temp['evaluation_table']:
	if _data['reasonable_omission'] == "no":
	key=(id, difficulty_level)
	if key not in reason_info:
	reason_info[key]=[]
	reason_info[key].append(_data['subclaim'])

	file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json"

	with open(file_path_qwen3_32B, 'r') as f:
	qwen3_32B_results = json.load(f)

	# dict_keys(['id', 'full_text', 'ref_summary', 'readability_versions'])
	# print(f"Full text: {synthetic_data[0]['full_text']}")
	import os
	# def revised_results(reference_summary, generated_summary, list_of_missing_subclaims, difficulty_level):
	res=[]
	temp=""
	save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/results_revised_100_gpt5.json"
	if os.path.exists(save_path):
	with open(save_path, 'r') as f:
	res = json.load(f)
	existing_check=set((entry['id'], entry['difficulty_level']) for entry in res)
	print(f"Resuming from {len(res)} entries")
	import tqdm
	for ind in tqdm.tqdm(range(0,100)):
	for version in ["easy", "intermediate", "hard"]:
	reference_summary = (f"{synthetic_data[ind]['ref_summary']['text']}")
	generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}")
	if (synthetic_data[ind]['id'],version) in existing_check:
	continue
	if (synthetic_data[ind]['id'],version) not in reason_info:
	continue
	subclaims_results = reason_info[(synthetic_data[ind]['id'],version)]
	prompt = revised_results(reference_summary, generated_summary, subclaims_results, version)
	try:
	ans=openai_return(prompt)
	res.append({
	"id": synthetic_data[ind]['id'],
	"difficulty_level": version,
	"prompt": prompt,
	"response": ans
	})

	if len(res)%2==0:
	print(f"Completed {len(res)} out of 300")
	with open(save_path, 'w') as outfile:
	json.dump(res, outfile, indent=2)
	except Exception as e:
	print(f"Error at index {ind}, version {version}: {e}")

	with open(save_path, 'w') as outfile:
	json.dump(res, outfile, indent=2)