def inference_prompt_revise_summary(fulltext, ref_summary, generated_summary, version, missing_subclaims): prompt = f""" You are a medical summarization model specialized in readability-controlled text revision. Your task is to improve the **Generated Summary** by adding back the key missing clinical information listed under **Missing Subclaims**, while keeping the readability style defined for the level **{version}**. Do not copy the reference summary. Keep coherence, brevity, and correctness. --- ### INPUT **Full Text (for context):** {fulltext} **Reference Summary (for comparison only):** {ref_summary} **Generated Summary (to revise):** {generated_summary} **Missing Subclaims (to integrate naturally):** {missing_subclaims} --- ### READABILITY STYLES - **easy (FH 70–100, grade 5–7):** - Short sentences, familiar vocabulary, concrete ideas. - Avoid subordinate clauses and medical jargon. - Tone: explanatory, simple, and friendly. - **intermediate (FH 50–69, grade 8–12):** - Moderate sentence complexity and domain vocabulary. - Clear and structured explanation. - **hard (FH 0–49, university/professional):** - Use specialized terminology, formal and dense phrasing. - Include: - precise domain vocabulary; - causal or analytical connectors (por consiguiente, sin embargo, dado que…); - one definition, one process description, and one implication statement if possible; - optional subordinate clauses for academic rhythm. --- ### OUTPUT Return the result in the following JSON format: {{ "revised_summary": "" }} Ensure the text is coherent, medically accurate, and matches the **{version}** readability level. """ return prompt from openai import OpenAI import json file_path = "/home/mshahidul/api_new.json" with open(file_path, "r") as file: api_keys = json.load(file) openai_api_key = api_keys.get("openai") client = OpenAI(api_key=openai_api_key) def openai_return(prompt): response = client.chat.completions.create( model="gpt-5", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] ) try: cleaned_response = response.choices[0].message.content.strip().replace("```json", "").replace("```", "") return json.loads(cleaned_response) except Exception as e: return response.choices[0].message.content.strip().replace("```json", "").replace("```", "") import json file_path = "/home/mshahidul/readctrl/data/training_data_subclaim_verifier/synthetic_data_es_subclaims_100.json" with open(file_path, 'r') as f: synthetic_data = json.load(f) with open("/home/mshahidul/readctrl/results/dataset_quality_check/completeness_resonability_check_100_qwen3-32B_v3.json", 'r') as f: readability_reasoning = json.load(f) import json, ast reason_info = {} for item in readability_reasoning: id = item['id'] difficulty_level = item['version'] data_temp = item['completeness'] for _data in data_temp['results']: reasonableness = _data['reasonableness'] # Step 1: Try to parse as JSON if isinstance(reasonableness, str): parsed = None try: parsed = json.loads(reasonableness) except Exception: try: parsed = ast.literal_eval(reasonableness) except Exception: # Not JSON or dict — treat as plain text parsed = {"reasonableness": "unknown", "justification": reasonableness} reasonableness = parsed # Step 2: Skip if "reasonable" if reasonableness.get('reasonableness') in ["reasonable","unknown"]: continue # Step 3: Collect non-reasonable subclaims key = (id, difficulty_level) reason_info.setdefault(key, []).append(_data['subclaim']) file_path_qwen3_32B = "/home/mshahidul/readctrl/results/dataset_quality_check/subclaim_verifier_results_100_qwen3-32B.json" with open(file_path_qwen3_32B, 'r') as f: qwen3_32B_results = json.load(f) # def inference_prompt_revise_summary(fulltext, ref_summary, generated_summary, version, missing_subclaims): import os with open("/home/mshahidul/readctrl/data/testing_data_gs/multiclinsum_gs_train_es.json", "r") as f_train: multiclinsum_gs_train_es = json.load(f_train) dat_full_text={} dat_summary={} for item in multiclinsum_gs_train_es: dat_full_text[item['id']]=item['fulltext'] dat_summary[item['id']]=item['summary'] res=[] save_path = "/home/mshahidul/readctrl/results/dataset_quality_check/results_revised_100_gpt5_v3.json" if os.path.exists(save_path): with open(save_path, 'r') as f: res = json.load(f) existing_check=set((entry['id'], entry['difficulty_level']) for entry in res) print(f"Resuming from {len(res)} entries") import tqdm for ind in tqdm.tqdm(range(0,10)): for version in ["easy", "intermediate", "hard"]: reference_summary = (f"{synthetic_data[ind]['ref_summary']['text']}") generated_summary = (f"{synthetic_data[ind]['readability_versions'][version]['text']}") if (synthetic_data[ind]['id'],version) in existing_check: continue if (synthetic_data[ind]['id'],version) not in reason_info or len(reason_info[(synthetic_data[ind]['id'],version)])==0: continue missing_subclaims = reason_info[(synthetic_data[ind]['id'],version)] prompt = inference_prompt_revise_summary(dat_full_text[synthetic_data[ind]['id']], reference_summary, generated_summary, version, missing_subclaims) try: ans=openai_return(prompt) res.append({ "id": synthetic_data[ind]['id'], "difficulty_level": version, "prompt": prompt, "response": ans }) if len(res)%2==0: print(f"Completed {len(res)} out of 300") with open(save_path, 'w') as outfile: json.dump(res, outfile, indent=2) except Exception as e: print(f"Error at index {ind}, version {version}: {e}") with open(save_path, 'w') as outfile: json.dump(res, outfile, indent=2)