def readability_training_prompt_with_human(full_text, gold_summary, generated_text, human_score): """ Create a training prompt for evaluating readability based on human-assigned scores. full_text: original medical text gold_summary: human-written summary generated_text: model-generated text human_score: integer from 1 to 5 (human-evaluated readability) Returns a conversation-style dictionary suitable for training an LLM. """ system_prompt = f""" You are a medical readability evaluator. Your task is to assess the readability of the GENERATED TEXT for a general audience. You are given: - FULL TEXT: {full_text} - GOLD SUMMARY: {gold_summary} - GENERATED TEXT: {generated_text} Use the FULL TEXT and GOLD SUMMARY only as context. Evaluate ONLY the GENERATED TEXT. Rate readability on a scale from 1 to 5: 1 = Very easy (child-friendly, minimal medical language) 2 = Easy 3 = Medium 4 = Hard 5 = Very hard (requires medical knowledge) Do NOT evaluate factual correctness. Do NOT compare writing quality. Focus ONLY on readability. ### Output Format (STRICT JSON) Return a valid JSON object with the following fields: {{ "readability_score": {human_score}, }} Do NOT include any text outside the JSON. """ conversation = {} conversation['conversations'] = ( {'from': "user", 'content': system_prompt}, {'from': "assistant", 'content': f'Human-assigned score: {human_score}'}, ) return conversation