| def readability_training_prompt_with_human(full_text, gold_summary, generated_text, human_score): |
| """ |
| Create a training prompt for evaluating readability based on human-assigned scores. |
| |
| full_text: original medical text |
| gold_summary: human-written summary |
| generated_text: model-generated text |
| human_score: integer from 1 to 5 (human-evaluated readability) |
| |
| Returns a conversation-style dictionary suitable for training an LLM. |
| """ |
| |
| system_prompt = f""" |
| You are a medical readability evaluator. |
|
|
| Your task is to assess the readability of the GENERATED TEXT for a general audience. |
|
|
| You are given: |
| - FULL TEXT: {full_text} |
| - GOLD SUMMARY: {gold_summary} |
| - GENERATED TEXT: {generated_text} |
|
|
| Use the FULL TEXT and GOLD SUMMARY only as context. Evaluate ONLY the GENERATED TEXT. |
|
|
| Rate readability on a scale from 1 to 5: |
| 1 = Very easy (child-friendly, minimal medical language) |
| 2 = Easy |
| 3 = Medium |
| 4 = Hard |
| 5 = Very hard (requires medical knowledge) |
|
|
| Do NOT evaluate factual correctness. |
| Do NOT compare writing quality. |
| Focus ONLY on readability. |
|
|
| ### Output Format (STRICT JSON) |
| Return a valid JSON object with the following fields: |
|
|
| {{ |
| "readability_score": {human_score}, |
| }} |
|
|
| Do NOT include any text outside the JSON. |
| """ |
|
|
| conversation = {} |
| conversation['conversations'] = ( |
| {'from': "user", 'content': system_prompt}, |
| {'from': "assistant", 'content': f'Human-assigned score: {human_score}'}, |
| ) |
| |
| return conversation |
|
|