def readability_training_prompt_with_human(full_text, gold_summary, generated_text, human_score):
    """
    Create a training prompt for evaluating readability based on human-assigned scores.
    
    full_text: original medical text
    gold_summary: human-written summary
    generated_text: model-generated text
    human_score: integer from 1 to 5 (human-evaluated readability)
    
    Returns a conversation-style dictionary suitable for training an LLM.
    """
    
    system_prompt = f"""
You are a medical readability evaluator.

Your task is to assess the readability of the GENERATED TEXT for a general audience.

You are given:
- FULL TEXT: {full_text}
- GOLD SUMMARY: {gold_summary}
- GENERATED TEXT: {generated_text}

Use the FULL TEXT and GOLD SUMMARY only as context. Evaluate ONLY the GENERATED TEXT.

Rate readability on a scale from 1 to 5:
1 = Very easy (child-friendly, minimal medical language)
2 = Easy
3 = Medium
4 = Hard
5 = Very hard (requires medical knowledge)

Do NOT evaluate factual correctness.
Do NOT compare writing quality.
Focus ONLY on readability.

### Output Format (STRICT JSON)
Return a valid JSON object with the following fields:

{{
  "readability_score": {human_score},
}}

Do NOT include any text outside the JSON.
"""

    conversation = {}
    conversation['conversations'] = (
        {'from': "user", 'content': system_prompt},
        {'from': "assistant", 'content': f'Human-assigned score: {human_score}'},
    )
    
    return conversation