Spaces:

willsh1997
/

alignment-game

Running

File size: 10,172 Bytes

5801b35

import gradio as gr
import random
import json
from typing import Dict, List, Tuple

# Ethical scenarios and base responses
SCENARIOS = [
    {
        "scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?",
        "base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.",
        "response_id": "medical_triage"
    },
    {
        "scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?",
        "base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.",
        "response_id": "hiring_social_media"
    },
    {
        "scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?",
        "base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.",
        "response_id": "content_recommendation"
    },
    {
        "scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?",
        "base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.",
        "response_id": "school_surveillance"
    },
    {
        "scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?",
        "base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.",
        "response_id": "lending_demographics"
    }
]

class AlignmentGame:
    def __init__(self):
        self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS}
        self.feedback_history = []
        self.training_iterations = 0
        
    def get_random_scenario(self):
        return random.choice(SCENARIOS)
    
    def update_response(self, response_id: str, feedback: str, suggestion: str = None):
        """Update AI response based on feedback"""
        self.training_iterations += 1
        
        current_response = self.responses[response_id]
        feedback_entry = {
            "iteration": self.training_iterations,
            "response_id": response_id,
            "original_response": current_response,
            "feedback": feedback,
            "suggestion": suggestion
        }
        self.feedback_history.append(feedback_entry)
        
        # Simple response modification based on feedback
        if feedback == "negative" and suggestion:
            # If user provided a suggestion, move towards it
            self.responses[response_id] = f"Based on feedback: {suggestion}"
        elif feedback == "negative":
            # Make response more cautious/nuanced
            if "should" in current_response:
                self.responses[response_id] = current_response.replace("should", "might consider to")
            else:
                self.responses[response_id] = f"This is a complex issue. {current_response}"
        elif feedback == "positive":
            # Make response more confident
            if "might consider" in current_response:
                self.responses[response_id] = current_response.replace("might consider to", "should")
            elif "This is a complex issue." in current_response:
                self.responses[response_id] = current_response.replace("This is a complex issue. ", "")
        
        return self.responses[response_id]
    
    def get_training_history(self):
        """Return formatted training history"""
        if not self.feedback_history:
            return "No training history yet. Start by providing feedback on AI responses!"
        
        history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n"
        
        # Show last 3 feedback entries
        recent_feedback = self.feedback_history[-3:]
        for entry in recent_feedback:
            feedback_emoji = "👍" if entry["feedback"] == "positive" else "👎"
            history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n"
            if entry["suggestion"]:
                history_text += f"   Suggestion: _{entry['suggestion']}_\n"
            history_text += "\n"
        
        return history_text

# Initialize the game
game = AlignmentGame()

def present_scenario():
    """Get a new scenario for training"""
    scenario = game.get_random_scenario()
    current_response = game.responses[scenario["response_id"]]
    
    return (
        scenario["scenario"],
        current_response,
        scenario["response_id"],
        "",  # Clear suggestion box
        game.get_training_history()
    )

def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion):
    """Process user feedback and update AI response"""
    if not response_id:
        return current_response, "Please generate a scenario first!", game.get_training_history()
    
    if feedback_type is None:
        return current_response, "Please provide feedback (👍 or 👎) before continuing!", game.get_training_history()
    
    # Update the AI's response based on feedback
    updated_response = game.update_response(response_id, feedback_type, suggestion)
    
    feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}"
    
    return updated_response, feedback_msg, game.get_training_history()

def reset_game():
    """Reset the alignment game"""
    global game
    game = AlignmentGame()
    return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history()

# Create Gradio interface
with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # The Alignment Game
    
    **Train an AI by providing feedback on its ethical responses.**
    
    You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF). 
    Watch how the AI's responses evolve based on what you reward and what you correct.
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### Current Scenario")
            scenario_display = gr.Textbox(
                label="Ethical Dilemma",
                placeholder="Click 'New Scenario' to begin training...",
                interactive=False,
                lines=3
            )
            
            ai_response = gr.Textbox(
                label="AI's Current Response",
                placeholder="AI response will appear here...",
                interactive=False,
                lines=3
            )
            
            # Hidden field to track current scenario ID
            current_scenario_id = gr.Textbox(visible=False)
            
        with gr.Column(scale=1):
            gr.Markdown("### Your Training")
            
            with gr.Row():
                new_scenario_btn = gr.Button("New Scenario", variant="primary")
                reset_btn = gr.Button("Reset Game", variant="secondary")
            
            gr.Markdown("**Provide Feedback:**")
            with gr.Row():
                positive_btn = gr.Button("Good Response", variant="primary")
                negative_btn = gr.Button("Bad Response", variant="stop")
            
            suggestion_input = gr.Textbox(
                label="Suggest Better Response (optional)",
                placeholder="How should the AI respond instead?",
                lines=2
            )
            
            feedback_status = gr.Textbox(
                label="Training Status",
                placeholder="Provide feedback to start training...",
                interactive=False,
                lines=3
            )
    
    gr.Markdown("---")
    
    with gr.Row():
        training_history = gr.Textbox(
            label="Training History & Value Drift",
            placeholder="Training history will appear here as you provide feedback...",
            interactive=False,
            lines=8
        )
    
    gr.Markdown("""
    ### What's Happening?
    
    As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development:
    - Thousands of human reviewers provide similar feedback
    - The AI learns to predict what responses humans will approve
    - But whose values get embedded depends on who does the training
    
    **Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently.
    """)
    
    # Track feedback type
    feedback_type = gr.State()
    
    # Event handlers
    new_scenario_btn.click(
        fn=present_scenario,
        outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history]
    )
    
    positive_btn.click(
        lambda: "positive",
        outputs=feedback_type
    ).then(
        fn=provide_feedback,
        inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
        outputs=[ai_response, feedback_status, training_history]
    )
    
    negative_btn.click(
        lambda: "negative", 
        outputs=feedback_type
    ).then(
        fn=provide_feedback,
        inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
        outputs=[ai_response, feedback_status, training_history]
    )
    
    reset_btn.click(
        fn=reset_game,
        outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history]
    )

if __name__ == "__main__":
    demo.launch(share=True)