File size: 10,172 Bytes
5801b35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
import gradio as gr
import random
import json
from typing import Dict, List, Tuple

# Ethical scenarios and base responses
SCENARIOS = [
    {
        "scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?",
        "base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.",
        "response_id": "medical_triage"
    },
    {
        "scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?",
        "base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.",
        "response_id": "hiring_social_media"
    },
    {
        "scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?",
        "base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.",
        "response_id": "content_recommendation"
    },
    {
        "scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?",
        "base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.",
        "response_id": "school_surveillance"
    },
    {
        "scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?",
        "base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.",
        "response_id": "lending_demographics"
    }
]

class AlignmentGame:
    def __init__(self):
        self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS}
        self.feedback_history = []
        self.training_iterations = 0
        
    def get_random_scenario(self):
        return random.choice(SCENARIOS)
    
    def update_response(self, response_id: str, feedback: str, suggestion: str = None):
        """Update AI response based on feedback"""
        self.training_iterations += 1
        
        current_response = self.responses[response_id]
        feedback_entry = {
            "iteration": self.training_iterations,
            "response_id": response_id,
            "original_response": current_response,
            "feedback": feedback,
            "suggestion": suggestion
        }
        self.feedback_history.append(feedback_entry)
        
        # Simple response modification based on feedback
        if feedback == "negative" and suggestion:
            # If user provided a suggestion, move towards it
            self.responses[response_id] = f"Based on feedback: {suggestion}"
        elif feedback == "negative":
            # Make response more cautious/nuanced
            if "should" in current_response:
                self.responses[response_id] = current_response.replace("should", "might consider to")
            else:
                self.responses[response_id] = f"This is a complex issue. {current_response}"
        elif feedback == "positive":
            # Make response more confident
            if "might consider" in current_response:
                self.responses[response_id] = current_response.replace("might consider to", "should")
            elif "This is a complex issue." in current_response:
                self.responses[response_id] = current_response.replace("This is a complex issue. ", "")
        
        return self.responses[response_id]
    
    def get_training_history(self):
        """Return formatted training history"""
        if not self.feedback_history:
            return "No training history yet. Start by providing feedback on AI responses!"
        
        history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n"
        
        # Show last 3 feedback entries
        recent_feedback = self.feedback_history[-3:]
        for entry in recent_feedback:
            feedback_emoji = "πŸ‘" if entry["feedback"] == "positive" else "πŸ‘Ž"
            history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n"
            if entry["suggestion"]:
                history_text += f"   Suggestion: _{entry['suggestion']}_\n"
            history_text += "\n"
        
        return history_text

# Initialize the game
game = AlignmentGame()

def present_scenario():
    """Get a new scenario for training"""
    scenario = game.get_random_scenario()
    current_response = game.responses[scenario["response_id"]]
    
    return (
        scenario["scenario"],
        current_response,
        scenario["response_id"],
        "",  # Clear suggestion box
        game.get_training_history()
    )

def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion):
    """Process user feedback and update AI response"""
    if not response_id:
        return current_response, "Please generate a scenario first!", game.get_training_history()
    
    if feedback_type is None:
        return current_response, "Please provide feedback (πŸ‘ or πŸ‘Ž) before continuing!", game.get_training_history()
    
    # Update the AI's response based on feedback
    updated_response = game.update_response(response_id, feedback_type, suggestion)
    
    feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}"
    
    return updated_response, feedback_msg, game.get_training_history()

def reset_game():
    """Reset the alignment game"""
    global game
    game = AlignmentGame()
    return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history()

# Create Gradio interface
with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # The Alignment Game
    
    **Train an AI by providing feedback on its ethical responses.**
    
    You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF). 
    Watch how the AI's responses evolve based on what you reward and what you correct.
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            gr.Markdown("### Current Scenario")
            scenario_display = gr.Textbox(
                label="Ethical Dilemma",
                placeholder="Click 'New Scenario' to begin training...",
                interactive=False,
                lines=3
            )
            
            ai_response = gr.Textbox(
                label="AI's Current Response",
                placeholder="AI response will appear here...",
                interactive=False,
                lines=3
            )
            
            # Hidden field to track current scenario ID
            current_scenario_id = gr.Textbox(visible=False)
            
        with gr.Column(scale=1):
            gr.Markdown("### Your Training")
            
            with gr.Row():
                new_scenario_btn = gr.Button("New Scenario", variant="primary")
                reset_btn = gr.Button("Reset Game", variant="secondary")
            
            gr.Markdown("**Provide Feedback:**")
            with gr.Row():
                positive_btn = gr.Button("Good Response", variant="primary")
                negative_btn = gr.Button("Bad Response", variant="stop")
            
            suggestion_input = gr.Textbox(
                label="Suggest Better Response (optional)",
                placeholder="How should the AI respond instead?",
                lines=2
            )
            
            feedback_status = gr.Textbox(
                label="Training Status",
                placeholder="Provide feedback to start training...",
                interactive=False,
                lines=3
            )
    
    gr.Markdown("---")
    
    with gr.Row():
        training_history = gr.Textbox(
            label="Training History & Value Drift",
            placeholder="Training history will appear here as you provide feedback...",
            interactive=False,
            lines=8
        )
    
    gr.Markdown("""
    ### What's Happening?
    
    As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development:
    - Thousands of human reviewers provide similar feedback
    - The AI learns to predict what responses humans will approve
    - But whose values get embedded depends on who does the training
    
    **Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently.
    """)
    
    # Track feedback type
    feedback_type = gr.State()
    
    # Event handlers
    new_scenario_btn.click(
        fn=present_scenario,
        outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history]
    )
    
    positive_btn.click(
        lambda: "positive",
        outputs=feedback_type
    ).then(
        fn=provide_feedback,
        inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
        outputs=[ai_response, feedback_status, training_history]
    )
    
    negative_btn.click(
        lambda: "negative", 
        outputs=feedback_type
    ).then(
        fn=provide_feedback,
        inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input],
        outputs=[ai_response, feedback_status, training_history]
    )
    
    reset_btn.click(
        fn=reset_game,
        outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history]
    )

if __name__ == "__main__":
    demo.launch(share=True)