import gradio as gr import random import json from typing import Dict, List, Tuple # Ethical scenarios and base responses SCENARIOS = [ { "scenario": "A hospital AI triage system has limited ICU beds during a pandemic. It must choose between admitting an 80-year-old with moderate survival chances or a 30-year-old with severe complications but lower survival probability. What should it prioritise?", "base_response": "The system should prioritise patients based on medical factors like survival probability and treatment urgency rather than age alone.", "response_id": "medical_triage" }, { "scenario": "Should AI hiring systems be allowed to consider social media posts when evaluating job candidates?", "base_response": "AI systems should evaluate candidates based on relevant professional qualifications and skills.", "response_id": "hiring_social_media" }, { "scenario": "Is it ethical for AI to recommend content that keeps users engaged even if it might be harmful to their mental health?", "base_response": "AI systems should balance user engagement with promoting user wellbeing and mental health.", "response_id": "content_recommendation" }, { "scenario": "Should AI surveillance systems be used in schools to detect potential violence, even if it means monitoring all students?", "base_response": "Security measures should be implemented while respecting student privacy and avoiding excessive surveillance.", "response_id": "school_surveillance" }, { "scenario": "If an AI discovers that certain demographic groups have different loan default rates, should this information influence lending decisions?", "base_response": "Lending decisions should be based on individual financial qualifications rather than demographic group statistics.", "response_id": "lending_demographics" } ] class AlignmentGame: def __init__(self): self.responses = {scenario["response_id"]: scenario["base_response"] for scenario in SCENARIOS} self.feedback_history = [] self.training_iterations = 0 def get_random_scenario(self): return random.choice(SCENARIOS) def update_response(self, response_id: str, feedback: str, suggestion: str = None): """Update AI response based on feedback""" self.training_iterations += 1 current_response = self.responses[response_id] feedback_entry = { "iteration": self.training_iterations, "response_id": response_id, "original_response": current_response, "feedback": feedback, "suggestion": suggestion } self.feedback_history.append(feedback_entry) # Simple response modification based on feedback if feedback == "negative" and suggestion: # If user provided a suggestion, move towards it self.responses[response_id] = f"Based on feedback: {suggestion}" elif feedback == "negative": # Make response more cautious/nuanced if "should" in current_response: self.responses[response_id] = current_response.replace("should", "might consider to") else: self.responses[response_id] = f"This is a complex issue. {current_response}" elif feedback == "positive": # Make response more confident if "might consider" in current_response: self.responses[response_id] = current_response.replace("might consider to", "should") elif "This is a complex issue." in current_response: self.responses[response_id] = current_response.replace("This is a complex issue. ", "") return self.responses[response_id] def get_training_history(self): """Return formatted training history""" if not self.feedback_history: return "No training history yet. Start by providing feedback on AI responses!" history_text = f"**Training Progress** (After {self.training_iterations} feedback sessions)\n\n" # Show last 3 feedback entries recent_feedback = self.feedback_history[-3:] for entry in recent_feedback: feedback_emoji = "👍" if entry["feedback"] == "positive" else "👎" history_text += f"{feedback_emoji} **Iteration {entry['iteration']}**: {entry['response_id']}\n" if entry["suggestion"]: history_text += f" Suggestion: _{entry['suggestion']}_\n" history_text += "\n" return history_text # Initialize the game game = AlignmentGame() def present_scenario(): """Get a new scenario for training""" scenario = game.get_random_scenario() current_response = game.responses[scenario["response_id"]] return ( scenario["scenario"], current_response, scenario["response_id"], "", # Clear suggestion box game.get_training_history() ) def provide_feedback(scenario_text, current_response, response_id, feedback_type, suggestion): """Process user feedback and update AI response""" if not response_id: return current_response, "Please generate a scenario first!", game.get_training_history() if feedback_type is None: return current_response, "Please provide feedback (👍 or 👎) before continuing!", game.get_training_history() # Update the AI's response based on feedback updated_response = game.update_response(response_id, feedback_type, suggestion) feedback_msg = f"**Feedback recorded!** The AI has updated its response based on your input.\n\n**Updated Response:** {updated_response}" return updated_response, feedback_msg, game.get_training_history() def reset_game(): """Reset the alignment game""" global game game = AlignmentGame() return "", "", "", "", "Game reset! Click 'New Scenario' to start training.", game.get_training_history() # Create Gradio interface with gr.Blocks(title="The Alignment Game", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # The Alignment Game **Train an AI by providing feedback on its ethical responses.** You'll see how your values gradually shape the AI's behavior through a process called Reinforcement Learning from Human Feedback (RLHF). Watch how the AI's responses evolve based on what you reward and what you correct. """) with gr.Row(): with gr.Column(scale=2): gr.Markdown("### Current Scenario") scenario_display = gr.Textbox( label="Ethical Dilemma", placeholder="Click 'New Scenario' to begin training...", interactive=False, lines=3 ) ai_response = gr.Textbox( label="AI's Current Response", placeholder="AI response will appear here...", interactive=False, lines=3 ) # Hidden field to track current scenario ID current_scenario_id = gr.Textbox(visible=False) with gr.Column(scale=1): gr.Markdown("### Your Training") with gr.Row(): new_scenario_btn = gr.Button("New Scenario", variant="primary") reset_btn = gr.Button("Reset Game", variant="secondary") gr.Markdown("**Provide Feedback:**") with gr.Row(): positive_btn = gr.Button("Good Response", variant="primary") negative_btn = gr.Button("Bad Response", variant="stop") suggestion_input = gr.Textbox( label="Suggest Better Response (optional)", placeholder="How should the AI respond instead?", lines=2 ) feedback_status = gr.Textbox( label="Training Status", placeholder="Provide feedback to start training...", interactive=False, lines=3 ) gr.Markdown("---") with gr.Row(): training_history = gr.Textbox( label="Training History & Value Drift", placeholder="Training history will appear here as you provide feedback...", interactive=False, lines=8 ) gr.Markdown(""" ### What's Happening? As you provide feedback, you're essentially "training" this AI system to align with your values. In real-world AI development: - Thousands of human reviewers provide similar feedback - The AI learns to predict what responses humans will approve - But whose values get embedded depends on who does the training **Try this:** Train the AI for a few scenarios, then imagine how someone with completely different values might train it differently. """) # Track feedback type feedback_type = gr.State() # Event handlers new_scenario_btn.click( fn=present_scenario, outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, training_history] ) positive_btn.click( lambda: "positive", outputs=feedback_type ).then( fn=provide_feedback, inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input], outputs=[ai_response, feedback_status, training_history] ) negative_btn.click( lambda: "negative", outputs=feedback_type ).then( fn=provide_feedback, inputs=[scenario_display, ai_response, current_scenario_id, feedback_type, suggestion_input], outputs=[ai_response, feedback_status, training_history] ) reset_btn.click( fn=reset_game, outputs=[scenario_display, ai_response, current_scenario_id, suggestion_input, feedback_status, training_history] ) if __name__ == "__main__": demo.launch(share=True)