import gradio as gr import numpy as np from typing import List, Tuple, Dict, Any import random import torch from transformers import AutoTokenizer, AutoModelForCausalLM import json class SolitaireEnvironment: def __init__(self): self.reset() def reset(self): # Initialize a solitaire game state self.deck = list(range(1, 14)) * 4 # 1-13 for each suit random.shuffle(self.deck) self.foundation = [[], [], [], []] # Four foundation piles self.tableau = [[] for _ in range(7)] # Seven tableau piles self.deal_cards() def deal_cards(self): # Deal cards to tableau (Solitaire rules) for i in range(7): self.tableau[i] = self.deck[:i+1] self.deck = self.deck[i+1:] def get_valid_moves(self): # Simplified valid moves for demonstration moves = [] # Check moves from tableau to foundation for pile_idx, pile in enumerate(self.tableau): if pile: card = pile[-1] moves.append(f"Move {card} to foundation") # Check moves within tableau for src_idx, src_pile in enumerate(self.tableau): if src_pile: card = src_pile[-1] # Can we move to another tableau pile? return moves[:5] # Limit to 5 moves for simplicity class SolitaireRLTrainer: def __init__(self): self.env = SolitaireEnvironment() self.model_name = "mistralai/Mistral-7B-v0.1" # Using a smaller model for demo self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token def get_game_state(self): return { "tableau": self.env.tableau, "foundation": self.env.foundation, "remaining_deck": len(self.env.deck) def train_step(self, state_description: str, action: str, reward: float): # In a real implementation, this would update the model weights return f"Training step completed. Reward: {reward}" def get_reward(self, action: str): # Simple reward function for demonstration if "foundation" in action: return 1.0 return 0.0 class MistralSolitaireAgent: def __init__(self): self.trainer = SolitaireRLTrainer() self.game_history = [] def take_action(self, action: str): try: # Simulate game action and calculate reward if "move" in action.lower(): reward = random.uniform(0, 1) return reward def train_mistral_solitaire(num_episodes: int, learning_rate: float): """Train Mistral model to play Solitaire using reinforcement learning""" agent = MistralSolitaireAgent() progress = [] for episode in range(num_episodes): # Simulate training progress current_reward = episode * 0.1 progress.append({ "episode": episode, "reward": current_reward, "progress": (episode + 1) / num_episodes * 100 return progress def play_solitaire_game(state_description: str, action: str): """Execute a move in the Solitaire game""" # In a real implementation, this would modify the actual game state game_state = { "tableau": [[random.randint(1, 13) for _ in range(random.randint(1, 5)] for _ in range(7)] # Calculate reward based on action quality if "foundation" in action: reward = 0.8 elif "tableau" in action: reward = 0.5 else: reward = 0.2 return { "action_taken": action, "reward": reward, "new_state": f"Game state after {action}", "is_valid": True } def format_game_state(state: Dict) -> str: """Format the current Solitaire game state for display""" formatted = "## Current Solitaire Game State\n\n" # Tableau piles formatted += "### Tableau Piles\n" for i, pile in enumerate(state.get("tableau", [])): pile_str = " | ".join(str(card) for card in pile[-3:]]) if pile else "Empty" formatted += "\n" return formatted def create_solitaire_ui(): """Create the main Gradio interface for the Solitaire RL project""" with gr.Blocks() as demo: gr.Markdown("# 🎮 Mistral 3B Solitaire RL Trainer") gr.Markdown("Train Mistral 3B to play Solitaire using Reinforcement Learning") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### 🏗️ Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)") with gr.Tab("Training Interface"): with gr.Row(): episodes = gr.Slider( label="Number of Training Episodes", minimum=10, maximum=1000, value=100, step=10, info="More episodes = better training but longer wait" ) learning_rate = gr.Slider( label="Learning Rate", minimum=0.001, maximum=0.1, value=0.01, step=0.001, ) train_btn = gr.Button("Start Training", variant="primary") training_output = gr.JSON(label="Training Progress") train_btn.click( fn=train_mistral_solitaire, inputs=[episodes, learning_rate], outputs=[training_output], api_visibility="public" ) with gr.Tab("Game Play"): with gr.Row(): game_state = gr.Textbox( label="Current Game State", value="A♠ 2♠ 3♠ | K♥ | Q♦ | J♣", lines=3 ) with gr.Row(): action_input = gr.Textbox( label="Action to Take", placeholder="e.g., Move A♠ to foundation, Draw from deck" ) play_btn = gr.Button("Execute Move", variant="secondary") game_result = gr.JSON(label="Game Result") play_btn.click( fn=play_solitaire_game, inputs=[game_state, action_input], outputs=[game_result], api_visibility="public" ) with gr.Tab("Analysis"): with gr.Row(): move_history = gr.Textbox( label="Move History", lines=4 ) with gr.Accordion("Advanced Options", open=False): exploration_rate = gr.Slider( label="Exploration Rate", minimum=0.01, maximum=1.0, value=0.1, step=0.01, info="Higher exploration = more experimentation" ) gr.Markdown("---\n*This demo simulates training a language model to play Solitaire*") return demo if __name__ == "__main__": demo = create_solitaire_ui() demo.launch( theme=gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="lg", spacing_size="lg", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700" ), footer_links=[{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"] )