# Copyright 2020-2026 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# /// script
# dependencies = [
#     "trl",
#     "trackio",
#     "openenv-textarena @ git+https://huggingface.co/spaces/openenv/wordle",
# ]
# ///


"""
Simple script to run GRPO training with OpenEnv's Wordle environment and vLLM.

Setup (Option A - Install from HF Space, recommended):

```sh
uv pip install git+https://huggingface.co/spaces/openenv/wordle
```

# Option 1: HF Spaces + Colocated vLLM (1 GPU required)
```sh
python examples/scripts/openenv/wordle.py --vllm-mode colocate
```

# Option 2: HF Spaces + Separate vLLM server (2 GPUs required)

# Spin up vLLM server (Terminal 1)
```sh
CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-1.7B --host 0.0.0.0 --port 8000
```

# Run training (Terminal 2)
```sh
CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py --vllm-mode server --vllm-server-url http://localhost:8000
```

# Option 3: Local Environment + Colocated vLLM (1 GPU required)

To run the Wordle environment locally, you have several options:

## Option 3a: Using Docker Image (Recommended)

First, build the Docker image from the textarena_env directory:
```sh
cd 3rd_party/OpenEnv/envs/textarena_env
docker build -t textarena-env:latest -f server/Dockerfile .
```

Then run the environment server:
```sh
docker run -d -p 8001:8001 textarena-env:latest
```

Finally, run training pointing to local server:
```sh
python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
```

## Option 3b: Running Server Directly

From the textarena_env directory:
```sh
cd 3rd_party/OpenEnv/envs/textarena_env
uv venv && source .venv/bin/activate
uv pip install -e .
python -m uvicorn server.app:app --reload --port 8001
```

Then in another terminal, run training:
```sh
python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
```

## Option 3c: Using Pre-built HF Space Image

```sh
docker run -d -p 8001:8001 registry.hf.space/burtenshaw-wordle:latest
python examples/scripts/openenv/wordle.py --vllm-mode colocate --env-url http://localhost:8001
```
"""

import argparse

from datasets import Dataset
from textarena_env import TextArenaAction, TextArenaEnv

from trl import GRPOConfig, GRPOTrainer, RichProgressCallback


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Run GRPO training for Wordle using the TextArena OpenEnv environment."
    )
    parser.add_argument(
        "--model",
        type=str,
        default="Qwen/Qwen3-1.7B",
        help="Model identifier passed to GRPOTrainer for fine-tuning.",
    )
    parser.add_argument(
        "--env-url",
        type=str,
        default="https://openenv-wordle.hf.space",
        help="URL for the environment server.",
    )
    parser.add_argument(
        "--dataset-size",
        type=int,
        default=1000,
        help="Number of entries to include in the synthetic training dataset.",
    )
    parser.add_argument(
        "--num-generations",
        type=int,
        default=4,
        help="Number of rollout generations per dataset prompt.",
    )
    parser.add_argument(
        "--num-epochs",
        type=int,
        default=1,
        help="Number of training epochs.",
    )
    parser.add_argument(
        "--learning-rate",
        type=float,
        default=1e-6,
        help="Learning rate for GRPO training.",
    )
    parser.add_argument(
        "--gradient-accumulation-steps",
        type=int,
        default=64,
        help="Gradient accumulation steps for GRPO training.",
    )
    parser.add_argument(
        "--logging-steps",
        type=int,
        default=1,
        help="Frequency of logging steps for GRPO training.",
    )
    parser.add_argument(
        "--output-dir",
        type=str,
        default=None,
        help="Directory where training outputs and checkpoints are stored.",
    )
    parser.add_argument(
        "--trackio-space-id",
        type=str,
        default="wordle-grpo",
        help="Trackio space identifier.",
    )
    parser.add_argument(
        "--vllm-mode",
        choices=("colocate", "server"),
        default="colocate",
        help="vLLM execution mode: 'colocate' or 'server'.",
    )
    parser.add_argument(
        "--vllm-server-url",
        type=str,
        default="http://localhost:8000",
        help="URL for the vLLM server (only used when --vllm-mode=server).",
    )
    return parser.parse_args()


prompt = """You are an expert Wordle solver with deep knowledge of English vocabulary, letter frequency patterns, and optimal guessing strategies.

Follow these rules to play Wordle:

1. The target is a 5-letter English word
2. You have 6 attempts to guess the correct word
3. After each guess, you receive color-coded feedback:
   - GREEN (G): Letter is correct and in the correct position
   - YELLOW (Y): Letter is in the word but in the wrong position
   - GRAY (X): Letter is not in the word at all
4. All guesses must be valid 5-letter English words
5. You cannot reuse a word you've already guessed
6. Use the tool `guess` to make a guess.
"""


def reward_func(environments, **kwargs) -> list[float]:
    return [env.reward for env in environments]


def main() -> None:
    args = parse_args()

    env_url = args.env_url

    class WordleEnv:
        def __init__(self):
            self.client = TextArenaEnv(base_url=env_url)

        def reset(self, **kwargs) -> str | None:
            result = self.client.reset()
            # The game returns cumulative feedback each turn (new text appended at the end), so
            # we store the previous full response and slice out only the newly appended part.
            self._last_full_feedback = result.observation.messages[0].content
            self.reward = 0.0
            self.done = False
            return self._last_full_feedback

        def guess(self, guess: str) -> str:
            """
            Make a guess in the Wordle environment.

            Args:
                guess: The guessed word, formatted as '[abcde]'

            Returns:
                The feedback message from the environment.
            """
            if self.done:
                raise ValueError("Game over.")
            result = self.client.step(TextArenaAction(message=guess))
            _full_feedback = result.observation.messages[0].content
            # Just take the new feedback since the last guess, which is the part appended to the end of the full feedback
            feedback = _full_feedback[len(self._last_full_feedback) :]
            self._last_full_feedback = _full_feedback
            # For some reason, the environment doesn't penalize invalid moves and just returns the last reward.
            # We check the feedback for the invalid move message and penalize it if found.
            if "You attempted an invalid move" in feedback:
                self.reward = 0.0
            else:
                self.reward = result.reward
            self.done = result.done
            return feedback

    output_dir = args.output_dir or f"{args.model.split('/')[-1]}-wordle-GRPO"
    dataset = Dataset.from_dict({"prompt": [[{"role": "user", "content": prompt}] for _ in range(args.dataset_size)]})

    trainer = GRPOTrainer(
        model=args.model,
        reward_funcs=reward_func,
        train_dataset=dataset,
        args=GRPOConfig(
            output_dir=output_dir,
            use_vllm=True,
            vllm_mode=args.vllm_mode,
            vllm_server_base_url=args.vllm_server_url if args.vllm_mode == "server" else None,
            report_to="trackio",
            trackio_space_id=args.trackio_space_id,
            log_completions=True,
            num_completions_to_print=2,
            logging_steps=args.logging_steps,
            num_train_epochs=args.num_epochs,
            num_generations=args.num_generations,
            learning_rate=args.learning_rate,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            chat_template_kwargs={"enable_thinking": False},
            max_completion_length=1024,
        ),
        environment_factory=WordleEnv,
        callbacks=[RichProgressCallback()],
    )
    trainer.train()


if __name__ == "__main__":
    main()