Buckets:
| import{s as ct,o as mt,n as dt}from"../chunks/scheduler.7b731bd4.js";import{S as pt,i as ut,e as s,s as o,c as d,h as gt,a as l,d as n,b as r,f as ce,g as p,j as c,k as ee,l as _,m as a,n as u,t as g,o as f,p as h}from"../chunks/index.cc268345.js";import{C as ft,H as te,E as ht}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{D as it}from"../chunks/Docstring.03f7b462.js";import{C as me}from"../chunks/CodeBlock.169a125f.js";import{E as _t}from"../chunks/ExampleCodeBlock.415f9452.js";function vt(de){let m,C="Example:",x,y,b;return y=new me({props:{code:"ZnJvbSUyMHRybC5leHBlcmltZW50YWwuYXN5bmNfZ3JwbyUyMGltcG9ydCUyMEFzeW5jR1JQT1RyYWluZXIlMEFmcm9tJTIwdHJsLnJld2FyZHMlMjBpbXBvcnQlMjBhY2N1cmFjeV9yZXdhcmQlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZEZWVwTWF0aC0xMDNLJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEElMEF0cmFpbmVyJTIwJTNEJTIwQXN5bmNHUlBPVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMi41LTAuNUItSW5zdHJ1Y3QlMjIlMkMlMEElMjAlMjAlMjAlMjByZXdhcmRfZnVuY3MlM0RhY2N1cmFjeV9yZXdhcmQlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> trl.experimental.async_grpo <span class="hljs-keyword">import</span> AsyncGRPOTrainer | |
| <span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| dataset = load_dataset(<span class="hljs-string">"trl-lib/DeepMath-103K"</span>, split=<span class="hljs-string">"train"</span>) | |
| trainer = AsyncGRPOTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen2.5-0.5B-Instruct"</span>, | |
| reward_funcs=accuracy_reward, | |
| train_dataset=dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),{c(){m=s("p"),m.textContent=C,x=o(),d(y.$$.fragment)},l(i){m=l(i,"P",{"data-svelte-h":!0}),c(m)!=="svelte-11lpom8"&&(m.textContent=C),x=r(i),p(y.$$.fragment,i)},m(i,w){a(i,m,w),a(i,x,w),u(y,i,w),b=!0},p:dt,i(i){b||(g(y.$$.fragment,i),b=!0)},o(i){f(y.$$.fragment,i),b=!1},d(i){i&&(n(m),n(x)),h(y,i)}}}function yt(de){let m,C,x,y,b,i,w,pe,T,ne,We="This trainer requires <code>vllm>=0.17.1</code> and <code>transformers>=5.2.0</code>. For distributed training, only FSDP2 is supported (DeepSpeed ZeRO is not).",Ie,ae,ze="Currently, <code>vllm</code> and <code>transformers</code> have conflicting dependency constraints. To work around this, install vLLM first and then force-install transformers:",Oe,P,ue,G,ge,J,Ze='<code>AsyncGRPOTrainer</code> implements the same <a href="grpo_trainer">GRPO</a> algorithm but decouples rollout generation from training. A background worker continuously streams completions from a vLLM server while the training loop consumes them, so generation and gradient updates overlap instead of alternating. The API mirrors <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> — for full details on the GRPO method itself (advantage computation, KL estimation, loss formulation, reward functions, etc.), see the <a href="grpo_trainer">GRPO Trainer</a> documentation. Not all features from <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> are available; refer to <code>AsyncGRPOConfig</code> for the supported parameters.',fe,L,De='This trainer was contributed by <a href="https://huggingface.co/qgallouedec" rel="nofollow">Quentin Gallouédec</a> and <a href="https://huggingface.co/aminediroHF" rel="nofollow">Amine Dirhoussi</a>.',he,A,_e,U,Xe='In the standard <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a>, generation and training are sequential: generate a batch, compute the loss, update weights, repeat. Even in <a href="grpo_trainer#speed-up-training-with-vllm">vLLM colocate mode</a>, where generation runs on the same GPUs, one phase must finish before the other begins.',ve,F,Qe="<code>AsyncGRPOTrainer</code> separates these two concerns:",ye,j,Se="<li><strong>Rollout worker</strong> (background thread) — sends prompts to a vLLM server, scores completions with reward functions, computes advantages, and pushes ready-to-train samples into a queue.</li> <li><strong>Training loop</strong> (main process) — pulls samples from the queue, computes the clipped surrogate loss, and updates the model weights.</li>",be,I,Ye="After every <code>weight_sync_steps</code> training steps, the updated weights are transferred to the vLLM server via NCCL so that subsequent generations reflect the latest policy.",we,O,Ke="Because generation and training run concurrently, the training samples may have been generated by a slightly older version of the model. The <code>max_staleness</code> parameter controls how many weight updates a sample can lag behind before being discarded.",Te,E,et="The number of concurrent requests sent to the vLLM server is controlled by <code>max_inflight_tasks</code>. By default it is set automatically to <code>max_staleness × per_device_train_batch_size × gradient_accumulation_steps × num_processes</code> — the maximum number of samples the trainer can consume before they become stale. Generating more than this is wasteful since the excess samples will be discarded.",Me,B,xe,H,Ne,V,tt="The vLLM server and the trainer must run on <strong>separate GPUs</strong>. Use <code>CUDA_VISIBLE_DEVICES</code> to partition your GPUs. For example, with 2 GPUs, you can run the vLLM server on GPU 0 and the trainer on GPU 1 as follows:",ke,q,$e,k,nt="<p>Set <code>--max-model-len</code> to the maximum total sequence length (prompt + completion) you expect. A lower value reduces GPU memory usage on the server, freeing more memory for the KV cache and increasing throughput. A good starting point is the prompt length plus <code>max_completion_length</code> from your config.</p>",Re,W,Ce,z,Pe,Z,at="This trainer is intentionally kept minimal and is not meant to grow into a general-purpose solution. If you need a feature that is not supported, we recommend cloning the repository and adapting the trainer to your needs directly. New features will only be considered when there is significant community demand.",Ge,D,Je,v,X,Ee,oe,ot="Configuration class for the <code>AsyncGRPOTrainer</code>.",Be,re,rt=`This class includes only the parameters that are specific to asynchronous GRPO training. For a full list of | |
| training arguments, please refer to the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a> documentation. Note that default values | |
| in this class may differ from those in <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>.`,He,Q,st='<p>These parameters have default values different from <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>:</p> <ul><li><code>logging_steps</code>: Defaults to <code>10</code> instead of <code>500</code>.</li> <li><code>gradient_checkpointing</code>: Defaults to <code>True</code> instead of <code>False</code>.</li> <li><code>bf16</code>: Defaults to <code>True</code> if <code>fp16</code> is not set, instead of <code>False</code>.</li> <li><code>learning_rate</code>: Defaults to <code>1e-6</code> instead of <code>5e-5</code>.</li></ul>',Le,S,Ae,M,Y,Ve,se,lt=`Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the | |
| paper <a href="https://huggingface.co/papers/2402.03300" rel="nofollow">DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language | |
| Models</a>. This trainer is the asynchronous version of GRPO, where | |
| generation is offloaded to an external vLLM server that runs asynchronously alongside training, decoupling rollout | |
| from the gradient update loop.`,qe,$,Ue,K,Fe,ie,je;return b=new ft({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),w=new te({props:{title:"Asynchronous GRPO",local:"asynchronous-grpo",headingTag:"h1"}}),P=new me({props:{code:"cGlwJTIwaW5zdGFsbCUyMCd2bGxtJTNFJTNEMC4xNy4xJyUwQXBpcCUyMGluc3RhbGwlMjAndHJhbnNmb3JtZXJzJTNFJTNENS4yLjAnJTIwLS1uby1kZXBz",highlighted:`pip install <span class="hljs-string">'vllm>=0.17.1'</span> | |
| pip install <span class="hljs-string">'transformers>=5.2.0'</span> --no-deps`,wrap:!1}}),G=new te({props:{title:"Overview",local:"overview",headingTag:"h2"}}),A=new te({props:{title:"How it differs from GRPOTrainer",local:"how-it-differs-from-grpotrainer",headingTag:"h2"}}),B=new te({props:{title:"Quick start",local:"quick-start",headingTag:"h2"}}),H=new me({props:{code:"JTIzJTIwdHJhaW5fYXN5bmNfZ3Jwby5weSUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGxvYWRfZGF0YXNldCUwQWZyb20lMjB0cmwuZXhwZXJpbWVudGFsLmFzeW5jX2dycG8lMjBpbXBvcnQlMjBBc3luY0dSUE9UcmFpbmVyJTBBZnJvbSUyMHRybC5yZXdhcmRzJTIwaW1wb3J0JTIwYWNjdXJhY3lfcmV3YXJkJTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0cmwtbGliJTJGRGVlcE1hdGgtMTAzSyUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMEFzeW5jR1JQT1RyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJRd2VuJTJGUXdlbjMtNEIlMjIlMkMlMEElMjAlMjAlMjAlMjByZXdhcmRfZnVuY3MlM0RhY2N1cmFjeV9yZXdhcmQlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-comment"># train_async_grpo.py</span> | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">from</span> trl.experimental.async_grpo <span class="hljs-keyword">import</span> AsyncGRPOTrainer | |
| <span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward | |
| dataset = load_dataset(<span class="hljs-string">"trl-lib/DeepMath-103K"</span>, split=<span class="hljs-string">"train"</span>) | |
| trainer = AsyncGRPOTrainer( | |
| model=<span class="hljs-string">"Qwen/Qwen3-4B"</span>, | |
| reward_funcs=accuracy_reward, | |
| train_dataset=dataset, | |
| ) | |
| trainer.train()`,wrap:!1}}),q=new me({props:{code:"JTIzJTIwVGVybWluYWwlMjAxJTNBJTIwdkxMTSUyMHNlcnZlciUyMG9uJTIwR1BVJTIwMCUyMChkZXYlMjBtb2RlJTIwJTJCJTIwTkNDTCUyMHdlaWdodCUyMHRyYW5zZmVyJTIwYXJlJTIwcmVxdWlyZWQpJTBBQ1VEQV9WSVNJQkxFX0RFVklDRVMlM0QwJTIwVkxMTV9TRVJWRVJfREVWX01PREUlM0QxJTIwdmxsbSUyMHNlcnZlJTIwUXdlbiUyRlF3ZW4zLTRCJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tYXgtbW9kZWwtbGVuJTIwNDA5NiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbG9ncHJvYnMtbW9kZSUyMHByb2Nlc3NlZF9sb2dwcm9icyUyMCU1QyUwQSUyMCUyMCUyMCUyMC0td2VpZ2h0LXRyYW5zZmVyLWNvbmZpZyUyMCclN0IlMjJiYWNrZW5kJTIyJTNBJTIybmNjbCUyMiU3RCc=",highlighted:`<span class="hljs-comment"># Terminal 1: vLLM server on GPU 0 (dev mode + NCCL weight transfer are required)</span> | |
| CUDA_VISIBLE_DEVICES=0 VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-4B \\ | |
| --max-model-len 4096 \\ | |
| --logprobs-mode processed_logprobs \\ | |
| --weight-transfer-config <span class="hljs-string">'{"backend":"nccl"}'</span>`,wrap:!1}}),W=new me({props:{code:"JTIzJTIwVGVybWluYWwlMjAyJTNBJTIwdHJhaW5pbmclMjBvbiUyMEdQVSUyMDElMEFDVURBX1ZJU0lCTEVfREVWSUNFUyUzRDElMjBhY2NlbGVyYXRlJTIwbGF1bmNoJTIwdHJhaW5fYXN5bmNfZ3Jwby5weQ==",highlighted:`<span class="hljs-comment"># Terminal 2: training on GPU 1</span> | |
| CUDA_VISIBLE_DEVICES=1 accelerate launch train_async_grpo.py`,wrap:!1}}),z=new te({props:{title:"Design philosophy",local:"design-philosophy",headingTag:"h2"}}),D=new te({props:{title:"AsyncGRPOConfig",local:"trl.experimental.async_grpo.AsyncGRPOConfig",headingTag:"h2"}}),X=new it({props:{name:"class trl.experimental.async_grpo.AsyncGRPOConfig",anchor:"trl.experimental.async_grpo.AsyncGRPOConfig",parameters:[{name:"output_dir",val:": str | None = None"},{name:"per_device_train_batch_size",val:": int = 8"},{name:"num_train_epochs",val:": float = 3.0"},{name:"max_steps",val:": int = -1"},{name:"learning_rate",val:": float = 1e-06"},{name:"lr_scheduler_type",val:": transformers.trainer_utils.SchedulerType | str = 'linear'"},{name:"lr_scheduler_kwargs",val:": dict | str | None = None"},{name:"warmup_steps",val:": float = 0"},{name:"optim",val:": transformers.training_args.OptimizerNames | str = 'adamw_torch_fused'"},{name:"optim_args",val:": str | None = None"},{name:"weight_decay",val:": float = 0.0"},{name:"adam_beta1",val:": float = 0.9"},{name:"adam_beta2",val:": float = 0.999"},{name:"adam_epsilon",val:": float = 1e-08"},{name:"optim_target_modules",val:": None | str | list[str] = None"},{name:"gradient_accumulation_steps",val:": int = 1"},{name:"average_tokens_across_devices",val:": bool = True"},{name:"max_grad_norm",val:": float = 1.0"},{name:"label_smoothing_factor",val:": float = 0.0"},{name:"bf16",val:": bool | None = None"},{name:"fp16",val:": bool = False"},{name:"bf16_full_eval",val:": bool = False"},{name:"fp16_full_eval",val:": bool = False"},{name:"tf32",val:": bool | None = None"},{name:"gradient_checkpointing",val:": bool = True"},{name:"gradient_checkpointing_kwargs",val:": dict[str, typing.Any] | str | None = None"},{name:"torch_compile",val:": bool = False"},{name:"torch_compile_backend",val:": str | None = None"},{name:"torch_compile_mode",val:": str | None = None"},{name:"use_liger_kernel",val:": bool = False"},{name:"liger_kernel_config",val:": dict[str, bool] | None = None"},{name:"use_cache",val:": bool = False"},{name:"neftune_noise_alpha",val:": float | None = None"},{name:"torch_empty_cache_steps",val:": int | None = None"},{name:"auto_find_batch_size",val:": bool = False"},{name:"logging_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'steps'"},{name:"logging_steps",val:": float = 1"},{name:"logging_first_step",val:": bool = False"},{name:"log_on_each_node",val:": bool = True"},{name:"logging_nan_inf_filter",val:": bool = True"},{name:"include_num_input_tokens_seen",val:": str | bool = 'no'"},{name:"log_level",val:": str = 'passive'"},{name:"log_level_replica",val:": str = 'warning'"},{name:"disable_tqdm",val:": bool | None = None"},{name:"report_to",val:": None | str | list[str] = 'none'"},{name:"run_name",val:": str | None = None"},{name:"project",val:": str = 'huggingface'"},{name:"trackio_space_id",val:": str | None = 'trackio'"},{name:"eval_strategy",val:": transformers.trainer_utils.IntervalStrategy | str = 'no'"},{name:"eval_steps",val:": float | None = None"},{name:"eval_delay",val:": float = 0"},{name:"per_device_eval_batch_size",val:": int = 8"},{name:"prediction_loss_only",val:": bool = False"},{name:"eval_on_start",val:": bool = False"},{name:"eval_do_concat_batches",val:": bool = True"},{name:"eval_use_gather_object",val:": bool = False"},{name:"eval_accumulation_steps",val:": int | None = None"},{name:"include_for_metrics",val:": list = <factory>"},{name:"batch_eval_metrics",val:": bool = False"},{name:"save_only_model",val:": bool = False"},{name:"save_strategy",val:": transformers.trainer_utils.SaveStrategy | str = 'steps'"},{name:"save_steps",val:": float = 500"},{name:"save_on_each_node",val:": bool = False"},{name:"save_total_limit",val:": int | None = None"},{name:"enable_jit_checkpoint",val:": bool = False"},{name:"push_to_hub",val:": bool = False"},{name:"hub_token",val:": str | None = None"},{name:"hub_private_repo",val:": bool | None = None"},{name:"hub_model_id",val:": str | None = None"},{name:"hub_strategy",val:": transformers.trainer_utils.HubStrategy | str = 'every_save'"},{name:"hub_always_push",val:": bool = False"},{name:"hub_revision",val:": str | None = None"},{name:"load_best_model_at_end",val:": bool = False"},{name:"metric_for_best_model",val:": str | None = None"},{name:"greater_is_better",val:": bool | None = None"},{name:"ignore_data_skip",val:": bool = False"},{name:"restore_callback_states_from_checkpoint",val:": bool = False"},{name:"full_determinism",val:": bool = False"},{name:"seed",val:": int = 42"},{name:"data_seed",val:": int | None = None"},{name:"use_cpu",val:": bool = False"},{name:"accelerator_config",val:": dict | str | None = None"},{name:"parallelism_config",val:": accelerate.parallelism_config.ParallelismConfig | None = None"},{name:"dataloader_drop_last",val:": bool = False"},{name:"dataloader_num_workers",val:": int = 0"},{name:"dataloader_pin_memory",val:": bool = True"},{name:"dataloader_persistent_workers",val:": bool = False"},{name:"dataloader_prefetch_factor",val:": int | None = None"},{name:"remove_unused_columns",val:": bool = True"},{name:"label_names",val:": list[str] | None = None"},{name:"train_sampling_strategy",val:": str = 'random'"},{name:"length_column_name",val:": str = 'length'"},{name:"ddp_find_unused_parameters",val:": bool | None = None"},{name:"ddp_bucket_cap_mb",val:": int | None = None"},{name:"ddp_broadcast_buffers",val:": bool | None = None"},{name:"ddp_backend",val:": str | None = None"},{name:"ddp_timeout",val:": int = 1800"},{name:"fsdp",val:": list[transformers.trainer_utils.FSDPOption] | str | None = None"},{name:"fsdp_config",val:": dict[str, typing.Any] | str | None = None"},{name:"deepspeed",val:": dict | str | None = None"},{name:"debug",val:": str | list[transformers.debug_utils.DebugOption] = ''"},{name:"skip_memory_metrics",val:": bool = True"},{name:"do_train",val:": bool = False"},{name:"do_eval",val:": bool = False"},{name:"do_predict",val:": bool = False"},{name:"resume_from_checkpoint",val:": str | None = None"},{name:"warmup_ratio",val:": float | None = None"},{name:"logging_dir",val:": str | None = None"},{name:"local_rank",val:": int = -1"},{name:"num_generations",val:": int = 8"},{name:"max_completion_length",val:": int = 2048"},{name:"temperature",val:": float = 1.0"},{name:"chat_template_kwargs",val:": dict | None = None"},{name:"max_tool_calling_iterations",val:": int | None = None"},{name:"vllm_server_base_url",val:": str = 'http://localhost:8000'"},{name:"vllm_server_timeout",val:": float = 240.0"},{name:"request_timeout",val:": int = 600"},{name:"epsilon",val:": float = 0.2"},{name:"epsilon_high",val:": float = 0.2"},{name:"max_inflight_tasks",val:": int = -1"},{name:"max_staleness",val:": int = 4"},{name:"queue_maxsize",val:": int = 1024"},{name:"weight_sync_steps",val:": int = 1"},{name:"log_completions",val:": bool = False"},{name:"num_completions_to_print",val:": int = 3"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/experimental/async_grpo/async_grpo_config.py#L21",parameterGroups:[{title:"Parameters that control generation",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.num_generations",description:`<strong>num_generations</strong> (<code>int</code>, <em>optional</em>, defaults to <code>8</code>) — | |
| Number of generations per prompt to sample.`,name:"num_generations"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_completion_length",description:`<strong>max_completion_length</strong> (<code>int</code>, <em>optional</em>, defaults to <code>2048</code>) — | |
| Maximum number of tokens to generate per completion.`,name:"max_completion_length"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.temperature",description:`<strong>temperature</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.0</code>) — | |
| Temperature for sampling. The higher the temperature, the more random the completions.`,name:"temperature"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.chat_template_kwargs",description:`<strong>chat_template_kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) — | |
| Additional keyword arguments to pass to the <code>apply_chat_template</code> function when generating completions.`,name:"chat_template_kwargs"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_tool_calling_iterations",description:`<strong>max_tool_calling_iterations</strong> (<code>int</code>, <em>optional</em>) — | |
| Maximum number of tool-calling turns when training an agent. If <code>None</code>, there is no limit and generation | |
| stops when the model generates a response turn with no tool calls or when the total response length reaches | |
| <code>max_completion_length</code>.`,name:"max_tool_calling_iterations"}]},{title:"Parameters that control the vLLM server",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.vllm_server_base_url",description:`<strong>vllm_server_base_url</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"http --//localhost:8000"</code>): | |
| Base URL of the vLLM server used for generation (e.g., <code>"http://localhost:8000"</code>).`,name:"vllm_server_base_url"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.vllm_server_timeout",description:`<strong>vllm_server_timeout</strong> (<code>float</code>, <em>optional</em>, defaults to <code>240.0</code>) — | |
| Total timeout duration in seconds to wait for the vLLM server to be ready.`,name:"vllm_server_timeout"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.request_timeout",description:`<strong>request_timeout</strong> (<code>int</code>, <em>optional</em>, defaults to <code>600</code>) — | |
| Timeout in seconds for individual HTTP requests to the vLLM server.`,name:"request_timeout"}]},{title:"Parameters that control the training",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.epsilon",description:`<strong>epsilon</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.2</code>) — | |
| Lower-bound epsilon value for clipping.`,name:"epsilon"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.epsilon_high",description:`<strong>epsilon_high</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.2</code>) — | |
| Upper-bound epsilon value for clipping.`,name:"epsilon_high"}]},{title:"Parameters that control the async rollout pipeline",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_inflight_tasks",description:`<strong>max_inflight_tasks</strong> (<code>int</code>, <em>optional</em>, defaults to <code>-1</code>) — | |
| Maximum number of concurrent generation tasks sent to the vLLM server. Defaults to <code>-1</code> (auto), which | |
| sets it to <code>max_staleness * per_device_train_batch_size * gradient_accumulation_steps * num_processes</code>. | |
| If using tool-use environments, you may want to set this manually based on how many parallel environments | |
| you can run.`,name:"max_inflight_tasks"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_staleness",description:`<strong>max_staleness</strong> (<code>int</code>, <em>optional</em>, defaults to <code>4</code>) — | |
| Maximum number of weight update steps a rollout sample can lag behind the current model version before | |
| being discarded.`,name:"max_staleness"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.queue_maxsize",description:`<strong>queue_maxsize</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1024</code>) — | |
| Maximum number of rollout samples to buffer in the rollout queue.`,name:"queue_maxsize"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.weight_sync_steps",description:`<strong>weight_sync_steps</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1</code>) — | |
| Number of training steps between weight synchronizations to the vLLM server.`,name:"weight_sync_steps"}]},{title:"Parameters that control the logging",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.log_completions",description:`<strong>log_completions</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) — | |
| Whether to log a sample of (prompt, completion) pairs every <code>logging_steps</code> steps.`,name:"log_completions"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.num_completions_to_print",description:`<strong>num_completions_to_print</strong> (<code>int</code>, <em>optional</em>, defaults to <code>3</code>) — | |
| Number of completions to print when <code>log_completions=True</code>.`,name:"num_completions_to_print"}]}]}}),S=new te({props:{title:"AsyncGRPOTrainer",local:"trl.experimental.async_grpo.AsyncGRPOTrainer",headingTag:"h2"}}),Y=new it({props:{name:"class trl.experimental.async_grpo.AsyncGRPOTrainer",anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer",parameters:[{name:"model",val:": str"},{name:"reward_funcs",val:": collections.abc.Callable[..., list[float]] | list[collections.abc.Callable[..., list[float]]]"},{name:"args",val:": trl.experimental.async_grpo.async_grpo_config.AsyncGRPOConfig | None = None"},{name:"train_dataset",val:": datasets.arrow_dataset.Dataset | datasets.iterable_dataset.IterableDataset | None = None"},{name:"processing_class",val:": transformers.tokenization_utils_base.PreTrainedTokenizerBase | None = None"},{name:"callbacks",val:": list[transformers.trainer_callback.TrainerCallback] | None = None"},{name:"optimizers",val:": tuple = (None, None)"},{name:"tools",val:": list[collections.abc.Callable] | None = None"},{name:"environment_factory",val:": collections.abc.Callable[[], trl.experimental.async_grpo.async_grpo_trainer._SupportsReset] | None = None"},{name:"rollout_worker",val:": trl.experimental.async_grpo.async_grpo_trainer.RolloutWorkerProtocol | None = None"}],parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.model",description:`<strong>model</strong> (<code>str</code>) — | |
| Model to be trained. Must be a string, being the <em>model id</em> of a pretrained model hosted inside a model | |
| repo on huggingface.co, or a path to a <em>directory</em> containing model weights saved using | |
| <a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.save_pretrained" rel="nofollow">save_pretrained</a>, e.g., <code>'./my_model_directory/'</code>. The model is loaded | |
| using <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained" rel="nofollow">from_pretrained</a>. The model name is also used to identify the | |
| model on the vLLM server used for generation.`,name:"model"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.reward_funcs",description:`<strong>reward_funcs</strong> (<code>RewardFunc | list[RewardFunc]</code>) — | |
| Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward | |
| functions with the prompts and completions and sum the rewards. Can be either:</p> | |
| <ul> | |
| <li>A single reward function: The function is provided with the prompts and the generated completions, plus | |
| any additional columns in the dataset. It should return a list of rewards. Reward functions can be either | |
| synchronous or asynchronous and can also return <code>None</code> when the reward is not applicable to those | |
| samples. This is useful for multi-task training where different reward functions apply to different types | |
| of samples. When a reward function returns <code>None</code> for a sample, that reward function is excluded from the | |
| reward calculation for that sample. For more details, see <a href="#using-a-custom-reward-function">Using a custom reward | |
| function</a>.</li> | |
| <li>A list of reward functions, where each item is a reward function as described above. Rewards from all | |
| functions are summed.</li> | |
| </ul>`,name:"reward_funcs"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.args",description:`<strong>args</strong> (<code>AsyncGRPOConfig</code>, <em>optional</em>) — | |
| Configuration for this trainer. If <code>None</code>, a default configuration is used.`,name:"args"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.train_dataset",description:`<strong>train_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a> or <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a>) — | |
| Dataset to use for training. It must include a column <code>"prompt"</code>. Any additional columns in the dataset are | |
| ignored. The format of the samples can be either:</p> | |
| <ul> | |
| <li><a href="dataset_formats#standard">Standard</a>: Each sample contains plain text.</li> | |
| <li><a href="dataset_formats#conversational">Conversational</a>: Each sample contains structured messages (e.g., role | |
| and content).</li> | |
| </ul>`,name:"train_dataset"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.processing_class",description:`<strong>processing_class</strong> (<a href="https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase" rel="nofollow">PreTrainedTokenizerBase</a>, <em>optional</em>) — | |
| Processing class used to process the data. The padding side must be set to <code>"left"</code>. If <code>None</code>, the | |
| processing class is loaded from the model’s name with <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained" rel="nofollow">from_pretrained</a>. A | |
| padding token, <code>tokenizer.pad_token</code>, must be set. If the processing class has not set a padding token, | |
| <code>tokenizer.eos_token</code> will be used as the default.`,name:"processing_class"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.callbacks",description:`<strong>callbacks</strong> (list of <a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerCallback" rel="nofollow">TrainerCallback</a>, <em>optional</em>) — | |
| List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed | |
| in <a href="https://huggingface.co/docs/transformers/main_classes/callback" rel="nofollow">here</a>.</p> | |
| <p>If you want to remove one of the default callbacks used, use the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.remove_callback" rel="nofollow">remove_callback</a> | |
| method.`,name:"callbacks"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.optimizers",description:`<strong>optimizers</strong> (<code>tuple[torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None]</code>, <em>optional</em>, defaults to <code>(None, None)</code>) — | |
| A tuple containing the optimizer and the scheduler to use. Will default to an instance of <code>AdamW</code> on your | |
| model and a scheduler given by <a href="https://huggingface.co/docs/transformers/main/en/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup" rel="nofollow">get_linear_schedule_with_warmup</a> controlled by <code>args</code>.`,name:"optimizers"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.tools",description:`<strong>tools</strong> (list of <code>Callable</code>, <em>optional</em>) — | |
| A list of callable tool functions (sync or async) that the model can invoke during generation. Each tool | |
| should be a standard Python function with properly type-hinted arguments and return values, and a | |
| Google-style docstring describing its purpose, arguments, and return value. For more details, see: | |
| <a href="https://huggingface.co/docs/transformers/en/chat_extras#passing-tools" rel="nofollow">https://huggingface.co/docs/transformers/en/chat_extras#passing-tools</a>. The model uses the function’s name, | |
| type hints, and docstring to determine how to call it. Ensure that the model’s chat template supports tool | |
| use and that it has been fine-tuned for tool calling.`,name:"tools"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.environment_factory",description:`<strong>environment_factory</strong> (<code>EnvironmentFactory</code>, <em>optional</em>) — | |
| A callable that creates and returns an environment instance. The environment class should define methods | |
| that can be invoked as tools during generation. Each method should comply with the same requirements as the | |
| <code>tools</code> described above. If <code>environment_factory</code> is provided, an instance of the environment is created | |
| for each generation in the batch, allowing for parallel and independent interactions. The environment must | |
| also implement a callable <code>reset</code> method that can be used to reset state between generations. The <code>reset</code> | |
| method should return either <code>None</code> or a string: when it returns a string, that string is appended to the | |
| last user message before generation. This feature is experimental and may change or be removed at any time | |
| without prior notice.`,name:"environment_factory"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/experimental/async_grpo/async_grpo_trainer.py#L169"}}),$=new _t({props:{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.example",$$slots:{default:[vt]},$$scope:{ctx:de}}}),K=new ht({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/async_grpo_trainer.md"}}),{c(){m=s("meta"),C=o(),x=s("p"),y=o(),d(b.$$.fragment),i=o(),d(w.$$.fragment),pe=o(),T=s("blockquote"),ne=s("p"),ne.innerHTML=We,Ie=o(),ae=s("p"),ae.innerHTML=ze,Oe=o(),d(P.$$.fragment),ue=o(),d(G.$$.fragment),ge=o(),J=s("p"),J.innerHTML=Ze,fe=o(),L=s("p"),L.innerHTML=De,he=o(),d(A.$$.fragment),_e=o(),U=s("p"),U.innerHTML=Xe,ve=o(),F=s("p"),F.innerHTML=Qe,ye=o(),j=s("ul"),j.innerHTML=Se,be=o(),I=s("p"),I.innerHTML=Ye,we=o(),O=s("p"),O.innerHTML=Ke,Te=o(),E=s("p"),E.innerHTML=et,Me=o(),d(B.$$.fragment),xe=o(),d(H.$$.fragment),Ne=o(),V=s("p"),V.innerHTML=tt,ke=o(),d(q.$$.fragment),$e=o(),k=s("blockquote"),k.innerHTML=nt,Re=o(),d(W.$$.fragment),Ce=o(),d(z.$$.fragment),Pe=o(),Z=s("p"),Z.textContent=at,Ge=o(),d(D.$$.fragment),Je=o(),v=s("div"),d(X.$$.fragment),Ee=o(),oe=s("p"),oe.innerHTML=ot,Be=o(),re=s("p"),re.innerHTML=rt,He=o(),Q=s("blockquote"),Q.innerHTML=st,Le=o(),d(S.$$.fragment),Ae=o(),M=s("div"),d(Y.$$.fragment),Ve=o(),se=s("p"),se.innerHTML=lt,qe=o(),d($.$$.fragment),Ue=o(),d(K.$$.fragment),Fe=o(),ie=s("p"),this.h()},l(e){const t=gt("svelte-u9bgzb",document.head);m=l(t,"META",{name:!0,content:!0}),t.forEach(n),C=r(e),x=l(e,"P",{}),ce(x).forEach(n),y=r(e),p(b.$$.fragment,e),i=r(e),p(w.$$.fragment,e),pe=r(e),T=l(e,"BLOCKQUOTE",{class:!0});var N=ce(T);ne=l(N,"P",{"data-svelte-h":!0}),c(ne)!=="svelte-170a2mh"&&(ne.innerHTML=We),Ie=r(N),ae=l(N,"P",{"data-svelte-h":!0}),c(ae)!=="svelte-2fiwe3"&&(ae.innerHTML=ze),Oe=r(N),p(P.$$.fragment,N),N.forEach(n),ue=r(e),p(G.$$.fragment,e),ge=r(e),J=l(e,"P",{"data-svelte-h":!0}),c(J)!=="svelte-7c3d8k"&&(J.innerHTML=Ze),fe=r(e),L=l(e,"P",{"data-svelte-h":!0}),c(L)!=="svelte-7ij08"&&(L.innerHTML=De),he=r(e),p(A.$$.fragment,e),_e=r(e),U=l(e,"P",{"data-svelte-h":!0}),c(U)!=="svelte-ha6h7"&&(U.innerHTML=Xe),ve=r(e),F=l(e,"P",{"data-svelte-h":!0}),c(F)!=="svelte-1ifo93o"&&(F.innerHTML=Qe),ye=r(e),j=l(e,"UL",{"data-svelte-h":!0}),c(j)!=="svelte-1hgksz5"&&(j.innerHTML=Se),be=r(e),I=l(e,"P",{"data-svelte-h":!0}),c(I)!=="svelte-13ayx4q"&&(I.innerHTML=Ye),we=r(e),O=l(e,"P",{"data-svelte-h":!0}),c(O)!=="svelte-1c5wt0o"&&(O.innerHTML=Ke),Te=r(e),E=l(e,"P",{"data-svelte-h":!0}),c(E)!=="svelte-18ibama"&&(E.innerHTML=et),Me=r(e),p(B.$$.fragment,e),xe=r(e),p(H.$$.fragment,e),Ne=r(e),V=l(e,"P",{"data-svelte-h":!0}),c(V)!=="svelte-7nixay"&&(V.innerHTML=tt),ke=r(e),p(q.$$.fragment,e),$e=r(e),k=l(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(k)!=="svelte-1ms05z4"&&(k.innerHTML=nt),Re=r(e),p(W.$$.fragment,e),Ce=r(e),p(z.$$.fragment,e),Pe=r(e),Z=l(e,"P",{"data-svelte-h":!0}),c(Z)!=="svelte-ho94qx"&&(Z.textContent=at),Ge=r(e),p(D.$$.fragment,e),Je=r(e),v=l(e,"DIV",{class:!0});var R=ce(v);p(X.$$.fragment,R),Ee=r(R),oe=l(R,"P",{"data-svelte-h":!0}),c(oe)!=="svelte-ntjsem"&&(oe.innerHTML=ot),Be=r(R),re=l(R,"P",{"data-svelte-h":!0}),c(re)!=="svelte-1vp3ijk"&&(re.innerHTML=rt),He=r(R),Q=l(R,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(Q)!=="svelte-17fyuhe"&&(Q.innerHTML=st),R.forEach(n),Le=r(e),p(S.$$.fragment,e),Ae=r(e),M=l(e,"DIV",{class:!0});var le=ce(M);p(Y.$$.fragment,le),Ve=r(le),se=l(le,"P",{"data-svelte-h":!0}),c(se)!=="svelte-cdgyfq"&&(se.innerHTML=lt),qe=r(le),p($.$$.fragment,le),le.forEach(n),Ue=r(e),p(K.$$.fragment,e),Fe=r(e),ie=l(e,"P",{}),ce(ie).forEach(n),this.h()},h(){ee(m,"name","hf:doc:metadata"),ee(m,"content",bt),ee(T,"class","important"),ee(k,"class","tip"),ee(Q,"class","note"),ee(v,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),ee(M,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){_(document.head,m),a(e,C,t),a(e,x,t),a(e,y,t),u(b,e,t),a(e,i,t),u(w,e,t),a(e,pe,t),a(e,T,t),_(T,ne),_(T,Ie),_(T,ae),_(T,Oe),u(P,T,null),a(e,ue,t),u(G,e,t),a(e,ge,t),a(e,J,t),a(e,fe,t),a(e,L,t),a(e,he,t),u(A,e,t),a(e,_e,t),a(e,U,t),a(e,ve,t),a(e,F,t),a(e,ye,t),a(e,j,t),a(e,be,t),a(e,I,t),a(e,we,t),a(e,O,t),a(e,Te,t),a(e,E,t),a(e,Me,t),u(B,e,t),a(e,xe,t),u(H,e,t),a(e,Ne,t),a(e,V,t),a(e,ke,t),u(q,e,t),a(e,$e,t),a(e,k,t),a(e,Re,t),u(W,e,t),a(e,Ce,t),u(z,e,t),a(e,Pe,t),a(e,Z,t),a(e,Ge,t),u(D,e,t),a(e,Je,t),a(e,v,t),u(X,v,null),_(v,Ee),_(v,oe),_(v,Be),_(v,re),_(v,He),_(v,Q),a(e,Le,t),u(S,e,t),a(e,Ae,t),a(e,M,t),u(Y,M,null),_(M,Ve),_(M,se),_(M,qe),u($,M,null),a(e,Ue,t),u(K,e,t),a(e,Fe,t),a(e,ie,t),je=!0},p(e,[t]){const N={};t&2&&(N.$$scope={dirty:t,ctx:e}),$.$set(N)},i(e){je||(g(b.$$.fragment,e),g(w.$$.fragment,e),g(P.$$.fragment,e),g(G.$$.fragment,e),g(A.$$.fragment,e),g(B.$$.fragment,e),g(H.$$.fragment,e),g(q.$$.fragment,e),g(W.$$.fragment,e),g(z.$$.fragment,e),g(D.$$.fragment,e),g(X.$$.fragment,e),g(S.$$.fragment,e),g(Y.$$.fragment,e),g($.$$.fragment,e),g(K.$$.fragment,e),je=!0)},o(e){f(b.$$.fragment,e),f(w.$$.fragment,e),f(P.$$.fragment,e),f(G.$$.fragment,e),f(A.$$.fragment,e),f(B.$$.fragment,e),f(H.$$.fragment,e),f(q.$$.fragment,e),f(W.$$.fragment,e),f(z.$$.fragment,e),f(D.$$.fragment,e),f(X.$$.fragment,e),f(S.$$.fragment,e),f(Y.$$.fragment,e),f($.$$.fragment,e),f(K.$$.fragment,e),je=!1},d(e){e&&(n(C),n(x),n(y),n(i),n(pe),n(T),n(ue),n(ge),n(J),n(fe),n(L),n(he),n(_e),n(U),n(ve),n(F),n(ye),n(j),n(be),n(I),n(we),n(O),n(Te),n(E),n(Me),n(xe),n(Ne),n(V),n(ke),n($e),n(k),n(Re),n(Ce),n(Pe),n(Z),n(Ge),n(Je),n(v),n(Le),n(Ae),n(M),n(Ue),n(Fe),n(ie)),n(m),h(b,e),h(w,e),h(P),h(G,e),h(A,e),h(B,e),h(H,e),h(q,e),h(W,e),h(z,e),h(D,e),h(X),h(S,e),h(Y),h($),h(K,e)}}}const bt='{"title":"Asynchronous GRPO","local":"asynchronous-grpo","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"How it differs from GRPOTrainer","local":"how-it-differs-from-grpotrainer","sections":[],"depth":2},{"title":"Quick start","local":"quick-start","sections":[],"depth":2},{"title":"Design philosophy","local":"design-philosophy","sections":[],"depth":2},{"title":"AsyncGRPOConfig","local":"trl.experimental.async_grpo.AsyncGRPOConfig","sections":[],"depth":2},{"title":"AsyncGRPOTrainer","local":"trl.experimental.async_grpo.AsyncGRPOTrainer","sections":[],"depth":2}],"depth":1}';function wt(de){return mt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Rt extends pt{constructor(m){super(),ut(this,m,wt,yt,ct,{})}}export{Rt as component}; | |
Xet Storage Details
- Size:
- 39.2 kB
- Xet hash:
- 3e48272496e347e059b203395e6b48293b8fc9eb45bae221b4cc6ad454ac01cb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.