Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / trl /pr_5607 /en /_app /immutable /nodes /2.5090de2a.js

HuggingFaceDocBuilder's picture

HuggingFaceDocBuilder

about 1 month ago

39.2 kB

	import{s as ct,o as mt,n as dt}from"../chunks/scheduler.7b731bd4.js";import{S as pt,i as ut,e as s,s as o,c as d,h as gt,a as l,d as n,b as r,f as ce,g as p,j as c,k as ee,l as _,m as a,n as u,t as g,o as f,p as h}from"../chunks/index.cc268345.js";import{C as ft,H as te,E as ht}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.f0d99f98.js";import{D as it}from"../chunks/Docstring.03f7b462.js";import{C as me}from"../chunks/CodeBlock.169a125f.js";import{E as _t}from"../chunks/ExampleCodeBlock.415f9452.js";function vt(de){let m,C="Example:",x,y,b;return y=new me({props:{code:"ZnJvbSUyMHRybC5leHBlcmltZW50YWwuYXN5bmNfZ3JwbyUyMGltcG9ydCUyMEFzeW5jR1JQT1RyYWluZXIlMEFmcm9tJTIwdHJsLnJld2FyZHMlMjBpbXBvcnQlMjBhY2N1cmFjeV9yZXdhcmQlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRybC1saWIlMkZEZWVwTWF0aC0xMDNLJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEElMEF0cmFpbmVyJTIwJTNEJTIwQXN5bmNHUlBPVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRCUyMlF3ZW4lMkZRd2VuMi41LTAuNUItSW5zdHJ1Y3QlMjIlMkMlMEElMjAlMjAlMjAlMjByZXdhcmRfZnVuY3MlM0RhY2N1cmFjeV9yZXdhcmQlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-keyword">from</span> trl.experimental.async_grpo <span class="hljs-keyword">import</span> AsyncGRPOTrainer
	<span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward
	<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset

	dataset = load_dataset(<span class="hljs-string">"trl-lib/DeepMath-103K"</span>, split=<span class="hljs-string">"train"</span>)

	trainer = AsyncGRPOTrainer(
	model=<span class="hljs-string">"Qwen/Qwen2.5-0.5B-Instruct"</span>,
	reward_funcs=accuracy_reward,
	train_dataset=dataset,
	)
	trainer.train()`,wrap:!1}}),{c(){m=s("p"),m.textContent=C,x=o(),d(y.$$.fragment)},l(i){m=l(i,"P",{"data-svelte-h":!0}),c(m)!=="svelte-11lpom8"&&(m.textContent=C),x=r(i),p(y.$$.fragment,i)},m(i,w){a(i,m,w),a(i,x,w),u(y,i,w),b=!0},p:dt,i(i){b\|\|(g(y.$$.fragment,i),b=!0)},o(i){f(y.$$.fragment,i),b=!1},d(i){i&&(n(m),n(x)),h(y,i)}}}function yt(de){let m,C,x,y,b,i,w,pe,T,ne,We="This trainer requires <code>vllm>=0.17.1</code> and <code>transformers>=5.2.0</code>. For distributed training, only FSDP2 is supported (DeepSpeed ZeRO is not).",Ie,ae,ze="Currently, <code>vllm</code> and <code>transformers</code> have conflicting dependency constraints. To work around this, install vLLM first and then force-install transformers:",Oe,P,ue,G,ge,J,Ze='<code>AsyncGRPOTrainer</code> implements the same <a href="grpo_trainer">GRPO</a> algorithm but decouples rollout generation from training. A background worker continuously streams completions from a vLLM server while the training loop consumes them, so generation and gradient updates overlap instead of alternating. The API mirrors <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> — for full details on the GRPO method itself (advantage computation, KL estimation, loss formulation, reward functions, etc.), see the <a href="grpo_trainer">GRPO Trainer</a> documentation. Not all features from <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a> are available; refer to <code>AsyncGRPOConfig</code> for the supported parameters.',fe,L,De='This trainer was contributed by <a href="https://huggingface.co/qgallouedec" rel="nofollow">Quentin Gallouédec</a> and <a href="https://huggingface.co/aminediroHF" rel="nofollow">Amine Dirhoussi</a>.',he,A,_e,U,Xe='In the standard <a href="/docs/trl/pr_5607/en/gspo_token#trl.GRPOTrainer">GRPOTrainer</a>, generation and training are sequential: generate a batch, compute the loss, update weights, repeat. Even in <a href="grpo_trainer#speed-up-training-with-vllm">vLLM colocate mode</a>, where generation runs on the same GPUs, one phase must finish before the other begins.',ve,F,Qe="<code>AsyncGRPOTrainer</code> separates these two concerns:",ye,j,Se="<li><strong>Rollout worker</strong> (background thread) — sends prompts to a vLLM server, scores completions with reward functions, computes advantages, and pushes ready-to-train samples into a queue.</li> <li><strong>Training loop</strong> (main process) — pulls samples from the queue, computes the clipped surrogate loss, and updates the model weights.</li>",be,I,Ye="After every <code>weight_sync_steps</code> training steps, the updated weights are transferred to the vLLM server via NCCL so that subsequent generations reflect the latest policy.",we,O,Ke="Because generation and training run concurrently, the training samples may have been generated by a slightly older version of the model. The <code>max_staleness</code> parameter controls how many weight updates a sample can lag behind before being discarded.",Te,E,et="The number of concurrent requests sent to the vLLM server is controlled by <code>max_inflight_tasks</code>. By default it is set automatically to <code>max_staleness × per_device_train_batch_size × gradient_accumulation_steps × num_processes</code> — the maximum number of samples the trainer can consume before they become stale. Generating more than this is wasteful since the excess samples will be discarded.",Me,B,xe,H,Ne,V,tt="The vLLM server and the trainer must run on <strong>separate GPUs</strong>. Use <code>CUDA_VISIBLE_DEVICES</code> to partition your GPUs. For example, with 2 GPUs, you can run the vLLM server on GPU 0 and the trainer on GPU 1 as follows:",ke,q,$e,k,nt="<p>Set <code>--max-model-len</code> to the maximum total sequence length (prompt + completion) you expect. A lower value reduces GPU memory usage on the server, freeing more memory for the KV cache and increasing throughput. A good starting point is the prompt length plus <code>max_completion_length</code> from your config.</p>",Re,W,Ce,z,Pe,Z,at="This trainer is intentionally kept minimal and is not meant to grow into a general-purpose solution. If you need a feature that is not supported, we recommend cloning the repository and adapting the trainer to your needs directly. New features will only be considered when there is significant community demand.",Ge,D,Je,v,X,Ee,oe,ot="Configuration class for the <code>AsyncGRPOTrainer</code>.",Be,re,rt=`This class includes only the parameters that are specific to asynchronous GRPO training. For a full list of
	training arguments, please refer to the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a> documentation. Note that default values
	in this class may differ from those in <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>.`,He,Q,st='<p>These parameters have default values different from <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a>:</p> <ul><li><code>logging_steps</code>: Defaults to <code>10</code> instead of <code>500</code>.</li> <li><code>gradient_checkpointing</code>: Defaults to <code>True</code> instead of <code>False</code>.</li> <li><code>bf16</code>: Defaults to <code>True</code> if <code>fp16</code> is not set, instead of <code>False</code>.</li> <li><code>learning_rate</code>: Defaults to <code>1e-6</code> instead of <code>5e-5</code>.</li></ul>',Le,S,Ae,M,Y,Ve,se,lt=`Trainer for the Group Relative Policy Optimization (GRPO) method. This algorithm was initially proposed in the
	paper <a href="https://huggingface.co/papers/2402.03300" rel="nofollow">DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language
	Models</a>. This trainer is the asynchronous version of GRPO, where
	generation is offloaded to an external vLLM server that runs asynchronously alongside training, decoupling rollout
	from the gradient update loop.`,qe,$,Ue,K,Fe,ie,je;return b=new ft({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),w=new te({props:{title:"Asynchronous GRPO",local:"asynchronous-grpo",headingTag:"h1"}}),P=new me({props:{code:"cGlwJTIwaW5zdGFsbCUyMCd2bGxtJTNFJTNEMC4xNy4xJyUwQXBpcCUyMGluc3RhbGwlMjAndHJhbnNmb3JtZXJzJTNFJTNENS4yLjAnJTIwLS1uby1kZXBz",highlighted:`pip install <span class="hljs-string">'vllm>=0.17.1'</span>
	pip install <span class="hljs-string">'transformers>=5.2.0'</span> --no-deps`,wrap:!1}}),G=new te({props:{title:"Overview",local:"overview",headingTag:"h2"}}),A=new te({props:{title:"How it differs from GRPOTrainer",local:"how-it-differs-from-grpotrainer",headingTag:"h2"}}),B=new te({props:{title:"Quick start",local:"quick-start",headingTag:"h2"}}),H=new me({props:{code:"JTIzJTIwdHJhaW5fYXN5bmNfZ3Jwby5weSUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGxvYWRfZGF0YXNldCUwQWZyb20lMjB0cmwuZXhwZXJpbWVudGFsLmFzeW5jX2dycG8lMjBpbXBvcnQlMjBBc3luY0dSUE9UcmFpbmVyJTBBZnJvbSUyMHRybC5yZXdhcmRzJTIwaW1wb3J0JTIwYWNjdXJhY3lfcmV3YXJkJTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0cmwtbGliJTJGRGVlcE1hdGgtMTAzSyUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBdHJhaW5lciUyMCUzRCUyMEFzeW5jR1JQT1RyYWluZXIoJTBBJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJRd2VuJTJGUXdlbjMtNEIlMjIlMkMlMEElMjAlMjAlMjAlMjByZXdhcmRfZnVuY3MlM0RhY2N1cmFjeV9yZXdhcmQlMkMlMEElMjAlMjAlMjAlMjB0cmFpbl9kYXRhc2V0JTNEZGF0YXNldCUyQyUwQSklMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-comment"># train_async_grpo.py</span>
	<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
	<span class="hljs-keyword">from</span> trl.experimental.async_grpo <span class="hljs-keyword">import</span> AsyncGRPOTrainer
	<span class="hljs-keyword">from</span> trl.rewards <span class="hljs-keyword">import</span> accuracy_reward

	dataset = load_dataset(<span class="hljs-string">"trl-lib/DeepMath-103K"</span>, split=<span class="hljs-string">"train"</span>)

	trainer = AsyncGRPOTrainer(
	model=<span class="hljs-string">"Qwen/Qwen3-4B"</span>,
	reward_funcs=accuracy_reward,
	train_dataset=dataset,
	)
	trainer.train()`,wrap:!1}}),q=new me({props:{code:"JTIzJTIwVGVybWluYWwlMjAxJTNBJTIwdkxMTSUyMHNlcnZlciUyMG9uJTIwR1BVJTIwMCUyMChkZXYlMjBtb2RlJTIwJTJCJTIwTkNDTCUyMHdlaWdodCUyMHRyYW5zZmVyJTIwYXJlJTIwcmVxdWlyZWQpJTBBQ1VEQV9WSVNJQkxFX0RFVklDRVMlM0QwJTIwVkxMTV9TRVJWRVJfREVWX01PREUlM0QxJTIwdmxsbSUyMHNlcnZlJTIwUXdlbiUyRlF3ZW4zLTRCJTIwJTVDJTBBJTIwJTIwJTIwJTIwLS1tYXgtbW9kZWwtbGVuJTIwNDA5NiUyMCU1QyUwQSUyMCUyMCUyMCUyMC0tbG9ncHJvYnMtbW9kZSUyMHByb2Nlc3NlZF9sb2dwcm9icyUyMCU1QyUwQSUyMCUyMCUyMCUyMC0td2VpZ2h0LXRyYW5zZmVyLWNvbmZpZyUyMCclN0IlMjJiYWNrZW5kJTIyJTNBJTIybmNjbCUyMiU3RCc=",highlighted:`<span class="hljs-comment"># Terminal 1: vLLM server on GPU 0 (dev mode + NCCL weight transfer are required)</span>
	CUDA_VISIBLE_DEVICES=0 VLLM_SERVER_DEV_MODE=1 vllm serve Qwen/Qwen3-4B \\
	--max-model-len 4096 \\
	--logprobs-mode processed_logprobs \\
	--weight-transfer-config <span class="hljs-string">'{"backend":"nccl"}'</span>`,wrap:!1}}),W=new me({props:{code:"JTIzJTIwVGVybWluYWwlMjAyJTNBJTIwdHJhaW5pbmclMjBvbiUyMEdQVSUyMDElMEFDVURBX1ZJU0lCTEVfREVWSUNFUyUzRDElMjBhY2NlbGVyYXRlJTIwbGF1bmNoJTIwdHJhaW5fYXN5bmNfZ3Jwby5weQ==",highlighted:`<span class="hljs-comment"># Terminal 2: training on GPU 1</span>
	CUDA_VISIBLE_DEVICES=1 accelerate launch train_async_grpo.py`,wrap:!1}}),z=new te({props:{title:"Design philosophy",local:"design-philosophy",headingTag:"h2"}}),D=new te({props:{title:"AsyncGRPOConfig",local:"trl.experimental.async_grpo.AsyncGRPOConfig",headingTag:"h2"}}),X=new it({props:{name:"class trl.experimental.async_grpo.AsyncGRPOConfig",anchor:"trl.experimental.async_grpo.AsyncGRPOConfig",parameters:[{name:"output_dir",val:": str \| None = None"},{name:"per_device_train_batch_size",val:": int = 8"},{name:"num_train_epochs",val:": float = 3.0"},{name:"max_steps",val:": int = -1"},{name:"learning_rate",val:": float = 1e-06"},{name:"lr_scheduler_type",val:": transformers.trainer_utils.SchedulerType \| str = 'linear'"},{name:"lr_scheduler_kwargs",val:": dict \| str \| None = None"},{name:"warmup_steps",val:": float = 0"},{name:"optim",val:": transformers.training_args.OptimizerNames \| str = 'adamw_torch_fused'"},{name:"optim_args",val:": str \| None = None"},{name:"weight_decay",val:": float = 0.0"},{name:"adam_beta1",val:": float = 0.9"},{name:"adam_beta2",val:": float = 0.999"},{name:"adam_epsilon",val:": float = 1e-08"},{name:"optim_target_modules",val:": None \| str \| list[str] = None"},{name:"gradient_accumulation_steps",val:": int = 1"},{name:"average_tokens_across_devices",val:": bool = True"},{name:"max_grad_norm",val:": float = 1.0"},{name:"label_smoothing_factor",val:": float = 0.0"},{name:"bf16",val:": bool \| None = None"},{name:"fp16",val:": bool = False"},{name:"bf16_full_eval",val:": bool = False"},{name:"fp16_full_eval",val:": bool = False"},{name:"tf32",val:": bool \| None = None"},{name:"gradient_checkpointing",val:": bool = True"},{name:"gradient_checkpointing_kwargs",val:": dict[str, typing.Any] \| str \| None = None"},{name:"torch_compile",val:": bool = False"},{name:"torch_compile_backend",val:": str \| None = None"},{name:"torch_compile_mode",val:": str \| None = None"},{name:"use_liger_kernel",val:": bool = False"},{name:"liger_kernel_config",val:": dict[str, bool] \| None = None"},{name:"use_cache",val:": bool = False"},{name:"neftune_noise_alpha",val:": float \| None = None"},{name:"torch_empty_cache_steps",val:": int \| None = None"},{name:"auto_find_batch_size",val:": bool = False"},{name:"logging_strategy",val:": transformers.trainer_utils.IntervalStrategy \| str = 'steps'"},{name:"logging_steps",val:": float = 1"},{name:"logging_first_step",val:": bool = False"},{name:"log_on_each_node",val:": bool = True"},{name:"logging_nan_inf_filter",val:": bool = True"},{name:"include_num_input_tokens_seen",val:": str \| bool = 'no'"},{name:"log_level",val:": str = 'passive'"},{name:"log_level_replica",val:": str = 'warning'"},{name:"disable_tqdm",val:": bool \| None = None"},{name:"report_to",val:": None \| str \| list[str] = 'none'"},{name:"run_name",val:": str \| None = None"},{name:"project",val:": str = 'huggingface'"},{name:"trackio_space_id",val:": str \| None = 'trackio'"},{name:"eval_strategy",val:": transformers.trainer_utils.IntervalStrategy \| str = 'no'"},{name:"eval_steps",val:": float \| None = None"},{name:"eval_delay",val:": float = 0"},{name:"per_device_eval_batch_size",val:": int = 8"},{name:"prediction_loss_only",val:": bool = False"},{name:"eval_on_start",val:": bool = False"},{name:"eval_do_concat_batches",val:": bool = True"},{name:"eval_use_gather_object",val:": bool = False"},{name:"eval_accumulation_steps",val:": int \| None = None"},{name:"include_for_metrics",val:": list = <factory>"},{name:"batch_eval_metrics",val:": bool = False"},{name:"save_only_model",val:": bool = False"},{name:"save_strategy",val:": transformers.trainer_utils.SaveStrategy \| str = 'steps'"},{name:"save_steps",val:": float = 500"},{name:"save_on_each_node",val:": bool = False"},{name:"save_total_limit",val:": int \| None = None"},{name:"enable_jit_checkpoint",val:": bool = False"},{name:"push_to_hub",val:": bool = False"},{name:"hub_token",val:": str \| None = None"},{name:"hub_private_repo",val:": bool \| None = None"},{name:"hub_model_id",val:": str \| None = None"},{name:"hub_strategy",val:": transformers.trainer_utils.HubStrategy \| str = 'every_save'"},{name:"hub_always_push",val:": bool = False"},{name:"hub_revision",val:": str \| None = None"},{name:"load_best_model_at_end",val:": bool = False"},{name:"metric_for_best_model",val:": str \| None = None"},{name:"greater_is_better",val:": bool \| None = None"},{name:"ignore_data_skip",val:": bool = False"},{name:"restore_callback_states_from_checkpoint",val:": bool = False"},{name:"full_determinism",val:": bool = False"},{name:"seed",val:": int = 42"},{name:"data_seed",val:": int \| None = None"},{name:"use_cpu",val:": bool = False"},{name:"accelerator_config",val:": dict \| str \| None = None"},{name:"parallelism_config",val:": accelerate.parallelism_config.ParallelismConfig \| None = None"},{name:"dataloader_drop_last",val:": bool = False"},{name:"dataloader_num_workers",val:": int = 0"},{name:"dataloader_pin_memory",val:": bool = True"},{name:"dataloader_persistent_workers",val:": bool = False"},{name:"dataloader_prefetch_factor",val:": int \| None = None"},{name:"remove_unused_columns",val:": bool = True"},{name:"label_names",val:": list[str] \| None = None"},{name:"train_sampling_strategy",val:": str = 'random'"},{name:"length_column_name",val:": str = 'length'"},{name:"ddp_find_unused_parameters",val:": bool \| None = None"},{name:"ddp_bucket_cap_mb",val:": int \| None = None"},{name:"ddp_broadcast_buffers",val:": bool \| None = None"},{name:"ddp_backend",val:": str \| None = None"},{name:"ddp_timeout",val:": int = 1800"},{name:"fsdp",val:": list[transformers.trainer_utils.FSDPOption] \| str \| None = None"},{name:"fsdp_config",val:": dict[str, typing.Any] \| str \| None = None"},{name:"deepspeed",val:": dict \| str \| None = None"},{name:"debug",val:": str \| list[transformers.debug_utils.DebugOption] = ''"},{name:"skip_memory_metrics",val:": bool = True"},{name:"do_train",val:": bool = False"},{name:"do_eval",val:": bool = False"},{name:"do_predict",val:": bool = False"},{name:"resume_from_checkpoint",val:": str \| None = None"},{name:"warmup_ratio",val:": float \| None = None"},{name:"logging_dir",val:": str \| None = None"},{name:"local_rank",val:": int = -1"},{name:"num_generations",val:": int = 8"},{name:"max_completion_length",val:": int = 2048"},{name:"temperature",val:": float = 1.0"},{name:"chat_template_kwargs",val:": dict \| None = None"},{name:"max_tool_calling_iterations",val:": int \| None = None"},{name:"vllm_server_base_url",val:": str = 'http://localhost:8000'"},{name:"vllm_server_timeout",val:": float = 240.0"},{name:"request_timeout",val:": int = 600"},{name:"epsilon",val:": float = 0.2"},{name:"epsilon_high",val:": float = 0.2"},{name:"max_inflight_tasks",val:": int = -1"},{name:"max_staleness",val:": int = 4"},{name:"queue_maxsize",val:": int = 1024"},{name:"weight_sync_steps",val:": int = 1"},{name:"log_completions",val:": bool = False"},{name:"num_completions_to_print",val:": int = 3"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/experimental/async_grpo/async_grpo_config.py#L21",parameterGroups:[{title:"Parameters that control generation",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.num_generations",description:`<strong>num_generations</strong> (<code>int</code>, <em>optional</em>, defaults to <code>8</code>) —
	Number of generations per prompt to sample.`,name:"num_generations"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_completion_length",description:`<strong>max_completion_length</strong> (<code>int</code>, <em>optional</em>, defaults to <code>2048</code>) —
	Maximum number of tokens to generate per completion.`,name:"max_completion_length"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.temperature",description:`<strong>temperature</strong> (<code>float</code>, <em>optional</em>, defaults to <code>1.0</code>) —
	Temperature for sampling. The higher the temperature, the more random the completions.`,name:"temperature"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.chat_template_kwargs",description:`<strong>chat_template_kwargs</strong> (<code>dict[str, Any]</code>, <em>optional</em>) —
	Additional keyword arguments to pass to the <code>apply_chat_template</code> function when generating completions.`,name:"chat_template_kwargs"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_tool_calling_iterations",description:`<strong>max_tool_calling_iterations</strong> (<code>int</code>, <em>optional</em>) —
	Maximum number of tool-calling turns when training an agent. If <code>None</code>, there is no limit and generation
	stops when the model generates a response turn with no tool calls or when the total response length reaches
	<code>max_completion_length</code>.`,name:"max_tool_calling_iterations"}]},{title:"Parameters that control the vLLM server",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.vllm_server_base_url",description:`<strong>vllm_server_base_url</strong> (<code>str</code>, <em>optional</em>, defaults to <code>"http --//localhost:8000"</code>):
	Base URL of the vLLM server used for generation (e.g., <code>"http://localhost:8000"</code>).`,name:"vllm_server_base_url"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.vllm_server_timeout",description:`<strong>vllm_server_timeout</strong> (<code>float</code>, <em>optional</em>, defaults to <code>240.0</code>) —
	Total timeout duration in seconds to wait for the vLLM server to be ready.`,name:"vllm_server_timeout"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.request_timeout",description:`<strong>request_timeout</strong> (<code>int</code>, <em>optional</em>, defaults to <code>600</code>) —
	Timeout in seconds for individual HTTP requests to the vLLM server.`,name:"request_timeout"}]},{title:"Parameters that control the training",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.epsilon",description:`<strong>epsilon</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.2</code>) —
	Lower-bound epsilon value for clipping.`,name:"epsilon"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.epsilon_high",description:`<strong>epsilon_high</strong> (<code>float</code>, <em>optional</em>, defaults to <code>0.2</code>) —
	Upper-bound epsilon value for clipping.`,name:"epsilon_high"}]},{title:"Parameters that control the async rollout pipeline",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_inflight_tasks",description:`<strong>max_inflight_tasks</strong> (<code>int</code>, <em>optional</em>, defaults to <code>-1</code>) —
	Maximum number of concurrent generation tasks sent to the vLLM server. Defaults to <code>-1</code> (auto), which
	sets it to <code>max_staleness * per_device_train_batch_size * gradient_accumulation_steps * num_processes</code>.
	If using tool-use environments, you may want to set this manually based on how many parallel environments
	you can run.`,name:"max_inflight_tasks"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.max_staleness",description:`<strong>max_staleness</strong> (<code>int</code>, <em>optional</em>, defaults to <code>4</code>) —
	Maximum number of weight update steps a rollout sample can lag behind the current model version before
	being discarded.`,name:"max_staleness"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.queue_maxsize",description:`<strong>queue_maxsize</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1024</code>) —
	Maximum number of rollout samples to buffer in the rollout queue.`,name:"queue_maxsize"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.weight_sync_steps",description:`<strong>weight_sync_steps</strong> (<code>int</code>, <em>optional</em>, defaults to <code>1</code>) —
	Number of training steps between weight synchronizations to the vLLM server.`,name:"weight_sync_steps"}]},{title:"Parameters that control the logging",parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.log_completions",description:`<strong>log_completions</strong> (<code>bool</code>, <em>optional</em>, defaults to <code>False</code>) —
	Whether to log a sample of (prompt, completion) pairs every <code>logging_steps</code> steps.`,name:"log_completions"},{anchor:"trl.experimental.async_grpo.AsyncGRPOConfig.num_completions_to_print",description:`<strong>num_completions_to_print</strong> (<code>int</code>, <em>optional</em>, defaults to <code>3</code>) —
	Number of completions to print when <code>log_completions=True</code>.`,name:"num_completions_to_print"}]}]}}),S=new te({props:{title:"AsyncGRPOTrainer",local:"trl.experimental.async_grpo.AsyncGRPOTrainer",headingTag:"h2"}}),Y=new it({props:{name:"class trl.experimental.async_grpo.AsyncGRPOTrainer",anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer",parameters:[{name:"model",val:": str"},{name:"reward_funcs",val:": collections.abc.Callable[..., list[float]] \| list[collections.abc.Callable[..., list[float]]]"},{name:"args",val:": trl.experimental.async_grpo.async_grpo_config.AsyncGRPOConfig \| None = None"},{name:"train_dataset",val:": datasets.arrow_dataset.Dataset \| datasets.iterable_dataset.IterableDataset \| None = None"},{name:"processing_class",val:": transformers.tokenization_utils_base.PreTrainedTokenizerBase \| None = None"},{name:"callbacks",val:": list[transformers.trainer_callback.TrainerCallback] \| None = None"},{name:"optimizers",val:": tuple = (None, None)"},{name:"tools",val:": list[collections.abc.Callable] \| None = None"},{name:"environment_factory",val:": collections.abc.Callable[[], trl.experimental.async_grpo.async_grpo_trainer._SupportsReset] \| None = None"},{name:"rollout_worker",val:": trl.experimental.async_grpo.async_grpo_trainer.RolloutWorkerProtocol \| None = None"}],parametersDescription:[{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.model",description:`<strong>model</strong> (<code>str</code>) —
	Model to be trained. Must be a string, being the <em>model id</em> of a pretrained model hosted inside a model
	repo on huggingface.co, or a path to a <em>directory</em> containing model weights saved using
	<a href="https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.save_pretrained" rel="nofollow">save_pretrained</a>, e.g., <code>'./my_model_directory/'</code>. The model is loaded
	using <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained" rel="nofollow">from_pretrained</a>. The model name is also used to identify the
	model on the vLLM server used for generation.`,name:"model"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.reward_funcs",description:`<strong>reward_funcs</strong> (<code>RewardFunc \| list[RewardFunc]</code>) —
	Reward functions to be used for computing the rewards. To compute the rewards, we call all the reward
	functions with the prompts and completions and sum the rewards. Can be either:</p>
	<ul>
	<li>A single reward function: The function is provided with the prompts and the generated completions, plus
	any additional columns in the dataset. It should return a list of rewards. Reward functions can be either
	synchronous or asynchronous and can also return <code>None</code> when the reward is not applicable to those
	samples. This is useful for multi-task training where different reward functions apply to different types
	of samples. When a reward function returns <code>None</code> for a sample, that reward function is excluded from the
	reward calculation for that sample. For more details, see <a href="#using-a-custom-reward-function">Using a custom reward
	function</a>.</li>
	<li>A list of reward functions, where each item is a reward function as described above. Rewards from all
	functions are summed.</li>
	</ul>`,name:"reward_funcs"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.args",description:`<strong>args</strong> (<code>AsyncGRPOConfig</code>, <em>optional</em>) —
	Configuration for this trainer. If <code>None</code>, a default configuration is used.`,name:"args"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.train_dataset",description:`<strong>train_dataset</strong> (<a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset" rel="nofollow">Dataset</a> or <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset" rel="nofollow">IterableDataset</a>) —
	Dataset to use for training. It must include a column <code>"prompt"</code>. Any additional columns in the dataset are
	ignored. The format of the samples can be either:</p>
	<ul>
	<li><a href="dataset_formats#standard">Standard</a>: Each sample contains plain text.</li>
	<li><a href="dataset_formats#conversational">Conversational</a>: Each sample contains structured messages (e.g., role
	and content).</li>
	</ul>`,name:"train_dataset"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.processing_class",description:`<strong>processing_class</strong> (<a href="https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase" rel="nofollow">PreTrainedTokenizerBase</a>, <em>optional</em>) —
	Processing class used to process the data. The padding side must be set to <code>"left"</code>. If <code>None</code>, the
	processing class is loaded from the model’s name with <a href="https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained" rel="nofollow">from_pretrained</a>. A
	padding token, <code>tokenizer.pad_token</code>, must be set. If the processing class has not set a padding token,
	<code>tokenizer.eos_token</code> will be used as the default.`,name:"processing_class"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.callbacks",description:`<strong>callbacks</strong> (list of <a href="https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.TrainerCallback" rel="nofollow">TrainerCallback</a>, <em>optional</em>) —
	List of callbacks to customize the training loop. Will add those to the list of default callbacks detailed
	in <a href="https://huggingface.co/docs/transformers/main_classes/callback" rel="nofollow">here</a>.</p>
	<p>If you want to remove one of the default callbacks used, use the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.remove_callback" rel="nofollow">remove_callback</a>
	method.`,name:"callbacks"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.optimizers",description:`<strong>optimizers</strong> (<code>tuple[torch.optim.Optimizer \| None, torch.optim.lr_scheduler.LambdaLR \| None]</code>, <em>optional</em>, defaults to <code>(None, None)</code>) —
	A tuple containing the optimizer and the scheduler to use. Will default to an instance of <code>AdamW</code> on your
	model and a scheduler given by <a href="https://huggingface.co/docs/transformers/main/en/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup" rel="nofollow">get_linear_schedule_with_warmup</a> controlled by <code>args</code>.`,name:"optimizers"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.tools",description:`<strong>tools</strong> (list of <code>Callable</code>, <em>optional</em>) —
	A list of callable tool functions (sync or async) that the model can invoke during generation. Each tool
	should be a standard Python function with properly type-hinted arguments and return values, and a
	Google-style docstring describing its purpose, arguments, and return value. For more details, see:
	<a href="https://huggingface.co/docs/transformers/en/chat_extras#passing-tools" rel="nofollow">https://huggingface.co/docs/transformers/en/chat_extras#passing-tools</a>. The model uses the function’s name,
	type hints, and docstring to determine how to call it. Ensure that the model’s chat template supports tool
	use and that it has been fine-tuned for tool calling.`,name:"tools"},{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.environment_factory",description:`<strong>environment_factory</strong> (<code>EnvironmentFactory</code>, <em>optional</em>) —
	A callable that creates and returns an environment instance. The environment class should define methods
	that can be invoked as tools during generation. Each method should comply with the same requirements as the
	<code>tools</code> described above. If <code>environment_factory</code> is provided, an instance of the environment is created
	for each generation in the batch, allowing for parallel and independent interactions. The environment must
	also implement a callable <code>reset</code> method that can be used to reset state between generations. The <code>reset</code>
	method should return either <code>None</code> or a string: when it returns a string, that string is appended to the
	last user message before generation. This feature is experimental and may change or be removed at any time
	without prior notice.`,name:"environment_factory"}],source:"https://github.com/huggingface/trl/blob/vr_5607/trl/experimental/async_grpo/async_grpo_trainer.py#L169"}}),$=new _t({props:{anchor:"trl.experimental.async_grpo.AsyncGRPOTrainer.example",$$slots:{default:[vt]},$$scope:{ctx:de}}}),K=new ht({props:{source:"https://github.com/huggingface/trl/blob/main/docs/source/async_grpo_trainer.md"}}),{c(){m=s("meta"),C=o(),x=s("p"),y=o(),d(b.$$.fragment),i=o(),d(w.$$.fragment),pe=o(),T=s("blockquote"),ne=s("p"),ne.innerHTML=We,Ie=o(),ae=s("p"),ae.innerHTML=ze,Oe=o(),d(P.$$.fragment),ue=o(),d(G.$$.fragment),ge=o(),J=s("p"),J.innerHTML=Ze,fe=o(),L=s("p"),L.innerHTML=De,he=o(),d(A.$$.fragment),_e=o(),U=s("p"),U.innerHTML=Xe,ve=o(),F=s("p"),F.innerHTML=Qe,ye=o(),j=s("ul"),j.innerHTML=Se,be=o(),I=s("p"),I.innerHTML=Ye,we=o(),O=s("p"),O.innerHTML=Ke,Te=o(),E=s("p"),E.innerHTML=et,Me=o(),d(B.$$.fragment),xe=o(),d(H.$$.fragment),Ne=o(),V=s("p"),V.innerHTML=tt,ke=o(),d(q.$$.fragment),$e=o(),k=s("blockquote"),k.innerHTML=nt,Re=o(),d(W.$$.fragment),Ce=o(),d(z.$$.fragment),Pe=o(),Z=s("p"),Z.textContent=at,Ge=o(),d(D.$$.fragment),Je=o(),v=s("div"),d(X.$$.fragment),Ee=o(),oe=s("p"),oe.innerHTML=ot,Be=o(),re=s("p"),re.innerHTML=rt,He=o(),Q=s("blockquote"),Q.innerHTML=st,Le=o(),d(S.$$.fragment),Ae=o(),M=s("div"),d(Y.$$.fragment),Ve=o(),se=s("p"),se.innerHTML=lt,qe=o(),d($.$$.fragment),Ue=o(),d(K.$$.fragment),Fe=o(),ie=s("p"),this.h()},l(e){const t=gt("svelte-u9bgzb",document.head);m=l(t,"META",{name:!0,content:!0}),t.forEach(n),C=r(e),x=l(e,"P",{}),ce(x).forEach(n),y=r(e),p(b.$$.fragment,e),i=r(e),p(w.$$.fragment,e),pe=r(e),T=l(e,"BLOCKQUOTE",{class:!0});var N=ce(T);ne=l(N,"P",{"data-svelte-h":!0}),c(ne)!=="svelte-170a2mh"&&(ne.innerHTML=We),Ie=r(N),ae=l(N,"P",{"data-svelte-h":!0}),c(ae)!=="svelte-2fiwe3"&&(ae.innerHTML=ze),Oe=r(N),p(P.$$.fragment,N),N.forEach(n),ue=r(e),p(G.$$.fragment,e),ge=r(e),J=l(e,"P",{"data-svelte-h":!0}),c(J)!=="svelte-7c3d8k"&&(J.innerHTML=Ze),fe=r(e),L=l(e,"P",{"data-svelte-h":!0}),c(L)!=="svelte-7ij08"&&(L.innerHTML=De),he=r(e),p(A.$$.fragment,e),_e=r(e),U=l(e,"P",{"data-svelte-h":!0}),c(U)!=="svelte-ha6h7"&&(U.innerHTML=Xe),ve=r(e),F=l(e,"P",{"data-svelte-h":!0}),c(F)!=="svelte-1ifo93o"&&(F.innerHTML=Qe),ye=r(e),j=l(e,"UL",{"data-svelte-h":!0}),c(j)!=="svelte-1hgksz5"&&(j.innerHTML=Se),be=r(e),I=l(e,"P",{"data-svelte-h":!0}),c(I)!=="svelte-13ayx4q"&&(I.innerHTML=Ye),we=r(e),O=l(e,"P",{"data-svelte-h":!0}),c(O)!=="svelte-1c5wt0o"&&(O.innerHTML=Ke),Te=r(e),E=l(e,"P",{"data-svelte-h":!0}),c(E)!=="svelte-18ibama"&&(E.innerHTML=et),Me=r(e),p(B.$$.fragment,e),xe=r(e),p(H.$$.fragment,e),Ne=r(e),V=l(e,"P",{"data-svelte-h":!0}),c(V)!=="svelte-7nixay"&&(V.innerHTML=tt),ke=r(e),p(q.$$.fragment,e),$e=r(e),k=l(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(k)!=="svelte-1ms05z4"&&(k.innerHTML=nt),Re=r(e),p(W.$$.fragment,e),Ce=r(e),p(z.$$.fragment,e),Pe=r(e),Z=l(e,"P",{"data-svelte-h":!0}),c(Z)!=="svelte-ho94qx"&&(Z.textContent=at),Ge=r(e),p(D.$$.fragment,e),Je=r(e),v=l(e,"DIV",{class:!0});var R=ce(v);p(X.$$.fragment,R),Ee=r(R),oe=l(R,"P",{"data-svelte-h":!0}),c(oe)!=="svelte-ntjsem"&&(oe.innerHTML=ot),Be=r(R),re=l(R,"P",{"data-svelte-h":!0}),c(re)!=="svelte-1vp3ijk"&&(re.innerHTML=rt),He=r(R),Q=l(R,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c(Q)!=="svelte-17fyuhe"&&(Q.innerHTML=st),R.forEach(n),Le=r(e),p(S.$$.fragment,e),Ae=r(e),M=l(e,"DIV",{class:!0});var le=ce(M);p(Y.$$.fragment,le),Ve=r(le),se=l(le,"P",{"data-svelte-h":!0}),c(se)!=="svelte-cdgyfq"&&(se.innerHTML=lt),qe=r(le),p($.$$.fragment,le),le.forEach(n),Ue=r(e),p(K.$$.fragment,e),Fe=r(e),ie=l(e,"P",{}),ce(ie).forEach(n),this.h()},h(){ee(m,"name","hf:doc:metadata"),ee(m,"content",bt),ee(T,"class","important"),ee(k,"class","tip"),ee(Q,"class","note"),ee(v,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),ee(M,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(e,t){_(document.head,m),a(e,C,t),a(e,x,t),a(e,y,t),u(b,e,t),a(e,i,t),u(w,e,t),a(e,pe,t),a(e,T,t),_(T,ne),_(T,Ie),_(T,ae),_(T,Oe),u(P,T,null),a(e,ue,t),u(G,e,t),a(e,ge,t),a(e,J,t),a(e,fe,t),a(e,L,t),a(e,he,t),u(A,e,t),a(e,_e,t),a(e,U,t),a(e,ve,t),a(e,F,t),a(e,ye,t),a(e,j,t),a(e,be,t),a(e,I,t),a(e,we,t),a(e,O,t),a(e,Te,t),a(e,E,t),a(e,Me,t),u(B,e,t),a(e,xe,t),u(H,e,t),a(e,Ne,t),a(e,V,t),a(e,ke,t),u(q,e,t),a(e,$e,t),a(e,k,t),a(e,Re,t),u(W,e,t),a(e,Ce,t),u(z,e,t),a(e,Pe,t),a(e,Z,t),a(e,Ge,t),u(D,e,t),a(e,Je,t),a(e,v,t),u(X,v,null),_(v,Ee),_(v,oe),_(v,Be),_(v,re),_(v,He),_(v,Q),a(e,Le,t),u(S,e,t),a(e,Ae,t),a(e,M,t),u(Y,M,null),_(M,Ve),_(M,se),_(M,qe),u($,M,null),a(e,Ue,t),u(K,e,t),a(e,Fe,t),a(e,ie,t),je=!0},p(e,[t]){const N={};t&2&&(N.$$scope={dirty:t,ctx:e}),$.$set(N)},i(e){je\|\|(g(b.$$.fragment,e),g(w.$$.fragment,e),g(P.$$.fragment,e),g(G.$$.fragment,e),g(A.$$.fragment,e),g(B.$$.fragment,e),g(H.$$.fragment,e),g(q.$$.fragment,e),g(W.$$.fragment,e),g(z.$$.fragment,e),g(D.$$.fragment,e),g(X.$$.fragment,e),g(S.$$.fragment,e),g(Y.$$.fragment,e),g($.$$.fragment,e),g(K.$$.fragment,e),je=!0)},o(e){f(b.$$.fragment,e),f(w.$$.fragment,e),f(P.$$.fragment,e),f(G.$$.fragment,e),f(A.$$.fragment,e),f(B.$$.fragment,e),f(H.$$.fragment,e),f(q.$$.fragment,e),f(W.$$.fragment,e),f(z.$$.fragment,e),f(D.$$.fragment,e),f(X.$$.fragment,e),f(S.$$.fragment,e),f(Y.$$.fragment,e),f($.$$.fragment,e),f(K.$$.fragment,e),je=!1},d(e){e&&(n(C),n(x),n(y),n(i),n(pe),n(T),n(ue),n(ge),n(J),n(fe),n(L),n(he),n(_e),n(U),n(ve),n(F),n(ye),n(j),n(be),n(I),n(we),n(O),n(Te),n(E),n(Me),n(xe),n(Ne),n(V),n(ke),n($e),n(k),n(Re),n(Ce),n(Pe),n(Z),n(Ge),n(Je),n(v),n(Le),n(Ae),n(M),n(Ue),n(Fe),n(ie)),n(m),h(b,e),h(w,e),h(P),h(G,e),h(A,e),h(B,e),h(H,e),h(q,e),h(W,e),h(z,e),h(D,e),h(X),h(S,e),h(Y),h($),h(K,e)}}}const bt='{"title":"Asynchronous GRPO","local":"asynchronous-grpo","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"How it differs from GRPOTrainer","local":"how-it-differs-from-grpotrainer","sections":[],"depth":2},{"title":"Quick start","local":"quick-start","sections":[],"depth":2},{"title":"Design philosophy","local":"design-philosophy","sections":[],"depth":2},{"title":"AsyncGRPOConfig","local":"trl.experimental.async_grpo.AsyncGRPOConfig","sections":[],"depth":2},{"title":"AsyncGRPOTrainer","local":"trl.experimental.async_grpo.AsyncGRPOTrainer","sections":[],"depth":2}],"depth":1}';function wt(de){return mt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Rt extends pt{constructor(m){super(),ut(this,m,wt,yt,ct,{})}}export{Rt as component};

Xet Storage Details

Size:: 39.2 kB
Xet hash:: 3e48272496e347e059b203395e6b48293b8fc9eb45bae221b4cc6ad454ac01cb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.