Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / trl /main /en /learning_tools.html

rtrm

about 1 month ago

download

raw

44.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Learning Tools (Experimental 🧪)","local":"learning-tools-experimental-","sections":[{"title":"Learning to Use a Calculator","local":"learning-to-use-a-calculator","sections":[],"depth":2},{"title":"Experiment results","local":"experiment-results","sections":[],"depth":2},{"title":"(Early Experiments 🧪): learning to use a wiki tool for question answering","local":"early-experiments--learning-to-use-a-wiki-tool-for-question-answering","sections":[{"title":"Building a search index","local":"building-a-search-index","sections":[],"depth":3},{"title":"Experiment settings","local":"experiment-settings","sections":[],"depth":3},{"title":"Result and Discussion","local":"result-and-discussion","sections":[],"depth":3}],"depth":2},{"title":"(Early Experiments 🧪): solving math puzzles with python interpreter","local":"early-experiments--solving-math-puzzles-with-python-interpreter","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/trl/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/entry/start.183b226a.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/scheduler.85c25b89.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/singletons.98fe034d.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/paths.eb9df337.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/entry/app.9853b7f5.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/index.c142fe32.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/nodes/0.5efac18d.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/nodes/19.18f457e9.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/Tip.993c623e.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/CodeBlock.a5e95a57.js">
	<link rel="modulepreload" href="/docs/trl/main/en/_app/immutable/chunks/EditOnGithub.a592e7aa.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Learning Tools (Experimental 🧪)","local":"learning-tools-experimental-","sections":[{"title":"Learning to Use a Calculator","local":"learning-to-use-a-calculator","sections":[],"depth":2},{"title":"Experiment results","local":"experiment-results","sections":[],"depth":2},{"title":"(Early Experiments 🧪): learning to use a wiki tool for question answering","local":"early-experiments--learning-to-use-a-wiki-tool-for-question-answering","sections":[{"title":"Building a search index","local":"building-a-search-index","sections":[],"depth":3},{"title":"Experiment settings","local":"experiment-settings","sections":[],"depth":3},{"title":"Result and Discussion","local":"result-and-discussion","sections":[],"depth":3}],"depth":2},{"title":"(Early Experiments 🧪): solving math puzzles with python interpreter","local":"early-experiments--solving-math-puzzles-with-python-interpreter","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="learning-tools-experimental-" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#learning-tools-experimental-"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Learning Tools (Experimental 🧪)</span></h1> <p data-svelte-h="svelte-o7hf6y">Using Large Language Models (LLMs) with tools has been a popular topic recently with awesome works such as <a href="https://huggingface.co/papers/2302.04761" rel="nofollow">ToolFormer</a> and <a href="https://huggingface.co/papers/2305.16504" rel="nofollow">ToolBench</a>. In TRL, we provide a simple example of how to teach LLM to use tools with reinforcement learning.</p> <p data-svelte-h="svelte-11mt98v">Here’s an overview of the scripts in the <a href="https://github.com/lvwerra/trl/tree/main/examples/research_projects/tools" rel="nofollow">trl repository</a>:</p> <table data-svelte-h="svelte-v8ueqj"><thead><tr><th>File</th> <th>Description</th></tr></thead> <tbody><tr><td><a href="https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/calculator.py" rel="nofollow"><code>calculator.py</code></a></td> <td>Script to train LLM to use a calculator with reinforcement learning.</td></tr> <tr><td><a href="https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/triviaqa.py" rel="nofollow"><code>triviaqa.py</code></a></td> <td>Script to train LLM to use a wiki tool to answer questions.</td></tr> <tr><td><a href="https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/python_interpreter.py" rel="nofollow"><code>python_interpreter.py</code></a></td> <td>Script to train LLM to use python interpreter to solve math puzzles.</td></tr></tbody></table> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-95mlun">Note that the scripts above rely heavily on the <code>TextEnvironment</code> API which is still under active development. The API may change in the future. Please see <a href="text_environment"><code>TextEnvironment</code></a> for the related docs.</p></div> <h2 class="relative group"><a id="learning-to-use-a-calculator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#learning-to-use-a-calculator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Learning to Use a Calculator</span></h2> <p data-svelte-h="svelte-dlnuxr">The rough idea is as follows:</p> <ol><li><p data-svelte-h="svelte-1cfjd6x">Load a tool such as <a href="https://huggingface.co/spaces/ybelkada/simple-calculator" rel="nofollow">ybelkada/simple-calculator</a> that parse a text calculation like <code>"14 + 34"</code> and return the calulated number:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, load_tool
	tool = load_tool(<span class="hljs-string">"ybelkada/simple-calculator"</span>)
	tool_fn = <span class="hljs-keyword">lambda</span> text: <span class="hljs-built_in">str</span>(<span class="hljs-built_in">round</span>(<span class="hljs-built_in">float</span>(tool(text)), <span class="hljs-number">2</span>)) <span class="hljs-comment"># rounding to 2 decimal places</span><!-- HTML_TAG_END --></pre></div></li> <li data-svelte-h="svelte-ssyycj"><p>Define a reward function that returns a positive reward if the tool returns the correct answer. In the script we create a dummy reward function like <code>reward_fn = lambda x: 1</code>, but we override the rewards directly later.</p></li> <li><p data-svelte-h="svelte-em0npc">Create a prompt on how to use the tools</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># system prompt</span>
	prompt = <span class="hljs-string">"""\
	What is 13.1-3?

	<request><SimpleCalculatorTool>13.1-3<call>10.1<response>

	Result=10.1<submit>

	What is 4*3?

	<request><SimpleCalculatorTool>4*3<call>12<response>

	Result=12<submit>

	What is 12.1+1?

	<request><SimpleCalculatorTool>12.1+1<call>13.1<response>

	Result=13.1<submit>

	What is 12.1-20?

	<request><SimpleCalculatorTool>12.1-20<call>-7.9<response>

	Result=-7.9<submit>"""</span><!-- HTML_TAG_END --></pre></div></li> <li><p data-svelte-h="svelte-idnct8">Create a <code>trl.TextEnvironment</code> with the model</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->env = TextEnvironment(
	model,
	tokenizer,
	{<span class="hljs-string">"SimpleCalculatorTool"</span>: tool_fn},
	reward_fn,
	prompt,
	generation_kwargs=generation_kwargs,
	)<!-- HTML_TAG_END --></pre></div></li> <li data-svelte-h="svelte-jdeihy"><p>Then generate some data such as <code>tasks = ["\n\nWhat is 13.1-3?", "\n\nWhat is 4*3?"]</code> and run the environment with <code>queries, responses, masks, rewards, histories = env.run(tasks)</code>. The environment will look for the <code><call></code> token in the prompt and append the tool output to the response; it will also return the mask associated with the response. You can further use the <code>histories</code> to visualize the interaction between the model and the tool; <code>histories[0].show_text()</code> will show the text with color-coded tool output and <code>histories[0].show_tokens(tokenizer)</code> will show visualize the tokens.
	<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/learning_tools.png"></p></li> <li data-svelte-h="svelte-3ih7t1"><p>Finally, we can train the model with <code>train_stats = ppo_trainer.step(queries, responses, rewards, masks)</code>. The trainer will use the mask to ignore the tool output when computing the loss, make sure to pass that argument to <code>step</code>.</p></li></ol> <h2 class="relative group"><a id="experiment-results" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#experiment-results"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Experiment results</span></h2> <p data-svelte-h="svelte-tbi3iu">We trained a model with the above script for 10 random seeds. You can reproduce the run with the following command. Feel free to remove the <code>--slurm-*</code> arguments if you don’t have access to a slurm cluster.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->WANDB_TAGS=<span class="hljs-string">"calculator_final"</span> python benchmark/benchmark<span class="hljs-selector-class">.py</span> \
	<span class="hljs-attr">--command</span> <span class="hljs-string">"python examples/research_projects/tools/calculator.py"</span> \
	<span class="hljs-attr">--num-seeds</span> <span class="hljs-number">10</span> \
	<span class="hljs-attr">--start-seed</span> <span class="hljs-number">1</span> \
	<span class="hljs-attr">--workers</span> <span class="hljs-number">10</span> \
	<span class="hljs-attr">--slurm-gpus-per-task</span> <span class="hljs-number">1</span> \
	<span class="hljs-attr">--slurm-ntasks</span> <span class="hljs-number">1</span> \
	<span class="hljs-attr">--slurm-total-cpus</span> <span class="hljs-number">8</span> \
	<span class="hljs-attr">--slurm-template-path</span> benchmark/trl.slurm_template<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xurwqg">We can then use <a href="https://github.com/openrlbenchmark/openrlbenchmark" rel="nofollow"><code>openrlbenchmark</code></a> which generates the following plot.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -m openrlbenchmark<span class="hljs-selector-class">.rlops_multi_metrics</span> \
	<span class="hljs-attr">--filters</span> <span class="hljs-string">'?we=openrlbenchmark&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.tracker_project_name&cen=trl_ppo_trainer_config.value.log_with&metrics=env/reward_mean&metrics=objective/kl'</span> \
	<span class="hljs-string">'wandb?tag=calculator_final&cl=calculator_mask'</span> \
	<span class="hljs-attr">--env-ids</span> trl \
	<span class="hljs-attr">--check-empty-runs</span> \
	<span class="hljs-attr">--pc</span><span class="hljs-selector-class">.ncols</span> <span class="hljs-number">2</span> \
	<span class="hljs-attr">--pc</span><span class="hljs-selector-class">.ncols-legend</span> <span class="hljs-number">1</span> \
	<span class="hljs-attr">--output-filename</span> static/<span class="hljs-number">0</span>compare \
	<span class="hljs-attr">--scan-history</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g846fo"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/learning_tools_chart.png"></p> <p data-svelte-h="svelte-vgz8f5">As we can see, while 1-2 experiments crashed for some reason, most of the runs obtained near perfect proficiency in the calculator task.</p> <h2 class="relative group"><a id="early-experiments--learning-to-use-a-wiki-tool-for-question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#early-experiments--learning-to-use-a-wiki-tool-for-question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>(Early Experiments 🧪): learning to use a wiki tool for question answering</span></h2> <p data-svelte-h="svelte-1mpbqn3">In the <a href="https://huggingface.co/papers/2302.04761" rel="nofollow">ToolFormer</a> paper, it shows an interesting use case that utilizes a Wikipedia Search tool to help answer questions. In this section, we attempt to perform similar experiments but uses RL instead to teach the model to use a wiki tool on the <a href="https://nlp.cs.washington.edu/triviaqa/" rel="nofollow">TriviaQA</a> dataset.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1quoiyn"><strong>Note that many settings are different so the results are not directly comparable.</strong></p></div> <h3 class="relative group"><a id="building-a-search-index" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-search-index"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Building a search index</span></h3> <p data-svelte-h="svelte-gx207">Since <a href="https://huggingface.co/papers/2302.04761" rel="nofollow">ToolFormer</a> did not open source, we needed to first replicate the search index. It is mentioned in their paper that the authors built the search index using a BM25 retriever that indexes the Wikipedia dump from <a href="https://github.com/facebookresearch/KILT" rel="nofollow">KILT</a></p> <p data-svelte-h="svelte-1ea9kw5">Fortunately, <a href="https://github.com/castorini/pyserini" rel="nofollow"><code>pyserini</code></a> already implements the BM25 retriever and provides a prebuilt index for the KILT Wikipedia dump. We can use the following code to search the index.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> pyserini.search.lucene <span class="hljs-keyword">import</span> LuceneSearcher
	<span class="hljs-keyword">import</span> json
	searcher = LuceneSearcher.from_prebuilt_index(<span class="hljs-string">'wikipedia-kilt-doc'</span>)
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">search</span>(<span class="hljs-params">query</span>):
	hits = searcher.search(query, k=<span class="hljs-number">1</span>)
	hit = hits[<span class="hljs-number">0</span>]
	contents = json.loads(hit.raw)[<span class="hljs-string">'contents'</span>]
	<span class="hljs-keyword">return</span> contents
	<span class="hljs-built_in">print</span>(search(<span class="hljs-string">"tennis racket"</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Racket (sports equipment)
	A racket <span class="hljs-keyword">or</span> racquet is <span class="hljs-keyword">a</span> sports implement consisting <span class="hljs-keyword">of</span> <span class="hljs-keyword">a</span> handled frame <span class="hljs-keyword">with</span> <span class="hljs-keyword">an</span> <span class="hljs-built_in">open</span> hoop across which <span class="hljs-keyword">a</span> network <span class="hljs-keyword">of</span> strings <span class="hljs-keyword">or</span> catgut is stretched tightly. It is used <span class="hljs-keyword">for</span> striking <span class="hljs-keyword">a</span> ball <span class="hljs-keyword">or</span> shuttlecock <span class="hljs-keyword">in</span> games such <span class="hljs-keyword">as</span> squash, tennis, racquetball, <span class="hljs-keyword">and</span> badminton. Collectively, these games are known <span class="hljs-keyword">as</span> racket sports. Racket design <span class="hljs-keyword">and</span> manufacturing has changed considerably over <span class="hljs-keyword">the</span> centuries.

	The frame <span class="hljs-keyword">of</span> rackets <span class="hljs-keyword">for</span> all sports was traditionally made <span class="hljs-keyword">of</span> solid wood (later laminated wood) <span class="hljs-keyword">and</span> <span class="hljs-keyword">the</span> strings <span class="hljs-keyword">of</span> animal intestine known <span class="hljs-keyword">as</span> catgut. The traditional racket size was limited <span class="hljs-keyword">by</span> <span class="hljs-keyword">the</span> strength <span class="hljs-keyword">and</span> weight <span class="hljs-keyword">of</span> <span class="hljs-keyword">the</span> wooden frame which had <span class="hljs-built_in">to</span> be strong enough <span class="hljs-built_in">to</span> hold <span class="hljs-keyword">the</span> strings <span class="hljs-keyword">and</span> stiff enough <span class="hljs-built_in">to</span> hit <span class="hljs-keyword">the</span> ball <span class="hljs-keyword">or</span> shuttle. Manufacturers started adding non-wood laminates <span class="hljs-built_in">to</span> wood rackets <span class="hljs-built_in">to</span> improve stiffness. Non-wood rackets were made <span class="hljs-keyword">first</span> <span class="hljs-keyword">of</span> steel, <span class="hljs-keyword">then</span> <span class="hljs-keyword">of</span> aluminum, <span class="hljs-keyword">and</span> <span class="hljs-keyword">then</span> carbon fiber composites. Wood is still used <span class="hljs-keyword">for</span> real tennis, rackets, <span class="hljs-keyword">and</span> xare. Most rackets are now made <span class="hljs-keyword">of</span> composite materials including carbon fiber <span class="hljs-keyword">or</span> fiberglass, metals such <span class="hljs-keyword">as</span> titanium alloys, <span class="hljs-keyword">or</span> ceramics.
	...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1oofcgb">We then basically deployed this snippet as a Hugging Face space <a href="https://huggingface.co/spaces/vwxyzjn/pyserini-wikipedia-kilt-doc" rel="nofollow">here</a>, so that we can use the space as a <code>transformers.Tool</code> later.</p> <p data-svelte-h="svelte-682f8q"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/pyserini.png"></p> <h3 class="relative group"><a id="experiment-settings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#experiment-settings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Experiment settings</span></h3> <p data-svelte-h="svelte-1nttfiy">We use the following settings:</p> <ul data-svelte-h="svelte-1prllvm"><li>use the <code>bigcode/starcoderbase</code> model as the base model</li> <li>use the <code>pyserini-wikipedia-kilt-doc</code> space as the wiki tool and only uses the first paragrahs of the search result, allowing the <code>TextEnvironment</code> to obtain at most <code>max_tool_reponse=400</code> response tokens from the tool.</li> <li>test if the response contain the answer string, if so, give a reward of 1, otherwise, give a reward of 0.<ul><li>notice this is a simplified evaluation criteria. In <a href="https://huggingface.co/papers/2302.04761" rel="nofollow">ToolFormer</a>, the authors checks if the first 20 words of the response contain the correct answer.</li></ul></li> <li>used the following prompt that demonstrates the usage of the wiki tool.</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = <span class="hljs-string">"""\
	Answer the following question:

	Q: In which branch of the arts is Patricia Neary famous?
	A: Ballets
	A2: <request><Wiki>Patricia Neary<call>Patricia Neary (born October 27, 1942) is an American ballerina, choreographer and ballet director, who has been particularly active in Switzerland. She has also been a highly successful ambassador for the Balanchine Trust, bringing George Balanchine's ballets to 60 cities around the globe.<response>
	Result=Ballets<submit>

	Q: Who won Super Bowl XX?
	A: Chicago Bears
	A2: <request><Wiki>Super Bowl XX<call>Super Bowl XX was an American football game between the National Football Conference (NFC) champion Chicago Bears and the American Football Conference (AFC) champion New England Patriots to decide the National Football League (NFL) champion for the 1985 season. The Bears defeated the Patriots by the score of 46–10, capturing their first NFL championship (and Chicago's first overall sports victory) since 1963, three years prior to the birth of the Super Bowl. Super Bowl XX was played on January 26, 1986 at the Louisiana Superdome in New Orleans.<response>
	Result=Chicago Bears<submit>

	Q: """</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="result-and-discussion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#result-and-discussion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Result and Discussion</span></h3> <p data-svelte-h="svelte-1uznutc">Our experiments show that the agent can learn to use the wiki tool to answer questions. The learning curves would go up mostly, but one of the experiment did crash.</p> <p data-svelte-h="svelte-xtcpx0"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/triviaqa_learning_curves.png"></p> <p data-svelte-h="svelte-1kk3wng">Wandb report is <a href="https://wandb.ai/costa-huang/cleanRL/reports/TriviaQA-Final-Experiments--Vmlldzo1MjY0ODk5" rel="nofollow">here</a> for further inspection.</p> <p data-svelte-h="svelte-1gz9d0a">Note that the correct rate of the trained model is on the low end, which could be due to the following reasons:</p> <ul data-svelte-h="svelte-16k5ke5"><li><strong>incorrect searches:</strong> When given the question <code>"What is Bruce Willis' real first name?"</code> if the model searches for <code>Bruce Willis</code>, our wiki tool returns “Patrick Poivey (born 18 February 1948) is a French actor. He is especially known for his voice: he is the French dub voice of Bruce Willis since 1988.<code>But a correct search should be</code>Walter Bruce Willis (born March 19, 1955) is an American former actor. He achieved fame with a leading role on the comedy-drama series Moonlighting (1985–1989) and appeared in over a hundred films, gaining recognition as an action hero after his portrayal of John McClane in the Die Hard franchise (1988–2013) and other roles.[1][2]”</li></ul> <p data-svelte-h="svelte-7ohcuc"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/real_first_name.png"></p> <ul data-svelte-h="svelte-1xqpxtt"><li><p><strong>unnecessarily long response</strong>: The wiki tool by default sometimes output very long sequences. E.g., when the wiki tool searches for “Brown Act”</p> <ul><li><p>Our wiki tool returns “The Ralph M. Brown Act, located at California Government Code 54950 “et seq.”, is an act of the California State Legislature, authored by Assemblymember Ralph M. Brown and passed in 1953, that guarantees the public’s right to attend and participate in meetings of local legislative bodies.”</p></li> <li><p><a href="https://huggingface.co/papers/2302.04761" rel="nofollow">ToolFormer</a>’s wiki tool returns “The Ralph M. Brown Act is an act of the California State Legislature that guarantees the public’s right to attend and participate in meetings of local legislative bodies.” which is more succinct.</p> <p><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/brown_act.png"></p></li></ul></li></ul> <h2 class="relative group"><a id="early-experiments--solving-math-puzzles-with-python-interpreter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#early-experiments--solving-math-puzzles-with-python-interpreter"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>(Early Experiments 🧪): solving math puzzles with python interpreter</span></h2> <p data-svelte-h="svelte-9l5kiy">In this section, we attempt to teach the model to use a python interpreter to solve math puzzles. The rough idea is to give the agent a prompt like the following:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = <span class="hljs-string">"""\
	Example of using a Python API to solve math questions.

	Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?

	<request><PythonInterpreter>
	def solution():
	money_initial = 23
	bagels = 5
	bagel_cost = 3
	money_spent = bagels * bagel_cost
	money_left = money_initial - money_spent
	result = money_left
	return result
	print(solution())
	<call>72<response>

	Result = 72 <submit>

	Q: """</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dnddfe">Training experiment can be found at <a href="https://wandb.ai/lvwerra/trl-gsm8k/runs/a5odv01y" rel="nofollow">https://wandb.ai/lvwerra/trl-gsm8k/runs/a5odv01y</a></p> <p data-svelte-h="svelte-1cvjnh6"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gms8k_learning_curve.png"></p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/trl/blob/main/docs/source/learning_tools.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_5yobsv = {
	assets: "/docs/trl/main/en",
	base: "/docs/trl/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/trl/main/en/_app/immutable/entry/start.183b226a.js"),
	import("/docs/trl/main/en/_app/immutable/entry/app.9853b7f5.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 19],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 44.8 kB
Xet hash:: b4a239ce8d9154b3ecd9ed14436919bb2985376fb5604802d96cbcfc42d4e8ea

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.