Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / deep-rl-course /pr_661 /en /_app /immutable /nodes /46.f00a6b99.js

rtrm's picture

about 2 months ago

12.6 kB

	import{s as rt,n as pt,o as ct}from"../chunks/scheduler.ef843396.js";import{S as mt,i as ht,e as l,s as a,c as g,h as gt,a as o,d as i,b as s,f as ot,g as u,j as r,k as Y,l as ut,m as n,n as d,t as f,o as v,p as y}from"../chunks/index.05ef1181.js";import{H as K,E as dt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ef6649bf.js";function ft(Ue){let p,Z,V,ee,w,te,$,Fe="At this point, you might ask, “but Deep Q-Learning is excellent! Why use policy-gradient methods?“. To answer this question, let’s study the <strong>advantages and disadvantages of policy-gradient methods</strong>.",ie,x,ne,b,Ie="There are multiple advantages over value-based methods. Let’s see some of them:",ae,T,se,C,Oe="We can estimate the policy directly without storing additional data (action values).",le,_,oe,H,Qe="Policy-gradient methods can <strong>learn a stochastic policy while value functions can’t</strong>.",re,L,We="This has two consequences:",pe,M,Ae="<li><p>We <strong>don’t need to implement an exploration/exploitation trade-off by hand</strong>. Since we output a probability distribution over actions, the agent explores <strong>the state space without always taking the same trajectory.</strong></p></li> <li><p>We also get rid of the problem of <strong>perceptual aliasing</strong>. Perceptual aliasing is when two states seem (or are) the same but need different actions.</p></li>",ce,P,Re="Let’s take an example: we have an intelligent vacuum cleaner whose goal is to suck the dust and avoid killing the hamsters.",me,c,De='<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit6/hamster1.jpg" alt="Hamster 1"/>',he,k,Ge="Our vacuum cleaner can only perceive where the walls are.",ge,q,Se="The problem is that the <strong>two red (colored) states are aliased states because the agent perceives an upper and lower wall for each</strong>.",ue,m,Be='<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit6/hamster2.jpg" alt="Hamster 1"/>',de,j,Ne="Under a deterministic policy, the policy will either always move right when in a red state or always move left. <strong>Either case will cause our agent to get stuck and never suck the dust</strong>.",fe,E,Je="Under a value-based Reinforcement learning algorithm, we learn a <strong>quasi-deterministic policy</strong> (“greedy epsilon strategy”). Consequently, our agent can <strong>spend a lot of time before finding the dust</strong>.",ve,z,Ke="On the other hand, an optimal stochastic policy <strong>will randomly move left or right in red (colored) states</strong>. Consequently, <strong>it will not be stuck and will reach the goal state with a high probability</strong>.",ye,h,Ve='<img src="https://huggingface.co/datasets/huggingface-deep-rl-course/course-images/resolve/main/en/unit6/hamster3.jpg" alt="Hamster 1"/>',we,U,$e,F,Xe="The problem with Deep Q-learning is that their <strong>predictions assign a score (maximum expected future reward) for each possible action</strong>, at each time step, given the current state.",xe,I,Ye="But what if we have an infinite possibility of actions?",be,O,Ze="For instance, with a self-driving car, at each state, you can have a (near) infinite choice of actions (turning the wheel at 15°, 17.2°, 19,4°, honking, etc.). <strong>We’ll need to output a Q-value for each possible action</strong>! And <strong>taking the max action of a continuous output is an optimization problem itself</strong>!",Te,Q,et="Instead, with policy-gradient methods, we output a <strong>probability distribution over actions.</strong>",Ce,W,_e,A,tt=`In value-based methods, we use an aggressive operator to <strong>change the value function: we take the maximum over Q-estimates</strong>.
	Consequently, the action probabilities may change dramatically for an arbitrarily small change in the estimated action values if that change results in a different action having the maximal value.`,He,R,it="For instance, if during the training, the best action was left (with a Q-value of 0.22) and the training step after it’s right (since the right Q-value becomes 0.23), we dramatically changed the policy since now the policy will take most of the time right instead of left.",Le,D,nt="On the other hand, in policy-gradient methods, stochastic policy action preferences (probability of taking action) <strong>change smoothly over time</strong>.",Me,G,Pe,S,at="Naturally, policy-gradient methods also have some disadvantages:",ke,B,st="<li><strong>Frequently, policy-gradient methods converges to a local maximum instead of a global optimum.</strong></li> <li>Policy-gradient goes slower, <strong>step by step: it can take longer to train (inefficient).</strong></li> <li>Policy-gradient can have high variance. We’ll see in the actor-critic unit why, and how we can solve this problem.</li>",qe,N,lt='👉 If you want to go deeper into the advantages and disadvantages of policy-gradient methods, <a href="https://youtu.be/y3oqOjHilio" rel="nofollow">you can check this video</a>.',je,J,Ee,X,ze;return w=new K({props:{title:"The advantages and disadvantages of policy-gradient methods",local:"the-advantages-and-disadvantages-of-policy-gradient-methods",headingTag:"h1"}}),x=new K({props:{title:"Advantages",local:"advantages",headingTag:"h2"}}),T=new K({props:{title:"The simplicity of integration",local:"the-simplicity-of-integration",headingTag:"h3"}}),_=new K({props:{title:"Policy-gradient methods can learn a stochastic policy",local:"policy-gradient-methods-can-learn-a-stochastic-policy",headingTag:"h3"}}),U=new K({props:{title:"Policy-gradient methods are more effective in high-dimensional action spaces and continuous actions spaces",local:"policy-gradient-methods-are-more-effective-in-high-dimensional-action-spaces-and-continuous-actions-spaces",headingTag:"h3"}}),W=new K({props:{title:"Policy-gradient methods have better convergence properties",local:"policy-gradient-methods-have-better-convergence-properties",headingTag:"h3"}}),G=new K({props:{title:"Disadvantages",local:"disadvantages",headingTag:"h2"}}),J=new dt({props:{source:"https://github.com/huggingface/deep-rl-class/blob/main/units/en/unit4/advantages-disadvantages.mdx"}}),{c(){p=l("meta"),Z=a(),V=l("p"),ee=a(),g(w.$$.fragment),te=a(),$=l("p"),$.innerHTML=Fe,ie=a(),g(x.$$.fragment),ne=a(),b=l("p"),b.textContent=Ie,ae=a(),g(T.$$.fragment),se=a(),C=l("p"),C.textContent=Oe,le=a(),g(_.$$.fragment),oe=a(),H=l("p"),H.innerHTML=Qe,re=a(),L=l("p"),L.textContent=We,pe=a(),M=l("ol"),M.innerHTML=Ae,ce=a(),P=l("p"),P.textContent=Re,me=a(),c=l("figure"),c.innerHTML=De,he=a(),k=l("p"),k.textContent=Ge,ge=a(),q=l("p"),q.innerHTML=Se,ue=a(),m=l("figure"),m.innerHTML=Be,de=a(),j=l("p"),j.innerHTML=Ne,fe=a(),E=l("p"),E.innerHTML=Je,ve=a(),z=l("p"),z.innerHTML=Ke,ye=a(),h=l("figure"),h.innerHTML=Ve,we=a(),g(U.$$.fragment),$e=a(),F=l("p"),F.innerHTML=Xe,xe=a(),I=l("p"),I.textContent=Ye,be=a(),O=l("p"),O.innerHTML=Ze,Te=a(),Q=l("p"),Q.innerHTML=et,Ce=a(),g(W.$$.fragment),_e=a(),A=l("p"),A.innerHTML=tt,He=a(),R=l("p"),R.textContent=it,Le=a(),D=l("p"),D.innerHTML=nt,Me=a(),g(G.$$.fragment),Pe=a(),S=l("p"),S.textContent=at,ke=a(),B=l("ul"),B.innerHTML=st,qe=a(),N=l("p"),N.innerHTML=lt,je=a(),g(J.$$.fragment),Ee=a(),X=l("p"),this.h()},l(e){const t=gt("svelte-u9bgzb",document.head);p=o(t,"META",{name:!0,content:!0}),t.forEach(i),Z=s(e),V=o(e,"P",{}),ot(V).forEach(i),ee=s(e),u(w.$$.fragment,e),te=s(e),$=o(e,"P",{"data-svelte-h":!0}),r($)!=="svelte-tdbzv5"&&($.innerHTML=Fe),ie=s(e),u(x.$$.fragment,e),ne=s(e),b=o(e,"P",{"data-svelte-h":!0}),r(b)!=="svelte-maw3p6"&&(b.textContent=Ie),ae=s(e),u(T.$$.fragment,e),se=s(e),C=o(e,"P",{"data-svelte-h":!0}),r(C)!=="svelte-2fsuyn"&&(C.textContent=Oe),le=s(e),u(_.$$.fragment,e),oe=s(e),H=o(e,"P",{"data-svelte-h":!0}),r(H)!=="svelte-1e82bsg"&&(H.innerHTML=Qe),re=s(e),L=o(e,"P",{"data-svelte-h":!0}),r(L)!=="svelte-1dwxfce"&&(L.textContent=We),pe=s(e),M=o(e,"OL",{"data-svelte-h":!0}),r(M)!=="svelte-3l8e2h"&&(M.innerHTML=Ae),ce=s(e),P=o(e,"P",{"data-svelte-h":!0}),r(P)!=="svelte-hzq0kh"&&(P.textContent=Re),me=s(e),c=o(e,"FIGURE",{class:!0,"data-svelte-h":!0}),r(c)!=="svelte-juzukv"&&(c.innerHTML=De),he=s(e),k=o(e,"P",{"data-svelte-h":!0}),r(k)!=="svelte-sba7bj"&&(k.textContent=Ge),ge=s(e),q=o(e,"P",{"data-svelte-h":!0}),r(q)!=="svelte-14k1z7"&&(q.innerHTML=Se),ue=s(e),m=o(e,"FIGURE",{class:!0,"data-svelte-h":!0}),r(m)!=="svelte-zv8qjk"&&(m.innerHTML=Be),de=s(e),j=o(e,"P",{"data-svelte-h":!0}),r(j)!=="svelte-wo6gfi"&&(j.innerHTML=Ne),fe=s(e),E=o(e,"P",{"data-svelte-h":!0}),r(E)!=="svelte-h0f6wc"&&(E.innerHTML=Je),ve=s(e),z=o(e,"P",{"data-svelte-h":!0}),r(z)!=="svelte-140fl91"&&(z.innerHTML=Ke),ye=s(e),h=o(e,"FIGURE",{class:!0,"data-svelte-h":!0}),r(h)!=="svelte-y1qp89"&&(h.innerHTML=Ve),we=s(e),u(U.$$.fragment,e),$e=s(e),F=o(e,"P",{"data-svelte-h":!0}),r(F)!=="svelte-1mhp325"&&(F.innerHTML=Xe),xe=s(e),I=o(e,"P",{"data-svelte-h":!0}),r(I)!=="svelte-19u4xo5"&&(I.textContent=Ye),be=s(e),O=o(e,"P",{"data-svelte-h":!0}),r(O)!=="svelte-od3dsl"&&(O.innerHTML=Ze),Te=s(e),Q=o(e,"P",{"data-svelte-h":!0}),r(Q)!=="svelte-6wxwpi"&&(Q.innerHTML=et),Ce=s(e),u(W.$$.fragment,e),_e=s(e),A=o(e,"P",{"data-svelte-h":!0}),r(A)!=="svelte-1ewv4ss"&&(A.innerHTML=tt),He=s(e),R=o(e,"P",{"data-svelte-h":!0}),r(R)!=="svelte-xfh67p"&&(R.textContent=it),Le=s(e),D=o(e,"P",{"data-svelte-h":!0}),r(D)!=="svelte-1nkvz6"&&(D.innerHTML=nt),Me=s(e),u(G.$$.fragment,e),Pe=s(e),S=o(e,"P",{"data-svelte-h":!0}),r(S)!=="svelte-1fe6jwi"&&(S.textContent=at),ke=s(e),B=o(e,"UL",{"data-svelte-h":!0}),r(B)!=="svelte-posk1q"&&(B.innerHTML=st),qe=s(e),N=o(e,"P",{"data-svelte-h":!0}),r(N)!=="svelte-s99z02"&&(N.innerHTML=lt),je=s(e),u(J.$$.fragment,e),Ee=s(e),X=o(e,"P",{}),ot(X).forEach(i),this.h()},h(){Y(p,"name","hf:doc:metadata"),Y(p,"content",vt),Y(c,"class","image table text-center m-0 w-full"),Y(m,"class","image table text-center m-0 w-full"),Y(h,"class","image table text-center m-0 w-full")},m(e,t){ut(document.head,p),n(e,Z,t),n(e,V,t),n(e,ee,t),d(w,e,t),n(e,te,t),n(e,$,t),n(e,ie,t),d(x,e,t),n(e,ne,t),n(e,b,t),n(e,ae,t),d(T,e,t),n(e,se,t),n(e,C,t),n(e,le,t),d(_,e,t),n(e,oe,t),n(e,H,t),n(e,re,t),n(e,L,t),n(e,pe,t),n(e,M,t),n(e,ce,t),n(e,P,t),n(e,me,t),n(e,c,t),n(e,he,t),n(e,k,t),n(e,ge,t),n(e,q,t),n(e,ue,t),n(e,m,t),n(e,de,t),n(e,j,t),n(e,fe,t),n(e,E,t),n(e,ve,t),n(e,z,t),n(e,ye,t),n(e,h,t),n(e,we,t),d(U,e,t),n(e,$e,t),n(e,F,t),n(e,xe,t),n(e,I,t),n(e,be,t),n(e,O,t),n(e,Te,t),n(e,Q,t),n(e,Ce,t),d(W,e,t),n(e,_e,t),n(e,A,t),n(e,He,t),n(e,R,t),n(e,Le,t),n(e,D,t),n(e,Me,t),d(G,e,t),n(e,Pe,t),n(e,S,t),n(e,ke,t),n(e,B,t),n(e,qe,t),n(e,N,t),n(e,je,t),d(J,e,t),n(e,Ee,t),n(e,X,t),ze=!0},p:pt,i(e){ze\|\|(f(w.$$.fragment,e),f(x.$$.fragment,e),f(T.$$.fragment,e),f(_.$$.fragment,e),f(U.$$.fragment,e),f(W.$$.fragment,e),f(G.$$.fragment,e),f(J.$$.fragment,e),ze=!0)},o(e){v(w.$$.fragment,e),v(x.$$.fragment,e),v(T.$$.fragment,e),v(_.$$.fragment,e),v(U.$$.fragment,e),v(W.$$.fragment,e),v(G.$$.fragment,e),v(J.$$.fragment,e),ze=!1},d(e){e&&(i(Z),i(V),i(ee),i(te),i($),i(ie),i(ne),i(b),i(ae),i(se),i(C),i(le),i(oe),i(H),i(re),i(L),i(pe),i(M),i(ce),i(P),i(me),i(c),i(he),i(k),i(ge),i(q),i(ue),i(m),i(de),i(j),i(fe),i(E),i(ve),i(z),i(ye),i(h),i(we),i($e),i(F),i(xe),i(I),i(be),i(O),i(Te),i(Q),i(Ce),i(_e),i(A),i(He),i(R),i(Le),i(D),i(Me),i(Pe),i(S),i(ke),i(B),i(qe),i(N),i(je),i(Ee),i(X)),i(p),y(w,e),y(x,e),y(T,e),y(_,e),y(U,e),y(W,e),y(G,e),y(J,e)}}}const vt='{"title":"The advantages and disadvantages of policy-gradient methods","local":"the-advantages-and-disadvantages-of-policy-gradient-methods","sections":[{"title":"Advantages","local":"advantages","sections":[{"title":"The simplicity of integration","local":"the-simplicity-of-integration","sections":[],"depth":3},{"title":"Policy-gradient methods can learn a stochastic policy","local":"policy-gradient-methods-can-learn-a-stochastic-policy","sections":[],"depth":3},{"title":"Policy-gradient methods are more effective in high-dimensional action spaces and continuous actions spaces","local":"policy-gradient-methods-are-more-effective-in-high-dimensional-action-spaces-and-continuous-actions-spaces","sections":[],"depth":3},{"title":"Policy-gradient methods have better convergence properties","local":"policy-gradient-methods-have-better-convergence-properties","sections":[],"depth":3}],"depth":2},{"title":"Disadvantages","local":"disadvantages","sections":[],"depth":2}],"depth":1}';function yt(Ue){return ct(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class bt extends mt{constructor(p){super(),ht(this,p,yt,ft,rt,{})}}export{bt as component};

Xet Storage Details

Size:: 12.6 kB
Xet hash:: 85e16592d409cd0d786b7fb6809bcf4948d7832dd8352b266a2ab230c359fb23

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.