Buckets:
| import{s as _s,n as $s,o as Us}from"../chunks/scheduler.d75c11ed.js";import{S as Cs,i as Is,e as p,s as n,c as i,h as ks,a as r,d as t,b as l,f as ws,g as o,j as u,k as Js,l as Qs,m as e,n as m,t as c,o as d,p as h}from"../chunks/index.4ec9dfe9.js";import{C as Rs,H as ms,E as qs}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as H}from"../chunks/CodeBlock.5919a092.js";function Gs(cs){let f,Y,P,v,M,L,j,B,g,ds=`This document is a quick introduction to using <code>datasets</code> with Pandas, with a particular focus on how to process | |
| datasets using Pandas functions, and how to convert a dataset to Pandas or from Pandas.`,E,y,hs="This is particularly useful as it allows fast operations, since <code>datasets</code> uses PyArrow under the hood and PyArrow is well integrated with Pandas.",A,b,F,T,us="By default, datasets return regular Python objects: integers, floats, strings, lists, etc.",S,w,fs='To get Pandas DataFrames or Series instead, you can set the format of the dataset to <code>pandas</code> using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.with_format">Dataset.with_format()</a>:',N,J,W,_,Ms="This also works for <code>IterableDataset</code> objects obtained e.g. using <code>load_dataset(..., streaming=True)</code>:",X,$,V,U,K,C,js='Pandas functions are generally faster than regular hand-written python functions, and therefore they are a good option to optimize data processing. You can use Pandas functions to process a dataset in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a> or <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.filter">Dataset.filter()</a>:',O,I,ss,k,gs="We use <code>batched=True</code> because it is faster to process batches of data in Pandas rather than row by row. It’s also possible to use <code>batch_size=</code> in <code>map()</code> to set the size of each <code>df</code>.",as,Q,ys='This also works for <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.map">IterableDataset.map()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.filter">IterableDataset.filter()</a>.',ts,R,es,q,bs='To import data from Pandas, you can use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.from_pandas">Dataset.from_pandas()</a>:',ns,G,ls,Z,Ts='And you can use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_pandas">Dataset.to_pandas()</a> to export a Dataset to a Pandas DataFrame:',ps,z,rs,D,is,x,os;return M=new Rs({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),j=new ms({props:{title:"Use with Pandas",local:"use-with-pandas",headingTag:"h1"}}),b=new ms({props:{title:"Dataset format",local:"dataset-format",headingTag:"h2"}}),J=new H({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwRGF0YXNldCUwQWRhdGElMjAlM0QlMjAlN0IlMjJjb2xfMCUyMiUzQSUyMCU1QiUyMmElMjIlMkMlMjAlMjJiJTIyJTJDJTIwJTIyYyUyMiUyQyUyMCUyMmQlMjIlNUQlMkMlMjAlMjJjb2xfMSUyMiUzQSUyMCU1QjAuJTJDJTIwMC4lMkMlMjAxLiUyQyUyMDEuJTVEJTdEJTBBZHMlMjAlM0QlMjBEYXRhc2V0LmZyb21fZGljdChkYXRhKSUwQWRzJTIwJTNEJTIwZHMud2l0aF9mb3JtYXQoJTIycGFuZGFzJTIyKSUwQWRzJTVCMCU1RCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMHBkLkRhdGFGcmFtZSUwQWRzJTVCJTNBMiU1RCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMHBkLkRhdGFGcmFtZSUwQWRzJTVCJTIyZGF0YSUyMiU1RCUyMCUyMCUyMyUyMHBkLlNlcmllcw==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset | |
| <span class="hljs-meta">>>> </span>data = {<span class="hljs-string">"col_0"</span>: [<span class="hljs-string">"a"</span>, <span class="hljs-string">"b"</span>, <span class="hljs-string">"c"</span>, <span class="hljs-string">"d"</span>], <span class="hljs-string">"col_1"</span>: [<span class="hljs-number">0.</span>, <span class="hljs-number">0.</span>, <span class="hljs-number">1.</span>, <span class="hljs-number">1.</span>]} | |
| <span class="hljs-meta">>>> </span>ds = Dataset.from_dict(data) | |
| <span class="hljs-meta">>>> </span>ds = ds.with_format(<span class="hljs-string">"pandas"</span>) | |
| <span class="hljs-meta">>>> </span>ds[<span class="hljs-number">0</span>] <span class="hljs-comment"># pd.DataFrame</span> | |
| col_0 col_1 | |
| <span class="hljs-number">0</span> a <span class="hljs-number">0.0</span> | |
| <span class="hljs-meta">>>> </span>ds[:<span class="hljs-number">2</span>] <span class="hljs-comment"># pd.DataFrame</span> | |
| col_0 col_1 | |
| <span class="hljs-number">0</span> a <span class="hljs-number">0.0</span> | |
| <span class="hljs-number">1</span> b <span class="hljs-number">0.0</span> | |
| <span class="hljs-meta">>>> </span>ds[<span class="hljs-string">"data"</span>] <span class="hljs-comment"># pd.Series</span> | |
| <span class="hljs-number">0</span> a | |
| <span class="hljs-number">1</span> b | |
| <span class="hljs-number">2</span> c | |
| <span class="hljs-number">3</span> d | |
| Name: col_0, dtype: <span class="hljs-built_in">object</span>`,wrap:!1}}),$=new H({props:{code:"ZHMlMjAlM0QlMjBkcy53aXRoX2Zvcm1hdCglMjJwYW5kYXMlMjIpJTBBZm9yJTIwZGYlMjBpbiUyMGRzLml0ZXIoYmF0Y2hfc2l6ZSUzRDIpJTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZGYpJTBBJTIwJTIwJTIwJTIwYnJlYWs=",highlighted:`<span class="hljs-meta">>>> </span>ds = ds.with_format(<span class="hljs-string">"pandas"</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> df <span class="hljs-keyword">in</span> ds.<span class="hljs-built_in">iter</span>(batch_size=<span class="hljs-number">2</span>): | |
| <span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(df) | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">break</span> | |
| col_0 col_1 | |
| <span class="hljs-number">0</span> a <span class="hljs-number">0.0</span> | |
| <span class="hljs-number">1</span> b <span class="hljs-number">0.0</span>`,wrap:!1}}),U=new ms({props:{title:"Process data",local:"process-data",headingTag:"h2"}}),I=new H({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwRGF0YXNldCUwQWRhdGElMjAlM0QlMjAlN0IlMjJjb2xfMCUyMiUzQSUyMCU1QiUyMmElMjIlMkMlMjAlMjJiJTIyJTJDJTIwJTIyYyUyMiUyQyUyMCUyMmQlMjIlNUQlMkMlMjAlMjJjb2xfMSUyMiUzQSUyMCU1QjAuJTJDJTIwMC4lMkMlMjAxLiUyQyUyMDEuJTVEJTdEJTBBZHMlMjAlM0QlMjBEYXRhc2V0LmZyb21fZGljdChkYXRhKSUwQWRzJTIwJTNEJTIwZHMud2l0aF9mb3JtYXQoJTIycGFuZGFzJTIyKSUwQWRzJTIwJTNEJTIwZHMubWFwKGxhbWJkYSUyMGRmJTNBJTIwZGYuYXNzaWduKGNvbF8yJTNEZGYuY29sXzElMjAlMkIlMjAxKSUyQyUyMGJhdGNoZWQlM0RUcnVlKSUwQWRzJTVCJTNBMiU1RCUwQWRzJTIwJTNEJTIwZHMuZmlsdGVyKGxhbWJkYSUyMGRmJTNBJTIwZGYuY29sXzAlMjAlM0QlM0QlMjAlMjJiJTIyJTJDJTIwYmF0Y2hlZCUzRFRydWUpJTBBZHMlNUIwJTVE",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset | |
| <span class="hljs-meta">>>> </span>data = {<span class="hljs-string">"col_0"</span>: [<span class="hljs-string">"a"</span>, <span class="hljs-string">"b"</span>, <span class="hljs-string">"c"</span>, <span class="hljs-string">"d"</span>], <span class="hljs-string">"col_1"</span>: [<span class="hljs-number">0.</span>, <span class="hljs-number">0.</span>, <span class="hljs-number">1.</span>, <span class="hljs-number">1.</span>]} | |
| <span class="hljs-meta">>>> </span>ds = Dataset.from_dict(data) | |
| <span class="hljs-meta">>>> </span>ds = ds.with_format(<span class="hljs-string">"pandas"</span>) | |
| <span class="hljs-meta">>>> </span>ds = ds.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> df: df.assign(col_2=df.col_1 + <span class="hljs-number">1</span>), batched=<span class="hljs-literal">True</span>) | |
| <span class="hljs-meta">>>> </span>ds[:<span class="hljs-number">2</span>] | |
| col_0 col_1 col_2 | |
| <span class="hljs-number">0</span> a <span class="hljs-number">0.0</span> <span class="hljs-number">1.0</span> | |
| <span class="hljs-number">1</span> b <span class="hljs-number">0.0</span> <span class="hljs-number">1.0</span> | |
| <span class="hljs-meta">>>> </span>ds = ds.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> df: df.col_0 == <span class="hljs-string">"b"</span>, batched=<span class="hljs-literal">True</span>) | |
| <span class="hljs-meta">>>> </span>ds[<span class="hljs-number">0</span>] | |
| col_0 col_1 col_2 | |
| <span class="hljs-number">0</span> b <span class="hljs-number">0.0</span> <span class="hljs-number">1.0</span>`,wrap:!1}}),R=new ms({props:{title:"Import or Export from Pandas",local:"import-or-export-from-pandas",headingTag:"h2"}}),G=new H({props:{code:"ZHMlMjAlM0QlMjBEYXRhc2V0LmZyb21fcGFuZGFzKGRmKQ==",highlighted:"ds = Dataset.from_pandas(df)",wrap:!1}}),z=new H({props:{code:"ZGYlMjAlM0QlMjBEYXRhc2V0LnRvX3BhbmRhcygp",highlighted:"df = Dataset.to_pandas()",wrap:!1}}),D=new qs({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/use_with_pandas.mdx"}}),{c(){f=p("meta"),Y=n(),P=p("p"),v=n(),i(M.$$.fragment),L=n(),i(j.$$.fragment),B=n(),g=p("p"),g.innerHTML=ds,E=n(),y=p("p"),y.innerHTML=hs,A=n(),i(b.$$.fragment),F=n(),T=p("p"),T.textContent=us,S=n(),w=p("p"),w.innerHTML=fs,N=n(),i(J.$$.fragment),W=n(),_=p("p"),_.innerHTML=Ms,X=n(),i($.$$.fragment),V=n(),i(U.$$.fragment),K=n(),C=p("p"),C.innerHTML=js,O=n(),i(I.$$.fragment),ss=n(),k=p("p"),k.innerHTML=gs,as=n(),Q=p("p"),Q.innerHTML=ys,ts=n(),i(R.$$.fragment),es=n(),q=p("p"),q.innerHTML=bs,ns=n(),i(G.$$.fragment),ls=n(),Z=p("p"),Z.innerHTML=Ts,ps=n(),i(z.$$.fragment),rs=n(),i(D.$$.fragment),is=n(),x=p("p"),this.h()},l(s){const a=ks("svelte-u9bgzb",document.head);f=r(a,"META",{name:!0,content:!0}),a.forEach(t),Y=l(s),P=r(s,"P",{}),ws(P).forEach(t),v=l(s),o(M.$$.fragment,s),L=l(s),o(j.$$.fragment,s),B=l(s),g=r(s,"P",{"data-svelte-h":!0}),u(g)!=="svelte-rghtkp"&&(g.innerHTML=ds),E=l(s),y=r(s,"P",{"data-svelte-h":!0}),u(y)!=="svelte-rzzdyf"&&(y.innerHTML=hs),A=l(s),o(b.$$.fragment,s),F=l(s),T=r(s,"P",{"data-svelte-h":!0}),u(T)!=="svelte-ej8pz8"&&(T.textContent=us),S=l(s),w=r(s,"P",{"data-svelte-h":!0}),u(w)!=="svelte-1vasaf"&&(w.innerHTML=fs),N=l(s),o(J.$$.fragment,s),W=l(s),_=r(s,"P",{"data-svelte-h":!0}),u(_)!=="svelte-2qljfh"&&(_.innerHTML=Ms),X=l(s),o($.$$.fragment,s),V=l(s),o(U.$$.fragment,s),K=l(s),C=r(s,"P",{"data-svelte-h":!0}),u(C)!=="svelte-bzmsm5"&&(C.innerHTML=js),O=l(s),o(I.$$.fragment,s),ss=l(s),k=r(s,"P",{"data-svelte-h":!0}),u(k)!=="svelte-1iws501"&&(k.innerHTML=gs),as=l(s),Q=r(s,"P",{"data-svelte-h":!0}),u(Q)!=="svelte-1lzi5rt"&&(Q.innerHTML=ys),ts=l(s),o(R.$$.fragment,s),es=l(s),q=r(s,"P",{"data-svelte-h":!0}),u(q)!=="svelte-117nd8q"&&(q.innerHTML=bs),ns=l(s),o(G.$$.fragment,s),ls=l(s),Z=r(s,"P",{"data-svelte-h":!0}),u(Z)!=="svelte-1cg4cwe"&&(Z.innerHTML=Ts),ps=l(s),o(z.$$.fragment,s),rs=l(s),o(D.$$.fragment,s),is=l(s),x=r(s,"P",{}),ws(x).forEach(t),this.h()},h(){Js(f,"name","hf:doc:metadata"),Js(f,"content",Zs)},m(s,a){Qs(document.head,f),e(s,Y,a),e(s,P,a),e(s,v,a),m(M,s,a),e(s,L,a),m(j,s,a),e(s,B,a),e(s,g,a),e(s,E,a),e(s,y,a),e(s,A,a),m(b,s,a),e(s,F,a),e(s,T,a),e(s,S,a),e(s,w,a),e(s,N,a),m(J,s,a),e(s,W,a),e(s,_,a),e(s,X,a),m($,s,a),e(s,V,a),m(U,s,a),e(s,K,a),e(s,C,a),e(s,O,a),m(I,s,a),e(s,ss,a),e(s,k,a),e(s,as,a),e(s,Q,a),e(s,ts,a),m(R,s,a),e(s,es,a),e(s,q,a),e(s,ns,a),m(G,s,a),e(s,ls,a),e(s,Z,a),e(s,ps,a),m(z,s,a),e(s,rs,a),m(D,s,a),e(s,is,a),e(s,x,a),os=!0},p:$s,i(s){os||(c(M.$$.fragment,s),c(j.$$.fragment,s),c(b.$$.fragment,s),c(J.$$.fragment,s),c($.$$.fragment,s),c(U.$$.fragment,s),c(I.$$.fragment,s),c(R.$$.fragment,s),c(G.$$.fragment,s),c(z.$$.fragment,s),c(D.$$.fragment,s),os=!0)},o(s){d(M.$$.fragment,s),d(j.$$.fragment,s),d(b.$$.fragment,s),d(J.$$.fragment,s),d($.$$.fragment,s),d(U.$$.fragment,s),d(I.$$.fragment,s),d(R.$$.fragment,s),d(G.$$.fragment,s),d(z.$$.fragment,s),d(D.$$.fragment,s),os=!1},d(s){s&&(t(Y),t(P),t(v),t(L),t(B),t(g),t(E),t(y),t(A),t(F),t(T),t(S),t(w),t(N),t(W),t(_),t(X),t(V),t(K),t(C),t(O),t(ss),t(k),t(as),t(Q),t(ts),t(es),t(q),t(ns),t(ls),t(Z),t(ps),t(rs),t(is),t(x)),t(f),h(M,s),h(j,s),h(b,s),h(J,s),h($,s),h(U,s),h(I,s),h(R,s),h(G,s),h(z,s),h(D,s)}}}const Zs='{"title":"Use with Pandas","local":"use-with-pandas","sections":[{"title":"Dataset format","local":"dataset-format","sections":[],"depth":2},{"title":"Process data","local":"process-data","sections":[],"depth":2},{"title":"Import or Export from Pandas","local":"import-or-export-from-pandas","sections":[],"depth":2}],"depth":1}';function zs(cs){return Us(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ys extends Cs{constructor(f){super(),Is(this,f,zs,Gs,_s,{})}}export{Ys as component}; | |
Xet Storage Details
- Size:
- 13 kB
- Xet hash:
- f2a16f1f04f98de08ae8e1b40e6a05dd2beaab45b3e188e4becda557f82c9ba8
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.