Buckets:

rtrm's picture
download
raw
32.3 kB
import{s as yn,o as fn,n as kt}from"../chunks/scheduler.eb244325.js";import{S as Tn,i as gn,e as o,s,c as u,h as bn,a as p,d as n,b as a,f as vt,g as r,j as d,k as wn,l as Wt,m as l,n as c,t as M,o as h,p as w}from"../chunks/index.661680a1.js";import{T as Gt}from"../chunks/Tip.76637dd3.js";import{C as Jn,H as b,E as jn}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.dcdac39a.js";import{C as Ue}from"../chunks/CodeBlock.0026e562.js";function In(J){let i,f="This tutorial focuses on creating a production-ready script that can process any dataset and add embeddings using the <strong>Text Embeddings Inference (TEI)</strong> engine for optimized performance.";return{c(){i=o("p"),i.innerHTML=f},l(m){i=p(m,"P",{"data-svelte-h":!0}),d(i)!=="svelte-1s3ow1a"&&(i.innerHTML=f)},m(m,y){l(m,i,y)},p:kt,d(m){m&&n(i)}}}function Un(J){let i,f='If you’re looking for a model with less compute requirements, you can use the <a href="https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2" rel="nofollow">sentence-transformers/all-MiniLM-L6-v2</a> model.';return{c(){i=o("p"),i.innerHTML=f},l(m){i=p(m,"P",{"data-svelte-h":!0}),d(i)!=="svelte-1a40r0f"&&(i.innerHTML=f)},m(m,y){l(m,i,y)},p:kt,d(m){m&&n(i)}}}function $n(J){let i,f="The <code>datasets</code> library will pass our function a batch of examples from the dataset, as a dictionary of batch values. The key will be the name of the column we want to embed, and the value will be a list of values from that column.";return{c(){i=o("p"),i.innerHTML=f},l(m){i=p(m,"P",{"data-svelte-h":!0}),d(i)!=="svelte-ybk8oj"&&(i.innerHTML=f)},m(m,y){l(m,i,y)},p:kt,d(m){m&&n(i)}}}function Zn(J){let i,f="The <code>datasets</code> library’s <code>map</code> function is optimized for performance and will automatically batch the rows for us. Inference Endpoints can also scale to meet the demand of the batch size, so to get the best performance, you should calibrate the batch size with your Inference Endpoints’s configuration.",m,y,j="For example, select the highest possible batch size for you model and synchronize the batch size with your Inference Endpoint’s configuration in <code>max_concurrent_requests</code>.";return{c(){i=o("p"),i.innerHTML=f,m=s(),y=o("p"),y.innerHTML=j},l(T){i=p(T,"P",{"data-svelte-h":!0}),d(i)!=="svelte-1sfo5v4"&&(i.innerHTML=f),m=a(T),y=p(T,"P",{"data-svelte-h":!0}),d(y)!=="svelte-1u02m5o"&&(y.innerHTML=j)},m(T,g){l(T,i,g),l(T,m,g),l(T,y,g)},p:kt,d(T){T&&(n(i),n(m),n(y))}}}function Cn(J){let i,f,m,y,j,T,g,Ce,G,Et='This tutorial will guide you through deploying an embedding endpoint and building a Python script to efficiently process datasets with embeddings. We’ll use the powerful <a href="https://huggingface.co/Qwen/Qwen3-Embedding-4B" rel="nofollow">Qwen/Qwen3-Embedding-4B</a> model to create high-quality embeddings for your data.',Ge,I,ke,k,Be,B,xt="First, we need to create an Inference Endpoint optimized for embeddings.",ve,v,Rt=`Start by navigating to the Inference Endpoints UI, and once you have logged in you should see a button for creating a new Inference
Endpoint. Click the “New” button.`,We,W,_t='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/quick_start/1-new-button.png" alt="new-button"/>',Ee,E,Ft=`From there you’ll be directed to the catalog. The Model Catalog consists of popular models which have tuned configurations to work as one-click
deploys. You can search for embedding models or create a custom endpoint.`,xe,x,Vt='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/quick_start/2-catalog.png" alt="catalog"/>',Re,R,Xt="For this tutorial, we’ll use the Qwen3-Embedding-4B model available in the Inference Endpoints Model Catalog. Note if it’s ever not in the catalog, you can deploy a model as a custom Endpoint from the Hugging Face Hub by entering the model repository ID <code>Qwen/Qwen3-Embedding-4B</code>.",_e,_,zt="For embedding models, we recommend:",Fe,F,Ht="<li><strong>GPU</strong>: NVIDIA, T4, L4 or A10G for good performance.</li> <li><strong>Instance Size</strong>: x1 (sufficient for most embedding workloads)</li> <li><strong>Auto-scaling</strong>: Enable scale-to-zero to save costs by switching the endpoint to a paused state when it’s not in use.</li> <li><strong>Timeout</strong>: Set a timeout of 10 minutes to avoid long-running requests. You should define a timeout based on how you expect your endpoint to be used.</li>",Ve,U,Xe,V,Yt="The Qwen3-Embedding-4B model will automatically use the <strong>Text Embeddings Inference (TEI)</strong> engine, which provides optimized inference and automatic batching.",ze,X,Qt="Click “Create Endpoint” to deploy your embedding service.",He,z,qt='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/quick_start/4-config.png" alt="config"/>',Ye,H,St="This may take about 5 minutes to initialize.",Qe,Y,qe,Q,At="Once your Inference Endpoint is running, you can test it directly in the playground. It accepts text input and returns high-dimensional vectors.",Se,q,Nt='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/embedding-tutorial/assets/tutorials/embedding/playground.png" alt="playground"/>',Ae,S,Lt="Try entering some sample text like “Machine learning is transforming how we process data” and see the embedding output.",Ne,A,Le,N,Pt='To use your endpoint programmatically, you’ll need these details from the Endpoint’s <a href="https://endpoints.huggingface.co/" rel="nofollow">Overview</a>:',Pe,L,Ot='<li><strong>Base URL</strong>: <code>https://&lt;endpoint-name&gt;.endpoints.huggingface.cloud/v1/</code></li> <li><strong>Model name</strong>: The name of your endpoint</li> <li><strong>Token</strong>: Your HF token from <a href="https://huggingface.co/settings/tokens" rel="nofollow">settings</a></li>',Oe,P,Kt='<img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/tutorials/embedding/endpoint-page.png" alt="endpoint-details"/>',Ke,O,De,K,Dt="Now let’s build a script step by step to process datasets with embeddings. We’ll break it down into logical blocks.",et,D,tt,ee,en="We’ll use the OpenAI client to connect to the endpoint and the datasets library to load and process the dataset. So let’s install the required packages:",nt,te,lt,ne,tn="Then, set up your imports in a new Python file:",st,le,at,se,it,ae,nn="Set up the configuration to connect to your Inference Endpoint based on the details you collected in the previous step.",ot,ie,pt,oe,ln='Your OpenAI client is now configured to connect to your Inference Endpoint. For further reading you can check out the client documentation on text embeddings <a href="https://platform.openai.com/docs/api-reference/embeddings" target="_blank" rel="noopener noreferrer">here</a>.',dt,pe,mt,de,sn="Next, we’ll create a function to process batches of text and return embeddings.",ut,me,rt,$,ct,ue,Mt,re,an="Load your dataset and apply the embedding function:",ht,ce,wt,Z,yt,Me,ft,he,on="Finally, let’s save our embedded dataset locally or push it to the Hugging Face Hub:",Tt,we,gt,ye,bt,fe,pn="Nice work! You’ve now built an embedding pipeline that can process any dataset. Here’s the complete script:",Jt,C,$e,dn="Click to view the complete script",Bt,Te,jt,ge,mn="Here are some ways to extend your script:",It,be,un="<li><strong>Process multiple datasets</strong>: Modify the script to handle different dataset sources</li> <li><strong>Add error handling</strong>: Implement retry logic for failed API calls</li> <li><strong>Optimize batch sizes</strong>: Experiment with different batch sizes for better performance</li> <li><strong>Add validation</strong>: Check embedding quality and dimensions</li> <li><strong>Custom preprocessing</strong>: Add text cleaning or normalization steps</li> <li><strong>Build a Semantic Search Application</strong>: Use the embeddings to build a semantic search application.</li>",Ut,Je,rn="Your embedded datasets are now ready for downstream tasks like semantic search, recommendation systems, or RAG applications!",$t,je,Zt,Ze,Ct;return j=new Jn({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),g=new b({props:{title:"Build an embedding pipeline with datasets",local:"build-an-embedding-pipeline-with-datasets",headingTag:"h1"}}),I=new Gt({props:{$$slots:{default:[In]},$$scope:{ctx:J}}}),k=new b({props:{title:"Create your embedding Endpoint",local:"create-your-embedding-endpoint",headingTag:"h2"}}),U=new Gt({props:{$$slots:{default:[Un]},$$scope:{ctx:J}}}),Y=new b({props:{title:"Test your Endpoint",local:"test-your-endpoint",headingTag:"h2"}}),A=new b({props:{title:"Get your Endpoint’s details",local:"get-your-endpoints-details",headingTag:"h2"}}),O=new b({props:{title:"Building the embedding script",local:"building-the-embedding-script",headingTag:"h2"}}),D=new b({props:{title:"Step 1: Set up dependencies and imports",local:"step-1-set-up-dependencies-and-imports",headingTag:"h3"}}),te=new Ue({props:{code:"cGlwJTIwaW5zdGFsbCUyMGRhdGFzZXRzJTIwb3BlbmFp",highlighted:"pip install datasets openai",wrap:!1}}),le=new Ue({props:{code:"aW1wb3J0JTIwb3MlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEFmcm9tJTIwb3BlbmFpJTIwaW1wb3J0JTIwT3BlbkFJ",highlighted:`<span class="hljs-keyword">import</span> os
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI`,wrap:!1}}),se=new b({props:{title:"Step 2: Configure the connection",local:"step-2-configure-the-connection",headingTag:"h3"}}),ie=new Ue({props:{code:"JTIzJTIwQ29uZmlndXJhdGlvbiUwQUVORFBPSU5UX1VSTCUyMCUzRCUyMCUyMmh0dHBzJTNBJTJGJTJGeW91ci1lbmRwb2ludC1uYW1lLmVuZHBvaW50cy5odWdnaW5nZmFjZS5jbG91ZCUyRnYxJTJGJTIyJTIwJTIzJTIwRW5kcG9pbnQlMjBVUkwlMjAlMkIlMjB2ZXJzaW9uJTBBSEZfVE9LRU4lMjAlM0QlMjBvcy5nZXRlbnYoJTIySEZfVE9LRU4lMjIpJTIwJTIzJTIwWW91ciUyMEh1Z2dpbmclMjBGYWNlJTIwSHViJTIwdG9rZW4lMjBmcm9tJTIwaGYuY28lMkZzZXR0aW5ncyUyRnRva2VucyUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBPcGVuQUklMjBjbGllbnQlMjBmb3IlMjB5b3VyJTIwZW5kcG9pbnQlMEFjbGllbnQlMjAlM0QlMjBPcGVuQUkoJTBBJTIwJTIwJTIwJTIwYmFzZV91cmwlM0RFTkRQT0lOVF9VUkwlMkMlMEElMjAlMjAlMjAlMjBhcGlfa2V5JTNESEZfVE9LRU4lMkMlMEEp",highlighted:`<span class="hljs-comment"># Configuration</span>
ENDPOINT_URL = <span class="hljs-string">&quot;https://your-endpoint-name.endpoints.huggingface.cloud/v1/&quot;</span> <span class="hljs-comment"># Endpoint URL + version</span>
HF_TOKEN = os.getenv(<span class="hljs-string">&quot;HF_TOKEN&quot;</span>) <span class="hljs-comment"># Your Hugging Face Hub token from hf.co/settings/tokens</span>
<span class="hljs-comment"># Initialize OpenAI client for your endpoint</span>
client = OpenAI(
base_url=ENDPOINT_URL,
api_key=HF_TOKEN,
)`,wrap:!1}}),pe=new b({props:{title:"Step 3: Create the embedding function",local:"step-3-create-the-embedding-function",headingTag:"h3"}}),me=new Ue({props:{code:"ZGVmJTIwZ2V0X2VtYmVkZGluZ3MoZXhhbXBsZXMpJTNBJTBBJTIwJTIwJTIwJTIwJTIyJTIyJTIyR2V0JTIwZW1iZWRkaW5ncyUyMGZvciUyMGElMjBiYXRjaCUyMG9mJTIwdGV4dHMuJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuZW1iZWRkaW5ncy5jcmVhdGUoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJ5b3VyLWVuZHBvaW50LW5hbWUlMjIlMkMlMjAlMjAlMjMlMjBSZXBsYWNlJTIwd2l0aCUyMHlvdXIlMjBhY3R1YWwlMjBlbmRwb2ludCUyMG5hbWUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpbnB1dCUzRGV4YW1wbGVzJTVCJTIyY29udGV4dCUyMiU1RCUyQyUyMCUyMyUyMEluJTIwdGhlJTIwc3F1YWQlMjBkYXRhc2V0JTJDJTIwdGhlJTIwdGV4dCUyMGlzJTIwaW4lMjB0aGUlMjAlMjJjb250ZXh0JTIyJTIwY29sdW1uJTBBJTIwJTIwJTIwJTIwKSUwQSUyMCUyMCUyMCUyMCUwQSUyMCUyMCUyMCUyMCUyMyUyMEV4dHJhY3QlMjBlbWJlZGRpbmdzJTIwZnJvbSUyMHJlc3BvbnNlJTIwb2JqZWN0cyUwQSUyMCUyMCUyMCUyMGVtYmVkZGluZ3MlMjAlM0QlMjAlNUJzYW1wbGUuZW1iZWRkaW5nJTIwZm9yJTIwc2FtcGxlJTIwaW4lMjByZXNwb25zZS5kYXRhJTVEJTBBJTIwJTIwJTIwJTIwJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTdCJTIyZW1iZWRkaW5ncyUyMiUzQSUyMGVtYmVkZGluZ3MlN0QlMjAlMjMlMjBkYXRhc2V0cyUyMGV4cGVjdHMlMjBhJTIwZGljdGlvbmFyeSUyMHdpdGglMjBhJTIwa2V5JTIwJTIyZW1iZWRkaW5ncyUyMiUyMGFuZCUyMGElMjB2YWx1ZSUyMG9mJTIwYSUyMGxpc3QlMjBvZiUyMGVtYmVkZGluZ3M=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_embeddings</span>(<span class="hljs-params">examples</span>):
<span class="hljs-string">&quot;&quot;&quot;Get embeddings for a batch of texts.&quot;&quot;&quot;</span>
response = client.embeddings.create(
model=<span class="hljs-string">&quot;your-endpoint-name&quot;</span>, <span class="hljs-comment"># Replace with your actual endpoint name</span>
<span class="hljs-built_in">input</span>=examples[<span class="hljs-string">&quot;context&quot;</span>], <span class="hljs-comment"># In the squad dataset, the text is in the &quot;context&quot; column</span>
)
<span class="hljs-comment"># Extract embeddings from response objects</span>
embeddings = [sample.embedding <span class="hljs-keyword">for</span> sample <span class="hljs-keyword">in</span> response.data]
<span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;embeddings&quot;</span>: embeddings} <span class="hljs-comment"># datasets expects a dictionary with a key &quot;embeddings&quot; and a value of a list of embeddings</span>`,wrap:!1}}),$=new Gt({props:{$$slots:{default:[$n]},$$scope:{ctx:J}}}),ue=new b({props:{title:"Step 4: Load and process your dataset",local:"step-4-load-and-process-your-dataset",headingTag:"h3"}}),ce=new Ue({props:{code:"JTIzJTIwTG9hZCUyMGElMjBzYW1wbGUlMjBkYXRhc2V0JTIwKHlvdSUyMGNhbiUyMHJlcGxhY2UlMjB0aGlzJTIwd2l0aCUyMHlvdXIlMjBvd24pJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJzcXVhZCUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lNUIlM0ExMDAlNUQlMjIpJTIwJTIwJTIzJTIwVXNpbmclMjBmaXJzdCUyMDEwMCUyMGV4YW1wbGVzJTIwZm9yJTIwZGVtbyUwQSUwQSUyMyUyMFByb2Nlc3MlMjB0aGUlMjBkYXRhc2V0JTIwd2l0aCUyMGVtYmVkZGluZ3MlMEFkYXRhc2V0X3dpdGhfZW1iZWRkaW5ncyUyMCUzRCUyMGRhdGFzZXQubWFwKCUwQSUyMCUyMCUyMCUyMGdldF9lbWJlZGRpbmdzJTJDJTBBJTIwJTIwJTIwJTIwYmF0Y2hlZCUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjBiYXRjaF9zaXplJTNEMTAlMkMlMjAlMjAlMjMlMjBQcm9jZXNzJTIwaW4lMjBzbWFsbCUyMGJhdGNoZXMlMjB0byUyMGF2b2lkJTIwdGltZW91dHMlMEElMjAlMjAlMjAlMjBkZXNjJTNEJTIyQWRkaW5nJTIwZW1iZWRkaW5ncyUyMiUyQyUwQSk=",highlighted:`<span class="hljs-comment"># Load a sample dataset (you can replace this with your own)</span>
dataset = load_dataset(<span class="hljs-string">&quot;squad&quot;</span>, split=<span class="hljs-string">&quot;train[:100]&quot;</span>) <span class="hljs-comment"># Using first 100 examples for demo</span>
<span class="hljs-comment"># Process the dataset with embeddings</span>
dataset_with_embeddings = dataset.<span class="hljs-built_in">map</span>(
get_embeddings,
batched=<span class="hljs-literal">True</span>,
batch_size=<span class="hljs-number">10</span>, <span class="hljs-comment"># Process in small batches to avoid timeouts</span>
desc=<span class="hljs-string">&quot;Adding embeddings&quot;</span>,
)`,wrap:!1}}),Z=new Gt({props:{$$slots:{default:[Zn]},$$scope:{ctx:J}}}),Me=new b({props:{title:"Step 5: Save and Share your results",local:"step-5-save-and-share-your-results",headingTag:"h3"}}),we=new Ue({props:{code:"JTIzJTIwU2F2ZSUyMHRoZSUyMHByb2Nlc3NlZCUyMGRhdGFzZXQlMjBsb2NhbGx5JTBBZGF0YXNldF93aXRoX2VtYmVkZGluZ3Muc2F2ZV90b19kaXNrKCUyMi4lMkZlbWJlZGRlZF9kYXRhc2V0JTIyKSUwQSUwQSUyMyUyME9yJTIwcHVzaCUyMGRpcmVjdGx5JTIwdG8lMjBIdWdnaW5nJTIwRmFjZSUyMEh1YiUwQWRhdGFzZXRfd2l0aF9lbWJlZGRpbmdzLnB1c2hfdG9faHViKCUyMnlvdXItdXNlcm5hbWUlMkZzcXVhZC1lbWJlZGRpbmdzJTIyKQ==",highlighted:`<span class="hljs-comment"># Save the processed dataset locally</span>
dataset_with_embeddings.save_to_disk(<span class="hljs-string">&quot;./embedded_dataset&quot;</span>)
<span class="hljs-comment"># Or push directly to Hugging Face Hub</span>
dataset_with_embeddings.push_to_hub(<span class="hljs-string">&quot;your-username/squad-embeddings&quot;</span>)`,wrap:!1}}),ye=new b({props:{title:"Next steps",local:"next-steps",headingTag:"h2"}}),Te=new Ue({props:{code:"aW1wb3J0JTIwb3MlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEFmcm9tJTIwZG90ZW52JTIwaW1wb3J0JTIwbG9hZF9kb3RlbnYlMEFmcm9tJTIwb3BlbmFpJTIwaW1wb3J0JTIwT3BlbkFJJTBBJTBBbG9hZF9kb3RlbnYoKSUwQSUwQSUyMyUyMENvbmZpZ3VyYXRpb24lMEFFTkRQT0lOVF9VUkwlMjAlM0QlMjAlMjJodHRwcyUzQSUyRiUyRnlvdXItZW5kcG9pbnQtbmFtZS5lbmRwb2ludHMuaHVnZ2luZ2ZhY2UuY2xvdWQlMkZ2MSUyRiUyMiUwQUhGX1RPS0VOJTIwJTNEJTIwb3MuZ2V0ZW52KCUyMkhGX1RPS0VOJTIyKSUwQSUwQSUyMyUyMEluaXRpYWxpemUlMjBPcGVuQUklMjBjbGllbnQlMjBmb3IlMjB5b3VyJTIwZW5kcG9pbnQlMEFjbGllbnQlMjAlM0QlMjBPcGVuQUkoJTBBJTIwJTIwJTIwJTIwYmFzZV91cmwlM0RFTkRQT0lOVF9VUkwlMkMlMEElMjAlMjAlMjAlMjBhcGlfa2V5JTNESEZfVE9LRU4lMkMlMEEpJTBBJTBBZGVmJTIwZ2V0X2VtYmVkZGluZ3MoZXhhbXBsZXMpJTNBJTBBJTIwJTIwJTIwJTIwJTIyJTIyJTIyR2V0JTIwZW1iZWRkaW5ncyUyMGZvciUyMGElMjBiYXRjaCUyMG9mJTIwdGV4dHMuJTIyJTIyJTIyJTBBJTIwJTIwJTIwJTIwcmVzcG9uc2UlMjAlM0QlMjBjbGllbnQuZW1iZWRkaW5ncy5jcmVhdGUoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbW9kZWwlM0QlMjJ5b3VyLWVuZHBvaW50LW5hbWUlMjIlMkMlMjAlMjAlMjMlMjBSZXBsYWNlJTIwd2l0aCUyMHlvdXIlMjBhY3R1YWwlMjBlbmRwb2ludCUyMG5hbWUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpbnB1dCUzRGV4YW1wbGVzJTVCJTIyY29udGV4dCUyMiU1RCUyQyUwQSUyMCUyMCUyMCUyMCklMEElMjAlMjAlMjAlMjAlMEElMjAlMjAlMjAlMjAlMjMlMjBFeHRyYWN0JTIwZW1iZWRkaW5ncyUyMGZyb20lMjByZXNwb25zZSUwQSUyMCUyMCUyMCUyMGVtYmVkZGluZ3MlMjAlM0QlMjAlNUJzYW1wbGUuZW1iZWRkaW5nJTIwZm9yJTIwc2FtcGxlJTIwaW4lMjByZXNwb25zZS5kYXRhJTVEJTBBJTIwJTIwJTIwJTIwJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwJTdCJTIyZW1iZWRkaW5ncyUyMiUzQSUyMGVtYmVkZGluZ3MlN0QlMEElMEElMjMlMjBMb2FkJTIwYSUyMHNhbXBsZSUyMGRhdGFzZXQlMjAoeW91JTIwY2FuJTIwcmVwbGFjZSUyMHRoaXMlMjB3aXRoJTIweW91ciUyMG93biklMEFwcmludCglMjJMb2FkaW5nJTIwZGF0YXNldC4uLiUyMiklMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnNxdWFkJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiU1QiUzQTEwMDAlNUQlMjIpJTIwJTIwJTIzJTIwVXNpbmclMjBmaXJzdCUyMDEwMDAlMjBleGFtcGxlcyUyMGZvciUyMGRlbW8lMEElMEElMjMlMjBQcm9jZXNzJTIwdGhlJTIwZGF0YXNldCUyMHdpdGglMjBlbWJlZGRpbmdzJTBBcHJpbnQoJTIyUHJvY2Vzc2luZyUyMGRhdGFzZXQlMjB3aXRoJTIwZW1iZWRkaW5ncy4uLiUyMiklMEFkYXRhc2V0X3dpdGhfZW1iZWRkaW5ncyUyMCUzRCUyMGRhdGFzZXQubWFwKCUwQSUyMCUyMCUyMCUyMGdldF9lbWJlZGRpbmdzJTJDJTBBJTIwJTIwJTIwJTIwYmF0Y2hlZCUzRFRydWUlMkMlMEElMjAlMjAlMjAlMjBiYXRjaF9zaXplJTNEMTAlMkMlMjAlMjAlMjMlMjBQcm9jZXNzJTIwaW4lMjBzbWFsbCUyMGJhdGNoZXMlMjB0byUyMGF2b2lkJTIwdGltZW91dHMlMEElMjAlMjAlMjAlMjBkZXNjJTNEJTIyQWRkaW5nJTIwZW1iZWRkaW5ncyUyMiUyQyUwQSklMEElMEElMjMlMjBTYXZlJTIwdGhlJTIwcHJvY2Vzc2VkJTIwZGF0YXNldCUyMGxvY2FsbHklMEFwcmludCglMjJTYXZpbmclMjBwcm9jZXNzZWQlMjBkYXRhc2V0Li4uJTIyKSUwQWRhdGFzZXRfd2l0aF9lbWJlZGRpbmdzLnNhdmVfdG9fZGlzayglMjIuJTJGZW1iZWRkZWRfZGF0YXNldCUyMiklMEElMEElMjMlMjBPciUyMHB1c2glMjBkaXJlY3RseSUyMHRvJTIwSHVnZ2luZyUyMEZhY2UlMjBIdWIlMEFwcmludCglMjJQdXNoaW5nJTIwdG8lMjBIdWdnaW5nJTIwRmFjZSUyMEh1Yi4uLiUyMiklMEFkYXRhc2V0X3dpdGhfZW1iZWRkaW5ncy5wdXNoX3RvX2h1YiglMjJ5b3VyLXVzZXJuYW1lJTJGc3F1YWQtZW1iZWRkaW5ncyUyMiklMEElMEFwcmludCglMjJEYXRhc2V0JTIwcHJvY2Vzc2luZyUyMGNvbXBsZXRlISUyMik=",highlighted:`<span class="hljs-keyword">import</span> os
<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> dotenv <span class="hljs-keyword">import</span> load_dotenv
<span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI
load_dotenv()
<span class="hljs-comment"># Configuration</span>
ENDPOINT_URL = <span class="hljs-string">&quot;https://your-endpoint-name.endpoints.huggingface.cloud/v1/&quot;</span>
HF_TOKEN = os.getenv(<span class="hljs-string">&quot;HF_TOKEN&quot;</span>)
<span class="hljs-comment"># Initialize OpenAI client for your endpoint</span>
client = OpenAI(
base_url=ENDPOINT_URL,
api_key=HF_TOKEN,
)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_embeddings</span>(<span class="hljs-params">examples</span>):
<span class="hljs-string">&quot;&quot;&quot;Get embeddings for a batch of texts.&quot;&quot;&quot;</span>
response = client.embeddings.create(
model=<span class="hljs-string">&quot;your-endpoint-name&quot;</span>, <span class="hljs-comment"># Replace with your actual endpoint name</span>
<span class="hljs-built_in">input</span>=examples[<span class="hljs-string">&quot;context&quot;</span>],
)
<span class="hljs-comment"># Extract embeddings from response</span>
embeddings = [sample.embedding <span class="hljs-keyword">for</span> sample <span class="hljs-keyword">in</span> response.data]
<span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;embeddings&quot;</span>: embeddings}
<span class="hljs-comment"># Load a sample dataset (you can replace this with your own)</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Loading dataset...&quot;</span>)
dataset = load_dataset(<span class="hljs-string">&quot;squad&quot;</span>, split=<span class="hljs-string">&quot;train[:1000]&quot;</span>) <span class="hljs-comment"># Using first 1000 examples for demo</span>
<span class="hljs-comment"># Process the dataset with embeddings</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Processing dataset with embeddings...&quot;</span>)
dataset_with_embeddings = dataset.<span class="hljs-built_in">map</span>(
get_embeddings,
batched=<span class="hljs-literal">True</span>,
batch_size=<span class="hljs-number">10</span>, <span class="hljs-comment"># Process in small batches to avoid timeouts</span>
desc=<span class="hljs-string">&quot;Adding embeddings&quot;</span>,
)
<span class="hljs-comment"># Save the processed dataset locally</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Saving processed dataset...&quot;</span>)
dataset_with_embeddings.save_to_disk(<span class="hljs-string">&quot;./embedded_dataset&quot;</span>)
<span class="hljs-comment"># Or push directly to Hugging Face Hub</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Pushing to Hugging Face Hub...&quot;</span>)
dataset_with_embeddings.push_to_hub(<span class="hljs-string">&quot;your-username/squad-embeddings&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Dataset processing complete!&quot;</span>)`,wrap:!1}}),je=new jn({props:{source:"https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/tutorials/embedding.md"}}),{c(){i=o("meta"),f=s(),m=o("p"),y=s(),u(j.$$.fragment),T=s(),u(g.$$.fragment),Ce=s(),G=o("p"),G.innerHTML=Et,Ge=s(),u(I.$$.fragment),ke=s(),u(k.$$.fragment),Be=s(),B=o("p"),B.textContent=xt,ve=s(),v=o("p"),v.textContent=Rt,We=s(),W=o("p"),W.innerHTML=_t,Ee=s(),E=o("p"),E.textContent=Ft,xe=s(),x=o("p"),x.innerHTML=Vt,Re=s(),R=o("p"),R.innerHTML=Xt,_e=s(),_=o("p"),_.textContent=zt,Fe=s(),F=o("ul"),F.innerHTML=Ht,Ve=s(),u(U.$$.fragment),Xe=s(),V=o("p"),V.innerHTML=Yt,ze=s(),X=o("p"),X.textContent=Qt,He=s(),z=o("p"),z.innerHTML=qt,Ye=s(),H=o("p"),H.textContent=St,Qe=s(),u(Y.$$.fragment),qe=s(),Q=o("p"),Q.textContent=At,Se=s(),q=o("p"),q.innerHTML=Nt,Ae=s(),S=o("p"),S.textContent=Lt,Ne=s(),u(A.$$.fragment),Le=s(),N=o("p"),N.innerHTML=Pt,Pe=s(),L=o("ul"),L.innerHTML=Ot,Oe=s(),P=o("p"),P.innerHTML=Kt,Ke=s(),u(O.$$.fragment),De=s(),K=o("p"),K.textContent=Dt,et=s(),u(D.$$.fragment),tt=s(),ee=o("p"),ee.textContent=en,nt=s(),u(te.$$.fragment),lt=s(),ne=o("p"),ne.textContent=tn,st=s(),u(le.$$.fragment),at=s(),u(se.$$.fragment),it=s(),ae=o("p"),ae.textContent=nn,ot=s(),u(ie.$$.fragment),pt=s(),oe=o("p"),oe.innerHTML=ln,dt=s(),u(pe.$$.fragment),mt=s(),de=o("p"),de.textContent=sn,ut=s(),u(me.$$.fragment),rt=s(),u($.$$.fragment),ct=s(),u(ue.$$.fragment),Mt=s(),re=o("p"),re.textContent=an,ht=s(),u(ce.$$.fragment),wt=s(),u(Z.$$.fragment),yt=s(),u(Me.$$.fragment),ft=s(),he=o("p"),he.textContent=on,Tt=s(),u(we.$$.fragment),gt=s(),u(ye.$$.fragment),bt=s(),fe=o("p"),fe.textContent=pn,Jt=s(),C=o("details"),$e=o("summary"),$e.textContent=dn,Bt=s(),u(Te.$$.fragment),jt=s(),ge=o("p"),ge.textContent=mn,It=s(),be=o("ul"),be.innerHTML=un,Ut=s(),Je=o("p"),Je.textContent=rn,$t=s(),u(je.$$.fragment),Zt=s(),Ze=o("p"),this.h()},l(e){const t=bn("svelte-u9bgzb",document.head);i=p(t,"META",{name:!0,content:!0}),t.forEach(n),f=a(e),m=p(e,"P",{}),vt(m).forEach(n),y=a(e),r(j.$$.fragment,e),T=a(e),r(g.$$.fragment,e),Ce=a(e),G=p(e,"P",{"data-svelte-h":!0}),d(G)!=="svelte-y0ssck"&&(G.innerHTML=Et),Ge=a(e),r(I.$$.fragment,e),ke=a(e),r(k.$$.fragment,e),Be=a(e),B=p(e,"P",{"data-svelte-h":!0}),d(B)!=="svelte-qqd7zu"&&(B.textContent=xt),ve=a(e),v=p(e,"P",{"data-svelte-h":!0}),d(v)!=="svelte-147h3qd"&&(v.textContent=Rt),We=a(e),W=p(e,"P",{"data-svelte-h":!0}),d(W)!=="svelte-dnyg4"&&(W.innerHTML=_t),Ee=a(e),E=p(e,"P",{"data-svelte-h":!0}),d(E)!=="svelte-161g95l"&&(E.textContent=Ft),xe=a(e),x=p(e,"P",{"data-svelte-h":!0}),d(x)!=="svelte-fxecmn"&&(x.innerHTML=Vt),Re=a(e),R=p(e,"P",{"data-svelte-h":!0}),d(R)!=="svelte-hre7rz"&&(R.innerHTML=Xt),_e=a(e),_=p(e,"P",{"data-svelte-h":!0}),d(_)!=="svelte-47pfzw"&&(_.textContent=zt),Fe=a(e),F=p(e,"UL",{"data-svelte-h":!0}),d(F)!=="svelte-me3ygp"&&(F.innerHTML=Ht),Ve=a(e),r(U.$$.fragment,e),Xe=a(e),V=p(e,"P",{"data-svelte-h":!0}),d(V)!=="svelte-qj37e3"&&(V.innerHTML=Yt),ze=a(e),X=p(e,"P",{"data-svelte-h":!0}),d(X)!=="svelte-g3g7p9"&&(X.textContent=Qt),He=a(e),z=p(e,"P",{"data-svelte-h":!0}),d(z)!=="svelte-14nerpn"&&(z.innerHTML=qt),Ye=a(e),H=p(e,"P",{"data-svelte-h":!0}),d(H)!=="svelte-1ih12xw"&&(H.textContent=St),Qe=a(e),r(Y.$$.fragment,e),qe=a(e),Q=p(e,"P",{"data-svelte-h":!0}),d(Q)!=="svelte-8cmlaj"&&(Q.textContent=At),Se=a(e),q=p(e,"P",{"data-svelte-h":!0}),d(q)!=="svelte-meccj2"&&(q.innerHTML=Nt),Ae=a(e),S=p(e,"P",{"data-svelte-h":!0}),d(S)!=="svelte-eb5krh"&&(S.textContent=Lt),Ne=a(e),r(A.$$.fragment,e),Le=a(e),N=p(e,"P",{"data-svelte-h":!0}),d(N)!=="svelte-grmwad"&&(N.innerHTML=Pt),Pe=a(e),L=p(e,"UL",{"data-svelte-h":!0}),d(L)!=="svelte-1mxq0ya"&&(L.innerHTML=Ot),Oe=a(e),P=p(e,"P",{"data-svelte-h":!0}),d(P)!=="svelte-hjx1sa"&&(P.innerHTML=Kt),Ke=a(e),r(O.$$.fragment,e),De=a(e),K=p(e,"P",{"data-svelte-h":!0}),d(K)!=="svelte-1jb2qdd"&&(K.textContent=Dt),et=a(e),r(D.$$.fragment,e),tt=a(e),ee=p(e,"P",{"data-svelte-h":!0}),d(ee)!=="svelte-1uq3z6b"&&(ee.textContent=en),nt=a(e),r(te.$$.fragment,e),lt=a(e),ne=p(e,"P",{"data-svelte-h":!0}),d(ne)!=="svelte-16rd7fj"&&(ne.textContent=tn),st=a(e),r(le.$$.fragment,e),at=a(e),r(se.$$.fragment,e),it=a(e),ae=p(e,"P",{"data-svelte-h":!0}),d(ae)!=="svelte-1rvx9zz"&&(ae.textContent=nn),ot=a(e),r(ie.$$.fragment,e),pt=a(e),oe=p(e,"P",{"data-svelte-h":!0}),d(oe)!=="svelte-6mzhuo"&&(oe.innerHTML=ln),dt=a(e),r(pe.$$.fragment,e),mt=a(e),de=p(e,"P",{"data-svelte-h":!0}),d(de)!=="svelte-103gfbk"&&(de.textContent=sn),ut=a(e),r(me.$$.fragment,e),rt=a(e),r($.$$.fragment,e),ct=a(e),r(ue.$$.fragment,e),Mt=a(e),re=p(e,"P",{"data-svelte-h":!0}),d(re)!=="svelte-75q4le"&&(re.textContent=an),ht=a(e),r(ce.$$.fragment,e),wt=a(e),r(Z.$$.fragment,e),yt=a(e),r(Me.$$.fragment,e),ft=a(e),he=p(e,"P",{"data-svelte-h":!0}),d(he)!=="svelte-1aql2k2"&&(he.textContent=on),Tt=a(e),r(we.$$.fragment,e),gt=a(e),r(ye.$$.fragment,e),bt=a(e),fe=p(e,"P",{"data-svelte-h":!0}),d(fe)!=="svelte-vzfm8t"&&(fe.textContent=pn),Jt=a(e),C=p(e,"DETAILS",{});var Ie=vt(C);$e=p(Ie,"SUMMARY",{"data-svelte-h":!0}),d($e)!=="svelte-1ri4alz"&&($e.textContent=dn),Bt=a(Ie),r(Te.$$.fragment,Ie),Ie.forEach(n),jt=a(e),ge=p(e,"P",{"data-svelte-h":!0}),d(ge)!=="svelte-1kuxjv9"&&(ge.textContent=mn),It=a(e),be=p(e,"UL",{"data-svelte-h":!0}),d(be)!=="svelte-1sd2to"&&(be.innerHTML=un),Ut=a(e),Je=p(e,"P",{"data-svelte-h":!0}),d(Je)!=="svelte-vw83lh"&&(Je.textContent=rn),$t=a(e),r(je.$$.fragment,e),Zt=a(e),Ze=p(e,"P",{}),vt(Ze).forEach(n),this.h()},h(){wn(i,"name","hf:doc:metadata"),wn(i,"content",Gn)},m(e,t){Wt(document.head,i),l(e,f,t),l(e,m,t),l(e,y,t),c(j,e,t),l(e,T,t),c(g,e,t),l(e,Ce,t),l(e,G,t),l(e,Ge,t),c(I,e,t),l(e,ke,t),c(k,e,t),l(e,Be,t),l(e,B,t),l(e,ve,t),l(e,v,t),l(e,We,t),l(e,W,t),l(e,Ee,t),l(e,E,t),l(e,xe,t),l(e,x,t),l(e,Re,t),l(e,R,t),l(e,_e,t),l(e,_,t),l(e,Fe,t),l(e,F,t),l(e,Ve,t),c(U,e,t),l(e,Xe,t),l(e,V,t),l(e,ze,t),l(e,X,t),l(e,He,t),l(e,z,t),l(e,Ye,t),l(e,H,t),l(e,Qe,t),c(Y,e,t),l(e,qe,t),l(e,Q,t),l(e,Se,t),l(e,q,t),l(e,Ae,t),l(e,S,t),l(e,Ne,t),c(A,e,t),l(e,Le,t),l(e,N,t),l(e,Pe,t),l(e,L,t),l(e,Oe,t),l(e,P,t),l(e,Ke,t),c(O,e,t),l(e,De,t),l(e,K,t),l(e,et,t),c(D,e,t),l(e,tt,t),l(e,ee,t),l(e,nt,t),c(te,e,t),l(e,lt,t),l(e,ne,t),l(e,st,t),c(le,e,t),l(e,at,t),c(se,e,t),l(e,it,t),l(e,ae,t),l(e,ot,t),c(ie,e,t),l(e,pt,t),l(e,oe,t),l(e,dt,t),c(pe,e,t),l(e,mt,t),l(e,de,t),l(e,ut,t),c(me,e,t),l(e,rt,t),c($,e,t),l(e,ct,t),c(ue,e,t),l(e,Mt,t),l(e,re,t),l(e,ht,t),c(ce,e,t),l(e,wt,t),c(Z,e,t),l(e,yt,t),c(Me,e,t),l(e,ft,t),l(e,he,t),l(e,Tt,t),c(we,e,t),l(e,gt,t),c(ye,e,t),l(e,bt,t),l(e,fe,t),l(e,Jt,t),l(e,C,t),Wt(C,$e),Wt(C,Bt),c(Te,C,null),l(e,jt,t),l(e,ge,t),l(e,It,t),l(e,be,t),l(e,Ut,t),l(e,Je,t),l(e,$t,t),c(je,e,t),l(e,Zt,t),l(e,Ze,t),Ct=!0},p(e,[t]){const Ie={};t&2&&(Ie.$$scope={dirty:t,ctx:e}),I.$set(Ie);const cn={};t&2&&(cn.$$scope={dirty:t,ctx:e}),U.$set(cn);const Mn={};t&2&&(Mn.$$scope={dirty:t,ctx:e}),$.$set(Mn);const hn={};t&2&&(hn.$$scope={dirty:t,ctx:e}),Z.$set(hn)},i(e){Ct||(M(j.$$.fragment,e),M(g.$$.fragment,e),M(I.$$.fragment,e),M(k.$$.fragment,e),M(U.$$.fragment,e),M(Y.$$.fragment,e),M(A.$$.fragment,e),M(O.$$.fragment,e),M(D.$$.fragment,e),M(te.$$.fragment,e),M(le.$$.fragment,e),M(se.$$.fragment,e),M(ie.$$.fragment,e),M(pe.$$.fragment,e),M(me.$$.fragment,e),M($.$$.fragment,e),M(ue.$$.fragment,e),M(ce.$$.fragment,e),M(Z.$$.fragment,e),M(Me.$$.fragment,e),M(we.$$.fragment,e),M(ye.$$.fragment,e),M(Te.$$.fragment,e),M(je.$$.fragment,e),Ct=!0)},o(e){h(j.$$.fragment,e),h(g.$$.fragment,e),h(I.$$.fragment,e),h(k.$$.fragment,e),h(U.$$.fragment,e),h(Y.$$.fragment,e),h(A.$$.fragment,e),h(O.$$.fragment,e),h(D.$$.fragment,e),h(te.$$.fragment,e),h(le.$$.fragment,e),h(se.$$.fragment,e),h(ie.$$.fragment,e),h(pe.$$.fragment,e),h(me.$$.fragment,e),h($.$$.fragment,e),h(ue.$$.fragment,e),h(ce.$$.fragment,e),h(Z.$$.fragment,e),h(Me.$$.fragment,e),h(we.$$.fragment,e),h(ye.$$.fragment,e),h(Te.$$.fragment,e),h(je.$$.fragment,e),Ct=!1},d(e){e&&(n(f),n(m),n(y),n(T),n(Ce),n(G),n(Ge),n(ke),n(Be),n(B),n(ve),n(v),n(We),n(W),n(Ee),n(E),n(xe),n(x),n(Re),n(R),n(_e),n(_),n(Fe),n(F),n(Ve),n(Xe),n(V),n(ze),n(X),n(He),n(z),n(Ye),n(H),n(Qe),n(qe),n(Q),n(Se),n(q),n(Ae),n(S),n(Ne),n(Le),n(N),n(Pe),n(L),n(Oe),n(P),n(Ke),n(De),n(K),n(et),n(tt),n(ee),n(nt),n(lt),n(ne),n(st),n(at),n(it),n(ae),n(ot),n(pt),n(oe),n(dt),n(mt),n(de),n(ut),n(rt),n(ct),n(Mt),n(re),n(ht),n(wt),n(yt),n(ft),n(he),n(Tt),n(gt),n(bt),n(fe),n(Jt),n(C),n(jt),n(ge),n(It),n(be),n(Ut),n(Je),n($t),n(Zt),n(Ze)),n(i),w(j,e),w(g,e),w(I,e),w(k,e),w(U,e),w(Y,e),w(A,e),w(O,e),w(D,e),w(te,e),w(le,e),w(se,e),w(ie,e),w(pe,e),w(me,e),w($,e),w(ue,e),w(ce,e),w(Z,e),w(Me,e),w(we,e),w(ye,e),w(Te),w(je,e)}}}const Gn='{"title":"Build an embedding pipeline with datasets","local":"build-an-embedding-pipeline-with-datasets","sections":[{"title":"Create your embedding Endpoint","local":"create-your-embedding-endpoint","sections":[],"depth":2},{"title":"Test your Endpoint","local":"test-your-endpoint","sections":[],"depth":2},{"title":"Get your Endpoint’s details","local":"get-your-endpoints-details","sections":[],"depth":2},{"title":"Building the embedding script","local":"building-the-embedding-script","sections":[{"title":"Step 1: Set up dependencies and imports","local":"step-1-set-up-dependencies-and-imports","sections":[],"depth":3},{"title":"Step 2: Configure the connection","local":"step-2-configure-the-connection","sections":[],"depth":3},{"title":"Step 3: Create the embedding function","local":"step-3-create-the-embedding-function","sections":[],"depth":3},{"title":"Step 4: Load and process your dataset","local":"step-4-load-and-process-your-dataset","sections":[],"depth":3},{"title":"Step 5: Save and Share your results","local":"step-5-save-and-share-your-results","sections":[],"depth":3}],"depth":2},{"title":"Next steps","local":"next-steps","sections":[],"depth":2}],"depth":1}';function kn(J){return fn(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Rn extends Tn{constructor(i){super(),gn(this,i,kn,Cn,yn,{})}}export{Rn as component};

Xet Storage Details

Size:
32.3 kB
·
Xet hash:
80515f45c157d7055508f134c3d0655021f5df4b05ef2503747d789c5d955844

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.