Buckets:

rtrm's picture
download
raw
114 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;FAISS ဖြင့် Semantic Search ပြုလုပ်ခြင်း&quot;,&quot;local&quot;:&quot;semantic-search-with-faiss&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Semantic Search အတွက် Embeddings များကို အသုံးပြုခြင်း&quot;,&quot;local&quot;:&quot;using-embeddings-for-semantic-search&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Dataset ကို Loading လုပ်ပြီး ပြင်ဆင်ခြင်း&quot;,&quot;local&quot;:&quot;loading-and-preparing-the-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Text Embeddings များ ဖန်တီးခြင်း&quot;,&quot;local&quot;:&quot;creating-text-embeddings&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;FAISS ကို အသုံးပြု၍ ထိရောက်သော Similarity Search&quot;,&quot;local&quot;:&quot;using-faiss-for-efficient-similarity-search&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ဝေါဟာရ ရှင်းလင်းချက် (Glossary)&quot;,&quot;local&quot;:&quot;ဝဟရ-ရငလငခက-glossary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1114/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/scheduler.893fe8c9.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/singletons.10fda3ce.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.bce52c8a.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/paths.89c82153.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/preload-helper.b1a719fd.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.b1df2166.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/0.510afdc1.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/41.642d1817.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.762ed9cc.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/Youtube.ec5d7916.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CodeBlock.6cef0479.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CourseFloatingBanner.c1c08878.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/FrameworkSwitchCourse.4480e339.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;FAISS ဖြင့် Semantic Search ပြုလုပ်ခြင်း&quot;,&quot;local&quot;:&quot;semantic-search-with-faiss&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Semantic Search အတွက် Embeddings များကို အသုံးပြုခြင်း&quot;,&quot;local&quot;:&quot;using-embeddings-for-semantic-search&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Dataset ကို Loading လုပ်ပြီး ပြင်ဆင်ခြင်း&quot;,&quot;local&quot;:&quot;loading-and-preparing-the-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Text Embeddings များ ဖန်တီးခြင်း&quot;,&quot;local&quot;:&quot;creating-text-embeddings&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;FAISS ကို အသုံးပြု၍ ထိရောက်သော Similarity Search&quot;,&quot;local&quot;:&quot;using-faiss-for-efficient-similarity-search&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ဝေါဟာရ ရှင်းလင်းချက် (Glossary)&quot;,&quot;local&quot;:&quot;ဝဟရ-ရငလငခက-glossary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="bg-white leading-none border border-gray-100 rounded-lg flex p-0.5 w-56 text-sm mb-4"><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-l bg-red-50 dark:bg-transparent text-red-600" href="?fw=pt"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><defs><clipPath id="a"><rect x="3.05" y="0.5" width="25.73" height="31" fill="none"></rect></clipPath></defs><g clip-path="url(#a)"><path d="M24.94,9.51a12.81,12.81,0,0,1,0,18.16,12.68,12.68,0,0,1-18,0,12.81,12.81,0,0,1,0-18.16l9-9V5l-.84.83-6,6a9.58,9.58,0,1,0,13.55,0ZM20.44,9a1.68,1.68,0,1,1,1.67-1.67A1.68,1.68,0,0,1,20.44,9Z" fill="#ee4c2c"></path></g></svg> Pytorch </a><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-r text-gray-500 filter grayscale" href="?fw=tf"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="0.94em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 274"><path d="M145.726 42.065v42.07l72.861 42.07v-42.07l-72.86-42.07zM0 84.135v42.07l36.43 21.03V105.17L0 84.135zm109.291 21.035l-36.43 21.034v126.2l36.43 21.035v-84.135l36.435 21.035v-42.07l-36.435-21.034V105.17z" fill="#E55B2D"></path><path d="M145.726 42.065L36.43 105.17v42.065l72.861-42.065v42.065l36.435-21.03v-84.14zM255.022 63.1l-36.435 21.035v42.07l36.435-21.035V63.1zm-72.865 84.135l-36.43 21.035v42.07l36.43-21.036v-42.07zm-36.43 63.104l-36.436-21.035v84.135l36.435-21.035V210.34z" fill="#ED8E24"></path><path d="M145.726 0L0 84.135l36.43 21.035l109.296-63.105l72.861 42.07L255.022 63.1L145.726 0zm0 126.204l-36.435 21.03l36.435 21.036l36.43-21.035l-36.43-21.03z" fill="#F8BF3C"></path></svg> TensorFlow </a></div> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="semantic-search-with-faiss" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#semantic-search-with-faiss"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>FAISS ဖြင့် Semantic Search ပြုလုပ်ခြင်း</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter5/section6_pt.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter5/section6_pt.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1fbtmqy"><a href="/course/chapter5/5">အခန်း ၅</a> မှာ ကျွန်တော်တို့ 🤗 Datasets repository ကနေ GitHub issues နဲ့ comments တွေရဲ့ dataset တစ်ခုကို ဖန်တီးခဲ့ပါတယ်။ ဒီအပိုင်းမှာတော့ ဒီအချက်အလက်တွေကို အသုံးပြုပြီး library နဲ့ ပတ်သက်တဲ့ ကျွန်တော်တို့ရဲ့ အရေးအကြီးဆုံး မေးခွန်းတွေရဲ့ အဖြေတွေကို ရှာဖွေနိုင်မယ့် search engine တစ်ခုကို တည်ဆောက်သွားမှာပါ။</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/OATCgQtNX2o" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <h2 class="relative group"><a id="using-embeddings-for-semantic-search" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-embeddings-for-semantic-search"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Semantic Search အတွက် Embeddings များကို အသုံးပြုခြင်း</span></h2> <p data-svelte-h="svelte-1iqee2c"><a href="/course/chapter1">Chapter 1</a> မှာ ကျွန်တော်တို့ မြင်ခဲ့ရတဲ့အတိုင်း၊ Transformer-based language models တွေက စာသားအပိုင်းအစတစ်ခုထဲက token တစ်ခုစီကို <em>embedding vector</em> အဖြစ် ကိုယ်စားပြုပါတယ်။ တစ်ခါတစ်ရံမှာ sentences တွေ၊ paragraphs တွေ ဒါမှမဟုတ် (အချို့ကိစ္စတွေမှာ) documents တွေအတွက် vector representation တစ်ခု ဖန်တီးဖို့ individual embeddings တွေကို “pool” လုပ်နိုင်ပါတယ်။ ထို့နောက် ဤ embeddings များကို dot-product similarity (သို့မဟုတ် အခြား similarity metric တစ်ခုခု) ကို တွက်ချက်ခြင်းဖြင့် corpus ထဲရှိ ဆင်တူသော documents များကို ရှာဖွေနိုင်ပြီး အတူဆုံးသော documents များကို ပြန်ပေးနိုင်ပါတယ်။</p> <p data-svelte-h="svelte-1i1ppnf">ဒီအပိုင်းမှာတော့ embeddings တွေကို အသုံးပြုပြီး semantic search engine တစ်ခုကို ကျွန်တော်တို့ တည်ဆောက်သွားမှာပါ။ ဒီ search engines တွေက query ထဲက keywords တွေကို documents တွေနဲ့ ကိုက်ညီအောင် လုပ်ဆောင်တဲ့ ရိုးရာနည်းလမ်းတွေထက် အားသာချက်များစွာကို ပေးစွမ်းပါတယ်။</p> <div class="flex justify-center" data-svelte-h="svelte-yxatr"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/semantic-search.svg" alt="Semantic search."> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/semantic-search-dark.svg" alt="Semantic search."></div> <h2 class="relative group"><a id="loading-and-preparing-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-and-preparing-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Dataset ကို Loading လုပ်ပြီး ပြင်ဆင်ခြင်း</span></h2> <p data-svelte-h="svelte-dij496">ပထမဆုံး ကျွန်တော်တို့ လုပ်ရမယ့်အရာက GitHub issues တွေရဲ့ dataset ကို download လုပ်ဖို့ပါပဲ၊ ဒါကြောင့် ပုံမှန်အတိုင်း <code>load_dataset()</code> function ကို အသုံးပြုရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
issues_dataset = load_dataset(<span class="hljs-string">&quot;lewtun/github-issues&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">2855</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4j6vtu">ဒီနေရာမှာ ကျွန်တော်တို့ <code>load_dataset()</code> မှာ default <code>train</code> split ကို သတ်မှတ်ထားတာကြောင့် <code>DatasetDict</code> အစား <code>Dataset</code> ကို ပြန်ပေးပါတယ်။ ပထမဦးဆုံး လုပ်ရမယ့်အရာက pull requests တွေကို စစ်ထုတ်ပစ်ဖို့ပါပဲ၊ ဘာလို့လဲဆိုတော့ ဒါတွေက user queries တွေကို ဖြေဖို့အတွက် ရှားရှားပါးပါး အသုံးပြုတာကြောင့် search engine မှာ noise တွေ ဖြစ်ပေါ်စေပါလိမ့်မယ်။ အခုဆို ရင်းနှီးနေပြီဖြစ်တဲ့အတိုင်း၊ ကျွန်တော်တို့ dataset ထဲက ဒီ rows တွေကို ဖယ်ထုတ်ဖို့ <code>Dataset.filter()</code> function ကို အသုံးပြုနိုင်ပါတယ်။ အဲဒါကို လုပ်နေရင်းနဲ့၊ user queries တွေအတွက် အဖြေမပေးနိုင်တဲ့ comments မရှိတဲ့ rows တွေကိုလည်း စစ်ထုတ်ပစ်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = issues_dataset.<span class="hljs-built_in">filter</span>(
<span class="hljs-keyword">lambda</span> x: (x[<span class="hljs-string">&quot;is_pull_request&quot;</span>] == <span class="hljs-literal">False</span> <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(x[<span class="hljs-string">&quot;comments&quot;</span>]) &gt; <span class="hljs-number">0</span>)
)
issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">771</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1f9xhou">ကျွန်တော်တို့ dataset မှာ columns တွေ အများကြီးပါတာကို တွေ့ရပါတယ်။ ဒါတွေထဲက အများစုကို search engine တည်ဆောက်ဖို့ ကျွန်တော်တို့ မလိုအပ်ပါဘူး။ search ရှုထောင့်ကကြည့်မယ်ဆိုရင်၊ အချက်အလက်အများဆုံး columns တွေက <code>title</code><code>body</code> နဲ့ <code>comments</code> တွေဖြစ်ပြီး၊ <code>html_url</code> ကတော့ ကျွန်တော်တို့ကို source issue ကို ပြန်လည်ညွှန်ပြတဲ့ link ကို ပေးပါတယ်။ ကျန်တာတွေကို ဖယ်ရှားဖို့ <code>Dataset.remove_columns()</code> function ကို အသုံးပြုရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->columns = issues_dataset.column_names
columns_to_keep = [<span class="hljs-string">&quot;title&quot;</span>, <span class="hljs-string">&quot;body&quot;</span>, <span class="hljs-string">&quot;html_url&quot;</span>, <span class="hljs-string">&quot;comments&quot;</span>]
columns_to_remove = <span class="hljs-built_in">set</span>(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>],
num_rows: <span class="hljs-number">771</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nb7vsg">ကျွန်တော်တို့ရဲ့ embeddings တွေ ဖန်တီးဖို့အတွက် issue ရဲ့ title နဲ့ body ကို comment တစ်ခုစီတိုင်းမှာ ပေါင်းထည့်ပါမယ်၊ ဘာလို့လဲဆိုတော့ ဒီ fields တွေက မကြာခဏဆိုသလို အသုံးဝင်တဲ့ context information တွေ ပါဝင်လို့ပါ။ ကျွန်တော်တို့ရဲ့ <code>comments</code> column က လက်ရှိမှာ issue တစ်ခုစီအတွက် comments တွေရဲ့ list တစ်ခုဖြစ်နေတာကြောင့်၊ row တစ်ခုစီမှာ <code>(html_url, title, body, comment)</code> tuple တစ်ခုပါဝင်အောင် column ကို “explode” လုပ်ဖို့ လိုအပ်ပါတယ်။ Pandas မှာ ဒါကို <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html" rel="nofollow"><code>DataFrame.explode()</code> function</a> နဲ့ လုပ်ဆောင်နိုင်ပါတယ်။ ဒါက list-like column တစ်ခုစီမှာ element တစ်ခုစီအတွက် new row တစ်ခု ဖန်တီးပေးပြီး ကျန်တဲ့ column values တွေအားလုံးကို ပွားပေးပါတယ်။ ဒါကို လက်တွေ့မြင်ရဖို့၊ ပထမဆုံး Pandas <code>DataFrame</code> format သို့ ပြောင်းရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset.set_format(<span class="hljs-string">&quot;pandas&quot;</span>)
df = issues_dataset[:]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qxpxlr">ဒီ <code>DataFrame</code> ထဲက ပထမဆုံး row ကို စစ်ဆေးကြည့်မယ်ဆိုရင် ဒီ issue နဲ့ ဆက်စပ်နေတဲ့ comments လေးခုရှိတာကို တွေ့ရပါတယ်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->df[<span class="hljs-string">&quot;comments&quot;</span>][<span class="hljs-number">0</span>].tolist()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;the bug code locate in :\r\n if data_args.task_name is not None:\r\n # Downloading and loading a dataset from the hub.\r\n datasets = load_dataset(&quot;glue&quot;, data_args.task_name, cache_dir=model_args.cache_dir)&#x27;</span>,
<span class="hljs-string">&#x27;Hi @jinec,\r\n\r\nFrom time to time we get this kind of `ConnectionError` coming from the github.com website: https://raw.githubusercontent.com\r\n\r\nNormally, it should work if you wait a little and then retry.\r\n\r\nCould you please confirm if the problem persists?&#x27;</span>,
<span class="hljs-string">&#x27;cannot connect,even by Web browser,please check that there is some problems。&#x27;</span>,
<span class="hljs-string">&#x27;I can access https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/glue/glue.py without problem...&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qjwn53">ကျွန်တော်တို့ <code>df</code> ကို explode လုပ်တဲ့အခါ၊ ဒီ comments တစ်ခုစီအတွက် row တစ်ခုရရှိဖို့ မျှော်လင့်ပါတယ်။ ဒါဟုတ်မဟုတ် စစ်ကြည့်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->comments_df = df.explode(<span class="hljs-string">&quot;comments&quot;</span>, ignore_index=<span class="hljs-literal">True</span>)
comments_df.head(<span class="hljs-number">4</span>)<!-- HTML_TAG_END --></pre></div> <table border="1" class="dataframe" style="table-layout: fixed; word-wrap:break-word; width: 100%;" data-svelte-h="svelte-1g5whzd"><thead><tr style="text-align: right;"><th></th> <th>html_url</th> <th>title</th> <th>comments</th> <th>body</th></tr></thead> <tbody><tr><th>0</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>the bug code locate in :\r\n if data_args.task_name is not None...</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>1</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>Hi @jinec,\r\n\r\nFrom time to time we get this kind of `ConnectionError` coming from the github.com website: https://raw.githubusercontent.com...</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>2</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>cannot connect,even by Web browser,please check that there is some problems。</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>3</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>I can access https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/glue/glue.py without problem...</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr></tbody></table> <p data-svelte-h="svelte-qqrkoi">ကောင်းပါပြီ၊ rows တွေ ပွားနေတာကို တွေ့ရပြီး <code>comments</code> column မှာ individual comments တွေ ပါဝင်တာကို မြင်ရပါတယ်။ Pandas နဲ့ ကျွန်တော်တို့ လုပ်ဆောင်တာ ပြီးစီးသွားပြီဆိုတော့ <code>DataFrame</code> ကို memory ထဲမှာ loading လုပ်ခြင်းဖြင့် <code>Dataset</code> သို့ လျင်မြန်စွာ ပြန်ပြောင်းနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>],
num_rows: <span class="hljs-number">2842</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1418x6t">ကောင်းပါပြီ၊ ဒါက ကျွန်တော်တို့ကို အလုပ်လုပ်ဖို့ comments အနည်းငယ် ထောင်ချီပြီး ပေးထားပါတယ်။</p> <blockquote class="tip" data-svelte-h="svelte-1vb38oc"><p>✏️ <strong>စမ်းသပ်ကြည့်ပါ။</strong> Pandas ကို အသုံးမပြုဘဲ <code>issues_dataset</code> ရဲ့ <code>comments</code> column ကို explode လုပ်ဖို့ <code>Dataset.map()</code> ကို အသုံးပြုနိုင်မလား ကြည့်ပါ။ ဒါက နည်းနည်းလေး ခက်ပါတယ်၊ ဒီ task အတွက် 🤗 Datasets documentation ရဲ့ <a href="https://huggingface.co/docs/datasets/about_map_batch#batch-mapping" rel="nofollow">“Batch mapping”</a> အပိုင်းက အထောက်အကူ ဖြစ်နိုင်ပါတယ်။</p></blockquote> <p data-svelte-h="svelte-ua3af1">အခု row တစ်ခုစီမှာ comment တစ်ခုစီ ရှိပြီဆိုတော့၊ comment တစ်ခုစီမှာရှိတဲ့ စကားလုံးအရေအတွက် ပါဝင်တဲ့ <code>comments_length</code> column အသစ်တစ်ခု ဖန်တီးရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->comments_dataset = comments_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;comment_length&quot;</span>: <span class="hljs-built_in">len</span>(x[<span class="hljs-string">&quot;comments&quot;</span>].split())}
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1odu5n6">ဒီ column အသစ်ကို အသုံးပြုပြီး “cc @lewtun” ဒါမှမဟုတ် “Thanks!” လိုမျိုး search engine နဲ့ မသက်ဆိုင်တဲ့ တိုတိုတောင်းတောင်း comments တွေကို ဖယ်ရှားနိုင်ပါတယ်။ filter အတွက် သတ်မှတ်ထားတဲ့ နံပါတ်မရှိပေမယ့်၊ စကားလုံး ၁၅ လုံးဝန်းကျင်က ကောင်းမွန်တဲ့ အစတစ်ခု ဖြစ်ပါလိမ့်မယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->comments_dataset = comments_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-string">&quot;comment_length&quot;</span>] &gt; <span class="hljs-number">15</span>)
comments_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;comment_length&#x27;</span>],
num_rows: <span class="hljs-number">2098</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xdogon">ကျွန်တော်တို့ dataset ကို နည်းနည်း သန့်ရှင်းရေး လုပ်ပြီးပြီဆိုတော့၊ issue title၊ description နဲ့ comments တွေကို <code>text</code> column အသစ်တစ်ခုထဲမှာ ပေါင်းစပ်လိုက်ရအောင်။ ပုံမှန်အတိုင်းပဲ၊ <code>Dataset.map()</code> ကို ပေးပို့နိုင်မယ့် ရိုးရှင်းတဲ့ function တစ်ခုကို ကျွန်တော်တို့ ရေးပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">concatenate_text</span>(<span class="hljs-params">examples</span>):
<span class="hljs-keyword">return</span> {
<span class="hljs-string">&quot;text&quot;</span>: examples[<span class="hljs-string">&quot;title&quot;</span>]
+ <span class="hljs-string">&quot; \n &quot;</span>
+ examples[<span class="hljs-string">&quot;body&quot;</span>]
+ <span class="hljs-string">&quot; \n &quot;</span>
+ examples[<span class="hljs-string">&quot;comments&quot;</span>]
}
comments_dataset = comments_dataset.<span class="hljs-built_in">map</span>(concatenate_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6aqvnf">နောက်ဆုံးတော့ embeddings တွေ ဖန်တီးဖို့ အဆင်သင့်ဖြစ်ပါပြီ။ ကြည့်ရအောင်။</p> <h2 class="relative group"><a id="creating-text-embeddings" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creating-text-embeddings"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text Embeddings များ ဖန်တီးခြင်း</span></h2> <p data-svelte-h="svelte-kaqp79"><a href="/course/chapter2">Chapter 2</a> မှာ <code>AutoModel</code> class ကို အသုံးပြုပြီး token embeddings တွေရနိုင်တယ်ဆိုတာ ကျွန်တော်တို့ တွေ့ခဲ့ရပါတယ်။ ကျွန်တော်တို့ လုပ်ဖို့လိုတာက model ကို load လုပ်ဖို့ သင့်လျော်တဲ့ checkpoint တစ်ခုကို ရွေးချယ်ဖို့ပါပဲ။ ကံကောင်းစွာနဲ့ပဲ၊ embeddings တွေ ဖန်တီးဖို့အတွက် သီးသန့် library တစ်ခုဖြစ်တဲ့ <code>sentence-transformers</code> ရှိပါတယ်။ library ရဲ့ <a href="https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search" rel="nofollow">documentation</a> မှာ ဖော်ပြထားတဲ့အတိုင်း၊ ကျွန်တော်တို့ရဲ့ use case က <em>asymmetric semantic search</em> ရဲ့ ဥပမာတစ်ခုပါပဲ၊ ဘာလို့လဲဆိုတော့ ကျွန်တော်တို့မှာ တိုတောင်းတဲ့ query တစ်ခုရှိပြီး အဲဒီ query ရဲ့ အဖြေကို issue comment လိုမျိုး ပိုရှည်တဲ့ document တစ်ခုထဲမှာ ရှာဖွေလိုတာကြောင့်ပါ။ documentation ထဲက အသုံးဝင်တဲ့ <a href="https://www.sbert.net/docs/pretrained_models.html#model-overview" rel="nofollow">model overview table</a> က <code>multi-qa-mpnet-base-dot-v1</code> checkpoint ဟာ semantic search အတွက် အကောင်းဆုံး စွမ်းဆောင်ရည် ရှိတယ်လို့ ညွှန်ပြထားတာကြောင့် ကျွန်တော်တို့ application အတွက် ဒါကို အသုံးပြုပါမယ်။ tokenizer ကိုလည်း အလားတူ checkpoint ကို အသုံးပြုပြီး load လုပ်ပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModel
model_ckpt = <span class="hljs-string">&quot;sentence-transformers/multi-qa-mpnet-base-dot-v1&quot;</span>
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8awzut">embedding လုပ်ငန်းစဉ်ကို မြန်ဆန်စေဖို့၊ model နဲ့ inputs တွေကို GPU device ပေါ်မှာ ထားတာက အထောက်အကူဖြစ်စေပါတယ်၊ ဒါကြောင့် အခုပဲ လုပ်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
device = torch.device(<span class="hljs-string">&quot;cuda&quot;</span>)
model.to(device)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fclw3b">ကျွန်တော်တို့ အစောပိုင်းက ဖော်ပြခဲ့တဲ့အတိုင်း၊ ကျွန်တော်တို့ရဲ့ GitHub issues corpus ထဲက entry တစ်ခုစီကို single vector အဖြစ် ကိုယ်စားပြုချင်တာကြောင့်၊ ကျွန်တော်တို့ token embeddings တွေကို နည်းလမ်းတစ်ခုခုနဲ့ “pool” သို့မဟုတ် average လုပ်ဖို့ လိုအပ်ပါတယ်။ လူကြိုက်များတဲ့ နည်းလမ်းတစ်ခုကတော့ model ရဲ့ outputs တွေပေါ်မှာ <em>CLS pooling</em> ကို လုပ်ဆောင်တာပါပဲ၊ ဒီနေရာမှာ ကျွန်တော်တို့ဟာ special <code>[CLS]</code> token အတွက် last hidden state ကို ရိုးရှင်းစွာ စုဆောင်းပါတယ်။ အောက်ပါ function က ကျွန်တော်တို့အတွက် လုပ်ဆောင်ပေးပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">cls_pooling</span>(<span class="hljs-params">model_output</span>):
<span class="hljs-keyword">return</span> model_output.last_hidden_state[:, <span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-isfav7">နောက်ထပ်အနေနဲ့၊ documents တွေရဲ့ list တစ်ခုကို tokenize လုပ်ပေးမယ့်၊ tensors တွေကို GPU ပေါ်မှာ ထားပေးမယ့်၊ model ကို feed လုပ်ပေးမယ့်၊ ပြီးတော့ နောက်ဆုံးမှာ outputs တွေကို CLS pooling လုပ်ပေးမယ့် helper function တစ်ခုကို ကျွန်တော်တို့ ဖန်တီးပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_embeddings</span>(<span class="hljs-params">text_list</span>):
encoded_input = tokenizer(
text_list, padding=<span class="hljs-literal">True</span>, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>
)
encoded_input = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> encoded_input.items()}
model_output = model(**encoded_input)
<span class="hljs-keyword">return</span> cls_pooling(model_output)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dex6at">function က အလုပ်ဖြစ်မဖြစ် စစ်ဆေးဖို့အတွက် ကျွန်တော်တို့ corpus ထဲက ပထမဆုံး text entry ကို feed လုပ်ပြီး output shape ကို စစ်ဆေးနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->embedding = get_embeddings(comments_dataset[<span class="hljs-string">&quot;text&quot;</span>][<span class="hljs-number">0</span>])
embedding.shape<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ppebol">ကောင်းပါပြီ၊ ကျွန်တော်တို့ corpus ထဲက ပထမဆုံး entry ကို 768-dimensional vector တစ်ခုအဖြစ် ပြောင်းလဲခဲ့ပါပြီ။ ကျွန်တော်တို့ရဲ့ <code>get_embeddings()</code> function ကို corpus ထဲက row တစ်ခုစီတိုင်းမှာ အသုံးပြုဖို့ <code>Dataset.map()</code> ကို အသုံးပြုနိုင်တာကြောင့်၊ အောက်ပါအတိုင်း <code>embeddings</code> column အသစ်တစ်ခု ဖန်တီးရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->embeddings_dataset = comments_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;embeddings&quot;</span>: get_embeddings(x[<span class="hljs-string">&quot;text&quot;</span>]).detach().cpu().numpy()[<span class="hljs-number">0</span>]}
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jxc79k">embeddings တွေကို NumPy arrays တွေအဖြစ် ပြောင်းလဲထားတာကို သတိပြုပါ။ ဒါက 🤗 Datasets က ၎င်းတို့ကို FAISS နဲ့ index လုပ်ဖို့ ကြိုးစားတဲ့အခါ ဒီ format ကို လိုအပ်လို့ပါ။ ဒါကို နောက်တစ်ဆင့်မှာ ကျွန်တော်တို့ လုပ်ဆောင်ပါမယ်။</p> <h2 class="relative group"><a id="using-faiss-for-efficient-similarity-search" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-faiss-for-efficient-similarity-search"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>FAISS ကို အသုံးပြု၍ ထိရောက်သော Similarity Search</span></h2> <p data-svelte-h="svelte-8to1v1">အခု ကျွန်တော်တို့မှာ embeddings တွေရဲ့ dataset တစ်ခုရှိပြီဆိုတော့ ၎င်းတို့ပေါ်မှာ search လုပ်ဖို့ နည်းလမ်းတစ်ခု လိုအပ်ပါတယ်။ ဒါကိုလုပ်ဖို့၊ 🤗 Datasets မှာ <em>FAISS index</em> လို့ခေါ်တဲ့ အထူး data structure တစ်ခုကို ကျွန်တော်တို့ အသုံးပြုပါမယ်။ <a href="https://faiss.ai/" rel="nofollow">FAISS</a> (Facebook AI Similarity Search ရဲ့ အတိုကောက်) က embedding vectors တွေကို လျင်မြန်စွာ ရှာဖွေပြီး cluster လုပ်ဖို့ ထိရောက်တဲ့ algorithms တွေကို ပံ့ပိုးပေးတဲ့ library တစ်ခုပါ။</p> <p data-svelte-h="svelte-1cq75lw">FAISS ရဲ့ အခြေခံသဘောတရားက input embedding တစ်ခုနဲ့ ဆင်တူတဲ့ embeddings တွေကို ရှာဖွေနိုင်စေမယ့် <em>index</em> လို့ခေါ်တဲ့ အထူး data structure တစ်ခုကို ဖန်တီးဖို့ပါပဲ။ 🤗 Datasets မှာ FAISS index တစ်ခု ဖန်တီးတာက ရိုးရှင်းပါတယ် — ကျွန်တော်တို့ <code>Dataset.add_faiss_index()</code> function ကို အသုံးပြုပြီး ကျွန်တော်တို့ dataset ထဲက ဘယ် column ကို index လုပ်ချင်တယ်ဆိုတာ သတ်မှတ်ပေးရုံပါပဲ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->embeddings_dataset.add_faiss_index(column=<span class="hljs-string">&quot;embeddings&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jpk9d1">အခု ကျွန်တော်တို့ <code>Dataset.get_nearest_examples()</code> function နဲ့ nearest neighbor lookup လုပ်ခြင်းဖြင့် ဒီ index ပေါ်မှာ queries တွေ လုပ်ဆောင်နိုင်ပါပြီ။ ဒါကို ပထမဆုံး မေးခွန်းတစ်ခုကို အောက်ပါအတိုင်း embedding လုပ်ခြင်းဖြင့် စမ်းကြည့်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->question = <span class="hljs-string">&quot;How can I load a dataset offline?&quot;</span>
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18wbl67">documents တွေနဲ့အတူတူပဲ၊ ကျွန်တော်တို့မှာ အခု query ကို ကိုယ်စားပြုတဲ့ 768-dimensional vector တစ်ခု ရှိပါတယ်။ ဒါကို အတူဆုံး embeddings တွေကို ရှာဖွေဖို့ corpus တစ်ခုလုံးနဲ့ နှိုင်းယှဉ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->scores, samples = embeddings_dataset.get_nearest_examples(
<span class="hljs-string">&quot;embeddings&quot;</span>, question_embedding, k=<span class="hljs-number">5</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1j51c7q"><code>Dataset.get_nearest_examples()</code> function က query နဲ့ document ကြား တူညီမှုအဆင့်ကို အဆင့်သတ်မှတ်ပေးတဲ့ scores တွေရဲ့ tuple တစ်ခုနဲ့ သက်ဆိုင်ရာ samples အစုအဝေးတစ်ခု (ဒီနေရာမှာတော့ အကောင်းဆုံး ကိုက်ညီမှု ၅ ခု) ကို ပြန်ပေးပါတယ်။ ဒါတွေကို <code>pandas.DataFrame</code> ထဲမှာ စုဆောင်းပြီး အလွယ်တကူ စီစဉ်နိုင်အောင် လုပ်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
samples_df = pd.DataFrame.from_dict(samples)
samples_df[<span class="hljs-string">&quot;scores&quot;</span>] = scores
samples_df.sort_values(<span class="hljs-string">&quot;scores&quot;</span>, ascending=<span class="hljs-literal">False</span>, inplace=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-57k7t9">အခု ကျွန်တော်တို့ query က ရရှိနိုင်တဲ့ comments တွေနဲ့ ဘယ်လောက် ကောင်းကောင်း ကိုက်ညီလဲဆိုတာကို ကြည့်ဖို့ ပထမဆုံး rows အနည်းငယ်ကို iterate လုပ်နိုင်ပါပြီ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> _, row <span class="hljs-keyword">in</span> samples_df.iterrows():
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;COMMENT: <span class="hljs-subst">{row.comments}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;SCORE: <span class="hljs-subst">{row.scores}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;TITLE: <span class="hljs-subst">{row.title}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;URL: <span class="hljs-subst">{row.html_url}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;=&quot;</span> * <span class="hljs-number">50</span>)
<span class="hljs-built_in">print</span>()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;&quot;&quot;
COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it&#x27;d be great if offline mode is added similar to how `transformers` loads models offline fine.
@mandubian&#x27;s second bullet point suggests that there&#x27;s a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
SCORE: 25.505046844482422
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :)
You can now use them offline
\`\`\`python
datasets = load_dataset(&quot;text&quot;, data_files=data_files)
\`\`\`
We&#x27;ll do a new release soon
SCORE: 24.555509567260742
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there&#x27;s no internet.
Let me know if you know other ways that can make the offline mode experience better. I&#x27;d be happy to add them :)
I already note the &quot;freeze&quot; modules option, to prevent local modules updates. It would be a cool feature.
----------
&gt; @mandubian&#x27;s second bullet point suggests that there&#x27;s a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like?
Indeed `load_dataset` allows to load remote dataset script (squad, glue, etc.) but also you own local ones.
For example if you have a dataset script at `./my_dataset/my_dataset.py` then you can do
\`\`\`python
load_dataset(&quot;./my_dataset&quot;)
\`\`\`
and the dataset script will generate your dataset once and for all.
----------
About I&#x27;m looking into having `csv`, `json`, `text`, `pandas` dataset builders already included in the `datasets` package, so that they are available offline by default, as opposed to the other datasets that require the script to be downloaded.
cf #1724
SCORE: 24.14896583557129
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: &gt; here is my way to load a dataset offline, but it **requires** an online machine
&gt;
&gt; 1. (online machine)
&gt;
&gt; ```
&gt;
&gt; import datasets
&gt;
&gt; data = datasets.load_dataset(...)
&gt;
&gt; data.save_to_disk(/YOUR/DATASET/DIR)
&gt;
&gt; ```
&gt;
&gt; 2. copy the dir from online to the offline machine
&gt;
&gt; 3. (offline machine)
&gt;
&gt; ```
&gt;
&gt; import datasets
&gt;
&gt; data = datasets.load_from_disk(/SAVED/DATA/DIR)
&gt;
&gt; ```
&gt;
&gt;
&gt;
&gt; HTH.
SCORE: 22.893993377685547
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: here is my way to load a dataset offline, but it **requires** an online machine
1. (online machine)
\`\`\`
import datasets
data = datasets.load_dataset(...)
data.save_to_disk(/YOUR/DATASET/DIR)
\`\`\`
2. copy the dir from online to the offline machine
3. (offline machine)
\`\`\`
import datasets
data = datasets.load_from_disk(/SAVED/DATA/DIR)
\`\`\`
HTH.
SCORE: 22.406635284423828
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
&quot;&quot;&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-it9kam">မဆိုးပါဘူး! ကျွန်တော်တို့ရဲ့ ဒုတိယ hit က query နဲ့ ကိုက်ညီပုံရပါတယ်။</p> <blockquote class="tip" data-svelte-h="svelte-1a3bugv"><p>✏️ <strong>စမ်းသပ်ကြည့်ပါ။</strong> သင့်ကိုယ်ပိုင် query တစ်ခုကို ဖန်တီးပြီး ပြန်လည်ရရှိထားတဲ့ documents တွေထဲမှာ အဖြေရှာနိုင်မလား ကြည့်ပါ။ search ကို ပိုမိုကျယ်ပြန့်စေဖို့ <code>Dataset.get_nearest_examples()</code> မှာရှိတဲ့ <code>k</code> parameter ကို တိုးမြှင့်ရပါလိမ့်မယ်။</p></blockquote> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-1m73ph3"><li><strong>FAISS (Facebook AI Similarity Search)</strong>: embedding vectors များကို လျင်မြန်စွာ ရှာဖွေပြီး cluster လုပ်ရန်အတွက် ထိရောက်သော algorithms များကို ပံ့ပိုးပေးသော library တစ်ခု။</li> <li><strong>Semantic Search</strong>: အဓိပ္ပာယ်ကို နားလည်ပြီး query တစ်ခု၏ ရည်ရွယ်ချက်နှင့် ကိုက်ညီသော documents များကို ရှာဖွေပေးသည့် search engine အမျိုးအစား။</li> <li><strong>Embeddings</strong>: စာသား (သို့မဟုတ် အခြားဒေတာ) ကို multi-dimensional vector space ထဲရှိ ဂဏန်းများအဖြစ် ကိုယ်စားပြုခြင်း။</li> <li><strong>GitHub Issues</strong>: GitHub repository များတွင် ပြဿနာများ၊ bug များ သို့မဟုတ် အင်္ဂါရပ် တောင်းဆိုမှုများကို မှတ်တမ်းတင်ရန် အသုံးပြုသော အင်္ဂါရပ်။</li> <li><strong>Comments</strong>: GitHub issue တစ်ခု သို့မဟုတ် pull request တစ်ခုအောက်တွင် အသုံးပြုသူများက ပေါင်းထည့်သော စာသားမှတ်ချက်များ။</li> <li><strong>Search Engine</strong>: အသုံးပြုသူ၏ query နှင့် ကိုက်ညီသော အချက်အလက်များကို ရှာဖွေပေးသည့် စနစ်။</li> <li><strong>Transformer-based Language Models</strong>: Transformer architecture ပေါ်တွင် အခြေခံထားသော language models များ။</li> <li><strong>Embedding Vector</strong>: စာသားအပိုင်းအစတစ်ခု (token, sentence, paragraph) ကို ဂဏန်းတန်ဖိုးများဖြင့် ကိုယ်စားပြုထားသော vector။</li> <li><strong>Pooling</strong>: individual embeddings များကို ပေါင်းစပ်ပြီး ပိုကြီးသော text unit (ဥပမာ- sentence, document) အတွက် single vector representation တစ်ခု ဖန်တီးခြင်း။</li> <li><strong>Corpus</strong>: သုတေသနပြုရန်အတွက် အသုံးပြုသော စာသားအစုအဝေးကြီး။</li> <li><strong>Dot-product Similarity</strong>: vectors နှစ်ခုကြားရှိ similarity ကို တွက်ချက်သော metric တစ်ခု။</li> <li><strong>Similarity Metric</strong>: အရာဝတ္ထုနှစ်ခု (ဥပမာ- embeddings) မည်မျှတူညီသည်ကို တိုင်းတာသော နည်းလမ်း။</li> <li><strong>Query</strong>: search engine သို့မဟုတ် database မှ အချက်အလက်များ တောင်းဆိုခြင်း။</li> <li><strong>Conventional Approaches</strong>: ရိုးရာ သို့မဟုတ် သမားရိုးကျ နည်းလမ်းများ။</li> <li><strong>Keywords</strong>: Search query တွင် အသုံးပြုသော အဓိကစကားလုံးများ။</li> <li><strong><code>load_dataset()</code> Function</strong>: Hugging Face Datasets library မှ dataset များကို download လုပ်ပြီး cache လုပ်ရန် အသုံးပြုသော function။</li> <li><strong><code>split=&quot;train&quot;</code></strong>: <code>load_dataset()</code> function တွင် training split ကို ရွေးချယ်ရန်အတွက် argument။</li> <li><strong><code>Dataset</code> Object</strong>: Hugging Face Datasets library မှ dataset တစ်ခုကို ကိုယ်စားပြုသော object။</li> <li><strong><code>DatasetDict</code> Object</strong>: Training set, validation set, နှင့် test set ကဲ့သို့သော dataset အများအပြားကို dictionary ပုံစံဖြင့် သိမ်းဆည်းထားသော object။</li> <li><strong>Pull Requests</strong>: GitHub တွင် code အပြောင်းအလဲများကို project ၏ main branch သို့ ပေါင်းစည်းရန် တောင်းဆိုခြင်း။</li> <li><strong><code>Dataset.filter()</code> Function</strong>: Dataset မှ သတ်မှတ်ထားသော အခြေအနေများနှင့် မကိုက်ညီသော rows များကို ဖယ်ရှားရန် အသုံးပြုသော function။</li> <li><strong><code>is_pull_request</code></strong>: GitHub issue တစ်ခုသည် pull request ဟုတ်မဟုတ်ကို ဖော်ပြသော feature (boolean value)။</li> <li><strong><code>len(x[&quot;comments&quot;]) &gt; 0</code></strong>: comment list ၏ အရှည်သည် သုညထက် ကြီးမားခြင်းရှိမရှိ စစ်ဆေးခြင်း။</li> <li><strong><code>title</code> Column</strong>: Issue ၏ ခေါင်းစဉ်ကို သိမ်းဆည်းထားသော column။</li> <li><strong><code>body</code> Column</strong>: Issue ၏ ဖော်ပြချက်ကို သိမ်းဆည်းထားသော column။</li> <li><strong><code>comments</code> Column</strong>: Issue နှင့် သက်ဆိုင်သော comments များကို သိမ်းဆည်းထားသော column (list of strings)။</li> <li><strong><code>html_url</code> Column</strong>: Issue ၏ GitHub URL ကို သိမ်းဆည်းထားသော column။</li> <li><strong><code>Dataset.remove_columns()</code> Function</strong>: Dataset မှ မလိုအပ်သော columns များကို ဖယ်ရှားရန် အသုံးပြုသော function။</li> <li><strong><code>set()</code></strong>: Python တွင် item များကို စုစည်းသိမ်းဆည်းထားသော unordered collection ဖြစ်ပြီး duplicate များ မပါဝင်ပါ။</li> <li><strong><code>symmetric_difference()</code></strong>: set နှစ်ခုကြားရှိ မတူညီသော items များကို ရှာဖွေသော method။</li> <li><strong>Contextual Information</strong>: အခြေအနေတစ်ခု သို့မဟုတ် စာသားတစ်ခု၏ အဓိပ္ပာယ်ကို နားလည်ရန် ကူညီပေးသော နောက်ခံအချက်အလက်များ။</li> <li><strong>“Explode” a Column</strong>: Pandas DataFrame တွင် list-like column တစ်ခုရှိ element တစ်ခုစီအတွက် new row တစ်ခု ဖန်တီးခြင်း။</li> <li><strong>Pandas <code>DataFrame</code></strong>: Python တွင် tabular data (ဇယားပုံစံဒေတာ) ကို ကိုင်တွယ်ရန် အသုံးပြုသော two-dimensional data structure။</li> <li><strong><code>DataFrame.explode()</code> Function</strong>: Pandas မှ list-like column တစ်ခုရှိ element တစ်ခုစီအတွက် new row တစ်ခု ဖန်တီးပေးသော function။</li> <li><strong><code>issues_dataset.set_format(&quot;pandas&quot;)</code></strong>: Dataset ကို Pandas DataFrame format သို့ ပြောင်းလဲခြင်း။</li> <li><strong><code>issues_dataset[:]</code></strong>: Dataset တစ်ခုလုံးကို (Pandas format တွင်) selection လုပ်ခြင်း။</li> <li><strong><code>comments_df.head(4)</code></strong>: DataFrame ၏ ပထမဆုံး rows လေးခုကို ပြသခြင်း။</li> <li><strong><code>ignore_index=True</code></strong>: <code>explode()</code> function တွင် original index ကို မထိန်းသိမ်းဘဲ new index ကို ဖန်တီးရန် argument။</li> <li><strong><code>Dataset.from_pandas()</code></strong>: Pandas DataFrame တစ်ခုမှ Hugging Face Dataset object တစ်ခုကို ဖန်တီးသော method။</li> <li><strong><code>comments_length</code> Column</strong>: Comment တစ်ခုစီရှိ စကားလုံးအရေအတွက်ကို သိမ်းဆည်းထားသော column။</li> <li><strong><code>x[&quot;comments&quot;].split()</code></strong>: Comment စာသားကို စကားလုံးများအဖြစ် ပိုင်းခြားခြင်း။</li> <li><strong><code>Dataset.map()</code></strong>: 🤗 Datasets library မှာ ပါဝင်တဲ့ method တစ်ခုဖြစ်ပြီး dataset ရဲ့ element တစ်ခုစီ ဒါမှမဟုတ် batch တစ်ခုစီပေါ်မှာ function တစ်ခုကို အသုံးပြုနိုင်စေသည်။</li> <li><strong><code>AutoModel</code> Class</strong>: Hugging Face Transformers library မှ မော်ဒယ်အမည်ကို အသုံးပြုပြီး သက်ဆိုင်ရာ model class ကို အလိုအလျောက် load လုပ်ပေးသော class။</li> <li><strong>Checkpoint</strong>: မော်ဒယ်၏ weights များနှင့် အခြားဖွဲ့စည်းပုံများ (configuration) ကို သတ်မှတ်ထားသော အချိန်တစ်ခုတွင် သိမ်းဆည်းထားခြင်း။</li> <li><strong><code>sentence-transformers</code> Library</strong>: Sentence embeddings များ ဖန်တီးရန်အတွက် ဒီဇိုင်းထုတ်ထားသော Python library။</li> <li><strong>Asymmetric Semantic Search</strong>: query သည် တိုတောင်းပြီး document သည် ရှည်လျားသော semantic search အမျိုးအစား (ဥပမာ- မေးခွန်းတစ်ခုကို အဖြေရှာခြင်း)။</li> <li><strong><code>multi-qa-mpnet-base-dot-v1</code></strong>: Semantic search အတွက် စွမ်းဆောင်ရည်ကောင်းမွန်သော sentence-transformer model checkpoint။</li> <li><strong><code>AutoTokenizer</code></strong>: Hugging Face Transformers library မှာ ပါဝင်တဲ့ class တစ်ခုဖြစ်ပြီး မော်ဒယ်အမည်ကို အသုံးပြုပြီး သက်ဆိုင်ရာ tokenizer ကို အလိုအလျောက် load လုပ်ပေးသည်။</li> <li><strong>GPU (Graphics Processing Unit)</strong>: ဂရပ်ဖစ်လုပ်ဆောင်မှုအတွက် အထူးဒီဇိုင်းထုတ်ထားသော processor တစ်မျိုးဖြစ်သော်လည်း AI/ML လုပ်ငန်းများတွင် အရှိန်မြှင့်ရန် အသုံးများသည်။</li> <li><strong><code>torch.device(&quot;cuda&quot;)</code></strong>: PyTorch တွင် GPU device ကို ရည်ညွှန်းသည်။</li> <li><strong><code>model.to(device)</code></strong>: PyTorch model ကို သတ်မှတ်ထားသော device (GPU) သို့ ရွှေ့ပြောင်းခြင်း။</li> <li><strong><code>TFAutoModel</code></strong>: TensorFlow framework အတွက် <code>AutoModel</code> နှင့် တူညီသော လုပ်ဆောင်ချက်များရှိသည်။</li> <li><strong><code>from_pt=True</code></strong>: <code>TFAutoModel.from_pretrained()</code> တွင် PyTorch weights များကို TensorFlow format သို့ အလိုအလျောက် ပြောင်းလဲရန် argument။</li> <li><strong>CLS Pooling</strong>: Transformer model ၏ output မှ <code>[CLS]</code> token ၏ last hidden state ကို အသုံးပြု၍ text sequence အတွက် single vector representation တစ်ခု ဖန်တီးခြင်း။</li> <li><strong><code>[CLS]</code> Token</strong>: BERT model တွင် sequence ၏ အစကို ကိုယ်စားပြုသော special token။</li> <li><strong>Last Hidden State</strong>: Transformer model ၏ နောက်ဆုံး layer မှ output embeddings များ။</li> <li><strong><code>encoded_input</code></strong>: Tokenizer မှ ထုတ်ပေးသော input IDs, attention masks စသည်တို့ ပါဝင်သော dictionary။</li> <li><strong><code>padding=True</code></strong>: Tokenization လုပ်ရာတွင် sequence အရှည်များ ကွဲပြားပါက အရှည်ဆုံး sequence အရှည်အတိုင်း ဖြည့်ပေးခြင်း။</li> <li><strong><code>truncation=True</code></strong>: sequence အရှည်သည် model ၏ အများဆုံး input အရှည်ထက် ရှည်လျားပါက ဖြတ်တောက်ခြင်း။</li> <li><strong><code>return_tensors=&quot;pt&quot;</code></strong>: PyTorch tensors များအဖြစ် output ပြန်ပေးရန် argument။</li> <li><strong><code>encoded_input.items()</code></strong>: dictionary မှ key-value pairs များကို ရယူခြင်း။</li> <li><strong><code>k: v.to(device)</code></strong>: dictionary comprehension ဖြင့် input tensors များကို GPU သို့ ရွှေ့ခြင်း။</li> <li><strong>`model(</strong>encoded_input)`**: model ကို encoded inputs များဖြင့် run ခြင်း။</li> <li><strong><code>embedding.shape</code></strong>: embedding vector ၏ ပုံသဏ္ဍာန် (dimensions) ကို ပြသခြင်း။</li> <li><strong>768-dimensional Vector</strong>: dimensions ၇၆၈ ခုပါဝင်သော vector။</li> <li><strong><code>detach().cpu().numpy()[0]</code></strong>: PyTorch tensor ကို detach (computation graph မှ ဖြတ်တောက်) ပြီး CPU သို့ ရွှေ့၊ ထို့နောက် NumPy array အဖြစ် ပြောင်းလဲခြင်း။</li> <li><strong>NumPy Arrays</strong>: Python တွင် ဂဏန်းဆိုင်ရာ တွက်ချက်မှုများအတွက် အသုံးပြုသော array object။</li> <li><strong>FAISS Index</strong>: FAISS library မှ efficient similarity search အတွက် အသုံးပြုသော data structure။</li> <li><strong><code>Dataset.add_faiss_index()</code> Function</strong>: Hugging Face Dataset တွင် FAISS index တစ်ခုကို ထည့်သွင်းရန် အသုံးပြုသော function။</li> <li><strong>Nearest Neighbor Lookup</strong>: input query နှင့် အတူဆုံးသော item များကို ရှာဖွေခြင်း။</li> <li><strong><code>Dataset.get_nearest_examples()</code> Function</strong>: Dataset ၏ FAISS index ကို အသုံးပြုပြီး query နှင့် အတူဆုံးသော examples များကို ပြန်ပေးသော function။</li> <li><strong><code>question_embedding</code></strong>: Query စာသား၏ embedding vector။</li> <li><strong><code>k</code> Parameter</strong>: <code>get_nearest_examples()</code> function တွင် အနီးဆုံး examples အရေအတွက် (k) ကို သတ်မှတ်ရန် အသုံးပြုသော parameter။</li> <li><strong><code>pd.DataFrame.from_dict()</code></strong>: dictionary တစ်ခုမှ Pandas DataFrame တစ်ခုကို ဖန်တီးသော method။</li> <li><strong><code>samples_df.sort_values(&quot;scores&quot;, ascending=False, inplace=True)</code></strong>: DataFrame ကို <code>scores</code> column အလိုက် အများဆုံးမှ အနည်းဆုံးသို့ စီစဉ်ခြင်း။</li> <li><strong><code>iterrows()</code></strong>: DataFrame ၏ rows များကို iterate လုပ်ရန် အသုံးပြုသော method။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter5/6.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_tyugt6 = {
assets: "/docs/course/pr_1114/my",
base: "/docs/course/pr_1114/my",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js"),
import("/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 41],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
114 kB
·
Xet hash:
4752b24f5f43375e24d96f0ddd512c82a076d68db0c8a180f1e80c2f152289be

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.