Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"使用 FAISS 进行语义搜索","local":"使用 FAISS 进行语义搜索","sections":[{"title":"使用文本嵌入进行语义搜索","local":"使用文本嵌入进行语义搜索","sections":[],"depth":2},{"title":"加载和准备数据集","local":"加载和准备数据集","sections":[],"depth":2},{"title":"创建文本嵌入","local":"创建文本嵌入","sections":[],"depth":2},{"title":"使用 FAISS 进行高效的相似性搜索","local":"使用 FAISS 进行高效的相似性搜索","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1021/zh-CN/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/entry/start.f3a1a511.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/scheduler.37c15a92.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/singletons.9bf55235.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/index.18351ede.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/paths.0ba10750.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/entry/app.c39e37cf.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/index.2bf4358c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/nodes/0.dad18ce3.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/nodes/39.43742bc9.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/Tip.363c041f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/Youtube.1e50a667.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/CodeBlock.4e987730.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/FrameworkSwitchCourse.8d4d4ab6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/getInferenceSnippets.ebf8be91.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"使用 FAISS 进行语义搜索","local":"使用 FAISS 进行语义搜索","sections":[{"title":"使用文本嵌入进行语义搜索","local":"使用文本嵌入进行语义搜索","sections":[],"depth":2},{"title":"加载和准备数据集","local":"加载和准备数据集","sections":[],"depth":2},{"title":"创建文本嵌入","local":"创建文本嵌入","sections":[],"depth":2},{"title":"使用 FAISS 进行高效的相似性搜索","local":"使用 FAISS 进行高效的相似性搜索","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="bg-white leading-none border border-gray-100 rounded-lg flex p-0.5 w-56 text-sm mb-4"><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-l bg-red-50 dark:bg-transparent text-red-600" href="?fw=pt"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><defs><clipPath id="a"><rect x="3.05" y="0.5" width="25.73" height="31" fill="none"></rect></clipPath></defs><g clip-path="url(#a)"><path d="M24.94,9.51a12.81,12.81,0,0,1,0,18.16,12.68,12.68,0,0,1-18,0,12.81,12.81,0,0,1,0-18.16l9-9V5l-.84.83-6,6a9.58,9.58,0,1,0,13.55,0ZM20.44,9a1.68,1.68,0,1,1,1.67-1.67A1.68,1.68,0,0,1,20.44,9Z" fill="#ee4c2c"></path></g></svg> Pytorch </a><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-r text-gray-500 filter grayscale" href="?fw=tf"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="0.94em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 274"><path d="M145.726 42.065v42.07l72.861 42.07v-42.07l-72.86-42.07zM0 84.135v42.07l36.43 21.03V105.17L0 84.135zm109.291 21.035l-36.43 21.034v126.2l36.43 21.035v-84.135l36.435 21.035v-42.07l-36.435-21.034V105.17z" fill="#E55B2D"></path><path d="M145.726 42.065L36.43 105.17v42.065l72.861-42.065v42.065l36.435-21.03v-84.14zM255.022 63.1l-36.435 21.035v42.07l36.435-21.035V63.1zm-72.865 84.135l-36.43 21.035v42.07l36.43-21.036v-42.07zm-36.43 63.104l-36.436-21.035v84.135l36.435-21.035V210.34z" fill="#ED8E24"></path><path d="M145.726 0L0 84.135l36.43 21.035l109.296-63.105l72.861 42.07L255.022 63.1L145.726 0zm0 126.204l-36.435 21.03l36.435 21.036l36.43-21.035l-36.43-21.03z" fill="#F8BF3C"></path></svg> TensorFlow </a></div> <h1 class="relative group"><a id="使用 FAISS 进行语义搜索" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#使用 FAISS 进行语义搜索"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>使用 FAISS 进行语义搜索</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/zh-CN/chapter5/section6_pt.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/zh-CN/chapter5/section6_pt.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1k1uitg">在 <a href="/course/chapter5/5">第5小节</a> ,我们创建了一个 🤗 Datasets 仓库的 GitHub issues 和评论组成的数据集。在本节,我们将使用这些信息构建一个搜索引擎,帮助我们找到关于该库的最相关的 issue 的答案!</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/OATCgQtNX2o" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <h2 class="relative group"><a id="使用文本嵌入进行语义搜索" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#使用文本嵌入进行语义搜索"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>使用文本嵌入进行语义搜索</span></h2> <p data-svelte-h="svelte-1bo3r1m">正如我们在 <a href="/course/chapter1">第一章</a> ,学习的,基于 Transformer 的语言模型会将文本中的每个 token 转换为嵌入向量。事实证明,我们可以 “池化(pool)” 嵌入向量以创建整个句子、段落或(在某些情况下)文档的向量表示。然后,通过计算每个嵌入之间的点积相似度(或其他一些相似度度量)并返回相似度最大的文档,这些嵌入可用于在语料库中找到相似的文档。</p> <p data-svelte-h="svelte-17w33xe">在本节中,我们将使用文本嵌入向量来开发语义搜索引擎。与基于将查询中的关键字的传统方法相比,这些搜索引擎具有多种优势。</p> <div class="flex justify-center" data-svelte-h="svelte-yxatr"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/semantic-search.svg" alt="Semantic search."> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/semantic-search-dark.svg" alt="Semantic search."></div> <h2 class="relative group"><a id="加载和准备数据集" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#加载和准备数据集"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>加载和准备数据集</span></h2> <p data-svelte-h="svelte-1j4s9k3">首先,我们需要下载我们的 GitHub issues 数据集,所以让我们像往常那样使用 <code>load_dataset()</code> 函数:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| issues_dataset = load_dataset(<span class="hljs-string">"lewtun/github-issues"</span>, split=<span class="hljs-string">"train"</span>) | |
| issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'url'</span>, <span class="hljs-string">'repository_url'</span>, <span class="hljs-string">'labels_url'</span>, <span class="hljs-string">'comments_url'</span>, <span class="hljs-string">'events_url'</span>, <span class="hljs-string">'html_url'</span>, <span class="hljs-string">'id'</span>, <span class="hljs-string">'node_id'</span>, <span class="hljs-string">'number'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'user'</span>, <span class="hljs-string">'labels'</span>, <span class="hljs-string">'state'</span>, <span class="hljs-string">'locked'</span>, <span class="hljs-string">'assignee'</span>, <span class="hljs-string">'assignees'</span>, <span class="hljs-string">'milestone'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'created_at'</span>, <span class="hljs-string">'updated_at'</span>, <span class="hljs-string">'closed_at'</span>, <span class="hljs-string">'author_association'</span>, <span class="hljs-string">'active_lock_reason'</span>, <span class="hljs-string">'pull_request'</span>, <span class="hljs-string">'body'</span>, <span class="hljs-string">'performed_via_github_app'</span>, <span class="hljs-string">'is_pull_request'</span>], | |
| num_rows: <span class="hljs-number">2855</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2cqt6s">在此,我们在 <code>load_dataset()</code> 中指定了默认的 <code>train(训练集)</code> 部分,因此它返回一个 <code>Dataset</code> 而不是 <code>DatasetDict</code> 。首要任务是排除掉拉取请求(pull),因为这些请求往往很少用于回答提出的 issue,会为我们的搜索引擎引入干扰。正如我们现在熟悉的那样,我们可以使用 <code>Dataset.filter()</code> 函数来排除数据集中的这些行。与此同时,让我们也筛选掉没有评论的行,因为这些行没有为用户提问提供回答:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = issues_dataset.<span class="hljs-built_in">filter</span>( | |
| <span class="hljs-keyword">lambda</span> x: (x[<span class="hljs-string">"is_pull_request"</span>] == <span class="hljs-literal">False</span> <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(x[<span class="hljs-string">"comments"</span>]) > <span class="hljs-number">0</span>) | |
| ) | |
| issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'url'</span>, <span class="hljs-string">'repository_url'</span>, <span class="hljs-string">'labels_url'</span>, <span class="hljs-string">'comments_url'</span>, <span class="hljs-string">'events_url'</span>, <span class="hljs-string">'html_url'</span>, <span class="hljs-string">'id'</span>, <span class="hljs-string">'node_id'</span>, <span class="hljs-string">'number'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'user'</span>, <span class="hljs-string">'labels'</span>, <span class="hljs-string">'state'</span>, <span class="hljs-string">'locked'</span>, <span class="hljs-string">'assignee'</span>, <span class="hljs-string">'assignees'</span>, <span class="hljs-string">'milestone'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'created_at'</span>, <span class="hljs-string">'updated_at'</span>, <span class="hljs-string">'closed_at'</span>, <span class="hljs-string">'author_association'</span>, <span class="hljs-string">'active_lock_reason'</span>, <span class="hljs-string">'pull_request'</span>, <span class="hljs-string">'body'</span>, <span class="hljs-string">'performed_via_github_app'</span>, <span class="hljs-string">'is_pull_request'</span>], | |
| num_rows: <span class="hljs-number">771</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15hujmg">我们可以看到,我们的数据集中有很多列,其中大部分在构建我们的搜索引擎都不会使用。从搜索的角度来看,信息量最大的列是 <code>title</code> , <code>body</code> ,和 <code>comments</code> ,而 <code>html_url</code> 为我们提供了一个回到原 issue 的链接。让我们使用 <code>Dataset.remove_columns()</code> 删除其余的列:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->columns = issues_dataset.column_names | |
| columns_to_keep = [<span class="hljs-string">"title"</span>, <span class="hljs-string">"body"</span>, <span class="hljs-string">"html_url"</span>, <span class="hljs-string">"comments"</span>] | |
| columns_to_remove = <span class="hljs-built_in">set</span>(columns_to_keep).symmetric_difference(columns) | |
| issues_dataset = issues_dataset.remove_columns(columns_to_remove) | |
| issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'html_url'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'body'</span>], | |
| num_rows: <span class="hljs-number">771</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-13qbpuw">为了创建我们的文本嵌入数据集,我们将用 issue 的标题和正文来扩充每条评论,因为这些字段通常包含有用的上下文信息。因为我们的 <code>comments</code> 列当前是每个 issue 的评论列表,我们需要“重新组合”列,使得每一行都是由一个 <code>(html_url, title, body, comment)</code> 元组组成。在 Pandas 中,我们可以使用 <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html" rel="nofollow">DataFrame.explode() 函数</a> 完成这个操作 它为类似列表的列中的每个元素创建一个新行,同时复制所有其他列值。让我们首先切换到 Pandas 的 <code>DataFrame</code> 格式:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset.set_format(<span class="hljs-string">"pandas"</span>) | |
| df = issues_dataset[:]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y6s0lg">如果我们检查这个 <code>DataFrame</code> 的第一行,我们可以看到这个 issue 有四个相关评论:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->df[<span class="hljs-string">"comments"</span>][<span class="hljs-number">0</span>].tolist()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'the bug code locate in :\r\n if data_args.task_name is not None:\r\n # Downloading and loading a dataset from the hub.\r\n datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)'</span>, | |
| <span class="hljs-string">'Hi @jinec,\r\n\r\nFrom time to time we get this kind of `ConnectionError` coming from the github.com website: https://raw.githubusercontent.com\r\n\r\nNormally, it should work if you wait a little and then retry.\r\n\r\nCould you please confirm if the problem persists?'</span>, | |
| <span class="hljs-string">'cannot connect,even by Web browser,please check that there is some problems。'</span>, | |
| <span class="hljs-string">'I can access https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/glue/glue.py without problem...'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-62usvk">我们希望使用 <code>explode()</code> 将这些评论中的每一条都展开成为一行。让我们看看是否可以做到:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->comments_df = df.explode(<span class="hljs-string">"comments"</span>, ignore_index=<span class="hljs-literal">True</span>) | |
| comments_df.head(<span class="hljs-number">4</span>)<!-- HTML_TAG_END --></pre></div> <table border="1" class="dataframe" style="table-layout: fixed; word-wrap:break-word; width: 100%;" data-svelte-h="svelte-1g5whzd"><thead><tr style="text-align: right;"><th></th> <th>html_url</th> <th>title</th> <th>comments</th> <th>body</th></tr></thead> <tbody><tr><th>0</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn't reach https://raw.githubusercontent.com</td> <td>the bug code locate in :\r\n if data_args.task_name is not None...</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>1</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn't reach https://raw.githubusercontent.com</td> <td>Hi @jinec,\r\n\r\nFrom time to time we get this kind of `ConnectionError` coming from the github.com website: https://raw.githubusercontent.com...</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>2</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn't reach https://raw.githubusercontent.com</td> <td>cannot connect,even by Web browser,please check that there is some problems。</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>3</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn't reach https://raw.githubusercontent.com</td> <td>I can access https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/glue/glue.py without problem...</td> <td>Hello,\r\nI am trying to run run_glue.py and it gives me this error...</td></tr></tbody></table> <p data-svelte-h="svelte-axvn69">非常好,我们可以看到其他三列已经被复制了,并且 <code>comments</code> 列里存放着单独的评论!现在我们已经完成了 Pandas 要完成的部分功能,我们可以通过加载内存中的 <code>DataFrame</code> 快速切换回 <code>Dataset</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset | |
| comments_dataset = Dataset.from_pandas(comments_df) | |
| comments_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'html_url'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'body'</span>], | |
| num_rows: <span class="hljs-number">2842</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x6393a">太好了,我们获取到了几千条的评论!</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-17yt0fw">✏️ <strong>试试看!</strong> 看看你是否可以使用 <code>Dataset.map()</code> 展开 <code>issues_dataset</code> 的 <code>comments</code> 列,这有点棘手;你可能会发现 🤗 Datasets 文档的 <a href="https://huggingface.co/docs/datasets/about_map_batch#batch-mapping" rel="nofollow">“批处理映射(Batch mapping)”</a> 对这个任务很有用。</p></div> <p data-svelte-h="svelte-vg61zh">既然我们每行有一个评论,让我们创建一个新的 <code>comments_length</code> 列来存放每条评论的字数:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->comments_dataset = comments_dataset.<span class="hljs-built_in">map</span>( | |
| <span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">"comment_length"</span>: <span class="hljs-built_in">len</span>(x[<span class="hljs-string">"comments"</span>].split())} | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1v6qqei">我们可以使用这个新列来过滤掉简短的评论,其中通常包括“cc @lewtun”或“谢谢!”之类与我们的搜索引擎无关的内容。筛选的精确数字没有硬性规定,但大约大于 15 个单词似乎是一个不错的选择:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->comments_dataset = comments_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-string">"comment_length"</span>] > <span class="hljs-number">15</span>) | |
| comments_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'html_url'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'body'</span>, <span class="hljs-string">'comment_length'</span>], | |
| num_rows: <span class="hljs-number">2098</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s6nife">稍微清理了我们的数据集后,让我们使用 issue 标题、描述和评论构建一个新的 <code>text</code> 列。像往常一样,我们可以编写一个简单的函数,并将其传递给 <code>Dataset.map()</code> 来完成这些操作</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">concatenate_text</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-keyword">return</span> { | |
| <span class="hljs-string">"text"</span>: examples[<span class="hljs-string">"title"</span>] | |
| + <span class="hljs-string">" \n "</span> | |
| + examples[<span class="hljs-string">"body"</span>] | |
| + <span class="hljs-string">" \n "</span> | |
| + examples[<span class="hljs-string">"comments"</span>] | |
| } | |
| comments_dataset = comments_dataset.<span class="hljs-built_in">map</span>(concatenate_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lpyv6q">我们终于准备好创建一些文本嵌入了!让我们继续。</p> <h2 class="relative group"><a id="创建文本嵌入" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#创建文本嵌入"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>创建文本嵌入</span></h2> <p data-svelte-h="svelte-1ou84a4">正如我们在 <a href="/course/chapter2">第二章</a> 学过,我们可以通过使用 <code>AutoModel</code> 类来完成文本嵌入。首先需要做的就是选择一个合适的 checkpoint。幸运的是,有一个名为 <code>sentence-transformers</code> 的库专门用于创建文本嵌入。如库中的 <a href="https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search" rel="nofollow">文档</a> 所述的,我们这次要实现的是非对称语义搜索(asymmetric semantic search),因为我们有一个简短的查询,我们希望在比如 issue 评论等更长的文档中找到其匹配的文本。通过查看 <a href="https://www.sbert.net/docs/pretrained_models.html#model-overview" rel="nofollow">模型概述表</a> 我们可以发现 <code>multi-qa-mpnet-base-dot-v1</code> checkpoint 在语义搜索方面具有最佳性能,因此我们将使用它。我们还将使用这个 checkpoint 加载了对应的 tokenizer :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModel | |
| model_ckpt = <span class="hljs-string">"sentence-transformers/multi-qa-mpnet-base-dot-v1"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_ckpt) | |
| model = AutoModel.from_pretrained(model_ckpt)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qindrm">将模型和输入的文本放到 GPU 上会加速嵌入过程,所以让我们现在就这么做:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| device = torch.device(<span class="hljs-string">"cuda"</span>) | |
| model.to(device)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1o32zfz">根据我们之前的想法,我们希望将我们的 GitHub issue 中的每一条记录转化为一个单一的向量,所以我们需要以某种方式“池化(pool)”或平均每个词的嵌入向量。一种流行的方法是在我们模型的输出上进行 <code>CLS 池化</code> ,我们只需要收集 <code>[CLS]</code> token 的的最后一个隐藏状态。以下函数实现了这个功能:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">cls_pooling</span>(<span class="hljs-params">model_output</span>): | |
| <span class="hljs-keyword">return</span> model_output.last_hidden_state[:, <span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-d2x713">接下来,我们将创建一个辅助函数,它将对一组文档进行 tokenize,然后将张量放在 GPU 上,接着将它们喂给模型,最后对输出进行 CLS 池化:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_embeddings</span>(<span class="hljs-params">text_list</span>): | |
| encoded_input = tokenizer( | |
| text_list, padding=<span class="hljs-literal">True</span>, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"pt"</span> | |
| ) | |
| encoded_input = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> encoded_input.items()} | |
| model_output = model(**encoded_input) | |
| <span class="hljs-keyword">return</span> cls_pooling(model_output)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bda0jo">我们可以将第一条数据喂给它并检查输出的形状来测试这个函数是否正常工作:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->embedding = get_embeddings(comments_dataset[<span class="hljs-string">"text"</span>][<span class="hljs-number">0</span>]) | |
| embedding.shape<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17bz3th">太好了,我们已经将语料库中的第一个条目转换为了一个 768 维向量!我们可以用 <code>Dataset.map()</code> 将我们的 <code>get_embeddings()</code> 函数应用到我们语料库中的每一行,然后创建一个新的 <code>embeddings</code> 列:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->embeddings_dataset = comments_dataset.<span class="hljs-built_in">map</span>( | |
| <span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">"embeddings"</span>: get_embeddings(x[<span class="hljs-string">"text"</span>]).detach().cpu().numpy()[<span class="hljs-number">0</span>]} | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6mj4g9">请注意,在上面的代码中我们已经将文本嵌入转换为 NumPy 数组——这是因为当我们尝试使用 FAISS 搜索它们时,🤗 Datasets 需要这种格式,让我们继续吧。</p> <h2 class="relative group"><a id="使用 FAISS 进行高效的相似性搜索" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#使用 FAISS 进行高效的相似性搜索"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>使用 FAISS 进行高效的相似性搜索</span></h2> <p data-svelte-h="svelte-cma3ou">现在我们有了一个文本嵌入数据集,我们需要一些方法来搜索它们。为此,我们将使用🤗 Datasets 中一种特殊的数据结构,称为 FAISS 指数。 <a href="https://faiss.ai/" rel="nofollow">FAISS</a> (Facebook AI Similarity Search 的缩写)是一个库,提供了用于快速搜索和聚类嵌入向量的高效算法。</p> <p data-svelte-h="svelte-1wc0p1">FAISS 背后的基本思想是创建一个特殊的数据结构,称为 <code>index(索引)</code> 它可以找到哪些嵌入与输入嵌入相似。在 🤗 Datasets 中创建一个 FAISS index(索引)很简单——我们使用 <code>Dataset.add_faiss_index()</code> 函数并指定我们要索引的数据集的哪一列:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->embeddings_dataset.add_faiss_index(column=<span class="hljs-string">"embeddings"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xjel5k">现在,我们可以使用 <code>Dataset.get_nearest_examples()</code> 函数进行最近邻居查找。让我们通过首先嵌入一个 issue 来测试这一点,如下所示:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->question = <span class="hljs-string">"How can I load a dataset offline?"</span> | |
| question_embedding = get_embeddings([question]).cpu().detach().numpy() | |
| question_embedding.shape<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-iayzgi">就像对文档进行嵌入一样,我们现在有一个表示查询的 768 维向量,我们可以将其与整个语料库进行比较以找到最相似的嵌入:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->scores, samples = embeddings_dataset.get_nearest_examples( | |
| <span class="hljs-string">"embeddings"</span>, question_embedding, k=<span class="hljs-number">5</span> | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11tm5f0"><code>Dataset.get_nearest_examples()</code> 函数返回一个元组,包括评分(评价查询和文档之间的相似程度)和对应的样本(这里是 5 个最佳匹配)。让我们把这些收集到一个 <code>pandas.DataFrame</code> ,这样我们就可以轻松地对它们进行排序:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd | |
| samples_df = pd.DataFrame.from_dict(samples) | |
| samples_df[<span class="hljs-string">"scores"</span>] = scores | |
| samples_df.sort_values(<span class="hljs-string">"scores"</span>, ascending=<span class="hljs-literal">False</span>, inplace=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1v4c8w0">现在我们可以遍历前几行来查看我们的查询与评论的匹配程度如何:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> _, row <span class="hljs-keyword">in</span> samples_df.iterrows(): | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"COMMENT: <span class="hljs-subst">{row.comments}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"SCORE: <span class="hljs-subst">{row.scores}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"TITLE: <span class="hljs-subst">{row.title}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"URL: <span class="hljs-subst">{row.html_url}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">"="</span> * <span class="hljs-number">50</span>) | |
| <span class="hljs-built_in">print</span>()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">""" | |
| COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it'd be great if offline mode is added similar to how `transformers` loads models offline fine. | |
| @mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like? | |
| SCORE: 25.505046844482422 | |
| TITLE: Discussion using datasets in offline mode | |
| URL: https://github.com/huggingface/datasets/issues/824 | |
| ================================================== | |
| COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the `datasets` package since #1726 :) | |
| You can now use them offline | |
| \`\`\`python | |
| datasets = load_dataset("text", data_files=data_files) | |
| \`\`\` | |
| We'll do a new release soon | |
| SCORE: 24.555509567260742 | |
| TITLE: Discussion using datasets in offline mode | |
| URL: https://github.com/huggingface/datasets/issues/824 | |
| ================================================== | |
| COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there's no internet. | |
| Let me know if you know other ways that can make the offline mode experience better. I'd be happy to add them :) | |
| I already note the "freeze" modules option, to prevent local modules updates. It would be a cool feature. | |
| ---------- | |
| > @mandubian's second bullet point suggests that there's a workaround allowing you to use your offline (custom?) dataset with `datasets`. Could you please elaborate on how that should look like? | |
| Indeed `load_dataset` allows to load remote dataset script (squad, glue, etc.) but also you own local ones. | |
| For example if you have a dataset script at `./my_dataset/my_dataset.py` then you can do | |
| \`\`\`python | |
| load_dataset("./my_dataset") | |
| \`\`\` | |
| and the dataset script will generate your dataset once and for all. | |
| ---------- | |
| About I'm looking into having `csv`, `json`, `text`, `pandas` dataset builders already included in the `datasets` package, so that they are available offline by default, as opposed to the other datasets that require the script to be downloaded. | |
| cf #1724 | |
| SCORE: 24.14896583557129 | |
| TITLE: Discussion using datasets in offline mode | |
| URL: https://github.com/huggingface/datasets/issues/824 | |
| ================================================== | |
| COMMENT: > here is my way to load a dataset offline, but it **requires** an online machine | |
| > | |
| > 1. (online machine) | |
| > | |
| > ``` | |
| > | |
| > import datasets | |
| > | |
| > data = datasets.load_dataset(.) | |
| > | |
| > data.save_to_disk(/YOUR/DATASET/DIR) | |
| > | |
| > ``` | |
| > | |
| > 2. copy the dir from online to the offline machine | |
| > | |
| > 3. (offline machine) | |
| > | |
| > ``` | |
| > | |
| > import datasets | |
| > | |
| > data = datasets.load_from_disk(/SAVED/DATA/DIR) | |
| > | |
| > ``` | |
| > | |
| > | |
| > | |
| > HTH. | |
| SCORE: 22.893993377685547 | |
| TITLE: Discussion using datasets in offline mode | |
| URL: https://github.com/huggingface/datasets/issues/824 | |
| ================================================== | |
| COMMENT: here is my way to load a dataset offline, but it **requires** an online machine | |
| 1. (online machine) | |
| \`\`\` | |
| import datasets | |
| data = datasets.load_dataset(...) | |
| data.save_to_disk(/YOUR/DATASET/DIR) | |
| \`\`\` | |
| 2. copy the dir from online to the offline machine | |
| 3. (offline machine) | |
| \`\`\` | |
| import datasets | |
| data = datasets.load_from_disk(/SAVED/DATA/DIR) | |
| \`\`\` | |
| HTH. | |
| SCORE: 22.406635284423828 | |
| TITLE: Discussion using datasets in offline mode | |
| URL: https://github.com/huggingface/datasets/issues/824 | |
| ================================================== | |
| """</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1l7ida9">不错!我们的输出的第 2 个结果似乎与查询匹配。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-sglmwj">✏️ 试试看!创建你自己的查询并查看你是否可以在检索到的文档中找到答案。你可能需要在 <code>Dataset.get_nearest_examples()</code> 增加参数 <code>k</code> 以扩大搜索范围。</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/zh-CN/chapter5/6.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_9aawfw = { | |
| assets: "/docs/course/pr_1021/zh-CN", | |
| base: "/docs/course/pr_1021/zh-CN", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1021/zh-CN/_app/immutable/entry/start.f3a1a511.js"), | |
| import("/docs/course/pr_1021/zh-CN/_app/immutable/entry/app.c39e37cf.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 39], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 79.4 kB
- Xet hash:
- 26627d0c9da674fbe66ae1f8038a1c9d34a3467471206f333c9ff3ac39c33717
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.