Question Answering
sanjudebnath commited on
Commit
22f3dba
verified
1 Parent(s): 5c41cbc

Delete load_data.ipynb

Browse files
Files changed (1) hide show
  1. load_data.ipynb +0 -1209
load_data.ipynb DELETED
@@ -1,1209 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "12d87b30",
6
- "metadata": {},
7
- "source": [
8
- "# Load Data\n",
9
- "This notebook loads and preproceses all necessary data, namely the following.\n",
10
- "* OpenWebTextCorpus: for base DistilBERT model\n",
11
- "* SQuAD datasrt: for Q&A\n",
12
- "* Natural Questions (needs to be downloaded externally but is preprocessed here): for Q&A\n",
13
- "* HotPotQA: for Q&A"
14
- ]
15
- },
16
- {
17
- "cell_type": "code",
18
- "execution_count": 4,
19
- "id": "7c82d7fa",
20
- "metadata": {},
21
- "outputs": [],
22
- "source": [
23
- "from tqdm.auto import tqdm\n",
24
- "from datasets import load_dataset\n",
25
- "import os\n",
26
- "import pandas as pd\n",
27
- "import random"
28
- ]
29
- },
30
- {
31
- "cell_type": "markdown",
32
- "id": "1737f219",
33
- "metadata": {},
34
- "source": [
35
- "## Distilbert Data\n",
36
- "In the following, we download the english openwebtext dataset from huggingface (https://huggingface.co/datasets/openwebtext). The dataset is provided by Aaron Gokaslan and Vanya Cohen from Brown University (https://skylion007.github.io/OpenWebTextCorpus/).\n",
37
- "\n",
38
- "We first load the data, investigate the structure and write the dataset into files of each 10 000 texts."
39
- ]
40
- },
41
- {
42
- "cell_type": "code",
43
- "execution_count": null,
44
- "id": "cce7623c",
45
- "metadata": {},
46
- "outputs": [],
47
- "source": [
48
- "ds = load_dataset(\"openwebtext\")"
49
- ]
50
- },
51
- {
52
- "cell_type": "code",
53
- "execution_count": 4,
54
- "id": "678a5e86",
55
- "metadata": {},
56
- "outputs": [
57
- {
58
- "data": {
59
- "text/plain": [
60
- "DatasetDict({\n",
61
- " train: Dataset({\n",
62
- " features: ['text'],\n",
63
- " num_rows: 8013769\n",
64
- " })\n",
65
- "})"
66
- ]
67
- },
68
- "execution_count": 4,
69
- "metadata": {},
70
- "output_type": "execute_result"
71
- }
72
- ],
73
- "source": [
74
- "# we have a text-only training dataset with 8 million entries\n",
75
- "ds"
76
- ]
77
- },
78
- {
79
- "cell_type": "code",
80
- "execution_count": 5,
81
- "id": "b141bce7",
82
- "metadata": {},
83
- "outputs": [],
84
- "source": [
85
- "# create necessary folders\n",
86
- "os.mkdir('data')\n",
87
- "os.mkdir('data/original')"
88
- ]
89
- },
90
- {
91
- "cell_type": "code",
92
- "execution_count": null,
93
- "id": "ca94f995",
94
- "metadata": {},
95
- "outputs": [],
96
- "source": [
97
- "# save text in chunks of 10000 samples\n",
98
- "text = []\n",
99
- "i = 0\n",
100
- "\n",
101
- "for sample in tqdm(ds['train']):\n",
102
- " # replace all newlines\n",
103
- " sample = sample['text'].replace('\\n','')\n",
104
- " \n",
105
- " # append cleaned sample to all texts\n",
106
- " text.append(sample)\n",
107
- " \n",
108
- " # if we processed 10000 samples, write them to a file and start over\n",
109
- " if len(text) == 10000:\n",
110
- " with open(f\"data/original/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
111
- " f.write('\\n'.join(text))\n",
112
- " text = []\n",
113
- " i += 1 \n",
114
- "\n",
115
- "# write remaining samples to a file\n",
116
- "with open(f\"data/original/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
117
- " f.write('\\n'.join(text))"
118
- ]
119
- },
120
- {
121
- "cell_type": "markdown",
122
- "id": "f131dcfc",
123
- "metadata": {},
124
- "source": [
125
- "### Testing\n",
126
- "If we load the first file, we should get a file that is 10000 lines long and has one column\n",
127
- "\n",
128
- "As we do not preprocess the data in any way, but just write the read text into the file, this is all testing necessary"
129
- ]
130
- },
131
- {
132
- "cell_type": "code",
133
- "execution_count": 13,
134
- "id": "df50af74",
135
- "metadata": {},
136
- "outputs": [],
137
- "source": [
138
- "with open(\"data/original/text_0.txt\", 'r', encoding='utf-8') as f:\n",
139
- " lines = f.read().split('\\n')\n",
140
- "lines = pd.DataFrame(lines)"
141
- ]
142
- },
143
- {
144
- "cell_type": "code",
145
- "execution_count": 14,
146
- "id": "8ddb0085",
147
- "metadata": {},
148
- "outputs": [
149
- {
150
- "name": "stdout",
151
- "output_type": "stream",
152
- "text": [
153
- "Passed\n"
154
- ]
155
- }
156
- ],
157
- "source": [
158
- "assert lines.shape==(10000,1)\n",
159
- "print(\"Passed\")"
160
- ]
161
- },
162
- {
163
- "cell_type": "markdown",
164
- "id": "1a65b268",
165
- "metadata": {},
166
- "source": [
167
- "## SQuAD Data\n",
168
- "In the following, we download the SQuAD dataset from huggingface (https://huggingface.co/datasets/squad). It was initially provided by Rajpurkar et al. from Stanford University.\n",
169
- "\n",
170
- "We again load the dataset and store it in chunks of 1000 into files."
171
- ]
172
- },
173
- {
174
- "cell_type": "code",
175
- "execution_count": 6,
176
- "id": "6750ce6e",
177
- "metadata": {},
178
- "outputs": [
179
- {
180
- "ename": "AssertionError",
181
- "evalue": "",
182
- "output_type": "error",
183
- "traceback": [
184
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
185
- "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
186
- "Cell \u001b[0;32mIn [6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m load_dataset(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msquad\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
187
- "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1670\u001b[0m, in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, **config_kwargs)\u001b[0m\n\u001b[1;32m 1667\u001b[0m ignore_verifications \u001b[38;5;241m=\u001b[39m ignore_verifications \u001b[38;5;129;01mor\u001b[39;00m save_infos\n\u001b[1;32m 1669\u001b[0m \u001b[38;5;66;03m# Create a dataset builder\u001b[39;00m\n\u001b[0;32m-> 1670\u001b[0m builder_instance \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset_builder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1671\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1672\u001b[0m \u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1673\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1674\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1675\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcache_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1676\u001b[0m \u001b[43m \u001b[49m\u001b[43mfeatures\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfeatures\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1677\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1678\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1679\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1680\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_auth_token\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_auth_token\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1681\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mconfig_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1682\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1684\u001b[0m \u001b[38;5;66;03m# Return iterable dataset in case of streaming\u001b[39;00m\n\u001b[1;32m 1685\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streaming:\n",
188
- "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1447\u001b[0m, in \u001b[0;36mload_dataset_builder\u001b[0;34m(path, name, data_dir, data_files, cache_dir, features, download_config, download_mode, revision, use_auth_token, **config_kwargs)\u001b[0m\n\u001b[1;32m 1445\u001b[0m download_config \u001b[38;5;241m=\u001b[39m download_config\u001b[38;5;241m.\u001b[39mcopy() \u001b[38;5;28;01mif\u001b[39;00m download_config \u001b[38;5;28;01melse\u001b[39;00m DownloadConfig()\n\u001b[1;32m 1446\u001b[0m download_config\u001b[38;5;241m.\u001b[39muse_auth_token \u001b[38;5;241m=\u001b[39m use_auth_token\n\u001b[0;32m-> 1447\u001b[0m dataset_module \u001b[38;5;241m=\u001b[39m \u001b[43mdataset_module_factory\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1448\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1449\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1450\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1451\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1452\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1453\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1454\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1456\u001b[0m \u001b[38;5;66;03m# Get dataset builder class from the processing script\u001b[39;00m\n\u001b[1;32m 1457\u001b[0m builder_cls \u001b[38;5;241m=\u001b[39m import_main_class(dataset_module\u001b[38;5;241m.\u001b[39mmodule_path)\n",
189
- "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1172\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)\u001b[0m\n\u001b[1;32m 1167\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(e1, \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m):\n\u001b[1;32m 1168\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[1;32m 1169\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1170\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m on the Hugging Face Hub either: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mtype\u001b[39m(e1)\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me1\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1171\u001b[0m ) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[0;32m-> 1172\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e1 \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28mNone\u001b[39m\n\u001b[1;32m 1173\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1174\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[1;32m 1175\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCouldn\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt find a dataset script at \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrelative_to_absolute_path(combined_path)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m or any data file in the same directory.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1176\u001b[0m )\n",
190
- "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:1151\u001b[0m, in \u001b[0;36mdataset_module_factory\u001b[0;34m(path, revision, download_config, download_mode, dynamic_modules_path, data_dir, data_files, **download_kwargs)\u001b[0m\n\u001b[1;32m 1143\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m HubDatasetModuleFactoryWithScript(\n\u001b[1;32m 1144\u001b[0m path,\n\u001b[1;32m 1145\u001b[0m revision\u001b[38;5;241m=\u001b[39mrevision,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1148\u001b[0m dynamic_modules_path\u001b[38;5;241m=\u001b[39mdynamic_modules_path,\n\u001b[1;32m 1149\u001b[0m )\u001b[38;5;241m.\u001b[39mget_module()\n\u001b[1;32m 1150\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mHubDatasetModuleFactoryWithoutScript\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1152\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1153\u001b[0m \u001b[43m \u001b[49m\u001b[43mrevision\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrevision\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1154\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_dir\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1155\u001b[0m \u001b[43m \u001b[49m\u001b[43mdata_files\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdata_files\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1156\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1157\u001b[0m \u001b[43m \u001b[49m\u001b[43mdownload_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdownload_mode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1158\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mget_module()\n\u001b[1;32m 1159\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e1: \u001b[38;5;66;03m# noqa: all the attempts failed, before raising the error we should check if the module is already cached.\u001b[39;00m\n\u001b[1;32m 1160\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
191
- "File \u001b[0;32m~/anaconda3/envs/myenv/lib/python3.10/site-packages/datasets/load.py:744\u001b[0m, in \u001b[0;36mHubDatasetModuleFactoryWithoutScript.__init__\u001b[0;34m(self, name, revision, data_dir, data_files, download_config, download_mode)\u001b[0m\n\u001b[1;32m 742\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_config \u001b[38;5;241m=\u001b[39m download_config \u001b[38;5;129;01mor\u001b[39;00m DownloadConfig()\n\u001b[1;32m 743\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdownload_mode \u001b[38;5;241m=\u001b[39m download_mode\n\u001b[0;32m--> 744\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname\u001b[38;5;241m.\u001b[39mcount(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 745\u001b[0m increase_load_count(name, resource_type\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdataset\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
192
- "\u001b[0;31mAssertionError\u001b[0m: "
193
- ]
194
- }
195
- ],
196
- "source": [
197
- "dataset = load_dataset(\"squad\")"
198
- ]
199
- },
200
- {
201
- "cell_type": "code",
202
- "execution_count": null,
203
- "id": "65a7ee23",
204
- "metadata": {},
205
- "outputs": [
206
- {
207
- "ename": "",
208
- "evalue": "",
209
- "output_type": "error",
210
- "traceback": [
211
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
212
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
213
- ]
214
- },
215
- {
216
- "ename": "",
217
- "evalue": "",
218
- "output_type": "error",
219
- "traceback": [
220
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
221
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
222
- ]
223
- }
224
- ],
225
- "source": [
226
- "os.mkdir(\"data/training_squad\")\n",
227
- "os.mkdir(\"data/test_squad\")"
228
- ]
229
- },
230
- {
231
- "cell_type": "code",
232
- "execution_count": null,
233
- "id": "f6ebf63e",
234
- "metadata": {},
235
- "outputs": [
236
- {
237
- "ename": "",
238
- "evalue": "",
239
- "output_type": "error",
240
- "traceback": [
241
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
242
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
243
- ]
244
- },
245
- {
246
- "ename": "",
247
- "evalue": "",
248
- "output_type": "error",
249
- "traceback": [
250
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
251
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
252
- ]
253
- }
254
- ],
255
- "source": [
256
- "# we already have a training and test split. Each sample has an id, title, context, question and answers.\n",
257
- "dataset"
258
- ]
259
- },
260
- {
261
- "cell_type": "code",
262
- "execution_count": null,
263
- "id": "f67ae448",
264
- "metadata": {},
265
- "outputs": [
266
- {
267
- "ename": "",
268
- "evalue": "",
269
- "output_type": "error",
270
- "traceback": [
271
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
272
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
273
- ]
274
- },
275
- {
276
- "ename": "",
277
- "evalue": "",
278
- "output_type": "error",
279
- "traceback": [
280
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
281
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
282
- ]
283
- }
284
- ],
285
- "source": [
286
- "# answers are provided like that - we need to extract answer_end for the model\n",
287
- "dataset['train']['answers'][0]"
288
- ]
289
- },
290
- {
291
- "cell_type": "code",
292
- "execution_count": null,
293
- "id": "101cd650",
294
- "metadata": {},
295
- "outputs": [
296
- {
297
- "ename": "",
298
- "evalue": "",
299
- "output_type": "error",
300
- "traceback": [
301
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
302
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
303
- ]
304
- },
305
- {
306
- "ename": "",
307
- "evalue": "",
308
- "output_type": "error",
309
- "traceback": [
310
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
311
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
312
- ]
313
- }
314
- ],
315
- "source": [
316
- "# column contains the split (either train or validation), save_dir is the directory\n",
317
- "def save_samples(column, save_dir):\n",
318
- " text = []\n",
319
- " i = 0\n",
320
- "\n",
321
- " for sample in tqdm(dataset[column]):\n",
322
- " \n",
323
- " # preprocess the context and question by removing the newlines\n",
324
- " context = sample['context'].replace('\\n','')\n",
325
- " question = sample['question'].replace('\\n','')\n",
326
- "\n",
327
- " # get the answer as text and start character index\n",
328
- " answer_text = sample['answers']['text'][0]\n",
329
- " answer_start = str(sample['answers']['answer_start'][0])\n",
330
- " \n",
331
- " text.append([context, question, answer_text, answer_start])\n",
332
- "\n",
333
- " # we choose chunks of 1000\n",
334
- " if len(text) == 1000:\n",
335
- " with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
336
- " f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
337
- " text = []\n",
338
- " i += 1\n",
339
- "\n",
340
- " # save remaining\n",
341
- " with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
342
- " f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
343
- "\n",
344
- "save_samples(\"train\", \"training_squad\")\n",
345
- "save_samples(\"validation\", \"test_squad\")\n",
346
- " "
347
- ]
348
- },
349
- {
350
- "cell_type": "markdown",
351
- "id": "67044d13",
352
- "metadata": {
353
- "collapsed": false,
354
- "jupyter": {
355
- "outputs_hidden": false
356
- }
357
- },
358
- "source": [
359
- "### Testing\n",
360
- "If we load a file, we should get a file with 10000 lines and 4 columns\n",
361
- "\n",
362
- "Also, we want to assure the correct interval. Hence, the second test."
363
- ]
364
- },
365
- {
366
- "cell_type": "code",
367
- "execution_count": null,
368
- "id": "446281cf",
369
- "metadata": {},
370
- "outputs": [
371
- {
372
- "ename": "",
373
- "evalue": "",
374
- "output_type": "error",
375
- "traceback": [
376
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
377
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
378
- ]
379
- },
380
- {
381
- "ename": "",
382
- "evalue": "",
383
- "output_type": "error",
384
- "traceback": [
385
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
386
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
387
- ]
388
- }
389
- ],
390
- "source": [
391
- "with open(\"data/training_squad/text_0.txt\", 'r', encoding='utf-8') as f:\n",
392
- " lines = f.read().split('\\n')\n",
393
- " \n",
394
- "lines = pd.DataFrame([line.split(\"\\t\") for line in lines], columns=[\"context\", \"question\", \"answer\", \"answer_start\"])"
395
- ]
396
- },
397
- {
398
- "cell_type": "code",
399
- "execution_count": null,
400
- "id": "ccd5c650",
401
- "metadata": {},
402
- "outputs": [
403
- {
404
- "ename": "",
405
- "evalue": "",
406
- "output_type": "error",
407
- "traceback": [
408
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
409
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
410
- ]
411
- },
412
- {
413
- "ename": "",
414
- "evalue": "",
415
- "output_type": "error",
416
- "traceback": [
417
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
418
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
419
- ]
420
- }
421
- ],
422
- "source": [
423
- "assert lines.shape==(1000,4)\n",
424
- "print(\"Passed\")"
425
- ]
426
- },
427
- {
428
- "cell_type": "code",
429
- "execution_count": null,
430
- "id": "2c9e4b70",
431
- "metadata": {},
432
- "outputs": [
433
- {
434
- "ename": "",
435
- "evalue": "",
436
- "output_type": "error",
437
- "traceback": [
438
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
439
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
440
- ]
441
- },
442
- {
443
- "ename": "",
444
- "evalue": "",
445
- "output_type": "error",
446
- "traceback": [
447
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
448
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
449
- ]
450
- }
451
- ],
452
- "source": [
453
- "# we assert that we have the right interval\n",
454
- "for ind, line in lines.iterrows():\n",
455
- " sample = line\n",
456
- " answer_start = int(sample['answer_start'])\n",
457
- " assert sample['context'][answer_start:answer_start+len(sample['answer'])] == sample['answer']\n",
458
- "print(\"Passed\")"
459
- ]
460
- },
461
- {
462
- "cell_type": "markdown",
463
- "id": "02265ace",
464
- "metadata": {},
465
- "source": [
466
- "## Natural Questions Dataset\n",
467
- "* Download from https://ai.google.com/research/NaturalQuestions via gsutil (the one from huggingface has 134.92GB, the one from google cloud is in archives)\n",
468
- "* Use gunzip to get some samples - we then get `.jsonl`files\n",
469
- "* The dataset is a lot more messy, as it is just wikipedia articles with all web artifacts\n",
470
- " * I cleaned the html tags\n",
471
- " * Also I chose a random interval (containing the answer) from the dataset\n",
472
- " * We can't send the whole text into the model anyways"
473
- ]
474
- },
475
- {
476
- "cell_type": "code",
477
- "execution_count": null,
478
- "id": "f3bce0c1",
479
- "metadata": {},
480
- "outputs": [
481
- {
482
- "ename": "",
483
- "evalue": "",
484
- "output_type": "error",
485
- "traceback": [
486
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
487
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
488
- ]
489
- },
490
- {
491
- "ename": "",
492
- "evalue": "",
493
- "output_type": "error",
494
- "traceback": [
495
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
496
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
497
- ]
498
- }
499
- ],
500
- "source": [
501
- "from pathlib import Path\n",
502
- "paths = [str(x) for x in Path('data/natural_questions/v1.0/train/').glob('**/*.jsonl')]"
503
- ]
504
- },
505
- {
506
- "cell_type": "code",
507
- "execution_count": null,
508
- "id": "e9c58c00",
509
- "metadata": {},
510
- "outputs": [
511
- {
512
- "ename": "",
513
- "evalue": "",
514
- "output_type": "error",
515
- "traceback": [
516
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
517
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
518
- ]
519
- },
520
- {
521
- "ename": "",
522
- "evalue": "",
523
- "output_type": "error",
524
- "traceback": [
525
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
526
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
527
- ]
528
- }
529
- ],
530
- "source": [
531
- "os.mkdir(\"data/natural_questions_train\")"
532
- ]
533
- },
534
- {
535
- "cell_type": "code",
536
- "execution_count": null,
537
- "id": "0ed7ba6c",
538
- "metadata": {},
539
- "outputs": [
540
- {
541
- "ename": "",
542
- "evalue": "",
543
- "output_type": "error",
544
- "traceback": [
545
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
546
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
547
- ]
548
- },
549
- {
550
- "ename": "",
551
- "evalue": "",
552
- "output_type": "error",
553
- "traceback": [
554
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
555
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
556
- ]
557
- }
558
- ],
559
- "source": [
560
- "import re\n",
561
- "\n",
562
- "# clean html tags\n",
563
- "CLEANR = re.compile('<.+?>')\n",
564
- "# clean multiple spaces\n",
565
- "CLEANMULTSPACE = re.compile('(\\s)+')\n",
566
- "\n",
567
- "# the function takes an html documents and removes artifacts\n",
568
- "def cleanhtml(raw_html):\n",
569
- " # tags\n",
570
- " cleantext = re.sub(CLEANR, '', raw_html)\n",
571
- " # newlines\n",
572
- " cleantext = cleantext.replace(\"\\n\", '')\n",
573
- " # tabs\n",
574
- " cleantext = cleantext.replace(\"\\t\", '')\n",
575
- " # character encodings\n",
576
- " cleantext = cleantext.replace(\"&#39;\", \"'\")\n",
577
- " cleantext = cleantext.replace(\"&amp;\", \"'\")\n",
578
- " cleantext = cleantext.replace(\"&quot;\", '\"')\n",
579
- " # multiple spaces\n",
580
- " cleantext = re.sub(CLEANMULTSPACE, ' ', cleantext)\n",
581
- " # documents end with this tags, if it is present in the string, cut it off\n",
582
- " idx = cleantext.find(\"<!-- NewPP limit\")\n",
583
- " if idx > -1:\n",
584
- " cleantext = cleantext[:idx]\n",
585
- " return cleantext.strip()"
586
- ]
587
- },
588
- {
589
- "cell_type": "code",
590
- "execution_count": null,
591
- "id": "66ca19ac",
592
- "metadata": {},
593
- "outputs": [
594
- {
595
- "ename": "",
596
- "evalue": "",
597
- "output_type": "error",
598
- "traceback": [
599
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
600
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
601
- ]
602
- },
603
- {
604
- "ename": "",
605
- "evalue": "",
606
- "output_type": "error",
607
- "traceback": [
608
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
609
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
610
- ]
611
- }
612
- ],
613
- "source": [
614
- "import json\n",
615
- "\n",
616
- "# file count\n",
617
- "i = 0\n",
618
- "data = []\n",
619
- "\n",
620
- "# iterate over all json files\n",
621
- "for path in paths:\n",
622
- " print(path)\n",
623
- " # read file and store as list (this requires much memory, as the files are huge)\n",
624
- " with open(path, 'r') as json_file:\n",
625
- " json_list = list(json_file)\n",
626
- " \n",
627
- " # process every context, question, answer pair\n",
628
- " for json_str in json_list:\n",
629
- " result = json.loads(json_str)\n",
630
- "\n",
631
- " # append a question mark - SQuAD questions end with a qm too\n",
632
- " question = result['question_text'] + \"?\"\n",
633
- " \n",
634
- " # some question do not contain an answer - we do not need them\n",
635
- " if(len(result['annotations'][0]['short_answers'])==0):\n",
636
- " continue\n",
637
- "\n",
638
- " # get true start/end byte\n",
639
- " true_start = result['annotations'][0]['short_answers'][0]['start_byte']\n",
640
- " true_end = result['annotations'][0]['short_answers'][0]['end_byte']\n",
641
- "\n",
642
- " # convert to bytes\n",
643
- " byte_encoding = bytes(result['document_html'], encoding='utf-8')\n",
644
- " \n",
645
- " # the document is the whole wikipedia article, we randomly choose an appropriate part (containing the\n",
646
- " # answer): we have 512 tokens as the input for the model - 4000 bytes lead to a good length\n",
647
- " max_back = 3500 if true_start >= 3500 else true_start\n",
648
- " first = random.randint(int(true_start)-max_back, int(true_start))\n",
649
- " end = first + 3500 + true_end - true_start\n",
650
- " \n",
651
- " # get chosen context\n",
652
- " cleanbytes = byte_encoding[first:end]\n",
653
- " # decode back to text - if our end byte is the middle of a word, we ignore it and cut it off\n",
654
- " cleantext = bytes.decode(cleanbytes, errors='ignore')\n",
655
- " # clean html tags\n",
656
- " cleantext = cleanhtml(cleantext)\n",
657
- "\n",
658
- " # find the true answer\n",
659
- " answer_start = cleanbytes.find(byte_encoding[true_start:true_end])\n",
660
- " true_answer = bytes.decode(cleanbytes[answer_start:answer_start+(true_end-true_start)])\n",
661
- " \n",
662
- " # clean html tags\n",
663
- " true_answer = cleanhtml(true_answer)\n",
664
- " \n",
665
- " start_ind = cleantext.find(true_answer)\n",
666
- " \n",
667
- " # If cleaning the string makes the answer not findable skip it\n",
668
- " # this hardly ever happens, except if there is an emense amount of web artifacts\n",
669
- " if start_ind == -1:\n",
670
- " continue\n",
671
- " \n",
672
- " data.append([cleantext, question, true_answer, str(start_ind)])\n",
673
- "\n",
674
- " if len(data) == 1000:\n",
675
- " with open(f\"data/natural_questions_train/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
676
- " f.write(\"\\n\".join([\"\\t\".join(t) for t in data]))\n",
677
- " i += 1\n",
678
- " data = []\n",
679
- "with open(f\"data/natural_questions_train/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
680
- " f.write(\"\\n\".join([\"\\t\".join(t) for t in data]))"
681
- ]
682
- },
683
- {
684
- "cell_type": "markdown",
685
- "id": "30f26b4e",
686
- "metadata": {},
687
- "source": [
688
- "### Testing\n",
689
- "In the following, we first check if the shape of the file is correct.\n",
690
- "\n",
691
- "Then we iterate over the file and check if the answers according to the file are the same as in the original file."
692
- ]
693
- },
694
- {
695
- "cell_type": "code",
696
- "execution_count": null,
697
- "id": "490ac0db",
698
- "metadata": {},
699
- "outputs": [
700
- {
701
- "ename": "",
702
- "evalue": "",
703
- "output_type": "error",
704
- "traceback": [
705
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
706
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
707
- ]
708
- },
709
- {
710
- "ename": "",
711
- "evalue": "",
712
- "output_type": "error",
713
- "traceback": [
714
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
715
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
716
- ]
717
- }
718
- ],
719
- "source": [
720
- "with open(\"data/natural_questions_train/text_0.txt\", 'r', encoding='utf-8') as f:\n",
721
- " lines = f.read().split('\\n')\n",
722
- " \n",
723
- "lines = pd.DataFrame([line.split(\"\\t\") for line in lines], columns=[\"context\", \"question\", \"answer\", \"answer_start\"])"
724
- ]
725
- },
726
- {
727
- "cell_type": "code",
728
- "execution_count": null,
729
- "id": "0d7cc3ee",
730
- "metadata": {},
731
- "outputs": [
732
- {
733
- "ename": "",
734
- "evalue": "",
735
- "output_type": "error",
736
- "traceback": [
737
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
738
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
739
- ]
740
- },
741
- {
742
- "ename": "",
743
- "evalue": "",
744
- "output_type": "error",
745
- "traceback": [
746
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
747
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
748
- ]
749
- }
750
- ],
751
- "source": [
752
- "assert lines.shape == (1000, 4)\n",
753
- "print(\"Passed\")"
754
- ]
755
- },
756
- {
757
- "cell_type": "code",
758
- "execution_count": null,
759
- "id": "0fd8a854",
760
- "metadata": {},
761
- "outputs": [
762
- {
763
- "ename": "",
764
- "evalue": "",
765
- "output_type": "error",
766
- "traceback": [
767
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
768
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
769
- ]
770
- },
771
- {
772
- "ename": "",
773
- "evalue": "",
774
- "output_type": "error",
775
- "traceback": [
776
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
777
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
778
- ]
779
- }
780
- ],
781
- "source": [
782
- "with open(\"data/natural_questions/v1.0/train/nq-train-00.jsonl\", 'r') as json_file:\n",
783
- " json_list = list(json_file)[:500]\n",
784
- "del json_file"
785
- ]
786
- },
787
- {
788
- "cell_type": "code",
789
- "execution_count": null,
790
- "id": "170bff30",
791
- "metadata": {},
792
- "outputs": [
793
- {
794
- "ename": "",
795
- "evalue": "",
796
- "output_type": "error",
797
- "traceback": [
798
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
799
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
800
- ]
801
- },
802
- {
803
- "ename": "",
804
- "evalue": "",
805
- "output_type": "error",
806
- "traceback": [
807
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
808
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
809
- ]
810
- }
811
- ],
812
- "source": [
813
- "lines_index = 0\n",
814
- "for i in range(len(json_list)):\n",
815
- " result = json.loads(json_list[i])\n",
816
- " \n",
817
- " if(len(result['annotations'][0]['short_answers'])==0):\n",
818
- " pass\n",
819
- " else: \n",
820
- " # assert that the question text is the same\n",
821
- " assert result['question_text'] + \"?\" == lines.loc[lines_index, 'question']\n",
822
- " true_start = result['annotations'][0]['short_answers'][0]['start_byte']\n",
823
- " true_end = result['annotations'][0]['short_answers'][0]['end_byte']\n",
824
- " true_answer = bytes.decode(bytes(result['document_html'], encoding='utf-8')[true_start:true_end])\n",
825
- " \n",
826
- " processed_answer = lines.loc[lines_index, 'answer']\n",
827
- " # assert that the answer is the same\n",
828
- " assert cleanhtml(true_answer) == processed_answer\n",
829
- " \n",
830
- " start_ind = int(lines.loc[lines_index, 'answer_start'])\n",
831
- " # assert that the answer (according to the index) is the same\n",
832
- " assert cleanhtml(true_answer) == lines.loc[lines_index, 'context'][start_ind:start_ind+len(processed_answer)]\n",
833
- " \n",
834
- " lines_index += 1\n",
835
- " \n",
836
- " if lines_index == len(lines):\n",
837
- " break\n",
838
- "print(\"Passed\")"
839
- ]
840
- },
841
- {
842
- "cell_type": "markdown",
843
- "id": "78e6e737",
844
- "metadata": {},
845
- "source": [
846
- "## Hotpot QA"
847
- ]
848
- },
849
- {
850
- "cell_type": "code",
851
- "execution_count": null,
852
- "id": "27efcc8c",
853
- "metadata": {},
854
- "outputs": [
855
- {
856
- "ename": "",
857
- "evalue": "",
858
- "output_type": "error",
859
- "traceback": [
860
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
861
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
862
- ]
863
- },
864
- {
865
- "ename": "",
866
- "evalue": "",
867
- "output_type": "error",
868
- "traceback": [
869
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
870
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
871
- ]
872
- }
873
- ],
874
- "source": [
875
- "ds = load_dataset(\"hotpot_qa\", 'fullwiki')"
876
- ]
877
- },
878
- {
879
- "cell_type": "code",
880
- "execution_count": null,
881
- "id": "1493f21f",
882
- "metadata": {},
883
- "outputs": [
884
- {
885
- "ename": "",
886
- "evalue": "",
887
- "output_type": "error",
888
- "traceback": [
889
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
890
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
891
- ]
892
- },
893
- {
894
- "ename": "",
895
- "evalue": "",
896
- "output_type": "error",
897
- "traceback": [
898
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
899
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
900
- ]
901
- }
902
- ],
903
- "source": [
904
- "ds"
905
- ]
906
- },
907
- {
908
- "cell_type": "code",
909
- "execution_count": null,
910
- "id": "2a047946",
911
- "metadata": {},
912
- "outputs": [
913
- {
914
- "ename": "",
915
- "evalue": "",
916
- "output_type": "error",
917
- "traceback": [
918
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
919
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
920
- ]
921
- },
922
- {
923
- "ename": "",
924
- "evalue": "",
925
- "output_type": "error",
926
- "traceback": [
927
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
928
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
929
- ]
930
- }
931
- ],
932
- "source": [
933
- "os.mkdir('data/hotpotqa_training')\n",
934
- "os.mkdir('data/hotpotqa_test')"
935
- ]
936
- },
937
- {
938
- "cell_type": "code",
939
- "execution_count": null,
940
- "id": "e65b6485",
941
- "metadata": {},
942
- "outputs": [
943
- {
944
- "ename": "",
945
- "evalue": "",
946
- "output_type": "error",
947
- "traceback": [
948
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
949
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
950
- ]
951
- },
952
- {
953
- "ename": "",
954
- "evalue": "",
955
- "output_type": "error",
956
- "traceback": [
957
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
958
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
959
- ]
960
- }
961
- ],
962
- "source": [
963
- "# column contains the split (either train or validation), save_dir is the directory\n",
964
- "def save_samples(column, save_dir):\n",
965
- " text = []\n",
966
- " i = 0\n",
967
- "\n",
968
- " for sample in tqdm(ds[column]):\n",
969
- " \n",
970
- " # preprocess the context and question by removing the newlines\n",
971
- " context = sample['context']['sentences']\n",
972
- " context = \" \".join([\"\".join(sentence) for sentence in context])\n",
973
- " question = sample['question'].replace('\\n','')\n",
974
- " \n",
975
- " # get the answer as text and start character index\n",
976
- " answer_text = sample['answer']\n",
977
- " answer_start = context.find(answer_text)\n",
978
- " if answer_start == -1:\n",
979
- " continue\n",
980
- " \n",
981
- " \n",
982
- " \n",
983
- " if answer_start > 1500:\n",
984
- " first = random.randint(answer_start-1500, answer_start)\n",
985
- " end = first + 1500 + len(answer_text)\n",
986
- " \n",
987
- " context = context[first:end+1]\n",
988
- " answer_start = context.find(answer_text)\n",
989
- " \n",
990
- " if answer_start == -1:continue\n",
991
- " \n",
992
- " text.append([context, question, answer_text, str(answer_start)])\n",
993
- "\n",
994
- " # we choose chunks of 1000\n",
995
- " if len(text) == 1000:\n",
996
- " with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
997
- " f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
998
- " text = []\n",
999
- " i += 1\n",
1000
- "\n",
1001
- " # save remaining\n",
1002
- " with open(f\"data/{save_dir}/text_{i}.txt\", 'w', encoding='utf-8') as f:\n",
1003
- " f.write(\"\\n\".join([\"\\t\".join(t) for t in text]))\n",
1004
- "\n",
1005
- "save_samples(\"train\", \"hotpotqa_training\")\n",
1006
- "save_samples(\"validation\", \"hotpotqa_test\")"
1007
- ]
1008
- },
1009
- {
1010
- "cell_type": "markdown",
1011
- "id": "97cc358f",
1012
- "metadata": {},
1013
- "source": [
1014
- "## Testing"
1015
- ]
1016
- },
1017
- {
1018
- "cell_type": "code",
1019
- "execution_count": null,
1020
- "id": "f321483c",
1021
- "metadata": {},
1022
- "outputs": [
1023
- {
1024
- "ename": "",
1025
- "evalue": "",
1026
- "output_type": "error",
1027
- "traceback": [
1028
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1029
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1030
- ]
1031
- },
1032
- {
1033
- "ename": "",
1034
- "evalue": "",
1035
- "output_type": "error",
1036
- "traceback": [
1037
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1038
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1039
- ]
1040
- }
1041
- ],
1042
- "source": [
1043
- "with open(\"data/hotpotqa_training/text_0.txt\", 'r', encoding='utf-8') as f:\n",
1044
- " lines = f.read().split('\\n')\n",
1045
- " \n",
1046
- "lines = pd.DataFrame([line.split(\"\\t\") for line in lines], columns=[\"context\", \"question\", \"answer\", \"answer_start\"])"
1047
- ]
1048
- },
1049
- {
1050
- "cell_type": "code",
1051
- "execution_count": null,
1052
- "id": "72a96e78",
1053
- "metadata": {},
1054
- "outputs": [
1055
- {
1056
- "ename": "",
1057
- "evalue": "",
1058
- "output_type": "error",
1059
- "traceback": [
1060
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1061
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1062
- ]
1063
- },
1064
- {
1065
- "ename": "",
1066
- "evalue": "",
1067
- "output_type": "error",
1068
- "traceback": [
1069
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1070
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1071
- ]
1072
- }
1073
- ],
1074
- "source": [
1075
- "assert lines.shape == (1000, 4)\n",
1076
- "print(\"Passed\")"
1077
- ]
1078
- },
1079
- {
1080
- "cell_type": "code",
1081
- "execution_count": null,
1082
- "id": "c32c2f16",
1083
- "metadata": {},
1084
- "outputs": [
1085
- {
1086
- "ename": "",
1087
- "evalue": "",
1088
- "output_type": "error",
1089
- "traceback": [
1090
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1091
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1092
- ]
1093
- },
1094
- {
1095
- "ename": "",
1096
- "evalue": "",
1097
- "output_type": "error",
1098
- "traceback": [
1099
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1100
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1101
- ]
1102
- }
1103
- ],
1104
- "source": [
1105
- "# we assert that we have the right interval\n",
1106
- "for ind, line in lines.iterrows():\n",
1107
- " sample = line\n",
1108
- " answer_start = int(sample['answer_start'])\n",
1109
- " assert sample['context'][answer_start:answer_start+len(sample['answer'])] == sample['answer']\n",
1110
- "print(\"Passed\")"
1111
- ]
1112
- },
1113
- {
1114
- "cell_type": "code",
1115
- "execution_count": null,
1116
- "id": "bc36fe7d",
1117
- "metadata": {},
1118
- "outputs": [
1119
- {
1120
- "ename": "",
1121
- "evalue": "",
1122
- "output_type": "error",
1123
- "traceback": [
1124
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1125
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1126
- ]
1127
- },
1128
- {
1129
- "ename": "",
1130
- "evalue": "",
1131
- "output_type": "error",
1132
- "traceback": [
1133
- "\u001b[1;31mnotebook controller is DISPOSED. \n",
1134
- "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
1135
- ]
1136
- }
1137
- ],
1138
- "source": []
1139
- }
1140
- ],
1141
- "metadata": {
1142
- "kernelspec": {
1143
- "display_name": "Python 3 (ipykernel)",
1144
- "language": "python",
1145
- "name": "python3"
1146
- },
1147
- "language_info": {
1148
- "codemirror_mode": {
1149
- "name": "ipython",
1150
- "version": 3
1151
- },
1152
- "file_extension": ".py",
1153
- "mimetype": "text/x-python",
1154
- "name": "python",
1155
- "nbconvert_exporter": "python",
1156
- "pygments_lexer": "ipython3",
1157
- "version": "3.10.16"
1158
- },
1159
- "toc": {
1160
- "base_numbering": 1,
1161
- "nav_menu": {},
1162
- "number_sections": true,
1163
- "sideBar": true,
1164
- "skip_h1_title": false,
1165
- "title_cell": "Table of Contents",
1166
- "title_sidebar": "Contents",
1167
- "toc_cell": false,
1168
- "toc_position": {},
1169
- "toc_section_display": true,
1170
- "toc_window_display": false
1171
- },
1172
- "varInspector": {
1173
- "cols": {
1174
- "lenName": 16,
1175
- "lenType": 16,
1176
- "lenVar": 40
1177
- },
1178
- "kernels_config": {
1179
- "python": {
1180
- "delete_cmd_postfix": "",
1181
- "delete_cmd_prefix": "del ",
1182
- "library": "var_list.py",
1183
- "varRefreshCmd": "print(var_dic_list())"
1184
- },
1185
- "r": {
1186
- "delete_cmd_postfix": ") ",
1187
- "delete_cmd_prefix": "rm(",
1188
- "library": "var_list.r",
1189
- "varRefreshCmd": "cat(var_dic_list()) "
1190
- }
1191
- },
1192
- "types_to_exclude": [
1193
- "module",
1194
- "function",
1195
- "builtin_function_or_method",
1196
- "instance",
1197
- "_Feature"
1198
- ],
1199
- "window_display": false
1200
- },
1201
- "vscode": {
1202
- "interpreter": {
1203
- "hash": "85bf9c14e9ba73b783ed1274d522bec79eb0b2b739090180d8ce17bb11aff4aa"
1204
- }
1205
- }
1206
- },
1207
- "nbformat": 4,
1208
- "nbformat_minor": 5
1209
- }