franz96521
commited on
Commit
·
a53f410
1
Parent(s):
3393cd3
test
Browse files- .gitattributes +2 -0
- AbstractGenerator.ipynb +434 -0
- AbstractGenerator/TrainigData/en.txt +3 -0
- AbstractGenerator/TrainigData/es.txt +3 -0
- AbstractGenerator/weights/run1/encoder.json +3 -0
- AbstractGenerator/weights/run1/events.out.tfevents.1648184225.FRANZ96521-W11 +3 -0
- AbstractGenerator/weights/run1/events.out.tfevents.1648184499.FRANZ96521-W11 +3 -0
- AbstractGenerator/weights/run1/events.out.tfevents.1648229481.FRANZ96521-W11 +3 -0
- AbstractGenerator/weights/run1/hparams.json +3 -0
- AbstractGenerator/weights/run1/vocab.bpe +3 -0
- Descarga.ipynb +278 -0
- PDF_a_TXT.ipynb +105 -0
- models/124M/checkpoint +3 -0
- models/124M/encoder.json +3 -0
- models/124M/hparams.json +3 -0
- models/124M/model.ckpt.data-00000-of-00001 +3 -0
- models/124M/model.ckpt.index +3 -0
- models/124M/model.ckpt.meta +3 -0
- models/124M/vocab.bpe +3 -0
- txt_to_csv.ipynb +662 -0
.gitattributes
CHANGED
|
@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 25 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 26 |
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
models/** filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
AbstractGenerator/** filter=lfs diff=lfs merge=lfs -text
|
AbstractGenerator.ipynb
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"WARNING:tensorflow:From C:\\Users\\franz\\AppData\\Local\\Temp\\ipykernel_14092\\1198363771.py:6: is_gpu_available (from tensorflow.python.framework.test_util) is deprecated and will be removed in a future version.\n",
|
| 13 |
+
"Instructions for updating:\n",
|
| 14 |
+
"Use `tf.config.list_physical_devices('GPU')` instead.\n",
|
| 15 |
+
"GPU is available\n"
|
| 16 |
+
]
|
| 17 |
+
}
|
| 18 |
+
],
|
| 19 |
+
"source": [
|
| 20 |
+
"\n",
|
| 21 |
+
"import gpt_2_simple as gpt2\n",
|
| 22 |
+
"import os\n",
|
| 23 |
+
"import tensorflow as tf\n",
|
| 24 |
+
"import pandas as pd\n",
|
| 25 |
+
"import re\n",
|
| 26 |
+
"print(\"GPU is\", \"available\" if tf.test.is_gpu_available() else \"NOT AVAILABLE\")"
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "code",
|
| 31 |
+
"execution_count": 2,
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [],
|
| 34 |
+
"source": [
|
| 35 |
+
"model_name = \"124M\"\n",
|
| 36 |
+
"if not os.path.isdir(os.path.join(\"models\", model_name)):\n",
|
| 37 |
+
"\tprint(f\"Downloading {model_name} model...\")\n",
|
| 38 |
+
"\tgpt2.download_gpt2(model_name=model_name) "
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": 3,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": [
|
| 47 |
+
"path = 'AbstractGenerator/'\n",
|
| 48 |
+
"checkpoint_dir =path+'weights/'\n",
|
| 49 |
+
"data_path = path+'TrainigData/'\n",
|
| 50 |
+
"\n",
|
| 51 |
+
"\n",
|
| 52 |
+
"\n",
|
| 53 |
+
"file_name_en = 'en'\n",
|
| 54 |
+
"file_path_en = data_path+file_name_en\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"file_name_es = 'es'\n",
|
| 57 |
+
"file_path_es = data_path+file_name_es\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"prefix= '<|startoftext|>'\n",
|
| 61 |
+
"sufix ='<|endoftext|>'"
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "markdown",
|
| 66 |
+
"metadata": {},
|
| 67 |
+
"source": [
|
| 68 |
+
"# create trainig data"
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"cell_type": "code",
|
| 73 |
+
"execution_count": 13,
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"outputs": [],
|
| 76 |
+
"source": [
|
| 77 |
+
"en = pd.read_csv('CSV\\scientific_paper_en.csv')[0:1000]\n",
|
| 78 |
+
"es = pd.read_csv('CSV\\scientific_paper_es.csv')[0:1000]"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "code",
|
| 83 |
+
"execution_count": 14,
|
| 84 |
+
"metadata": {},
|
| 85 |
+
"outputs": [],
|
| 86 |
+
"source": [
|
| 87 |
+
"import codecs\n",
|
| 88 |
+
"def createTrainingData(ds,fileName= 'resumen.txt' ,path ='TrainigData/'):\n",
|
| 89 |
+
" with codecs.open(path+fileName,'a','utf-8') as f:\n",
|
| 90 |
+
" for i in ds.index:\n",
|
| 91 |
+
" f.write(prefix+\"\\n\")\n",
|
| 92 |
+
" f.write(ds.iloc[i]['text_no_abstract'])\n",
|
| 93 |
+
" f.write(\"ABSTRACT\\n\")\n",
|
| 94 |
+
" f.write(ds.iloc[i]['abstract']+\"\\n\")\n",
|
| 95 |
+
" f.write(sufix)\n",
|
| 96 |
+
" "
|
| 97 |
+
]
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"cell_type": "code",
|
| 101 |
+
"execution_count": 15,
|
| 102 |
+
"metadata": {},
|
| 103 |
+
"outputs": [],
|
| 104 |
+
"source": [
|
| 105 |
+
"createTrainingData(en,'en.txt',data_path)\n",
|
| 106 |
+
"createTrainingData(es,'es.txt',data_path)"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "markdown",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"source": [
|
| 113 |
+
"# pretrained"
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": null,
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"outputs": [],
|
| 121 |
+
"source": [
|
| 122 |
+
"sess = gpt2.start_tf_sess()\n",
|
| 123 |
+
"gpt2.load_gpt2(sess,checkpoint_dir=checkpoint_dir,run_name='run1')"
|
| 124 |
+
]
|
| 125 |
+
},
|
| 126 |
+
{
|
| 127 |
+
"cell_type": "markdown",
|
| 128 |
+
"metadata": {},
|
| 129 |
+
"source": [
|
| 130 |
+
"# train "
|
| 131 |
+
]
|
| 132 |
+
},
|
| 133 |
+
{
|
| 134 |
+
"cell_type": "code",
|
| 135 |
+
"execution_count": 16,
|
| 136 |
+
"metadata": {},
|
| 137 |
+
"outputs": [],
|
| 138 |
+
"source": [
|
| 139 |
+
"tf.compat.v1.reset_default_graph()\n",
|
| 140 |
+
"sess = gpt2.start_tf_sess()"
|
| 141 |
+
]
|
| 142 |
+
},
|
| 143 |
+
{
|
| 144 |
+
"cell_type": "markdown",
|
| 145 |
+
"metadata": {},
|
| 146 |
+
"source": [
|
| 147 |
+
"## en"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"cell_type": "code",
|
| 152 |
+
"execution_count": null,
|
| 153 |
+
"metadata": {},
|
| 154 |
+
"outputs": [],
|
| 155 |
+
"source": [
|
| 156 |
+
"gpt2.finetune(sess,\n",
|
| 157 |
+
" file_path_en+'.txt',\n",
|
| 158 |
+
" model_name=model_name,\n",
|
| 159 |
+
" checkpoint_dir=checkpoint_dir, \n",
|
| 160 |
+
" steps=1000\n",
|
| 161 |
+
" ) "
|
| 162 |
+
]
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"cell_type": "markdown",
|
| 166 |
+
"metadata": {},
|
| 167 |
+
"source": [
|
| 168 |
+
"## es"
|
| 169 |
+
]
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
"cell_type": "code",
|
| 173 |
+
"execution_count": 17,
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"outputs": [
|
| 176 |
+
{
|
| 177 |
+
"name": "stdout",
|
| 178 |
+
"output_type": "stream",
|
| 179 |
+
"text": [
|
| 180 |
+
"Loading checkpoint models\\124M\\model.ckpt\n",
|
| 181 |
+
"INFO:tensorflow:Restoring parameters from models\\124M\\model.ckpt\n",
|
| 182 |
+
"Loading dataset...\n"
|
| 183 |
+
]
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"name": "stderr",
|
| 187 |
+
"output_type": "stream",
|
| 188 |
+
"text": [
|
| 189 |
+
"100%|██████████| 1/1 [00:51<00:00, 51.03s/it]\n"
|
| 190 |
+
]
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"name": "stdout",
|
| 194 |
+
"output_type": "stream",
|
| 195 |
+
"text": [
|
| 196 |
+
"dataset has 17511492 tokens\n",
|
| 197 |
+
"Training...\n"
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
{
|
| 201 |
+
"ename": "ResourceExhaustedError",
|
| 202 |
+
"evalue": "Graph execution error:\n\nfailed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.",
|
| 203 |
+
"output_type": "error",
|
| 204 |
+
"traceback": [
|
| 205 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
| 206 |
+
"\u001b[1;31mResourceExhaustedError\u001b[0m Traceback (most recent call last)",
|
| 207 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1377\u001b[0m, in \u001b[0;36mBaseSession._do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1375'>1376</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1376'>1377</a>\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1377'>1378</a>\u001b[0m \u001b[39mexcept\u001b[39;00m errors\u001b[39m.\u001b[39mOpError \u001b[39mas\u001b[39;00m e:\n",
|
| 208 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1360\u001b[0m, in \u001b[0;36mBaseSession._do_run.<locals>._run_fn\u001b[1;34m(feed_dict, fetch_list, target_list, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1358'>1359</a>\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_extend_graph()\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1359'>1360</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_tf_sessionrun(options, feed_dict, fetch_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1360'>1361</a>\u001b[0m target_list, run_metadata)\n",
|
| 209 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1453\u001b[0m, in \u001b[0;36mBaseSession._call_tf_sessionrun\u001b[1;34m(self, options, feed_dict, fetch_list, target_list, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1450'>1451</a>\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_call_tf_sessionrun\u001b[39m(\u001b[39mself\u001b[39m, options, feed_dict, fetch_list, target_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1451'>1452</a>\u001b[0m run_metadata):\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1452'>1453</a>\u001b[0m \u001b[39mreturn\u001b[39;00m tf_session\u001b[39m.\u001b[39;49mTF_SessionRun_wrapper(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_session, options, feed_dict,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1453'>1454</a>\u001b[0m fetch_list, target_list,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1454'>1455</a>\u001b[0m run_metadata)\n",
|
| 210 |
+
"\u001b[1;31mResourceExhaustedError\u001b[0m: failed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.\n",
|
| 211 |
+
"\nDuring handling of the above exception, another exception occurred:\n",
|
| 212 |
+
"\u001b[1;31mResourceExhaustedError\u001b[0m Traceback (most recent call last)",
|
| 213 |
+
"\u001b[1;32mc:\\Users\\franz\\OneDrive\\Documentos\\GitHub\\Generador-de-abstracts\\AbstractGenerator.ipynb Cell 15'\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[1;32m----> <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=0'>1</a>\u001b[0m gpt2\u001b[39m.\u001b[39;49mfinetune(sess,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=1'>2</a>\u001b[0m file_path_es\u001b[39m+\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39m.txt\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=2'>3</a>\u001b[0m model_name\u001b[39m=\u001b[39;49mmodel_name,\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=3'>4</a>\u001b[0m checkpoint_dir\u001b[39m=\u001b[39;49mcheckpoint_dir, \n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=4'>5</a>\u001b[0m steps\u001b[39m=\u001b[39;49m\u001b[39m1000\u001b[39;49m\n\u001b[0;32m <a href='vscode-notebook-cell:/c%3A/Users/franz/OneDrive/Documentos/GitHub/Generador-de-abstracts/AbstractGenerator.ipynb#ch0000014?line=5'>6</a>\u001b[0m )\n",
|
| 214 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\gpt_2_simple\\gpt_2.py:339\u001b[0m, in \u001b[0;36mfinetune\u001b[1;34m(sess, dataset, steps, model_name, model_dir, combine, batch_size, learning_rate, accumulate_gradients, restore_from, run_name, checkpoint_dir, sample_every, sample_length, sample_num, multi_gpu, save_every, print_every, max_checkpoints, use_memory_saving_gradients, only_train_transformer_layers, optimizer, overwrite, reuse)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=336'>337</a>\u001b[0m sess\u001b[39m.\u001b[39mrun(opt_reset)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=337'>338</a>\u001b[0m \u001b[39mfor\u001b[39;00m _ \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(accumulate_gradients):\n\u001b[1;32m--> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=338'>339</a>\u001b[0m sess\u001b[39m.\u001b[39;49mrun(\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=339'>340</a>\u001b[0m opt_compute, feed_dict\u001b[39m=\u001b[39;49m{context: sample_batch()})\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=340'>341</a>\u001b[0m (v_loss, v_summary) \u001b[39m=\u001b[39m sess\u001b[39m.\u001b[39mrun((opt_apply, summary_loss))\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/gpt_2_simple/gpt_2.py?line=341'>342</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n",
|
| 215 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:967\u001b[0m, in \u001b[0;36mBaseSession.run\u001b[1;34m(self, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=963'>964</a>\u001b[0m run_metadata_ptr \u001b[39m=\u001b[39m tf_session\u001b[39m.\u001b[39mTF_NewBuffer() \u001b[39mif\u001b[39;00m run_metadata \u001b[39melse\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=965'>966</a>\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=966'>967</a>\u001b[0m result \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_run(\u001b[39mNone\u001b[39;49;00m, fetches, feed_dict, options_ptr,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=967'>968</a>\u001b[0m run_metadata_ptr)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=968'>969</a>\u001b[0m \u001b[39mif\u001b[39;00m run_metadata:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=969'>970</a>\u001b[0m proto_data \u001b[39m=\u001b[39m tf_session\u001b[39m.\u001b[39mTF_GetBuffer(run_metadata_ptr)\n",
|
| 216 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1190\u001b[0m, in \u001b[0;36mBaseSession._run\u001b[1;34m(self, handle, fetches, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1186'>1187</a>\u001b[0m \u001b[39m# We only want to really perform the run if fetches or targets are provided,\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1187'>1188</a>\u001b[0m \u001b[39m# or if the call is a partial run that specifies feeds.\u001b[39;00m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1188'>1189</a>\u001b[0m \u001b[39mif\u001b[39;00m final_fetches \u001b[39mor\u001b[39;00m final_targets \u001b[39mor\u001b[39;00m (handle \u001b[39mand\u001b[39;00m feed_dict_tensor):\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1189'>1190</a>\u001b[0m results \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_do_run(handle, final_targets, final_fetches,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1190'>1191</a>\u001b[0m feed_dict_tensor, options, run_metadata)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1191'>1192</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1192'>1193</a>\u001b[0m results \u001b[39m=\u001b[39m []\n",
|
| 217 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1370\u001b[0m, in \u001b[0;36mBaseSession._do_run\u001b[1;34m(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1366'>1367</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_call_tf_sessionprun(handle, feed_dict, fetch_list)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1368'>1369</a>\u001b[0m \u001b[39mif\u001b[39;00m handle \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1369'>1370</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_do_call(_run_fn, feeds, fetches, targets, options,\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1370'>1371</a>\u001b[0m run_metadata)\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1371'>1372</a>\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1372'>1373</a>\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_do_call(_prun_fn, handle, feeds, fetches)\n",
|
| 218 |
+
"File \u001b[1;32m~\\.conda\\envs\\tf-gpu\\lib\\site-packages\\tensorflow\\python\\client\\session.py:1396\u001b[0m, in \u001b[0;36mBaseSession._do_call\u001b[1;34m(self, fn, *args)\u001b[0m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1390'>1391</a>\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39m'\u001b[39m\u001b[39monly supports NHWC tensor format\u001b[39m\u001b[39m'\u001b[39m \u001b[39min\u001b[39;00m message:\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1391'>1392</a>\u001b[0m message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m (\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mA possible workaround: Try disabling Grappler optimizer\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1392'>1393</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39mby modifying the config for creating the session eg.\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1393'>1394</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39msession_config.graph_options.rewrite_options.\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1394'>1395</a>\u001b[0m \u001b[39m'\u001b[39m\u001b[39mdisable_meta_optimizer = True\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m-> <a href='file:///c%3A/Users/franz/.conda/envs/tf-gpu/lib/site-packages/tensorflow/python/client/session.py?line=1395'>1396</a>\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mtype\u001b[39m(e)(node_def, op, message)\n",
|
| 219 |
+
"\u001b[1;31mResourceExhaustedError\u001b[0m: Graph execution error:\n\nfailed to allocate memory\n\t [[{{node model/h10/attn/ArithmeticOptimizer/ReorderCastLikeAndValuePreserving_float_Cast_1}}]]\nHint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode."
|
| 220 |
+
]
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"source": [
|
| 224 |
+
"gpt2.finetune(sess,\n",
|
| 225 |
+
" file_path_es+'.txt',\n",
|
| 226 |
+
" model_name=model_name,\n",
|
| 227 |
+
" checkpoint_dir=checkpoint_dir, \n",
|
| 228 |
+
" steps=1000\n",
|
| 229 |
+
" ) "
|
| 230 |
+
]
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"cell_type": "markdown",
|
| 234 |
+
"metadata": {},
|
| 235 |
+
"source": [
|
| 236 |
+
"# test"
|
| 237 |
+
]
|
| 238 |
+
},
|
| 239 |
+
{
|
| 240 |
+
"cell_type": "markdown",
|
| 241 |
+
"metadata": {},
|
| 242 |
+
"source": [
|
| 243 |
+
"## en "
|
| 244 |
+
]
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cell_type": "code",
|
| 248 |
+
"execution_count": null,
|
| 249 |
+
"metadata": {},
|
| 250 |
+
"outputs": [],
|
| 251 |
+
"source": [
|
| 252 |
+
"text = \"\"\"Introduction and preliminaries\n",
|
| 253 |
+
"The focus of this paper is decompositions of (k, `)-sparse graphs into edge-disjoint subgraphs\n",
|
| 254 |
+
"that certify sparsity. We use graph to mean a multigraph, possibly with loops. We say that a\n",
|
| 255 |
+
"graph is (k, `)-sparse if no subset of n′ vertices spans more than kn′− ` edges in the graph; a\n",
|
| 256 |
+
"(k, `)-sparse graph with kn′− ` edges is (k, `)-tight. We call the range k ≤ `≤ 2k−1 the upper\n",
|
| 257 |
+
"range of sparse graphs and 0≤ `≤ k the lower range.\n",
|
| 258 |
+
"In this paper, we present efficient algorithms for finding decompositions that certify sparsity\n",
|
| 259 |
+
"in the upper range of `. Our algorithms also apply in the lower range, which was already ad-\n",
|
| 260 |
+
"dressed by [3, 4, 5, 6, 19]. A decomposition certifies the sparsity of a graph if the sparse graphs\n",
|
| 261 |
+
"and graphs admitting the decomposition coincide.\n",
|
| 262 |
+
"Our algorithms are based on a new characterization of sparse graphs, which we call the\n",
|
| 263 |
+
"pebble game with colors. The pebble game with colors is a simple graph construction rule that\n",
|
| 264 |
+
"produces a sparse graph along with a sparsity-certifying decomposition.\n",
|
| 265 |
+
"We define and study a canonical class of pebble game constructions, which correspond to\n",
|
| 266 |
+
"previously studied decompositions of sparse graphs into edge disjoint trees. Our results provide\n",
|
| 267 |
+
"a unifying framework for all the previously known special cases, including Nash-Williams-\n",
|
| 268 |
+
"Tutte and [7, 24]. Indeed, in the lower range, canonical pebble game constructions capture the\n",
|
| 269 |
+
"properties of the augmenting paths used in matroid union and intersection algorithms[5, 6].\n",
|
| 270 |
+
"Since the sparse graphs in the upper range are not known to be unions or intersections of the\n",
|
| 271 |
+
"matroids for which there are efficient augmenting path algorithms, these do not easily apply in\n",
|
| 272 |
+
"∗ Research of both authors funded by the NSF under grants NSF CCF-0430990 and NSF-DARPA CARGO\n",
|
| 273 |
+
"CCR-0310661 to the first author.\n",
|
| 274 |
+
"2 Ileana Streinu, Louis Theran\n",
|
| 275 |
+
"Term Meaning\n",
|
| 276 |
+
"Sparse graph G Every non-empty subgraph on n′ vertices has ≤ kn′− ` edges\n",
|
| 277 |
+
"Tight graph G G = (V,E) is sparse and |V |= n, |E|= kn− `\n",
|
| 278 |
+
"Block H in G G is sparse, and H is a tight subgraph\n",
|
| 279 |
+
"Component H of G G is sparse and H is a maximal block\n",
|
| 280 |
+
"Map-graph Graph that admits an out-degree-exactly-one orientation\n",
|
| 281 |
+
"(k, `)-maps-and-trees Edge-disjoint union of ` trees and (k− `) map-grpahs\n",
|
| 282 |
+
"`Tk Union of ` trees, each vertex is in exactly k of them\n",
|
| 283 |
+
"Set of tree-pieces of an `Tk induced on V ′ ⊂V Pieces of trees in the `Tk spanned by E(V ′)\n",
|
| 284 |
+
"Proper `Tk Every V ′ ⊂V contains ≥ ` pieces of trees from the `Tk\n",
|
| 285 |
+
"Table 1. Sparse graph and decomposition terminology used in this paper.\n",
|
| 286 |
+
"the upper range. Pebble game with colors constructions may thus be considered a strengthening\n",
|
| 287 |
+
"of augmenting paths to the upper range of matroidal sparse graphs.\n",
|
| 288 |
+
"1.1. Sparse graphs\n",
|
| 289 |
+
"\n",
|
| 290 |
+
"ABSTRACT\n",
|
| 291 |
+
"\"\"\""
|
| 292 |
+
]
|
| 293 |
+
},
|
| 294 |
+
{
|
| 295 |
+
"cell_type": "code",
|
| 296 |
+
"execution_count": null,
|
| 297 |
+
"metadata": {},
|
| 298 |
+
"outputs": [],
|
| 299 |
+
"source": [
|
| 300 |
+
"gpt2.generate(sess,prefix=text,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)"
|
| 301 |
+
]
|
| 302 |
+
},
|
| 303 |
+
{
|
| 304 |
+
"cell_type": "markdown",
|
| 305 |
+
"metadata": {},
|
| 306 |
+
"source": [
|
| 307 |
+
"## es"
|
| 308 |
+
]
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
"cell_type": "code",
|
| 312 |
+
"execution_count": null,
|
| 313 |
+
"metadata": {},
|
| 314 |
+
"outputs": [],
|
| 315 |
+
"source": [
|
| 316 |
+
"text = \"\"\"El foco de este documento son las descomposicións de (k, `)-sparse gráficos en bordes-disjunto subgraphs\n",
|
| 317 |
+
"que certifique la escasez. Usamos el gráfico para significar un múltiplo, posiblemente con bucles. Nosotros decimos que un\n",
|
| 318 |
+
"grafo es (k, `)-sparse si ningún subconjunto de n′ vértices abarca más de kn ` bordes en el gráfico; a\n",
|
| 319 |
+
"(k, `)-sparse gráfico con kn ` bordes es (k, `)-estrechado. Llamamos al rango k ≤ 2k−1 el superior\n",
|
| 320 |
+
"rango de gráficos escasos y 0≤ k el rango inferior.\n",
|
| 321 |
+
"En este artículo, presentamos algoritmos eficientes para encontrar descomposicións que certifiquen la escasez\n",
|
| 322 |
+
"en el rango superior de `. Nuestros algoritmos también se aplican en el rango inferior, que ya era ad-\n",
|
| 323 |
+
"vestido por [3, 4, 5, 6, 19]. Una descomposición certifica la escasez de un gráfico si los gráficos dispersos\n",
|
| 324 |
+
"y los gráficos que admiten la descomposición coinciden.\n",
|
| 325 |
+
"Nuestros algoritmos se basan en una nueva caracterización de gráficos escasos, que llamamos el\n",
|
| 326 |
+
"juego de guijarros con colores. El juego de guijarros con colores es una regla de construcción de gráficos simples que\n",
|
| 327 |
+
"produce un gráfico escaso junto con una descomposición certificadora de la escasez.\n",
|
| 328 |
+
"Definimos y estudiamos una clase canónica de construcciones de juego de guijarros, que corresponden a\n",
|
| 329 |
+
"previamente estudiado las descomposiciones de los gráficos escasos en los árboles disjuntos del borde. Nuestros resultados proporcionan\n",
|
| 330 |
+
"un marco unificador para todos los casos especiales conocidos anteriormente, incluidos Nash-Williams-\n",
|
| 331 |
+
"Tutte y [7, 24]. De hecho, en el rango inferior, las construcciones canónicas de juego de guijarros capturan la\n",
|
| 332 |
+
"propiedades de las rutas de aumento utilizadas en los algoritmos de unión de matroides y de intersección[5, 6].\n",
|
| 333 |
+
"Dado que los gráficos escasos en el rango superior no se sabe que son uniones o intersecciones de la\n",
|
| 334 |
+
"matroides para los que hay algoritmos de ruta de aumento eficiente, estos no se aplican fácilmente en\n",
|
| 335 |
+
"* Investigación de ambos autores financiada por la NSF bajo subvenciones NSF CCF-0430990 y NSF-DARPA CARGO\n",
|
| 336 |
+
"CCR-0310661 al primer autor.\n",
|
| 337 |
+
"2 Ileana Streinu, Louis Theran\n",
|
| 338 |
+
"Significado del término\n",
|
| 339 |
+
"Gráfico escaso G Cada subgrafo no vacío en n′ vértices tiene ≤ kn ` bordes\n",
|
| 340 |
+
"El gráfico ajustado G G = (V,E) es escaso y V = n, E= kn− `\n",
|
| 341 |
+
"El bloque H en G G es escaso, y H es un subgrafo apretado\n",
|
| 342 |
+
"El componente H de G G es escaso y H es un bloqueo máximo\n",
|
| 343 |
+
"Gráfico cartográfico que admite una orientación de grado-exactamente-uno\n",
|
| 344 |
+
"(k, `)-maps-and-trees Edge-disjunt union de ` árboles y (k- `) map-grpahs\n",
|
| 345 |
+
"`Tk Unión de ` árboles, cada vértice está exactamente en k de ellos\n",
|
| 346 |
+
"Conjunto de piezas arbóreas de un `Tk inducido en V ′ ́V Piezas de árboles en el `Tk extendido por E(V ′)\n",
|
| 347 |
+
"`Tk Apropiado Cada V ′ V contiene ≥ ` pedazos de árboles de la `Tk\n",
|
| 348 |
+
"Cuadro 1 Gráfico escaso y terminología de descomposición utilizada en este artículo.\n",
|
| 349 |
+
"el rango superior. Pebble juego con construcciones de colores por lo tanto puede ser considerado un fortalecimiento\n",
|
| 350 |
+
"de caminos de aumento a la gama superior de gráficos de la escasez matroidal.\n",
|
| 351 |
+
"1.1. Gráficos escasos\n",
|
| 352 |
+
"Un gráfico es (k, `)-sparse si para cualquier subgrafo no vacío con bordes m′ y n′ vértices, m′ ≤\n",
|
| 353 |
+
"kn `. Observamos que esta condición implica que 0 ≤ ` ≤ 2k− 1, y a partir de ahora en este\n",
|
| 354 |
+
"Haremos esta suposición. Un gráfico escaso que tiene n vértices y exactamente bordes kn\n",
|
| 355 |
+
"se llama apretado.\n",
|
| 356 |
+
"Para un gráfico G = (V,E), y V ′ V, utilizamos el intervalo de notación (V ′) para el número de bordes\n",
|
| 357 |
+
"en el subgráfico inducido por V ′. En un gráfico dirigido, out(V ′) es el número de bordes con la cola\n",
|
| 358 |
+
"en V ′ y la cabeza en V −V ′; para un subgráfico inducido por V ′, llamamos a tal borde un borde superior.\n",
|
| 359 |
+
"Hay dos tipos importantes de subgrafías de gráficos escasos. Un bloque es un subgrafo apretado de\n",
|
| 360 |
+
"un gráfico escaso. Un componente es un bloque máximo.\n",
|
| 361 |
+
"La Tabla 1 resume la escasa terminología gráfica utilizada en este artículo.\n",
|
| 362 |
+
"1.2. Descomposiciónes de certificación de la sparsidad\n",
|
| 363 |
+
"Un k-arborescencia es un gráfico que admite una descomposición en k borde-desjunto que abarca los árboles.\n",
|
| 364 |
+
"La Figura 1(a) muestra un ejemplo de una 3-arborescencia. Se describen los gráficos k-arborescentes\n",
|
| 365 |
+
"por los conocidos teoremas de Tutte [23] y Nash-Williams [17] como exactamente el (k,k) apretado\n",
|
| 366 |
+
"gráficos.\n",
|
| 367 |
+
"ABSTRACT\n",
|
| 368 |
+
"\"\"\""
|
| 369 |
+
]
|
| 370 |
+
},
|
| 371 |
+
{
|
| 372 |
+
"cell_type": "code",
|
| 373 |
+
"execution_count": null,
|
| 374 |
+
"metadata": {},
|
| 375 |
+
"outputs": [],
|
| 376 |
+
"source": [
|
| 377 |
+
"gpt2.generate(sess,prefix=text,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)"
|
| 378 |
+
]
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"cell_type": "markdown",
|
| 382 |
+
"metadata": {},
|
| 383 |
+
"source": [
|
| 384 |
+
"# gradio interface"
|
| 385 |
+
]
|
| 386 |
+
},
|
| 387 |
+
{
|
| 388 |
+
"cell_type": "code",
|
| 389 |
+
"execution_count": null,
|
| 390 |
+
"metadata": {},
|
| 391 |
+
"outputs": [],
|
| 392 |
+
"source": [
|
| 393 |
+
"def generateAbstract(text):\n",
|
| 394 |
+
" # with tf.compat.v1.variable_scope(\"weight\", reuse = True):\n",
|
| 395 |
+
" #sess = tf.compat.v1.get_variable('sess',gpt2.start_tf_sess())\n",
|
| 396 |
+
" tf.compat.v1.reset_default_graph()\n",
|
| 397 |
+
" sess = gpt2.start_tf_sess()\n",
|
| 398 |
+
" gpt2.load_gpt2(sess,checkpoint_dir=checkpoint_dir,run_name='run1')\n",
|
| 399 |
+
" txt = gpt2.generate(sess,prefix=str(text)+\"\\nABSTRACT\", return_as_list=True,truncate=sufix,checkpoint_dir=checkpoint_dir,nsamples=1)[0]\n",
|
| 400 |
+
" return str(txt[txt.find('ABSTRACT'):])\n",
|
| 401 |
+
"\n",
|
| 402 |
+
"\n",
|
| 403 |
+
"\n",
|
| 404 |
+
"iface = gr.Interface(fn=generateAbstract, inputs=gr.inputs.Textbox(lines=10, placeholder=\"text\"), outputs=\"textbox\")\n",
|
| 405 |
+
"iface.launch(debug = True )"
|
| 406 |
+
]
|
| 407 |
+
}
|
| 408 |
+
],
|
| 409 |
+
"metadata": {
|
| 410 |
+
"interpreter": {
|
| 411 |
+
"hash": "53fbdc69e3e12c371950068c144423682c30d04ec68c2bd46937202e33e0058d"
|
| 412 |
+
},
|
| 413 |
+
"kernelspec": {
|
| 414 |
+
"display_name": "Python 3.7.11 ('receta')",
|
| 415 |
+
"language": "python",
|
| 416 |
+
"name": "python3"
|
| 417 |
+
},
|
| 418 |
+
"language_info": {
|
| 419 |
+
"codemirror_mode": {
|
| 420 |
+
"name": "ipython",
|
| 421 |
+
"version": 3
|
| 422 |
+
},
|
| 423 |
+
"file_extension": ".py",
|
| 424 |
+
"mimetype": "text/x-python",
|
| 425 |
+
"name": "python",
|
| 426 |
+
"nbconvert_exporter": "python",
|
| 427 |
+
"pygments_lexer": "ipython3",
|
| 428 |
+
"version": "3.9.7"
|
| 429 |
+
},
|
| 430 |
+
"orig_nbformat": 4
|
| 431 |
+
},
|
| 432 |
+
"nbformat": 4,
|
| 433 |
+
"nbformat_minor": 2
|
| 434 |
+
}
|
AbstractGenerator/TrainigData/en.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:814f983aa49ccc33a993a7d12f67a2eb2a7ca0b15d8697e82b50d3a19f3e1595
|
| 3 |
+
size 35400974
|
AbstractGenerator/TrainigData/es.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2454067cfe384e1d824b3f5d29cb5c4e1ff292289ad4b37c6cbd22f5cc715295
|
| 3 |
+
size 44460970
|
AbstractGenerator/weights/run1/encoder.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
|
| 3 |
+
size 1042301
|
AbstractGenerator/weights/run1/events.out.tfevents.1648184225.FRANZ96521-W11
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:83a88ba7f3268f11289fb24fd13db1367b91acce6466c4ad394011e10ea4c304
|
| 3 |
+
size 82
|
AbstractGenerator/weights/run1/events.out.tfevents.1648184499.FRANZ96521-W11
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cb8646e6bf1e1b8cc26f8128ec4e4c2e797dac297939450a8bf46057e7388a6a
|
| 3 |
+
size 82
|
AbstractGenerator/weights/run1/events.out.tfevents.1648229481.FRANZ96521-W11
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04d1f71db542da83fee4fe8574bf382cb5324b6decef506206250b8fea85abd0
|
| 3 |
+
size 82
|
AbstractGenerator/weights/run1/hparams.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9d56e4121c427164e0c55c6f03c08e1daf9002b9b672825112d19097b680318
|
| 3 |
+
size 90
|
AbstractGenerator/weights/run1/vocab.bpe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
|
| 3 |
+
size 456318
|
Descarga.ipynb
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import pandas as pd\n",
|
| 10 |
+
"import json\n",
|
| 11 |
+
"from pandas import json_normalize\n",
|
| 12 |
+
"import requests\n",
|
| 13 |
+
"from pathlib import Path\n",
|
| 14 |
+
"from multiprocessing.pool import ThreadPool as Pool\n",
|
| 15 |
+
"import codecs\n",
|
| 16 |
+
"import random\n",
|
| 17 |
+
"import re"
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"cell_type": "code",
|
| 22 |
+
"execution_count": 3,
|
| 23 |
+
"metadata": {},
|
| 24 |
+
"outputs": [],
|
| 25 |
+
"source": [
|
| 26 |
+
"URL_BASE = \"https://arxiv.org/pdf/\"\n",
|
| 27 |
+
"PDF_PATH = 'PDF'\n",
|
| 28 |
+
"TXT_PATH= 'TXT'"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
{
|
| 32 |
+
"cell_type": "markdown",
|
| 33 |
+
"metadata": {},
|
| 34 |
+
"source": [
|
| 35 |
+
"# Arxiv\n"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": 4,
|
| 41 |
+
"metadata": {},
|
| 42 |
+
"outputs": [
|
| 43 |
+
{
|
| 44 |
+
"name": "stdout",
|
| 45 |
+
"output_type": "stream",
|
| 46 |
+
"text": [
|
| 47 |
+
"<class 'pandas.core.frame.DataFrame'>\n"
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"data": {
|
| 52 |
+
"text/html": [
|
| 53 |
+
"<div>\n",
|
| 54 |
+
"<style scoped>\n",
|
| 55 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 56 |
+
" vertical-align: middle;\n",
|
| 57 |
+
" }\n",
|
| 58 |
+
"\n",
|
| 59 |
+
" .dataframe tbody tr th {\n",
|
| 60 |
+
" vertical-align: top;\n",
|
| 61 |
+
" }\n",
|
| 62 |
+
"\n",
|
| 63 |
+
" .dataframe thead th {\n",
|
| 64 |
+
" text-align: right;\n",
|
| 65 |
+
" }\n",
|
| 66 |
+
"</style>\n",
|
| 67 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 68 |
+
" <thead>\n",
|
| 69 |
+
" <tr style=\"text-align: right;\">\n",
|
| 70 |
+
" <th></th>\n",
|
| 71 |
+
" <th>id</th>\n",
|
| 72 |
+
" <th>title</th>\n",
|
| 73 |
+
" <th>abstract</th>\n",
|
| 74 |
+
" <th>Text</th>\n",
|
| 75 |
+
" </tr>\n",
|
| 76 |
+
" </thead>\n",
|
| 77 |
+
" <tbody>\n",
|
| 78 |
+
" <tr>\n",
|
| 79 |
+
" <th>0</th>\n",
|
| 80 |
+
" <td>0704.0001</td>\n",
|
| 81 |
+
" <td>Calculation of prompt diphoton production cros...</td>\n",
|
| 82 |
+
" <td>A fully differential calculation in perturba...</td>\n",
|
| 83 |
+
" <td></td>\n",
|
| 84 |
+
" </tr>\n",
|
| 85 |
+
" <tr>\n",
|
| 86 |
+
" <th>1</th>\n",
|
| 87 |
+
" <td>0704.0002</td>\n",
|
| 88 |
+
" <td>Sparsity-certifying Graph Decompositions</td>\n",
|
| 89 |
+
" <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
|
| 90 |
+
" <td></td>\n",
|
| 91 |
+
" </tr>\n",
|
| 92 |
+
" <tr>\n",
|
| 93 |
+
" <th>2</th>\n",
|
| 94 |
+
" <td>0704.0003</td>\n",
|
| 95 |
+
" <td>The evolution of the Earth-Moon system based o...</td>\n",
|
| 96 |
+
" <td>The evolution of Earth-Moon system is descri...</td>\n",
|
| 97 |
+
" <td></td>\n",
|
| 98 |
+
" </tr>\n",
|
| 99 |
+
" <tr>\n",
|
| 100 |
+
" <th>3</th>\n",
|
| 101 |
+
" <td>0704.0004</td>\n",
|
| 102 |
+
" <td>A determinant of Stirling cycle numbers counts...</td>\n",
|
| 103 |
+
" <td>We show that a determinant of Stirling cycle...</td>\n",
|
| 104 |
+
" <td></td>\n",
|
| 105 |
+
" </tr>\n",
|
| 106 |
+
" <tr>\n",
|
| 107 |
+
" <th>4</th>\n",
|
| 108 |
+
" <td>0704.0005</td>\n",
|
| 109 |
+
" <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
|
| 110 |
+
" <td>In this paper we show how to compute the $\\L...</td>\n",
|
| 111 |
+
" <td></td>\n",
|
| 112 |
+
" </tr>\n",
|
| 113 |
+
" <tr>\n",
|
| 114 |
+
" <th>...</th>\n",
|
| 115 |
+
" <td>...</td>\n",
|
| 116 |
+
" <td>...</td>\n",
|
| 117 |
+
" <td>...</td>\n",
|
| 118 |
+
" <td>...</td>\n",
|
| 119 |
+
" </tr>\n",
|
| 120 |
+
" <tr>\n",
|
| 121 |
+
" <th>1996</th>\n",
|
| 122 |
+
" <td>0704.1997</td>\n",
|
| 123 |
+
" <td>Query on Negative Temperature, Internal Intera...</td>\n",
|
| 124 |
+
" <td>After negative temperature is restated, we f...</td>\n",
|
| 125 |
+
" <td></td>\n",
|
| 126 |
+
" </tr>\n",
|
| 127 |
+
" <tr>\n",
|
| 128 |
+
" <th>1997</th>\n",
|
| 129 |
+
" <td>0704.1998</td>\n",
|
| 130 |
+
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
|
| 131 |
+
" <td>A scale invariant model containing dilaton $...</td>\n",
|
| 132 |
+
" <td></td>\n",
|
| 133 |
+
" </tr>\n",
|
| 134 |
+
" <tr>\n",
|
| 135 |
+
" <th>1998</th>\n",
|
| 136 |
+
" <td>0704.1999</td>\n",
|
| 137 |
+
" <td>Dark matter caustics and the enhancement of se...</td>\n",
|
| 138 |
+
" <td>Cold dark matter haloes are populated by cau...</td>\n",
|
| 139 |
+
" <td></td>\n",
|
| 140 |
+
" </tr>\n",
|
| 141 |
+
" <tr>\n",
|
| 142 |
+
" <th>1999</th>\n",
|
| 143 |
+
" <td>0704.2000</td>\n",
|
| 144 |
+
" <td>Search for a Higgs boson produced in associati...</td>\n",
|
| 145 |
+
" <td>We describe a search for the standard model ...</td>\n",
|
| 146 |
+
" <td></td>\n",
|
| 147 |
+
" </tr>\n",
|
| 148 |
+
" <tr>\n",
|
| 149 |
+
" <th>2000</th>\n",
|
| 150 |
+
" <td>0704.2001</td>\n",
|
| 151 |
+
" <td>Geometry of Parallelizable Manifolds in the Co...</td>\n",
|
| 152 |
+
" <td>In this paper, we deal with a generalization...</td>\n",
|
| 153 |
+
" <td></td>\n",
|
| 154 |
+
" </tr>\n",
|
| 155 |
+
" </tbody>\n",
|
| 156 |
+
"</table>\n",
|
| 157 |
+
"<p>2001 rows × 4 columns</p>\n",
|
| 158 |
+
"</div>"
|
| 159 |
+
],
|
| 160 |
+
"text/plain": [
|
| 161 |
+
" id title \\\n",
|
| 162 |
+
"0 0704.0001 Calculation of prompt diphoton production cros... \n",
|
| 163 |
+
"1 0704.0002 Sparsity-certifying Graph Decompositions \n",
|
| 164 |
+
"2 0704.0003 The evolution of the Earth-Moon system based o... \n",
|
| 165 |
+
"3 0704.0004 A determinant of Stirling cycle numbers counts... \n",
|
| 166 |
+
"4 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
|
| 167 |
+
"... ... ... \n",
|
| 168 |
+
"1996 0704.1997 Query on Negative Temperature, Internal Intera... \n",
|
| 169 |
+
"1997 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
|
| 170 |
+
"1998 0704.1999 Dark matter caustics and the enhancement of se... \n",
|
| 171 |
+
"1999 0704.2000 Search for a Higgs boson produced in associati... \n",
|
| 172 |
+
"2000 0704.2001 Geometry of Parallelizable Manifolds in the Co... \n",
|
| 173 |
+
"\n",
|
| 174 |
+
" abstract Text \n",
|
| 175 |
+
"0 A fully differential calculation in perturba... \n",
|
| 176 |
+
"1 We describe a new algorithm, the $(k,\\ell)$-... \n",
|
| 177 |
+
"2 The evolution of Earth-Moon system is descri... \n",
|
| 178 |
+
"3 We show that a determinant of Stirling cycle... \n",
|
| 179 |
+
"4 In this paper we show how to compute the $\\L... \n",
|
| 180 |
+
"... ... ... \n",
|
| 181 |
+
"1996 After negative temperature is restated, we f... \n",
|
| 182 |
+
"1997 A scale invariant model containing dilaton $... \n",
|
| 183 |
+
"1998 Cold dark matter haloes are populated by cau... \n",
|
| 184 |
+
"1999 We describe a search for the standard model ... \n",
|
| 185 |
+
"2000 In this paper, we deal with a generalization... \n",
|
| 186 |
+
"\n",
|
| 187 |
+
"[2001 rows x 4 columns]"
|
| 188 |
+
]
|
| 189 |
+
},
|
| 190 |
+
"execution_count": 4,
|
| 191 |
+
"metadata": {},
|
| 192 |
+
"output_type": "execute_result"
|
| 193 |
+
}
|
| 194 |
+
],
|
| 195 |
+
"source": [
|
| 196 |
+
"data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n",
|
| 197 |
+
"df = None\n",
|
| 198 |
+
"for i in data:\n",
|
| 199 |
+
" df = i \n",
|
| 200 |
+
" print(type(i))\n",
|
| 201 |
+
" break\n",
|
| 202 |
+
"df = df[['id','title','abstract']]\n",
|
| 203 |
+
"df.insert(3, \"Text\", \"\") \n",
|
| 204 |
+
"df"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"cell_type": "code",
|
| 209 |
+
"execution_count": 7,
|
| 210 |
+
"metadata": {},
|
| 211 |
+
"outputs": [],
|
| 212 |
+
"source": [
|
| 213 |
+
"def GetFileURL(file_id):\n",
|
| 214 |
+
" url = URL_BASE+file_id\n",
|
| 215 |
+
" r = requests.get(url, stream=True) \n",
|
| 216 |
+
" filename = Path(PDF_PATH+'/'+file_id+'.pdf')\n",
|
| 217 |
+
" response = requests.get(url)\n",
|
| 218 |
+
" filename.write_bytes(response.content)"
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"cell_type": "code",
|
| 223 |
+
"execution_count": 35,
|
| 224 |
+
"metadata": {},
|
| 225 |
+
"outputs": [],
|
| 226 |
+
"source": [
|
| 227 |
+
"pool_size = 16 \n",
|
| 228 |
+
"def worker(file):\n",
|
| 229 |
+
" try:\n",
|
| 230 |
+
" GetFileURL(file)\n",
|
| 231 |
+
" except:\n",
|
| 232 |
+
" print('error with item '+ file)\n",
|
| 233 |
+
" try:\n",
|
| 234 |
+
" with codecs.open(PDF_PATH+'/log.txt', 'a') as the_file: \n",
|
| 235 |
+
" the_file.writelines(str(file)+\"\\n\")\n",
|
| 236 |
+
" except:\n",
|
| 237 |
+
" print('error en log '+ file)\n",
|
| 238 |
+
"def get_ids(iteracion,batch=100): \n",
|
| 239 |
+
" inicio = int(iteracion*batch)\n",
|
| 240 |
+
" filesId = data[inicio :inicio + batch]['id']\n",
|
| 241 |
+
" return filesId\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"pool = Pool(pool_size)\n",
|
| 244 |
+
"filesId = get_ids(19)\n",
|
| 245 |
+
"for file in filesId:\n",
|
| 246 |
+
" pool.apply_async(worker, (file,))\n",
|
| 247 |
+
"\n",
|
| 248 |
+
"pool.close()\n",
|
| 249 |
+
"pool.join()"
|
| 250 |
+
]
|
| 251 |
+
}
|
| 252 |
+
],
|
| 253 |
+
"metadata": {
|
| 254 |
+
"interpreter": {
|
| 255 |
+
"hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
|
| 256 |
+
},
|
| 257 |
+
"kernelspec": {
|
| 258 |
+
"display_name": "Python 3.9.7 ('tf-gpu')",
|
| 259 |
+
"language": "python",
|
| 260 |
+
"name": "python3"
|
| 261 |
+
},
|
| 262 |
+
"language_info": {
|
| 263 |
+
"codemirror_mode": {
|
| 264 |
+
"name": "ipython",
|
| 265 |
+
"version": 3
|
| 266 |
+
},
|
| 267 |
+
"file_extension": ".py",
|
| 268 |
+
"mimetype": "text/x-python",
|
| 269 |
+
"name": "python",
|
| 270 |
+
"nbconvert_exporter": "python",
|
| 271 |
+
"pygments_lexer": "ipython3",
|
| 272 |
+
"version": "3.9.7"
|
| 273 |
+
},
|
| 274 |
+
"orig_nbformat": 4
|
| 275 |
+
},
|
| 276 |
+
"nbformat": 4,
|
| 277 |
+
"nbformat_minor": 2
|
| 278 |
+
}
|
PDF_a_TXT.ipynb
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 3,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [
|
| 8 |
+
{
|
| 9 |
+
"name": "stdout",
|
| 10 |
+
"output_type": "stream",
|
| 11 |
+
"text": [
|
| 12 |
+
"Requirement already satisfied: PyPDF2 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.26.0)\n",
|
| 13 |
+
"Requirement already satisfied: tika in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (1.24)\n",
|
| 14 |
+
"Requirement already satisfied: requests in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (2.27.1)\n",
|
| 15 |
+
"Requirement already satisfied: setuptools in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from tika) (58.0.4)\n",
|
| 16 |
+
"Requirement already satisfied: charset-normalizer~=2.0.0 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2.0.4)\n",
|
| 17 |
+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (2021.10.8)\n",
|
| 18 |
+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (3.3)\n",
|
| 19 |
+
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\franz\\.conda\\envs\\tensorflow\\lib\\site-packages (from requests->tika) (1.26.8)\n"
|
| 20 |
+
]
|
| 21 |
+
}
|
| 22 |
+
],
|
| 23 |
+
"source": [
|
| 24 |
+
"! pip install PyPDF2\n",
|
| 25 |
+
"! pip install tika"
|
| 26 |
+
]
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"cell_type": "code",
|
| 30 |
+
"execution_count": 1,
|
| 31 |
+
"metadata": {},
|
| 32 |
+
"outputs": [],
|
| 33 |
+
"source": [
|
| 34 |
+
"from tika import parser\n",
|
| 35 |
+
"import codecs\n",
|
| 36 |
+
"import os"
|
| 37 |
+
]
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "code",
|
| 41 |
+
"execution_count": 2,
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [],
|
| 44 |
+
"source": [
|
| 45 |
+
"def obtener_texto(file_path,store_path):\n",
|
| 46 |
+
" file_data = parser.from_file(file_path)\n",
|
| 47 |
+
" output = file_data['content']\n",
|
| 48 |
+
" output = output.strip() \n",
|
| 49 |
+
" output= output.split('\\n')\n",
|
| 50 |
+
" with codecs.open(store_path+'.txt', 'w','utf-8') as the_file: \n",
|
| 51 |
+
" for line in output:\n",
|
| 52 |
+
" #print(line)\n",
|
| 53 |
+
" if len(line)>4: \n",
|
| 54 |
+
" the_file.write(str(line)+'\\n')\n"
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
{
|
| 58 |
+
"cell_type": "code",
|
| 59 |
+
"execution_count": 3,
|
| 60 |
+
"metadata": {},
|
| 61 |
+
"outputs": [
|
| 62 |
+
{
|
| 63 |
+
"name": "stderr",
|
| 64 |
+
"output_type": "stream",
|
| 65 |
+
"text": [
|
| 66 |
+
"2022-03-17 17:02:20,018 [MainThread ] [WARNI] Failed to see startup log message; retrying...\n"
|
| 67 |
+
]
|
| 68 |
+
}
|
| 69 |
+
],
|
| 70 |
+
"source": [
|
| 71 |
+
"PDF_PATH = 'PDF'\n",
|
| 72 |
+
"TXT_PATH= 'TXT'\n",
|
| 73 |
+
"files = os.listdir(PDF_PATH)\n",
|
| 74 |
+
"for file in files:\n",
|
| 75 |
+
" obtener_texto(PDF_PATH+'/'+file,TXT_PATH+'/'+file)\n",
|
| 76 |
+
" "
|
| 77 |
+
]
|
| 78 |
+
}
|
| 79 |
+
],
|
| 80 |
+
"metadata": {
|
| 81 |
+
"interpreter": {
|
| 82 |
+
"hash": "3f7e9d73c32ad96f75174922c475a50b168aad887cbaa14717912a88f31d3802"
|
| 83 |
+
},
|
| 84 |
+
"kernelspec": {
|
| 85 |
+
"display_name": "Python 3.9.7 ('tf-gpu')",
|
| 86 |
+
"language": "python",
|
| 87 |
+
"name": "python3"
|
| 88 |
+
},
|
| 89 |
+
"language_info": {
|
| 90 |
+
"codemirror_mode": {
|
| 91 |
+
"name": "ipython",
|
| 92 |
+
"version": 3
|
| 93 |
+
},
|
| 94 |
+
"file_extension": ".py",
|
| 95 |
+
"mimetype": "text/x-python",
|
| 96 |
+
"name": "python",
|
| 97 |
+
"nbconvert_exporter": "python",
|
| 98 |
+
"pygments_lexer": "ipython3",
|
| 99 |
+
"version": "3.9.7"
|
| 100 |
+
},
|
| 101 |
+
"orig_nbformat": 4
|
| 102 |
+
},
|
| 103 |
+
"nbformat": 4,
|
| 104 |
+
"nbformat_minor": 2
|
| 105 |
+
}
|
models/124M/checkpoint
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd1b025d2e155283f5e300ce95bf6d5b6bc0f7fe010db73daa6975eb896ab9cb
|
| 3 |
+
size 77
|
models/124M/encoder.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783
|
| 3 |
+
size 1042301
|
models/124M/hparams.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9d56e4121c427164e0c55c6f03c08e1daf9002b9b672825112d19097b680318
|
| 3 |
+
size 90
|
models/124M/model.ckpt.data-00000-of-00001
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2060c885360cc0cf41d7a6dbc4d24b5127aae20260c8b5ae521b5a6578407118
|
| 3 |
+
size 497759232
|
models/124M/model.ckpt.index
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:71916f763f9746f9b2a06b12d91996cf1084ae008d0424543d39391c5f2dc687
|
| 3 |
+
size 5215
|
models/124M/model.ckpt.meta
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4668c448fa11531fd6700460487f73e82d3272960cea942252f8744bf225c77b
|
| 3 |
+
size 471155
|
models/124M/vocab.bpe
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5
|
| 3 |
+
size 456318
|
txt_to_csv.ipynb
ADDED
|
@@ -0,0 +1,662 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": null,
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"outputs": [],
|
| 8 |
+
"source": [
|
| 9 |
+
"import pandas as pd\n",
|
| 10 |
+
"import os\n",
|
| 11 |
+
"from easynmt import EasyNMT\n"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": null,
|
| 17 |
+
"metadata": {},
|
| 18 |
+
"outputs": [],
|
| 19 |
+
"source": [
|
| 20 |
+
"URL_BASE = \"https://arxiv.org/pdf/\"\n",
|
| 21 |
+
"PDF_PATH = 'PDF'\n",
|
| 22 |
+
"TXT_PATH= 'TXT'\n",
|
| 23 |
+
"CSV_PATH = 'CSV'"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "markdown",
|
| 28 |
+
"metadata": {},
|
| 29 |
+
"source": [
|
| 30 |
+
"# Get Data from TXT"
|
| 31 |
+
]
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"cell_type": "code",
|
| 35 |
+
"execution_count": null,
|
| 36 |
+
"metadata": {},
|
| 37 |
+
"outputs": [],
|
| 38 |
+
"source": [
|
| 39 |
+
"data = pd.read_json('ARxiv/arxiv-metadata-oai-snapshot.json',lines=True, chunksize=2001,dtype={'id':'str'})\n",
|
| 40 |
+
"df = None\n",
|
| 41 |
+
"for i in data:\n",
|
| 42 |
+
" df = i \n",
|
| 43 |
+
" print(type(i))\n",
|
| 44 |
+
" break\n",
|
| 45 |
+
"df = df[['id','title','abstract']]\n"
|
| 46 |
+
]
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"cell_type": "code",
|
| 50 |
+
"execution_count": null,
|
| 51 |
+
"metadata": {},
|
| 52 |
+
"outputs": [],
|
| 53 |
+
"source": [
|
| 54 |
+
"for file in df['id']:\n",
|
| 55 |
+
" file_path = TXT_PATH+'/'+str(file)+'.pdf.txt'\n",
|
| 56 |
+
" if os.path.isfile(file_path):\n",
|
| 57 |
+
" with open(file_path,'r',encoding='utf8') as f:\n",
|
| 58 |
+
" s =str( f.read()) \n",
|
| 59 |
+
" df.loc[df['id'] == str(file),'full_text'] = s "
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "code",
|
| 64 |
+
"execution_count": null,
|
| 65 |
+
"metadata": {},
|
| 66 |
+
"outputs": [],
|
| 67 |
+
"source": [
|
| 68 |
+
"df = df.dropna()\n",
|
| 69 |
+
"df.reset_index()\n",
|
| 70 |
+
"df.to_csv(CSV_PATH+'/scientific_paper_en.csv',index=False,encoding='utf-8')\n",
|
| 71 |
+
"df"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "markdown",
|
| 76 |
+
"metadata": {},
|
| 77 |
+
"source": [
|
| 78 |
+
"# first run \n"
|
| 79 |
+
]
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "code",
|
| 83 |
+
"execution_count": null,
|
| 84 |
+
"metadata": {},
|
| 85 |
+
"outputs": [],
|
| 86 |
+
"source": [
|
| 87 |
+
"df = pd.read_csv(CSV_PATH +'/scientific_paper_en.csv',dtype={'id':'str'})\n",
|
| 88 |
+
"df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
{
|
| 92 |
+
"cell_type": "markdown",
|
| 93 |
+
"metadata": {},
|
| 94 |
+
"source": [
|
| 95 |
+
"# leer datos"
|
| 96 |
+
]
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "code",
|
| 100 |
+
"execution_count": null,
|
| 101 |
+
"metadata": {},
|
| 102 |
+
"outputs": [],
|
| 103 |
+
"source": [
|
| 104 |
+
"df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n",
|
| 105 |
+
"print(len(df.index))\n",
|
| 106 |
+
"df"
|
| 107 |
+
]
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
"cell_type": "markdown",
|
| 111 |
+
"metadata": {},
|
| 112 |
+
"source": [
|
| 113 |
+
"# translate"
|
| 114 |
+
]
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"cell_type": "code",
|
| 118 |
+
"execution_count": null,
|
| 119 |
+
"metadata": {},
|
| 120 |
+
"outputs": [],
|
| 121 |
+
"source": [
|
| 122 |
+
"model = EasyNMT('opus-mt')"
|
| 123 |
+
]
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"cell_type": "markdown",
|
| 127 |
+
"metadata": {},
|
| 128 |
+
"source": [
|
| 129 |
+
"## translate full text"
|
| 130 |
+
]
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"cell_type": "code",
|
| 134 |
+
"execution_count": null,
|
| 135 |
+
"metadata": {},
|
| 136 |
+
"outputs": [],
|
| 137 |
+
"source": [
|
| 138 |
+
"max = len(df.index)\n",
|
| 139 |
+
"for i in range(0,1754):\n",
|
| 140 |
+
" text = df.iloc[i]['full_text']\n",
|
| 141 |
+
" translated_text = model.translate(text, target_lang='es')\n",
|
| 142 |
+
" df.loc[i,'translated'] = translated_text \n",
|
| 143 |
+
" print(\"listo documento \",i)\n",
|
| 144 |
+
" if(i%10==0):\n",
|
| 145 |
+
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
|
| 146 |
+
"df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"cell_type": "markdown",
|
| 151 |
+
"metadata": {},
|
| 152 |
+
"source": [
|
| 153 |
+
"## translate abstract"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"cell_type": "code",
|
| 158 |
+
"execution_count": null,
|
| 159 |
+
"metadata": {},
|
| 160 |
+
"outputs": [],
|
| 161 |
+
"source": [
|
| 162 |
+
"max = len(df.index)\n",
|
| 163 |
+
"for i in range(0,1754):\n",
|
| 164 |
+
" text = df.iloc[i]['abstract']\n",
|
| 165 |
+
" translated_text = model.translate(text, target_lang='es')\n",
|
| 166 |
+
" df.loc[i,'translated_abstract'] = translated_text \n",
|
| 167 |
+
" print(\"listo documento \",i)\n",
|
| 168 |
+
" if(i%100==0):\n",
|
| 169 |
+
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
|
| 170 |
+
"df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
|
| 171 |
+
"\n"
|
| 172 |
+
]
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"cell_type": "markdown",
|
| 176 |
+
"metadata": {},
|
| 177 |
+
"source": [
|
| 178 |
+
"# remove abstract"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"cell_type": "code",
|
| 183 |
+
"execution_count": null,
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"outputs": [],
|
| 186 |
+
"source": [
|
| 187 |
+
"max = len(df.index)-1"
|
| 188 |
+
]
|
| 189 |
+
},
|
| 190 |
+
{
|
| 191 |
+
"cell_type": "code",
|
| 192 |
+
"execution_count": null,
|
| 193 |
+
"metadata": {},
|
| 194 |
+
"outputs": [],
|
| 195 |
+
"source": [
|
| 196 |
+
"end = 'Introducción'\n",
|
| 197 |
+
"for i in range(0,max):\n",
|
| 198 |
+
" text = df.iloc[i]['translated'] \n",
|
| 199 |
+
" p = text.find(end)\n",
|
| 200 |
+
" if(p != -1): \n",
|
| 201 |
+
" df.loc[i,'translated_no_abstract'] = text[p:] \n",
|
| 202 |
+
" else:\n",
|
| 203 |
+
" df.loc[i,'translated_no_abstract']= text\n",
|
| 204 |
+
" print(\"listo documento \",i,p)\n",
|
| 205 |
+
" if(i%1000==0):\n",
|
| 206 |
+
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
|
| 207 |
+
"df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
{
|
| 211 |
+
"cell_type": "code",
|
| 212 |
+
"execution_count": null,
|
| 213 |
+
"metadata": {},
|
| 214 |
+
"outputs": [],
|
| 215 |
+
"source": [
|
| 216 |
+
"end = 'Abstract'\n",
|
| 217 |
+
"for i in range(0,max):\n",
|
| 218 |
+
" text = df.iloc[i]['full_text'] \n",
|
| 219 |
+
" p = text.find(end)\n",
|
| 220 |
+
" if(p != -1): \n",
|
| 221 |
+
" df.loc[i,'text_no_abstract'] = text[p:] \n",
|
| 222 |
+
" else:\n",
|
| 223 |
+
" df.loc[i,'text_no_abstract']= text \n",
|
| 224 |
+
" if(i%1000==0):\n",
|
| 225 |
+
" df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')\n",
|
| 226 |
+
"df.to_csv(CSV_PATH+'/scientific_paper_full_text_translated.csv',index=False,encoding='utf-8')"
|
| 227 |
+
]
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"cell_type": "markdown",
|
| 231 |
+
"metadata": {},
|
| 232 |
+
"source": [
|
| 233 |
+
"# split data to csv"
|
| 234 |
+
]
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"cell_type": "code",
|
| 238 |
+
"execution_count": null,
|
| 239 |
+
"metadata": {},
|
| 240 |
+
"outputs": [],
|
| 241 |
+
"source": [
|
| 242 |
+
"df = pd.read_csv(CSV_PATH +'/scientific_paper_full_text_translated.csv',dtype={'id':'str'})\n",
|
| 243 |
+
"df"
|
| 244 |
+
]
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
"cell_type": "code",
|
| 248 |
+
"execution_count": 77,
|
| 249 |
+
"metadata": {},
|
| 250 |
+
"outputs": [
|
| 251 |
+
{
|
| 252 |
+
"data": {
|
| 253 |
+
"text/html": [
|
| 254 |
+
"<div>\n",
|
| 255 |
+
"<style scoped>\n",
|
| 256 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 257 |
+
" vertical-align: middle;\n",
|
| 258 |
+
" }\n",
|
| 259 |
+
"\n",
|
| 260 |
+
" .dataframe tbody tr th {\n",
|
| 261 |
+
" vertical-align: top;\n",
|
| 262 |
+
" }\n",
|
| 263 |
+
"\n",
|
| 264 |
+
" .dataframe thead th {\n",
|
| 265 |
+
" text-align: right;\n",
|
| 266 |
+
" }\n",
|
| 267 |
+
"</style>\n",
|
| 268 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 269 |
+
" <thead>\n",
|
| 270 |
+
" <tr style=\"text-align: right;\">\n",
|
| 271 |
+
" <th></th>\n",
|
| 272 |
+
" <th>id</th>\n",
|
| 273 |
+
" <th>title</th>\n",
|
| 274 |
+
" <th>full_text</th>\n",
|
| 275 |
+
" <th>abstract</th>\n",
|
| 276 |
+
" <th>text_no_abstract</th>\n",
|
| 277 |
+
" </tr>\n",
|
| 278 |
+
" </thead>\n",
|
| 279 |
+
" <tbody>\n",
|
| 280 |
+
" <tr>\n",
|
| 281 |
+
" <th>0</th>\n",
|
| 282 |
+
" <td>0704.0002</td>\n",
|
| 283 |
+
" <td>Sparsity-certifying Graph Decompositions</td>\n",
|
| 284 |
+
" <td>Descomposiciones del gráfico de certificación ...</td>\n",
|
| 285 |
+
" <td>Describimos un nuevo algoritmo, el juego de ...</td>\n",
|
| 286 |
+
" <td>Introducción y preliminares\\nEl foco de este d...</td>\n",
|
| 287 |
+
" </tr>\n",
|
| 288 |
+
" <tr>\n",
|
| 289 |
+
" <th>1</th>\n",
|
| 290 |
+
" <td>0704.0003</td>\n",
|
| 291 |
+
" <td>The evolution of the Earth-Moon system based o...</td>\n",
|
| 292 |
+
" <td>La evolución del sistema Tierra-Luna basado en...</td>\n",
|
| 293 |
+
" <td>La evolución del sistema Tierra-Luna es desc...</td>\n",
|
| 294 |
+
" <td>Introducción \\nLa teoría aceptada popularmente...</td>\n",
|
| 295 |
+
" </tr>\n",
|
| 296 |
+
" <tr>\n",
|
| 297 |
+
" <th>2</th>\n",
|
| 298 |
+
" <td>0704.0004</td>\n",
|
| 299 |
+
" <td>A determinant of Stirling cycle numbers counts...</td>\n",
|
| 300 |
+
" <td>Un determinante de los números de ciclo de Sti...</td>\n",
|
| 301 |
+
" <td>Demostramos que un determinante de los númer...</td>\n",
|
| 302 |
+
" <td>Introducción El propósito principal de este ar...</td>\n",
|
| 303 |
+
" </tr>\n",
|
| 304 |
+
" <tr>\n",
|
| 305 |
+
" <th>3</th>\n",
|
| 306 |
+
" <td>0704.0005</td>\n",
|
| 307 |
+
" <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
|
| 308 |
+
" <td>DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC...</td>\n",
|
| 309 |
+
" <td>En este artículo mostramos cómo calcular la ...</td>\n",
|
| 310 |
+
" <td>DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC...</td>\n",
|
| 311 |
+
" </tr>\n",
|
| 312 |
+
" <tr>\n",
|
| 313 |
+
" <th>4</th>\n",
|
| 314 |
+
" <td>0704.0007</td>\n",
|
| 315 |
+
" <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
|
| 316 |
+
" <td>La mecánica cuántica de polímeros y su límite ...</td>\n",
|
| 317 |
+
" <td>Una representación cuántica no estándar de l...</td>\n",
|
| 318 |
+
" <td>La mecánica cuántica de polímeros y su límite ...</td>\n",
|
| 319 |
+
" </tr>\n",
|
| 320 |
+
" <tr>\n",
|
| 321 |
+
" <th>...</th>\n",
|
| 322 |
+
" <td>...</td>\n",
|
| 323 |
+
" <td>...</td>\n",
|
| 324 |
+
" <td>...</td>\n",
|
| 325 |
+
" <td>...</td>\n",
|
| 326 |
+
" <td>...</td>\n",
|
| 327 |
+
" </tr>\n",
|
| 328 |
+
" <tr>\n",
|
| 329 |
+
" <th>1749</th>\n",
|
| 330 |
+
" <td>0704.1996</td>\n",
|
| 331 |
+
" <td>A Wave-function for Stringy Universes</td>\n",
|
| 332 |
+
" <td>LPTENS–07/16\\nAbril de 2007\\nUna función de on...</td>\n",
|
| 333 |
+
" <td>Definimos una función de onda para los fondo...</td>\n",
|
| 334 |
+
" <td>Introducción\\nNuestro objetivo en este documen...</td>\n",
|
| 335 |
+
" </tr>\n",
|
| 336 |
+
" <tr>\n",
|
| 337 |
+
" <th>1750</th>\n",
|
| 338 |
+
" <td>0704.1997</td>\n",
|
| 339 |
+
" <td>Query on Negative Temperature, Internal Intera...</td>\n",
|
| 340 |
+
" <td>Microsoft Word - negEntr.doc\\nConsulta sobre t...</td>\n",
|
| 341 |
+
" <td>Después de que la temperatura negativa se vu...</td>\n",
|
| 342 |
+
" <td>Microsoft Word - negEntr.doc\\nConsulta sobre t...</td>\n",
|
| 343 |
+
" </tr>\n",
|
| 344 |
+
" <tr>\n",
|
| 345 |
+
" <th>1751</th>\n",
|
| 346 |
+
" <td>0704.1998</td>\n",
|
| 347 |
+
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
|
| 348 |
+
" <td>Ausencia del problema de la quinta fuerza en u...</td>\n",
|
| 349 |
+
" <td>Un modelo de escala invariante que contiene ...</td>\n",
|
| 350 |
+
" <td>Introducción\\n\\tBase de Dos Medidas Teoría de ...</td>\n",
|
| 351 |
+
" </tr>\n",
|
| 352 |
+
" <tr>\n",
|
| 353 |
+
" <th>1752</th>\n",
|
| 354 |
+
" <td>0704.1999</td>\n",
|
| 355 |
+
" <td>Dark matter caustics and the enhancement of se...</td>\n",
|
| 356 |
+
" <td>Proyecto de versión 16 de noviembre de 2018\\nT...</td>\n",
|
| 357 |
+
" <td>Los haloes fríos de materia oscura están pob...</td>\n",
|
| 358 |
+
" <td>Proyecto de versión 16 de noviembre de 2018\\nT...</td>\n",
|
| 359 |
+
" </tr>\n",
|
| 360 |
+
" <tr>\n",
|
| 361 |
+
" <th>1753</th>\n",
|
| 362 |
+
" <td>0704.2000</td>\n",
|
| 363 |
+
" <td>Search for a Higgs boson produced in associati...</td>\n",
|
| 364 |
+
" <td>FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi...</td>\n",
|
| 365 |
+
" <td>Describimos una búsqueda para el modelo está...</td>\n",
|
| 366 |
+
" <td>FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi...</td>\n",
|
| 367 |
+
" </tr>\n",
|
| 368 |
+
" </tbody>\n",
|
| 369 |
+
"</table>\n",
|
| 370 |
+
"<p>1754 rows × 5 columns</p>\n",
|
| 371 |
+
"</div>"
|
| 372 |
+
],
|
| 373 |
+
"text/plain": [
|
| 374 |
+
" id title \\\n",
|
| 375 |
+
"0 0704.0002 Sparsity-certifying Graph Decompositions \n",
|
| 376 |
+
"1 0704.0003 The evolution of the Earth-Moon system based o... \n",
|
| 377 |
+
"2 0704.0004 A determinant of Stirling cycle numbers counts... \n",
|
| 378 |
+
"3 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
|
| 379 |
+
"4 0704.0007 Polymer Quantum Mechanics and its Continuum Limit \n",
|
| 380 |
+
"... ... ... \n",
|
| 381 |
+
"1749 0704.1996 A Wave-function for Stringy Universes \n",
|
| 382 |
+
"1750 0704.1997 Query on Negative Temperature, Internal Intera... \n",
|
| 383 |
+
"1751 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
|
| 384 |
+
"1752 0704.1999 Dark matter caustics and the enhancement of se... \n",
|
| 385 |
+
"1753 0704.2000 Search for a Higgs boson produced in associati... \n",
|
| 386 |
+
"\n",
|
| 387 |
+
" full_text \\\n",
|
| 388 |
+
"0 Descomposiciones del gráfico de certificación ... \n",
|
| 389 |
+
"1 La evolución del sistema Tierra-Luna basado en... \n",
|
| 390 |
+
"2 Un determinante de los números de ciclo de Sti... \n",
|
| 391 |
+
"3 DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC... \n",
|
| 392 |
+
"4 La mecánica cuántica de polímeros y su límite ... \n",
|
| 393 |
+
"... ... \n",
|
| 394 |
+
"1749 LPTENS–07/16\\nAbril de 2007\\nUna función de on... \n",
|
| 395 |
+
"1750 Microsoft Word - negEntr.doc\\nConsulta sobre t... \n",
|
| 396 |
+
"1751 Ausencia del problema de la quinta fuerza en u... \n",
|
| 397 |
+
"1752 Proyecto de versión 16 de noviembre de 2018\\nT... \n",
|
| 398 |
+
"1753 FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... \n",
|
| 399 |
+
"\n",
|
| 400 |
+
" abstract \\\n",
|
| 401 |
+
"0 Describimos un nuevo algoritmo, el juego de ... \n",
|
| 402 |
+
"1 La evolución del sistema Tierra-Luna es desc... \n",
|
| 403 |
+
"2 Demostramos que un determinante de los númer... \n",
|
| 404 |
+
"3 En este artículo mostramos cómo calcular la ... \n",
|
| 405 |
+
"4 Una representación cuántica no estándar de l... \n",
|
| 406 |
+
"... ... \n",
|
| 407 |
+
"1749 Definimos una función de onda para los fondo... \n",
|
| 408 |
+
"1750 Después de que la temperatura negativa se vu... \n",
|
| 409 |
+
"1751 Un modelo de escala invariante que contiene ... \n",
|
| 410 |
+
"1752 Los haloes fríos de materia oscura están pob... \n",
|
| 411 |
+
"1753 Describimos una búsqueda para el modelo está... \n",
|
| 412 |
+
"\n",
|
| 413 |
+
" text_no_abstract \n",
|
| 414 |
+
"0 Introducción y preliminares\\nEl foco de este d... \n",
|
| 415 |
+
"1 Introducción \\nLa teoría aceptada popularmente... \n",
|
| 416 |
+
"2 Introducción El propósito principal de este ar... \n",
|
| 417 |
+
"3 DE DÍA A DÍA\\nWAEL ABU-SHAMMALA Y ALBERTO TORC... \n",
|
| 418 |
+
"4 La mecánica cuántica de polímeros y su límite ... \n",
|
| 419 |
+
"... ... \n",
|
| 420 |
+
"1749 Introducción\\nNuestro objetivo en este documen... \n",
|
| 421 |
+
"1750 Microsoft Word - negEntr.doc\\nConsulta sobre t... \n",
|
| 422 |
+
"1751 Introducción\\n\\tBase de Dos Medidas Teoría de ... \n",
|
| 423 |
+
"1752 Proyecto de versión 16 de noviembre de 2018\\nT... \n",
|
| 424 |
+
"1753 FERMILAB-PUB-07/076-E\\nBúsqueda de un bosón Hi... \n",
|
| 425 |
+
"\n",
|
| 426 |
+
"[1754 rows x 5 columns]"
|
| 427 |
+
]
|
| 428 |
+
},
|
| 429 |
+
"execution_count": 77,
|
| 430 |
+
"metadata": {},
|
| 431 |
+
"output_type": "execute_result"
|
| 432 |
+
}
|
| 433 |
+
],
|
| 434 |
+
"source": [
|
| 435 |
+
"es = df[['id','title','translated','translated_abstract','translated_no_abstract']]\n",
|
| 436 |
+
"es.columns = [\"id\",\"title\", \"full_text\",\"abstract\",\"text_no_abstract\"]\n",
|
| 437 |
+
"es.to_csv(CSV_PATH+'/scientific_paper_es.csv',index=False,encoding='utf-8')\n",
|
| 438 |
+
"es"
|
| 439 |
+
]
|
| 440 |
+
},
|
| 441 |
+
{
|
| 442 |
+
"cell_type": "code",
|
| 443 |
+
"execution_count": 79,
|
| 444 |
+
"metadata": {},
|
| 445 |
+
"outputs": [
|
| 446 |
+
{
|
| 447 |
+
"data": {
|
| 448 |
+
"text/html": [
|
| 449 |
+
"<div>\n",
|
| 450 |
+
"<style scoped>\n",
|
| 451 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 452 |
+
" vertical-align: middle;\n",
|
| 453 |
+
" }\n",
|
| 454 |
+
"\n",
|
| 455 |
+
" .dataframe tbody tr th {\n",
|
| 456 |
+
" vertical-align: top;\n",
|
| 457 |
+
" }\n",
|
| 458 |
+
"\n",
|
| 459 |
+
" .dataframe thead th {\n",
|
| 460 |
+
" text-align: right;\n",
|
| 461 |
+
" }\n",
|
| 462 |
+
"</style>\n",
|
| 463 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 464 |
+
" <thead>\n",
|
| 465 |
+
" <tr style=\"text-align: right;\">\n",
|
| 466 |
+
" <th></th>\n",
|
| 467 |
+
" <th>id</th>\n",
|
| 468 |
+
" <th>title</th>\n",
|
| 469 |
+
" <th>full_text</th>\n",
|
| 470 |
+
" <th>abstract</th>\n",
|
| 471 |
+
" <th>text_no_abstract</th>\n",
|
| 472 |
+
" </tr>\n",
|
| 473 |
+
" </thead>\n",
|
| 474 |
+
" <tbody>\n",
|
| 475 |
+
" <tr>\n",
|
| 476 |
+
" <th>0</th>\n",
|
| 477 |
+
" <td>0704.0002</td>\n",
|
| 478 |
+
" <td>Sparsity-certifying Graph Decompositions</td>\n",
|
| 479 |
+
" <td>Sparsity-certifying Graph Decompositions\\nIlea...</td>\n",
|
| 480 |
+
" <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
|
| 481 |
+
" <td>Introduction and preliminaries\\nThe focus of t...</td>\n",
|
| 482 |
+
" </tr>\n",
|
| 483 |
+
" <tr>\n",
|
| 484 |
+
" <th>1</th>\n",
|
| 485 |
+
" <td>0704.0003</td>\n",
|
| 486 |
+
" <td>The evolution of the Earth-Moon system based o...</td>\n",
|
| 487 |
+
" <td>The evolution of the Earth-Moon system based o...</td>\n",
|
| 488 |
+
" <td>The evolution of Earth-Moon system is descri...</td>\n",
|
| 489 |
+
" <td>Introduction \\nThe popularly accepted theory f...</td>\n",
|
| 490 |
+
" </tr>\n",
|
| 491 |
+
" <tr>\n",
|
| 492 |
+
" <th>2</th>\n",
|
| 493 |
+
" <td>0704.0004</td>\n",
|
| 494 |
+
" <td>A determinant of Stirling cycle numbers counts...</td>\n",
|
| 495 |
+
" <td>A Determinant of Stirling Cycle Numbers Counts...</td>\n",
|
| 496 |
+
" <td>We show that a determinant of Stirling cycle...</td>\n",
|
| 497 |
+
" <td>Introduction The chief purpose of this paper i...</td>\n",
|
| 498 |
+
" </tr>\n",
|
| 499 |
+
" <tr>\n",
|
| 500 |
+
" <th>3</th>\n",
|
| 501 |
+
" <td>0704.0005</td>\n",
|
| 502 |
+
" <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
|
| 503 |
+
" <td>FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL...</td>\n",
|
| 504 |
+
" <td>In this paper we show how to compute the $\\L...</td>\n",
|
| 505 |
+
" <td>FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL...</td>\n",
|
| 506 |
+
" </tr>\n",
|
| 507 |
+
" <tr>\n",
|
| 508 |
+
" <th>4</th>\n",
|
| 509 |
+
" <td>0704.0007</td>\n",
|
| 510 |
+
" <td>Polymer Quantum Mechanics and its Continuum Limit</td>\n",
|
| 511 |
+
" <td>Polymer Quantum Mechanics and its Continuum Li...</td>\n",
|
| 512 |
+
" <td>A rather non-standard quantum representation...</td>\n",
|
| 513 |
+
" <td>Polymer Quantum Mechanics and its Continuum Li...</td>\n",
|
| 514 |
+
" </tr>\n",
|
| 515 |
+
" <tr>\n",
|
| 516 |
+
" <th>...</th>\n",
|
| 517 |
+
" <td>...</td>\n",
|
| 518 |
+
" <td>...</td>\n",
|
| 519 |
+
" <td>...</td>\n",
|
| 520 |
+
" <td>...</td>\n",
|
| 521 |
+
" <td>...</td>\n",
|
| 522 |
+
" </tr>\n",
|
| 523 |
+
" <tr>\n",
|
| 524 |
+
" <th>1749</th>\n",
|
| 525 |
+
" <td>0704.1996</td>\n",
|
| 526 |
+
" <td>A Wave-function for Stringy Universes</td>\n",
|
| 527 |
+
" <td>LPTENS–07/16\\nApril 2007\\nA Wave-function for ...</td>\n",
|
| 528 |
+
" <td>We define a wave-function for string theory ...</td>\n",
|
| 529 |
+
" <td>Introduction\\nOur goal in this paper is to emb...</td>\n",
|
| 530 |
+
" </tr>\n",
|
| 531 |
+
" <tr>\n",
|
| 532 |
+
" <th>1750</th>\n",
|
| 533 |
+
" <td>0704.1997</td>\n",
|
| 534 |
+
" <td>Query on Negative Temperature, Internal Intera...</td>\n",
|
| 535 |
+
" <td>Microsoft Word - negEntr.doc\\nQuery on Negativ...</td>\n",
|
| 536 |
+
" <td>After negative temperature is restated, we f...</td>\n",
|
| 537 |
+
" <td>Microsoft Word - negEntr.doc\\nQuery on Negativ...</td>\n",
|
| 538 |
+
" </tr>\n",
|
| 539 |
+
" <tr>\n",
|
| 540 |
+
" <th>1751</th>\n",
|
| 541 |
+
" <td>0704.1998</td>\n",
|
| 542 |
+
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
|
| 543 |
+
" <td>Absence of the Fifth Force Problem in a Model ...</td>\n",
|
| 544 |
+
" <td>A scale invariant model containing dilaton $...</td>\n",
|
| 545 |
+
" <td>Introduction\\n\\tBasis of Two Measures Field Th...</td>\n",
|
| 546 |
+
" </tr>\n",
|
| 547 |
+
" <tr>\n",
|
| 548 |
+
" <th>1752</th>\n",
|
| 549 |
+
" <td>0704.1999</td>\n",
|
| 550 |
+
" <td>Dark matter caustics and the enhancement of se...</td>\n",
|
| 551 |
+
" <td>Draft version November 16, 2018\\nPreprint type...</td>\n",
|
| 552 |
+
" <td>Cold dark matter haloes are populated by cau...</td>\n",
|
| 553 |
+
" <td>Draft version November 16, 2018\\nPreprint type...</td>\n",
|
| 554 |
+
" </tr>\n",
|
| 555 |
+
" <tr>\n",
|
| 556 |
+
" <th>1753</th>\n",
|
| 557 |
+
" <td>0704.2000</td>\n",
|
| 558 |
+
" <td>Search for a Higgs boson produced in associati...</td>\n",
|
| 559 |
+
" <td>FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso...</td>\n",
|
| 560 |
+
" <td>We describe a search for the standard model ...</td>\n",
|
| 561 |
+
" <td>FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso...</td>\n",
|
| 562 |
+
" </tr>\n",
|
| 563 |
+
" </tbody>\n",
|
| 564 |
+
"</table>\n",
|
| 565 |
+
"<p>1754 rows × 5 columns</p>\n",
|
| 566 |
+
"</div>"
|
| 567 |
+
],
|
| 568 |
+
"text/plain": [
|
| 569 |
+
" id title \\\n",
|
| 570 |
+
"0 0704.0002 Sparsity-certifying Graph Decompositions \n",
|
| 571 |
+
"1 0704.0003 The evolution of the Earth-Moon system based o... \n",
|
| 572 |
+
"2 0704.0004 A determinant of Stirling cycle numbers counts... \n",
|
| 573 |
+
"3 0704.0005 From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a... \n",
|
| 574 |
+
"4 0704.0007 Polymer Quantum Mechanics and its Continuum Limit \n",
|
| 575 |
+
"... ... ... \n",
|
| 576 |
+
"1749 0704.1996 A Wave-function for Stringy Universes \n",
|
| 577 |
+
"1750 0704.1997 Query on Negative Temperature, Internal Intera... \n",
|
| 578 |
+
"1751 0704.1998 Absence of the Fifth Force Problem in a Model ... \n",
|
| 579 |
+
"1752 0704.1999 Dark matter caustics and the enhancement of se... \n",
|
| 580 |
+
"1753 0704.2000 Search for a Higgs boson produced in associati... \n",
|
| 581 |
+
"\n",
|
| 582 |
+
" full_text \\\n",
|
| 583 |
+
"0 Sparsity-certifying Graph Decompositions\\nIlea... \n",
|
| 584 |
+
"1 The evolution of the Earth-Moon system based o... \n",
|
| 585 |
+
"2 A Determinant of Stirling Cycle Numbers Counts... \n",
|
| 586 |
+
"3 FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL... \n",
|
| 587 |
+
"4 Polymer Quantum Mechanics and its Continuum Li... \n",
|
| 588 |
+
"... ... \n",
|
| 589 |
+
"1749 LPTENS–07/16\\nApril 2007\\nA Wave-function for ... \n",
|
| 590 |
+
"1750 Microsoft Word - negEntr.doc\\nQuery on Negativ... \n",
|
| 591 |
+
"1751 Absence of the Fifth Force Problem in a Model ... \n",
|
| 592 |
+
"1752 Draft version November 16, 2018\\nPreprint type... \n",
|
| 593 |
+
"1753 FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... \n",
|
| 594 |
+
"\n",
|
| 595 |
+
" abstract \\\n",
|
| 596 |
+
"0 We describe a new algorithm, the $(k,\\ell)$-... \n",
|
| 597 |
+
"1 The evolution of Earth-Moon system is descri... \n",
|
| 598 |
+
"2 We show that a determinant of Stirling cycle... \n",
|
| 599 |
+
"3 In this paper we show how to compute the $\\L... \n",
|
| 600 |
+
"4 A rather non-standard quantum representation... \n",
|
| 601 |
+
"... ... \n",
|
| 602 |
+
"1749 We define a wave-function for string theory ... \n",
|
| 603 |
+
"1750 After negative temperature is restated, we f... \n",
|
| 604 |
+
"1751 A scale invariant model containing dilaton $... \n",
|
| 605 |
+
"1752 Cold dark matter haloes are populated by cau... \n",
|
| 606 |
+
"1753 We describe a search for the standard model ... \n",
|
| 607 |
+
"\n",
|
| 608 |
+
" text_no_abstract \n",
|
| 609 |
+
"0 Introduction and preliminaries\\nThe focus of t... \n",
|
| 610 |
+
"1 Introduction \\nThe popularly accepted theory f... \n",
|
| 611 |
+
"2 Introduction The chief purpose of this paper i... \n",
|
| 612 |
+
"3 FROM DYADIC Λα TO Λα\\nWAEL ABU-SHAMMALA AND AL... \n",
|
| 613 |
+
"4 Polymer Quantum Mechanics and its Continuum Li... \n",
|
| 614 |
+
"... ... \n",
|
| 615 |
+
"1749 Introduction\\nOur goal in this paper is to emb... \n",
|
| 616 |
+
"1750 Microsoft Word - negEntr.doc\\nQuery on Negativ... \n",
|
| 617 |
+
"1751 Introduction\\n\\tBasis of Two Measures Field Th... \n",
|
| 618 |
+
"1752 Draft version November 16, 2018\\nPreprint type... \n",
|
| 619 |
+
"1753 FERMILAB-PUB-07/076-E\\nSearch for a Higgs boso... \n",
|
| 620 |
+
"\n",
|
| 621 |
+
"[1754 rows x 5 columns]"
|
| 622 |
+
]
|
| 623 |
+
},
|
| 624 |
+
"execution_count": 79,
|
| 625 |
+
"metadata": {},
|
| 626 |
+
"output_type": "execute_result"
|
| 627 |
+
}
|
| 628 |
+
],
|
| 629 |
+
"source": [
|
| 630 |
+
"en = df[['id','title','full_text','abstract','text_no_abstract']]\n",
|
| 631 |
+
"en.columns = [\"id\",\"title\", \"full_text\",\"abstract\",\"text_no_abstract\"]\n",
|
| 632 |
+
"en.to_csv(CSV_PATH+'/scientific_paper_en.csv',index=False,encoding='utf-8')\n",
|
| 633 |
+
"en"
|
| 634 |
+
]
|
| 635 |
+
}
|
| 636 |
+
],
|
| 637 |
+
"metadata": {
|
| 638 |
+
"interpreter": {
|
| 639 |
+
"hash": "05def4d9d0834781cbeb6b95fd92421f8bd6a45e945308f90d88567f4afc1911"
|
| 640 |
+
},
|
| 641 |
+
"kernelspec": {
|
| 642 |
+
"display_name": "Python 3.8.12 ('tensorflow')",
|
| 643 |
+
"language": "python",
|
| 644 |
+
"name": "python3"
|
| 645 |
+
},
|
| 646 |
+
"language_info": {
|
| 647 |
+
"codemirror_mode": {
|
| 648 |
+
"name": "ipython",
|
| 649 |
+
"version": 3
|
| 650 |
+
},
|
| 651 |
+
"file_extension": ".py",
|
| 652 |
+
"mimetype": "text/x-python",
|
| 653 |
+
"name": "python",
|
| 654 |
+
"nbconvert_exporter": "python",
|
| 655 |
+
"pygments_lexer": "ipython3",
|
| 656 |
+
"version": "3.9.7"
|
| 657 |
+
},
|
| 658 |
+
"orig_nbformat": 4
|
| 659 |
+
},
|
| 660 |
+
"nbformat": 4,
|
| 661 |
+
"nbformat_minor": 2
|
| 662 |
+
}
|