{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2021-06-23T10:14:01.220679Z", "iopub.status.busy": "2021-06-23T10:14:01.219714Z", "iopub.status.idle": "2021-06-23T10:14:01.359474Z", "shell.execute_reply": "2021-06-23T10:14:01.357018Z", "shell.execute_reply.started": "2021-06-23T10:14:01.220489Z" }, "trusted": true }, "outputs": [], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " \n", " print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", "ls # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.status.busy": "2021-06-23T10:14:01.360753Z", "iopub.status.idle": "2021-06-23T10:14:01.361256Z" }, "trusted": true }, "outputs": [], "source": [ "import tensorflow as tf\n", "import psutil\n", "\n", "print(\"TensorFlow version\",tf.__version__)\n", "print(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\n", "psutil.virtual_memory()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.status.busy": "2021-06-23T10:14:01.362996Z", "iopub.status.idle": "2021-06-23T10:14:01.363698Z" }, "trusted": true }, "outputs": [], "source": [ "print (spacy.util.is_package(\"en_core_web_lg\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# **Readability Scores + spaCy**" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2021-06-25T19:37:17.991312Z", "iopub.status.busy": "2021-06-25T19:37:17.990794Z", "iopub.status.idle": "2021-06-25T19:41:20.773700Z", "shell.execute_reply": "2021-06-25T19:41:20.772698Z", "shell.execute_reply.started": "2021-06-25T19:37:17.991258Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "working directory /kaggle/working\n", "\n", "spaCy features\n", "xtrain id url_legal license \\\n", "0 c12129c31 NaN NaN \n", "1 85aa80a4c NaN NaN \n", "2 b69ac6792 NaN NaN \n", "3 dd1000b26 NaN NaN \n", "4 37c1b32fb NaN NaN \n", "5 f9bf357fe NaN NaN \n", "6 eaf8e7355 NaN NaN \n", "\n", " excerpt target standard_error \n", "0 When the young people returned to the ballroom... -0.340259 0.464009 \n", "1 All through dinner time, Mrs. Fayre was somewh... -0.315372 0.480805 \n", "2 As Roger had predicted, the snow departed as q... -0.580118 0.476676 \n", "3 And outside before the palace a great garden w... -1.054013 0.450007 \n", "4 Once upon a time there were Three Bears who li... 0.247197 0.510845 \n", "5 Hal and Chester found ample time to take an in... -0.861809 0.480936 \n", "6 Hal Paine and Chester Crawford were typical Am... -1.759061 0.476507 \n", "\n", "xtrain target 0 -0.340259\n", "1 -0.315372\n", "2 -0.580118\n", "3 -1.054013\n", "4 0.247197\n", "5 -0.861809\n", "6 -1.759061\n", "Name: target, dtype: float64\n", "\n", "calculating Flesch score\n", "counting unique words\n", "\n", "--------------------------------------------------------------------------------\n", "text 100 When the two friends had thus become reconciled, they examined the cub, and s target -0.811519925\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 200 Maggie soon thought she had been hours in the attic, and it must be tea time, target 0.277737321\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 300 In Computer science, cloud computing describes a type of outsourcing of compu target -1.413698838\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 400 A habitat is an ecological or environmental area that is inhabited by a parti target -0.616799879\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 500 Nanotechnology has provided the possibility of delivering drugs to specific c target -1.6251413\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 600 Animal tissues are grouped into four basic types: connective, muscle, nervous target -2.601398503\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 700 A local area network (LAN) is a computer network in a small area like a home, target -1.10578373\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 800 You guessed it–science! Fluorescent colors are very special. You might have s target -0.101779002\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 900 Our immune systems work to protect us from illness by recognizing foreign mol target -0.986988435\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1000 Fog is an accumulation of tiny water droplets or ice crystals suspended above target -0.691647347\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1100 Chameli's mother had a lot of beautiful jewelry. \n", "One day, Chameli's mother s target -0.047198886\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1200 December was an anxious month. Several German divisions were east of the Piav target -2.279783534\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1300 The Government of the United States and the Imperial German Government are co target -1.532702197\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1400 It was not, however, until the morning that we entered the harbor of Havre th target -0.841382898\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1500 After a time the polished rocky sides of the shaft grew to be of a solemn sam target -1.826667527\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1600 One morning, Grandma had two loaves of \"riz bread,\" and some election cakes, target -0.698302533\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1700 This beautiful Agave is now in blossom in the garden here, and I am happy to target -2.162795917\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1800 Quickly he opens the paper to its full extent, and places it on the floor car target -0.584273721\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 1900 There is a great park here, known as the Maidan, where dogs run with bones to target -0.583532619\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2000 Mabel lives on a hill, quite near a beautiful lake, and is very fond of going target 0.902661245\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2100 He fallow-deer — This is the domestic or park deer; and no two animals can ma target -2.57511146\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2200 The sun is a star, just like the other millions of stars you see when you loo target -0.580630824\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2300 This may be a useful lesson to you, dear Isabel,\" she said. \"It will teach yo target -0.223570721\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2400 By looking at any map of Europe, it will be seen that England is separated fr target -1.188880582\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2500 It was a beautiful place to play. There were trees for hide-and-seek, flat sp target -0.025405297\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2600 When Doris got home she opened her paint-box. What do you think? Of course it target 0.198997768\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2700 Forty years ago women were given no representation in conventions where polit target -1.291127806\n", "\n", "\n", "--------------------------------------------------------------------------------\n", "text 2800 The American Civil War (1861–1865) was a civil war in the United States of Am target 0.223365705\n", "\n", "getting spaCy features\n", "namelist ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299']\n", "train_vectors f0 f1 f2 f3 f4 f5 f6 \\\n", "0 0.054942 0.104693 -0.129839 -0.077362 0.066479 -0.007780 -0.010820 \n", "1 -0.014731 0.213879 -0.183907 -0.048958 0.112992 0.026316 0.028143 \n", "2 -0.006671 0.217069 -0.125178 -0.073087 0.106584 0.016715 0.011751 \n", "3 0.040802 0.123908 -0.106365 -0.109637 0.090377 0.020302 -0.004274 \n", "4 -0.018610 0.109863 -0.161603 -0.035534 0.143024 -0.053699 -0.026999 \n", "\n", " f7 f8 f9 ... f290 f291 f292 f293 \\\n", "0 -0.054412 -0.069162 2.085747 ... -0.238603 0.031822 0.059434 -0.093304 \n", "1 -0.137892 -0.094396 2.170374 ... -0.105756 0.051830 0.006578 -0.007093 \n", "2 -0.115226 -0.071845 2.141033 ... -0.116192 0.020743 0.012045 -0.028818 \n", "3 -0.007752 -0.103293 2.044155 ... -0.201305 0.014579 -0.002538 -0.046069 \n", "4 -0.152680 -0.012829 2.205194 ... -0.284280 0.039023 0.082985 -0.052379 \n", "\n", " f294 f295 f296 f297 f298 f299 \n", "0 -0.134071 0.053603 0.038264 -0.028437 -0.022459 0.068514 \n", "1 0.065427 -0.029093 -0.000948 0.012834 0.015731 0.002224 \n", "2 0.044761 -0.033804 0.015294 -0.019791 0.020420 0.016437 \n", "3 -0.155320 0.064200 0.079673 -0.035276 0.001438 0.066118 \n", "4 -0.135277 0.122106 0.057177 -0.104051 -0.100320 0.024026 \n", "\n", "[5 rows x 300 columns]\n", "xtrain id url_legal license \\\n", "0 c12129c31 NaN NaN \n", "1 85aa80a4c NaN NaN \n", "2 b69ac6792 NaN NaN \n", "3 dd1000b26 NaN NaN \n", "4 37c1b32fb NaN NaN \n", "\n", " excerpt target \\\n", "0 When the young people returned to the ballroom... -0.340259 \n", "1 All through dinner time, Mrs. Fayre was somewh... -0.315372 \n", "2 As Roger had predicted, the snow departed as q... -0.580118 \n", "3 And outside before the palace a great garden w... -1.054013 \n", "4 Once upon a time there were Three Bears who li... 0.247197 \n", "\n", " standard_error nof_char nof_words w2c nof_sentences ... \\\n", "0 0.464009 992 174 0.175403 11 ... \n", "1 0.480805 937 164 0.175027 10 ... \n", "2 0.476676 908 162 0.178414 11 ... \n", "3 0.450007 909 163 0.179318 5 ... \n", "4 0.510845 723 147 0.203320 5 ... \n", "\n", " f290 f291 f292 f293 f294 f295 f296 \\\n", "0 -0.238603 0.031822 0.059434 -0.093304 -0.134071 0.053603 0.038264 \n", "1 -0.105756 0.051830 0.006578 -0.007093 0.065427 -0.029093 -0.000948 \n", "2 -0.116192 0.020743 0.012045 -0.028818 0.044761 -0.033804 0.015294 \n", "3 -0.201305 0.014579 -0.002538 -0.046069 -0.155320 0.064200 0.079673 \n", "4 -0.284280 0.039023 0.082985 -0.052379 -0.135277 0.122106 0.057177 \n", "\n", " f297 f298 f299 \n", "0 -0.028437 -0.022459 0.068514 \n", "1 0.012834 0.015731 0.002224 \n", "2 -0.019791 0.020420 0.016437 \n", "3 -0.035276 0.001438 0.066118 \n", "4 -0.104051 -0.100320 0.024026 \n", "\n", "[5 rows x 318 columns]\n", "\n", "2267 566 2267 566\n", "X nof_words nof_sentences nof_syllables flesch_score txt_diversity \\\n", "981 165 12 235 72.387841 0.709091 \n", "683 182 8 321 34.531662 0.631868 \n", "195 172 9 249 64.963966 0.703488 \n", "1158 151 5 264 28.272066 0.741722 \n", "1553 173 9 281 49.910572 0.722543 \n", "2273 169 9 273 51.114017 0.662722 \n", "2056 188 14 241 84.755000 0.632979 \n", "\n", " nof_unique_words nof_char w2c flesch_score2 punctsPerSentence \\\n", "981 117 896 0.184152 6.578561 1.812500 \n", "683 115 1106 0.164557 14.094588 1.700000 \n", "195 121 988 0.174089 8.945891 4.000000 \n", "1158 112 927 0.162891 16.818464 4.833333 \n", "1553 125 1018 0.169941 11.073141 3.000000 \n", "2273 112 989 0.170880 10.794872 4.166667 \n", "2056 119 938 0.200426 4.773739 1.588235 \n", "\n", " ... f290 f291 f292 f293 f294 f295 \\\n", "981 ... -0.135388 0.006350 0.059477 -0.048736 0.037906 -0.004224 \n", "683 ... -0.209450 0.023985 -0.054420 -0.117216 -0.131639 0.101565 \n", "195 ... -0.155826 0.020981 0.004709 0.017356 0.054904 -0.006336 \n", "1158 ... -0.246229 0.019796 -0.079219 -0.012937 -0.049782 0.086845 \n", "1553 ... -0.115806 0.026836 0.069627 -0.061271 -0.058355 0.045644 \n", "2273 ... -0.201175 0.044129 0.031399 -0.086733 -0.077939 0.026245 \n", "2056 ... -0.159391 0.044991 0.042129 -0.041979 -0.002026 0.052522 \n", "\n", " f296 f297 f298 f299 \n", "981 -0.000829 0.038497 0.012296 0.009338 \n", "683 -0.080587 -0.070543 0.008028 0.024660 \n", "195 -0.047379 -0.007128 -0.015000 0.042302 \n", "1158 -0.022204 -0.057120 -0.018331 0.023094 \n", "1553 0.062768 -0.013423 -0.016792 0.006240 \n", "2273 -0.007248 -0.011341 -0.099948 0.014829 \n", "2056 0.005720 0.018664 0.004293 0.039324 \n", "\n", "[7 rows x 310 columns]\n", "\n", "y 981 1.597870\n", "683 -0.743435\n", "195 -0.280994\n", "1158 -1.464792\n", "1553 -1.884352\n", "2273 -0.578085\n", "2056 0.666116\n", "Name: target, dtype: float64\n", "\n", "Xtest nof_words nof_sentences nof_syllables flesch_score txt_diversity \\\n", "4 147 5 183 71.675633 0.346939 \n", "9 191 8 274 61.238524 0.675393 \n", "14 181 11 259 69.076178 0.696133 \n", "19 176 8 256 61.450455 0.693182 \n", "24 169 6 273 41.584295 0.733728 \n", "29 167 10 256 60.198272 0.694611 \n", "34 146 5 230 43.923027 0.630137 \n", "\n", " nof_unique_words nof_char w2c flesch_score2 punctsPerSentence \\\n", "4 51 723 0.203320 10.565796 6.400000 \n", "9 129 1026 0.186160 10.648999 2.625000 \n", "14 126 967 0.187177 7.712356 2.230769 \n", "19 122 957 0.183908 10.153636 3.125000 \n", "24 124 993 0.170191 14.456538 4.000000 \n", "29 116 937 0.178228 9.011623 2.083333 \n", "34 92 802 0.182045 14.387041 1.500000 \n", "\n", " ... f290 f291 f292 f293 f294 f295 f296 \\\n", "4 ... -0.284280 0.039023 0.082985 -0.052379 -0.135277 0.122106 0.057177 \n", "9 ... -0.187329 0.043337 0.025266 -0.083779 -0.068758 0.022553 0.040310 \n", "14 ... -0.173686 0.034872 0.118947 -0.056733 -0.031695 0.045877 0.025237 \n", "19 ... -0.191329 0.022130 0.035867 0.000256 0.020506 0.012822 0.054820 \n", "24 ... -0.214793 0.006693 0.043532 -0.063686 -0.038540 0.034346 0.009910 \n", "29 ... -0.135137 0.031284 0.025826 -0.051649 -0.006029 -0.006497 0.011574 \n", "34 ... -0.135040 0.043721 0.088910 -0.033691 0.038809 0.023250 -0.022141 \n", "\n", " f297 f298 f299 \n", "4 -0.104051 -0.100320 0.024026 \n", "9 0.023220 -0.014123 0.073939 \n", "14 0.017827 -0.013019 0.070423 \n", "19 0.012178 0.026124 0.014872 \n", "24 -0.043891 -0.028318 0.008500 \n", "29 0.073487 0.016896 0.012550 \n", "34 -0.017283 0.009402 0.041435 \n", "\n", "[7 rows x 310 columns]\n", "\n", "ytest 4 0.247197\n", "9 -1.238432\n", "14 0.245806\n", "19 -1.009999\n", "24 -1.483887\n", "29 -1.413744\n", "34 0.022598\n", "Name: target, dtype: float64\n", "\n", "testTexts 4 Once upon a time there were Three Bears who li...\n", "9 One day he had gone beyond any point which he ...\n", "14 Aunt Abigail was gone, Eleanor was gone. The r...\n", "19 Father had been away in the country for three ...\n", "24 One beautiful misummer night in 18— a large, h...\n", "29 Before Fred could complete the sentence his fo...\n", "34 When Josie arrived at her destination she went...\n", "Name: excerpt, dtype: object\n", "\n", "Coefficients: \n", " [ 1.48112616e-02 -1.55412549e-02 -7.73959046e-03 -2.88993529e-02\n", " 1.67597648e+00 -1.13562873e-02 -6.76790330e-04 2.31096998e-01\n", " -9.55646587e-02 -4.19500101e-02 1.36526000e-01 -4.64126500e-01\n", " -2.03473618e+00 9.43253633e-01 -2.02192115e-01 -1.34499713e+00\n", " 3.64497640e-01 6.57803691e-01 4.88818959e-01 2.15804127e+00\n", " 3.14023294e-01 1.67664341e+00 9.81973042e-01 -6.91690628e-01\n", " -1.01460977e+00 -2.02116650e+00 1.71898026e+00 1.74817617e-01\n", " -1.22450073e+00 -1.30771453e-01 2.11451440e+00 6.13313101e-01\n", " 6.78876769e-01 -2.25069093e-01 7.07877084e-01 2.04801343e-01\n", " 1.96560164e+00 -1.46212410e+00 1.31736764e+00 -1.05987346e+00\n", " 7.89324172e-01 -1.33164448e+00 9.95217519e-01 -2.49868056e+00\n", " 1.29056335e-01 1.49038129e+00 -5.42990407e-01 -8.39090134e-01\n", " -1.41186945e+00 7.54193366e-02 2.57674731e-01 1.30808569e-01\n", " -4.92950811e-01 -1.58826836e+00 5.96774858e-01 1.06580809e-01\n", " -1.99887650e+00 3.42666469e-01 6.91031626e-01 -1.36461958e+00\n", " -7.99921013e-01 1.06241437e+00 6.02372886e-01 1.34139155e-01\n", " 5.87312458e-01 -8.89612395e-01 2.31979457e-01 -2.72189030e-01\n", " 1.27046369e+00 -7.87424473e-01 -5.96105483e-01 1.24769951e+00\n", " 1.40308240e+00 1.27394400e+00 4.91412901e-01 -7.72385024e-01\n", " -5.05964685e-01 1.14412032e-01 8.54139656e-01 1.73251441e-01\n", " 5.39661393e-01 -1.52359023e+00 8.30725440e-01 -2.26389149e+00\n", " -4.30577198e-01 -2.44745684e-01 9.70829904e-01 9.66891873e-01\n", " 5.73260492e-01 2.39438422e+00 6.31846069e-01 -9.79336578e-01\n", " -7.77869900e-01 -8.86337073e-01 8.90635384e-01 1.50316880e+00\n", " 2.77835843e-01 3.36868622e+00 -6.44468462e-01 8.65056836e-01\n", " -7.77913999e-01 7.31843409e-01 -1.85950207e-01 -1.59319060e+00\n", " 1.45043870e+00 -8.12437286e-02 -2.70250697e+00 1.04160621e-02\n", " 6.05750580e-02 -6.64125045e-01 9.65134691e-01 1.08247921e+00\n", " 1.55597965e+00 2.14804900e+00 -1.78798007e+00 5.47404008e-01\n", " 5.62647384e-01 -7.78201603e-01 4.11461894e-02 -1.02381785e+00\n", " 4.75500088e-02 1.78260419e+00 4.47533117e-02 1.22368838e+00\n", " 7.79222434e-01 9.18839415e-01 -4.49509730e-01 -8.75912387e-01\n", " -6.25593418e-01 -9.89823633e-01 -2.26460072e-01 4.07520063e-01\n", " 1.01842585e+00 -1.75370348e+00 -1.88901046e+00 1.02996122e+00\n", " 6.23973502e-01 1.05602333e+00 -1.00035180e+00 8.30154535e-01\n", " -7.36426230e-01 -1.23791657e-01 -2.67737114e-01 4.15182059e-01\n", " 1.83684335e+00 2.83771232e-02 -1.23688909e+00 1.10544755e+00\n", " -1.28570229e+00 5.76877283e-02 -1.67473796e+00 5.93510889e-01\n", " -5.22998133e-01 8.88995961e-01 -6.45399566e-01 7.12948999e-01\n", " 1.14934601e+00 -8.13445538e-01 -5.62938695e-01 -3.20461496e-01\n", " 1.40390815e+00 -8.46474289e-01 -1.10234188e+00 1.14106878e+00\n", " 3.63046803e-01 -3.11109625e-01 4.11949925e-01 3.34908944e-02\n", " -8.05188696e-01 1.68705750e-01 9.73377722e-02 1.02001860e+00\n", " -3.23013411e-01 6.57740349e-01 8.48421459e-02 -8.48525076e-01\n", " 1.13746190e+00 1.34074006e+00 1.01844132e+00 9.41773743e-01\n", " 2.95959229e-01 3.40927881e-01 2.09716200e-01 1.03042481e+00\n", " -4.80037659e-01 -1.53610920e+00 6.90516961e-01 2.50788383e+00\n", " -1.13485462e+00 5.40351535e-01 1.53150011e+00 2.50065741e-01\n", " -1.66960225e+00 -1.46658813e+00 1.25153767e+00 1.38410771e-01\n", " 2.24230354e+00 1.77595626e+00 9.52513188e-01 3.93867629e-02\n", " 5.24358446e-01 -2.91415353e+00 9.89378191e-01 -9.35222678e-01\n", " 2.09256753e+00 5.85871842e-01 5.67877375e-01 -2.64329976e-01\n", " 5.24332810e-01 4.07220229e-01 -9.98309823e-01 -1.77984935e+00\n", " -3.05702491e-01 -5.94020140e-01 1.64474431e+00 -9.60539779e-01\n", " 5.73586279e-01 -8.04563719e-01 2.85977044e-01 -7.12660606e-01\n", " 1.16091398e+00 5.21863006e-01 6.36117494e-01 -1.54157975e+00\n", " -2.48319258e-02 -7.80990052e-01 1.93422206e+00 -1.81617141e-01\n", " 8.74368246e-01 -9.95851975e-01 1.20673851e+00 6.25174832e-01\n", " -4.14509525e-01 3.47246439e-01 1.24975180e+00 -1.24743672e-01\n", " 2.83320953e+00 -2.47267863e-01 -4.76491480e-01 -1.15920506e-01\n", " -6.13646729e-01 -2.02507388e+00 -1.17955649e+00 -4.96551385e-01\n", " 7.02227766e-01 -1.35640131e+00 1.49416852e-01 1.19430963e+00\n", " -9.06887201e-02 -6.62766712e-01 -2.14906303e+00 -3.19839067e-01\n", " -1.20248193e+00 -1.31185676e+00 -2.09510746e-01 -2.99065619e-01\n", " -3.96931126e-01 -2.74447079e-01 -7.14053736e-01 -2.31750076e+00\n", " -1.63908613e+00 -1.18412058e+00 -1.59452523e+00 2.56873125e-01\n", " -1.75285642e+00 -1.52126210e-03 4.38063031e-01 5.63524168e-01\n", " -1.86617995e+00 5.31489429e-02 -6.01337084e-01 1.37594875e-01\n", " 2.94007258e-01 1.44702658e+00 -2.10128714e+00 -1.73651395e+00\n", " -6.93153045e-01 1.19929792e+00 -1.30700577e+00 9.80201534e-01\n", " 1.27176203e+00 -3.31401047e-01 -8.54530472e-01 -8.68841038e-01\n", " 2.37037887e-01 -6.57746733e-01 1.09954468e-01 -1.09134992e-01\n", " -9.02331545e-01 1.42930236e+00 -1.97150272e+00 -2.42991619e+00\n", " 4.67394492e-01 6.05340704e-01 2.91821556e+00 7.67391194e-01\n", " -1.13204002e+00 -9.21798068e-01 -5.39630783e-01 -4.09563689e-01\n", " -1.44841898e+00 1.68235528e+00 -1.34297779e+00 9.36105670e-01\n", " -1.86417473e-02 2.78968737e+00 -4.39593949e-01 1.35258287e+00\n", " -7.39050533e-01 1.85308077e+00]\n", "Mean squared error: 0.3317574750\n", "Root Mean squared error: 0.5759839191\n", "Coefficient of determination: 0.69\n", "sklearn RMSE 0.5759839190527772\n", "ytest is a y_pred is a \n", "testTexts is a testResults is a \n", " excerpt predregr target\n", "4 Once upon a time there were Three Bears who li... 0.303138 0.247197\n", "9 One day he had gone beyond any point which he ... -1.311287 -1.238432\n", "14 Aunt Abigail was gone, Eleanor was gone. The r... 0.211536 0.245806\n", "19 Father had been away in the country for three ... -1.139146 -1.009999\n", "24 One beautiful misummer night in 18— a large, h... -1.115843 -1.483887\n", "29 Before Fred could complete the sentence his fo... -1.260631 -1.413744\n", "34 When Josie arrived at her destination she went... -0.316249 0.022598\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEGCAYAAABsLkJ6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA7DElEQVR4nO2de5wU5Znvf880DfQgMkMyHmV0BLMGAgEGIYqHVQMaceOCiLc1usaNl3h2s15iJsHoEUj0yGeJlz0mGyWay6prMGomGoxXMBpdjOIMIipqFJDGo0QYVKaBnp7n/NFdTXV1vVVv3bp6up/v5zMf6FvVUzXTz/O+z5WYGYIgCEL90RC3AIIgCEI8iAEQBEGoU8QACIIg1CliAARBEOoUMQCCIAh1yqC4BfDCZz/7WR49enTcYgiCIAwo1qxZ81dmbrE+P6AMwOjRo/HSSy/FLYYgCMKAgog22T0vLiBBEIQ6RQyAIAhCnRKbASCiQ4hoFRG9RkTrieiyuGQRBEGoR+KMAfQBuJKZXyai4QDWENETzPxajDIJghAB2WwWW7Zswe7du+MWpaYZOnQoDj74YCSTSa33x2YAmPl9AO8X/v8JEb0OoBWAGABBqDG2bNmC4cOHY/To0SCiuMWpSZgZH330EbZs2YIxY8ZofaYqsoCIaDSAKQBeiFkUQRAC0NmVxtLHNmBrTwajmlLomD0W86a0Yvfu3aL8I4aI8JnPfAbbtm3T/kzsBoCI9gPwAIDLmfljm9cvBnAxALS1tVVYOkEQdOnsSuOqB9chk80BANI9GVz14DoAwNihEOVfAbze41izgIgoibzyv4eZH7R7DzMvY+ZpzDytpaWsjkEQhCph6WMbisrfIJPNYeljG2KSSHAjziwgAnAngNeZ+aa45BAEIRy29mQ8PV9pEokE2tvb8cUvfhFnnHEGent7fR/r/PPPx/333w8AuPDCC/Haa+rQ5dNPP43nn3+++Pi2227Df/7nf/o+d5jEuQOYAeAfAcwiou7Cz1djlEcQhACMakp5er7SpFIpdHd349VXX8XgwYNx2223lbze19fn67h33HEHxo8fr3zdagAuueQSnHfeeb7OFTaxGQBm/hMzEzNPYub2ws8jcckjCEIwOmaPRSqZKHkulUygY/ZYz8fq7EpjxpKVGLNgBWYsWYnOrnRYYgIAjjnmGLz99tt4+umnccwxx2Du3LkYP348crkcOjo68KUvfQmTJk3C7bffDiCfYfOtb30LY8eOxQknnIAPP/yweKwvf/nLxRY1jz76KI444ghMnjwZxx9/PDZu3IjbbrsNN998M9rb2/Hss89i0aJF+NGPfgQA6O7uxvTp0zFp0iSceuqp2LFjR/GY3/ve93DkkUfi85//PJ599lkAwPr163HkkUeivb0dkyZNwltvvRXoPsQeBBYEoTaYN6UVAGyzgF5/vSy/Q4lTMNk4RxD6+vrwhz/8ASeddBIA4OWXX8arr76KMWPGYNmyZRgxYgRefPFF7NmzBzNmzMCJJ56Irq4ubNiwAa+99ho++OADjB8/Ht/4xjdKjrtt2zZcdNFFeOaZZzBmzBhs374dI0eOxCWXXIL99tsP3/nOdwAATz31VPEz5513Hm699VYcd9xxuPbaa7F48WLccsstRTn//Oc/45FHHsHixYvx5JNP4rbbbsNll12Gc845B3v37kUuVxpz8YoYAEEQQmPelNbAStopmBzk2JlMBu3t7QDyO4ALLrgAzz//PI488shi3vzjjz+OV155pejf37lzJ9566y0888wzOPvss5FIJDBq1CjMmjWr7PirV6/GscceWzzWyJEjHeXZuXMnenp6cNxxxwEAvv71r+OMM84ovj5//nwAwNSpU7Fx40YAwNFHH43rr78eW7Zswfz583H44Yf7vh+AGABBEKqMqILJRgzAyrBhw4r/Z2bceuutmD17dsl7Hnmk8t7pIUOGAMgHr434xNe+9jUcddRRWLFiBb761a/i9ttvtzVGukgzOEEQqoo4g8mzZ8/GT3/6U2SzWQDAm2++iV27duHYY4/F8uXLkcvl8P7772PVqlVln50+fTqeeeYZvPvuuwCA7du3AwCGDx+OTz75pOz9I0aMQHNzc9G/f9dddxV3AyreeecdHHbYYbj00ktxyimn4JVXXgl0vbIDEAShquiYPbYkBgD4DyZ75cILL8TGjRtxxBFHgJnR0tKCzs5OnHrqqVi5ciXGjx+PtrY2HH300WWfbWlpwbJlyzB//nz09/fjgAMOwBNPPIE5c+bg9NNPx+9+9zvceuutJZ/51a9+hUsuuQS9vb047LDD8Itf/MJRvvvuuw933XUXkskkDjzwQHz/+98PdL3EzIEOUEmmTZvGMhBGEAYer7/+Or7whS9ov1/VUkJwx+5eE9EaZp5mfa/sAARBqDrCCCYL7kgMQBAEoU4RAyAIQkUYSO7mgYrXeywGQBCEyBk6dCg++ugjMQIRYswDGDp0qPZnJAYgCELkHHzwwdiyZYunXvWCd4yJYLqIARAEIXKSyaT2lCqhcogLSBAEoU4RAyAIglCniAEQBEGoU8QACIIg1CliAARBEOoUMQCCIAh1ihgAQRCEOkUMgCAIQp0iBkAQBKFOEQMgCIJQp4gBEARBqFOkF5AgCEIVE+V0NDEAgiAIVUpnV7pkPnK6J4OrHlwHAKEYATEAgiAIPol6dvHSxzYUlb9BJpvD0sc2iAEQBEGIi6hX5wCwtSfj6XmvxBoEJqKfE9GHRPRqnHIIgmBPZ1caM5asxJgFKzBjyUp0dqXjFqlqcFqdh8WoppSn573iagCIaAYRPUFEbxLRO0T0LhG9E8rZgV8COCmkYwmCECLGCjfdkwFj3wo3iBGoJYMS9eocADpmj0UqmSh5LpVMoGP22FCOr+MCuhPAFQDWAMi5vNcTzPwMEY0O85iCIIRDmP7nzq40Fj20Hj2ZbPG5KFwmlWRUUwppG2Uf1uoc2Hdf4swC2snMfwjlbD4goosBXAwAbW1tcYkhCFVLVIHIsFa4Vl+5mTADmpWmY/bYsusi5A3bjCUrQ/s9zJvSGtn90YkBrCKipUR0NBEdYfxEIo0NzLyMmacx87SWlpZKnVYQBgRRuGkMwvI/2+0kzITpMqkk86a04ob5E9FauB8EgAuvhfl7iBIdA3AUgGkA/g+AGws/P4pSKEEQ9IgyEBmW/9lNwYfpMqk086a04rkFs9DalCoqf4OwA8JR4OoCYuaZlRBEEATvRB2IHJpsKBqYplQSi+ZO8OyOUPnKgXACmlHn4utQiYBwFOhkAY0gopuI6KXCz41ENCKMkxPRvQD+G8BYItpCRBeEcVxBqBeiShM0XEs7evcFbff09fs6lt1OAgCaG5O4Yf7EQMo6SheYF6JO14wKHRfQzwF8AuDMws/HAH4RxsmZ+WxmPoiZk8x8MDPfGcZxBaFeiCpNMEzXktlXTgBam1K45ax2dF17YlH5+00PrUQuvg5Rp2tGhU4W0OeY+TTT48VE1B2RPIIg2KByc0SVJhi2S8MpkyVIRW21uF6iTteMCh0DkCGiv2XmPwH5wjAA1e3YEoQawk1BRpEmWIkcd4Mg9QYqORuI0NmV9n1f/MQVokzXjAodF9D/AvATItpIRJsA/BjAJdGKJQiCQRxujkq6NLyu4s3uot69fUg2UNl7csy+YwHVEleoBK4GgJm7mXkygEkAJjLzFGZeG71ogiAA8bg57Pz2QQO2KrwEUK3KeUdvFqB8Dr4Vv0ayWuIKlUDpAiKic5n5biL6tuV5AAAz3xSxbIIgoLLuGDNeXRpmt0lTYxLMwM5M1tWFoqqonTmuvPDTTjlnc9YM/H34MZLVEleoBE47gGGFf4fb/OwXsVyCIBSohDsmaJM2u5V5TybrwYXCZY8eWJMu+4xXJezHSA7UlE4/KA0AM99e+O+TzLzY/APgqcqIJwhC1O6YMHzebu0eVC6UfQHu8hoDu8+olHBzYzI0IzlQUzr9oJMFdCsAa+8fu+cEQYiIKDNMwuj6qbMyt3uP1z5Bdu6iVDKBhXMmFI8XNA1zoKZ0+sEpBnA0gP8JoMUSB9gfQHlZnyAIA5IwfN5O7R7M7/FzDnM6p5tyDktJD8SUTj847QAGI+/rH4S839/gYwCnRymUIAjlRNXzJowgc8fssej4zVpk++0DsioXipvhYKCsKEylnKuhJ9BAwykG8MeCv3+6JQZwEzO/VUEZBaHuiTI3PQyf97wprdhvqP16MkGkjFmo+gSZ0UnBrKfc/TDRKQS7g4iajAdE1ExEj0UnkiAIVqLMTQ8ryNxjahxnJseMpY9tsFXGxrndcHMV1VPufpjoBIE/y8w9xgNm3kFEB0QnkiAIVqLOTQ/D5+3kznHq7zNvSite2rQdd6/e7HhsJ+opdz9MdAxAPxG1MfNmACCiQ2FN2hUEIVLiKgYzsPOvA6XB2JnjWvDAmrQyq8cps+i6eRPx4Jot6LVJByXA1R0V9/3xQzXELIjZWZcT0UkAlgH4I/K/i2MAXMzMFXcDTZs2jV966aVKn1YQYsdurm4qmYisPYPbuZMJAhglQd9UMoHTprZi1RvblDsBAvDukpOVBsWuIvic6W24bp6zm8jp/gDVl9JZ6d8nEa1h5mnW53V6AT2KfM7/cgC/BjA1DuUvCPVMJXvzWFG1X7Bm/GSyOax6Y1txRKIdo5pSyoAtANwwfyKaG5PF949IJTHt0JGuMqruD4CqDA5XS8zCqQ5gHDO/YRoAv7Xwb1vBJfRy9OIJgmAQV266Fz/61p4MOrvS2LWnr+w1I7PISfl1zB6L3SY3UE8mqz0bwHp/OrvSuPK+tchxuaHyUuSmIogLp1piFk4xgCsBXIT8EHgrDGBWJBIJglBxnJSZTpGXQePgRJlrA8i3alg4Jz9P+Irl3baf3dqTCVyVbFxHuicDgjpYGVTRBhliA1RPzEJpAJj5osK/MhReEGoEHd+7VZnZtV9QsWuv/XsaBw9yNSijmlKBVsZWpewU3TRcUYaxSBAhx4xWzZV8UEOlamlR6X5DTi6g+U4fZOYHwxdHEAQrYWWLqFatQwY1OCozc/sF3Z2AFfPnnJSf6hw6K2O3vkLmc80c11Iig+EmSvdkcPnybix+eH1xx2JHUBdOtfQbcnIBzSn8ewDyPYFWFh7PBPA8ADEAghAxQV0NZlSrVpXSNCszwxCMWbDCVw44AZixZGVR2RnZQnbKz+/KWEf5GlXJbsZiR69z7CEMF0419BvSSQN9HMDXmfn9wuODAPySmWdXQL4SJA1UqDdmLFlpq2ham1J4boG3MJxX5U0FJ7pZQavk8YpTyqOda6YplQRRvtpYVYfQu7cvPyFM45y690J1n+NMy/WD7zRQAIcYyr/ABwDaQpNMEAQlqlVtuifjeXBLkym90sywwYl8Xr8FZpSlTqp69zQ3JtGUsj++HU4pj0bcIZVMFF0zPZksdvTuGzDT8Zu16Lh/bUl656e7+2yvA8iv/E+b2loSh9BBdf/jTMsNE51K4KcKvX/uLTw+C8CT0YkkCIKB3/YKdqg2+73ZnPI1A0NhG6thO991Z1calysyfOxwctm4uWjsuo5m+/M7hWFDBpVlAeWY8cCaNKYdOtJTYNswFKo4zEBT+FZcXUAAQESnAji28PAZZv5tpFIpEBeQUK1EVdZv52qwwyl7xexSCYJRxevENZ3rHHv6WFHJHSTW8O6Sk7VcZ6o6AQNzJfFAcvfYEcQFBAAvA1jBzFcAeIyIhrt9QBDqhShbERuuhmaF+8ZAdU6zbEHRcZtcN28ibjmrvcQ1cu70NmXLZ5XcfvPhjc/pZOnMm9KKG8+crHRpGQo+aNVu0HnLUeLqAiKiiwBcDGAkgM8BaAVwG4DjoxVNEAYGYYxUtGLdUWhs1G3PqZsa6UaygVwzcTq70lj88PpiILYplSyu7qcdOlK5C7GT20vtQVHGxD4ZdbN0dNIx3YyJ0+4vzCyuKNCJAfwLgCMBvAAAzPxWWO2gC43m/h35EZN3MPOSMI4rCJXELVDr1R1kpzT8yhLGyh9A3rfiQGdXGh33r0U2t89S9WSy6PjNWgD7Uh5HL1hh+3mrnFbFPMKUBdRQyAyyMsxUbOal0MrNl+9kTNwUfBSLgzDRMQB7mHkvUf4vgIgGIYR20ESUAPATAF8BsAXAi0T0EDO/FvTYghAFqpWel0CtTqwgyKrdvMLt7Eo7tkOwI9FAyNkFWHOMxQ+vV8q+9LENJcq/+Ll+LrpKnFwmCSq3MCrFPEZhRHZm9qWAhllo5Va45qTgq6XnjwodA/BHIvo+gBQRfQXAPwN4OIRzHwngbWZ+BwCI6NcATgEgBkCoOpxWem7uCrO/WMcdEEQ57NrTVxyivvSxDY7KP0GE6Yc1Y/U7O5BjRgPBVvkb7OjNFt07ZtmNxyqMtE3VvGAAykCsHV7cO2Gssp2MiVNfIy+yxoWOAfgegAsBrAPwTQCPALgjhHO3AnjP9HgLgKOsbyKii5GPQaCtTcoPhHhwWumZUyNVitBLo7OmxqRtQdOwwQk0NQ4uGcCy4pX3S95rdrs4GRKjd/8Da9JF5eugn23JZHNY9NB67OkrH+Jihsg+bdNMg4uLyYyueyfMzCyVMXFT8NXS80eFowEouGnWM/M4AD+rjEilMPMy5AfSYNq0aTKJTIgFt628oSBU6YcNRI7GwYxqMZxMNJRVpf5+7ftl78v2MxY9tF6pnHTbIejQkyk3VFZ0Fvf9nE8hVbWHMKPj3gkj+KpjQNwUfLX0/FHhaACYOUdEG8wjIUMkDeAQ0+ODC88JQtWhu5VXuYOcXBwjLBW0OxVK1e55lQLuyWTx95MPKhvRaM5fV7kv4uKe1ZuLLqt0TwYd96/FoofWY2cmW6Y43dw7YbSV1jEgOgq+mgvGdFxAzQDWE9GfAewynmTmuQHP/SKAw4loDPKK/x8AfC3gMQUhEnS38laFoMpYMfPJ7iw+d9UjyDEjQYRUssF2Nq5Xv/EDa9KOTde89Pm3g6B2V/nBepeyOS4auHRPBlcs78ZLm7a7jocEgnfrVBmQy5d3FwfX6BqjakbHAPzvKE7MzH1E9C0AjyGfBvpzZl4fxbkEIShetvJmhaDKWDGTT57hwv8Zvdlyg9FA+QDvmAUrSs7d7KCAzSMa7fCTa2+GAYw/aDie+8t2X5/3c757Vm8utnNwImjw1clQVFsufxCUrSCIaCiASwD8DfIB4DuZuXzOWwWRVhBCtWP1G7t1qPSLuU2BNf/ejF37BmvBll+8ppgaNDcm8enuPtfAsAqdTqhBu3XqdD3105E1Lvy0gvgVgGnIK/+/g/1oSEEQCti1hHDqUBkEsz976emTbfPogfIVb2dXGlf+Zm0oRsmr+k4lE7jlrHZ0XXsilp4xudguQiW7CnMFrqrFQtBunaqup3ZyDGScXEDjmXkiABDRnQD+XBmRBGFgYuc3tnaoDBOzAko0ADkbT07v3n11AQCw+OH1jrn+OjSQ95RRABgyaN9606ubzIxOBa71HF7RmYJWLbn8QXDaARSXCHG7fgRhIKBSFD2ZLLb2ZNDalMKMz40srngTRDj8gGG+z2cowm/f1429ChfQjt4srljejWs61xUfB0VH+aeSedViXtv3ZLKeGr8NG5wo60ChU4HrB7vdxLwprXhuwSzcclZ72W6gmnL5g+BkACYT0ceFn08ATDL+T0QfV0pAQYgTL50cnVwZhkvoub9sL2YF5ZixZcfuEqOgi1kRuilkI3hqGIEoIQDnTm/DDfMnoYHK3UR2StrO3ZJKJnD9qRNxs6WzqOHGCbPFgls311oZ/mKH0gXEzM4OMEGocbwWE3lpZ2CQyeaw8aMMbjxzsnbPfnMPfd1cfsMI6NKUSuKT3X3Ka7ILADc3JrFwzgQA+cC0yjBZlbRbhlVYM3lVhV06NQMDOdXTCZ00UEGoOa7pXId7X3ivRMFZh5OoFMMVy7uLk68MpTdvSitafebV6/TKMWPOPPGSy+/FPLlV+DLy98tOYc9YslKZlQTYK2mvCtZriwUnY17tDduiRAyAUNPYrfpe2rTddmqVdYWvUgBm1bajN4uO+/O9d2aOaympZtWF4N4rx4w5qNsxeyy+fV+3r6BsEIz5vwxg684MLi8YRae6BANDSQfp1eO1xYLTKr/aG7ZFidZIyGpB6gAEL6hywXf3Oc/ANfK7dXLBDYYNTqB3b65M+fvNmHGCCLj5zPaSoSPff/AV2+phN5obk9id7fdcDOb3ulLJBowcNqRsZm/+tX21DWH3zlGNmCQAN5/VPuBHProRdCSkIAw4VKs+tzWPsfLXyQU32GWj/IF8nx+nAG/SxzeQOe9jNwcp50892PNxUskEFs6ZoDVy0opfo5bJ9heNql2AeNFD6yMZr6lazY9qStV0kNcN1z8/IppPRG8R0U7JAhIGEn59uIaysCoGr5k6QH6CVb+DxfGxaM9/rjCgxeDeF95zeHc5DQScNnWf3323X0Ec8GpUgHzsIcz0TgNVppG5a+dzC2bh3SUn47kFs+pC+QN6MYB/AzCHmV+PWhhBCBOVb7dR0WwNyCuFmeNaMGPJyjIXRGdX2lOwFoBWMzi/mH3tXs/Rz8DyF98rzuoNY26wlcbBg7BwzgTt7CYnggZkq70tc1zoGIAPRPkLAxFVs7OMYoBJa2HIirmFsl3q56KH1hezZJobk2BWZ81EpfytJHwYmmyOiwpRBVHeT+7H5WPcO13jkkomMDTZYBtEDiMgW6upnEHQMQAvEdFyAJ0A9hhPMvODUQklCAZhZIqYFTZQOqDEGuybsWSlaxvg7oUnlrx+Tec6X9k/QUmZAgjTD2v21ZXTGLiuMmDMQDJBykpjNzLZnKNxMgLBRgouAGV6Z5gTvoQ8OgZgfwC9AMx/9QxADIAQKWFMdTIKfVQKLpPN4cr71uKK5d2OitA4v5Hyac7AeWBNuuLKHwD6+rkYHH15805fx2hqTOLTPc6dXvbm8vOCme1rCYYNTiCZaHDcBaWSiTLD2pRKYtHcCba/S6uiB/TmKQvekDRQoWpRpWEmiHDjmZO1v/iqFMAgNDcm0dObjdTHr0NrwTXix8eeTBCGDR6kNdbRIJ82mkPGEkNxSq81fl9BVu+qv4WB1JI5TlRpoK47ACI6GMCtAGYUnnoWwGXMvCVcEQWhFJVvOsdsu/pTuQiCTr6yw/BTx6n8Af/BUaOC2etYyB292bIGbQAc/fw55sD+93qu1o0SnSzkXwB4CMCows/DhecEITTsmq45Bf6sqYFODb1mjmuxVVq1ACOfaaSLuSe/YRz9nNMLCSKtZnpOOOXxC/7RMQAtzPwLZu4r/PwSQEvEcgl1hEp5zxzX4liIZV7Vq4q+Fj+8PjYffaXQ3YUkiMoKnGaOC++r3NyYtP195ZgDF3W55fEL/tAxAB8R0blElCj8nAvgo6gFE+oHlfJe9cY23DB/orIAi4CiMlG5Anb0lhcW6ZJMUM2UyqeSCdu4yao3tvk6nl2ffqOq2KlwTlXU5dZ2u56rdaNEJwvoG8jHAG5Gfvf3PIB/ilIoob5Q+ee39mSKX/ArlneXreIZKLbsDcPP35RKgihfvWvOPrnco5+82rB2OTWj40O39v1JJRM4bWorVr2xTdm+ubMrrbxv1nPqZntJHn/4uBoAZt4EYG4FZBFqFKf87c6utHK4uLklg5syUbUHHjJInZ5ohgDHlMSwg8iVggDHLBknw2nu7+8lg8dQ6E7nNKPTj1+IBqUBIKLvMvO/EdGtsPl+MvOlkUomDBjcFLxqdadqywzkFZfZv6vqtW82EoB7/rgK827Cem29ewfuRFS3IKmqWhrQ7w9k/f3v2tOnvN92fnvJ8IkPpx2A0f5BEu9riLCrKd2276rV3VUPvlKWS26GUbr91xkA4uQi0HHjuLkm/NDalELv3r5QZvF6heAe5DXu15X3rS0LJhtBdHO7aOvv1+7374Sd3z6ufvxSWew8EvLhwr+/Mp4jogYA+zGzdAMdgIRRWWvFbfuuWsU5KX9gX4GTQZBmXoYhclNOZoXT2ZW2VYpeMO9i4ogjMIAH1qQx7dCRtjsb4544tWqwM1zGVLTFD6/3ZNhaC62XrXid7hUGUXwXBiI67aD/i4j2J6JhAF4F8BoRdUQvmhA2TsraL27bdz+rONWXP0jLXrfe/uZzGsohqPI/Z3obAJS0ba40dr9fc9ot4K+YjWFvHFQ4KfQ4Mnyi+C4MRHSygMYz88dEdA6APwBYAGANgKWRSiaEjkpZp3syGLNgha9tsNv2XbW629OXU3aYNKZC2bVk9ot5B5HuyZRkthg9aYxzBg34GhO7gPzgFqf5uDo0NyZx8qSDSrJudu3p027hYP2967Z/9hJEt5O5cfAg7d9fpTN8JO6QR8cAJIkoCWAegB8zc5aIAv1FE9EZABYB+AKAI5lZ4gwVwCnjw1yoA+hvg9227yrXjSoAfG5h1ex1e24e8p4gwtlHHYLr5k0s8/Na2z0DwJ6+fry0aXvZ834ZVMh/X/rYBi3ln0yQ4/u6rj2x7Dkv8QnrLkxHySWIcNrUVkw7dKTnOIhRE1DNrpR6ngNsRscA3A5gI4C1AJ4hokMBBI0BvApgfuHYQoVwyvgw8Jp+p+Obt1vdGY/tlPaUHzzuKS3wms51JcYkx4y7V2/Gu9s+xcubd5YYEru2zZlsrihHGGT73fvsl7y/0G3TbkdkjYUYWHc0qlRaO9eLTs1EjrkYP7hh/kTtGIZTzUE1EUfcoRrx1Q2UiAYxc+DcOCJ6GsB3dHcA0g00OOYVseo3TwDeXXJyJcUq4lRABNgrmDFXrXCd8xsHTS7tpa1YWyZ7GUze2ZUum3ugarfsZfdgdNuc8oPHHX3+lRqiHmbmTj1lAQXpBnoZ8s3fPgFwB4ApyMcBHg9bSMX5LwZwMQC0tbVV4pQ1jXk1rvJ3h7EN9vvlcgvC2bmDqlH5A8AuD/UDBDhW1+qwxzLpzPrYwLp7cMoCMnYxC+dMUMYzKrXqDztzRyqLNVtBMPO/E9FsAM0A/hHAXXAxAET0JIADbV66mpl/pysgMy8DsAzI7wB0Pye4E9U2OMgXVcdtErRKVOUuCRsvwV9Gvi+P3972XqtprcrPbTFQDTN1pWI4fHQMgNHR6asA7mLm9UTu/WeZ+YRAkgmRE9WXWvVFvfK+fdO0gvbuD5KtMTTZ4FqHEAdBriloVkvQQrtKIJk74aNjANYQ0eMAxgC4ioiGA6i+b4/gy+0Sxpfael6VAjcGuVgzbtI9GVyxvBuXL+9Gc2NSGRA1Y3ZTpTwq9GpU/kDeMPklaFZLNazw3ZDMnfBxDQIXqn/bAbzDzD1E9BkArcz8iu+TEp2KfIfRFgA9ALqZebbb5yQIrMYusFeJwJzded1cLE4+Z13Mufth5NpXCzM+NxIbP8qU9TOyprJaYwWA/TD1WmqZHNffeC2gCgLrGAACcA6Aw5j5B0TUBuBAZv5zNKKqEQOgJq6ZqarzVsLPnkomMDTZoFWRWim/f9gkEwRwPrVUhXFtdu2sa00x1lPmTpj4zgIC8B/Iu3xmAfgB8tlADwD4UqgSCoGIyz+qOj5DvdIPSxlnsjntAiWd80VpJJIJQq6fXV1bVnR2NsY7ejJZpJIJ3HxWe80qxbjjELWGjtPxKGb+FwC7AYCZdwAYHKlUgmfimpmqOn5rUwo3njm5rP9OsoHgYYRtxYhC+SeIir1tlp4+GfsPTYZ8hnKMrBi3CVuCAOjtALJElEDh+0FELajTIHA1bz+9pHSGeR1O57ULLPb07sWuvd7aLbQ69L5pSiWxM5PVVt6qALNX5W/0unHKWOpnLimou6JCHUGNtNsoOl1W83dA8I6OAfi/AH4L4AAiuh7A6QCuiVSqKqTa28fqZnFEUUzjdF7rln30ghWez+EU5Fw0d4LjYBkr+w/N+8mNuEGyAfCTFLSjN4vGwYNw7vQ25bmtuyOnDCmi/O5obwjB7ARRJPny1f4dELzjGAQuZABNB7AdwPHI75SfYubXlR+KkDiDwHEFWcPGy3XYNVLzW6lq7j/vlaZUEsOGDCqpWrVWn7q1KjCz0bQq/9xVjwTKSGqA83a41ZSlY9c/35iva9eIzsktZbSZsL7H2k7CerwgLT5q5TtQj/gKAjNzPxH9hJmnAHgjMukGALVShKJ7HXarPfNK1271p3IP6PSeccrl78lki+6fHHOZiwnItyrwM70raDqq2+Yh3ZNBx/1rbTN5jFRWVXtmJ8mGDRmE7oUn2t5zlaENGg+qle+AsA8dF9BTRHQagAfZT+e4GqFWilB0r0OnZ7zZrWBnMK5Y3o2XNm3Hqje2OR4r2UC4Yf6ksmZmTudd/PD6soriEamkVlpoZ1e6aDzCqElwQ5XJM2zIIMyb0uorNmAoXVVWTBQtPmrlOyDsQycL6JsAfgNgDxF9TESfEFHdjYS0myhVyfaxYWV16F6H7qrOeJ+dwWAA96ze7Oj2aW7Mu3cMJZhM6KUI7ejN4prOdcXJVoz8TmF3th+3nNWOhEOq0VUPrivev+mHNWudLwrcpqY1NyahugrVZwyDmMnmivcgrAlbcX8HhPBxNQDMPJyZG5h5MDPvX3i8fyWEqyZUY+sARJ5uZx7hZx7c4udcuuP3dFd1o5pS6OxKOw6aUSnj5sYkdmf70VPI4unJZAHep/ham1JoblSnTt69erMy2Hn2UYcoP2dOlXx5807H6yPkh9So+vIHwTw1zU6xLpwzAedMbyszAoT834D1781u1KOdu8wvcYxuFKJFpxL4CJundwLYFMZMAC9UWyVwpUrT4wi+6fnt1QFMu/da7xOB0Wvj9zdfl3XYiy63nNXuOFeAoDcYxZDHLt1Vt3bArprX+ndiN6TdHEBWDX4xH0eCtIIKVRBYxwX0HwBWA/hZ4Wc18i6hDURUPquujqjUYOk4gm92qz1jJWxe/bn594F9ve7Nnz1taqut8gf2XVdnVxoPrPG3o3KbYDWqKeXJzWV3P24+q125M7AWgS09Y7LjynnelNbiTsCISZgD7c8tmIXWppTtNLNFD613nGVszHyWgjDBik4QeCuAC5h5PQAQ0XjkW0J8F8CDqNBgmGqkUoo5ruCbTtm9TgDTrtf9jCUrle83rkt3eLlXDLeIblqquSe+bsDVbhfodi/d+t2r/q7MWVIq/M58FmobnR3A5w3lDwDM/BqAccz8TnRiDQwq1X4h6uCb3wBzZ1caDZp9HazKy8lIGtcV1Q7HUM4zx7W4vtftPofpF3dbUITxdxXFDlUYuOjsANYT0U8B/Lrw+CwArxHREAD6A09rkEoNlo6yV7vf6k7jc7oplFblNcJhXu4Vy7ux9LENju/xS3Njspg+6uZe0h11GLRBmeH/V93JEal8INzu780PkrcvGOgYgPMB/DOAywuPnwPwHeSV/8xIpBogVHKIRlRdEFVuh8UPry/Jy29uTGLhnAkl12yniIiAQQ1UkvtuZxSdNg6GuyKZICQbyLEVsheSCcLCORMc5TfjVhkdxu9aJ9hu3Cu7v7cdu/bYxlLM1dNWJG9fMHA1AMycIaL/APB7ZrbuHT+NRqyBw0BvT6taDVqLqXb0ZvMVrYCjP5oZGDZ4kGtf+h6Ntg3ZHKMhYOdQI2vGupp3WwUTSgvGouqDo2OIzPfK+vc25QeP2xoAosrtUIWBi6sBIKK5AJYi3wJ6DBG1A/gBM8+NWDYhRFR9fbysrbM5LgYknVIoezJZJBsITY1JbO3JFH3OZsWlm4JpXvx7bdmcIMKNZ062VdBu7iUGSpqnRTWQXMcd47RiVxnSnt7sgBjzKMSLjgtoIYAjATwNAMzcTURjohRKCEZnV9q28ZiBta+PFwyF5eaPzvZz8fx2q2U//mw75e9kFPqZbZVdZ1cau/a6l7AYxVYds8dqZXz5cRG5GUK3FbtbhthA36EK0aKTBZRlZmu5ZN32BKp2OrvS6Lh/rXZnTCtuHhcjIGnOftHBmn1ifN6pZYMOTpXGqpXz0sc2aM8QNoyXcd2qc/it1rbL8DKuRiejSNozCEHQqQS+E8BTABYAOA3ApQCSzHxJ9OKVUm2VwNWIU0FQWFj96V7O2VoowDIPMw9jqLtTG2RrAHvMghW+BsB8uruvJCCdbCAsPWNy4CrcoMFlGdIiuBFkJvC/ArgawB4A9wJ4FMAPwxUvfuL6EoV93kqk+FldOrruHKOHjfkYN8yfiGGDBwVK9zQMksrtZQ1g68YfrMcoa1RnehikKDCom0bcPIJfdJrB9TLz1cz8pYIFuQvAj6MXrXKE2Wwt7vP6TfFLJRNoUrg57DC7dKzFUE2pZJmytPPVG8fYGUD5m90dux1GexkBbACOBWAqd1KCqGyXYj6m16JAmdkrVANKA0BEk4jocSJ6lYiuI6KDiOgB5N1Br1VOxOgJu6eP7pc7il5CHbPHardUBlBSvTph1HBP5zKvbudNacVzC2bh3SUno3vhiVh6emnvG5XLxdj56MhpJZVsQAPl+/5cvrzbdQeS7sm4FoAZHTRLz5NQFryZg+K6vvi4FhyCYMVpB/AzAP+FvN//rwC6AfwFwN8w883Ri1Y5wuzp4+XLHUUvoXlTWrH09MmObZQNWptSeHfJyUUf9fN/2e7pXE6K22wQjEZmqmPYKU8zyQbCOZZGdOdOb0NfP3saMJ8gcs27N4yhtbWDk/zG9eq2hKhUE0FBcMMpBjCEmX9Z+P8GIrqUmb9bAZkqTpjN1rzki0fZ5K1x8KBiIdbMcS22LZu39mSKQ9qJvKV2ec00cSpKMu7LlfettV1p7zd0EK6bN7HkuRlLVnoOHOeYHY2rWR4/U7Z0ffEyWlGoFpx2AEOJaAoRHVGYCbDH8rhmsFuBJhOEXXv6PPtovXy5o0jhs9uBPLAmjdOmtpb5+M3q08tURD8Nz9xWyPOmtKJfIYRdsZMfZZlKNqBJsTNqIDheU5hN3yrVRFAQ3HDaAbwP4CbT4/9neswAambChLVisqmQ8mdkpngp+/eyqo+iUlO1A/n92vfxye7g83sI5T1yDNwymtxWyKp710CEMQtWYEQqWWwx0eBjlq9q6DwA7D80GXnTNwNp0SBUC0oDwMyRNXojoqUA5gDYi3xc4Z+YuSeq8+lg/nLPWLKyLJ1Qt+zf65c77BQ+p57xYaBapV7TuQ73rN5c3FX46ZWjSic1FL35GlTK3615nMoIBMlEMtBN6ZUWDUK1oFMJHAVPAPgiM08C8CaAq2KSw5agOd1xzk0N6kYwJlnZpXI6ZbWYlb+B18Cm9d55rRJOJRuKk7e80kAUKAvHa2aPNUguyl+IA51CsNBhZvMUsdUATo9DDhV+grN+CrqiKD4L0jPeXNnqRT6nXvZbC6mXTsdRvT6mEKDWJ28wOmaPxRXLu21lMgbR2+0ygnT39BL8l8pdoVpwbAVBRATgYGZ+LzIBiB4GsJyZ71a8fjGAiwGgra1t6qZNm6ISpYjXYe9+hsNHOVDeqmB69/Zp9Qa65ax2X+d2aq0wbHACvXtzJa8nE4RhgwdhZyZbjLfYDUzXHdloxmi9YHVJmY8LqDOO/A5QV90DAvDukpOLj6P8vQuCCl9D4TlvHR7xecInC0Vk1p9TTO+5GkAfgHscZFjGzNOYeVpLi/sIvzDw6sbxk9cdZS641b2wcM4Exzx7IH+NfhWQ085ol0X5A/kK2p5MFox8iwWrzz6TzeHK+9Yi3ZNxbU5nxXDTXTdvYnFou/V36JRx5DcVUzezR2oAhGpCxwX0MhF9iZlf9HJgZj7B6XUiOh/A3wM4nt060sWAl+Csn5hBJXPBzUFHQ6laV8ZeM1Cu6VyHe194DznOD21pAKDOsfGOsTpn7Gsj0aSRBWRWuE6/w7BrMHSD/1IDIFQTOgbgKADnENEmALtQ+D4WAri+IKKTAHwXwHHM3Ov3ONWCH2USZRGYHWZlGNQHfU3nupJ5AsYCPtkAOGRa+saY6GV2zdi5Usy1G27XFXYqpm5mT6V/74LghI4BmB3BeX8MYAiAJ/JhBqyOo710WPhRJmErIJVSVz0fxN987wv2IaEolL+BdYUctHYjilRMnfsqNQBCNeE6D6D4RqIDAAw1HjOzv5FSAajmeQBxZgGpAounTW0tawERJOBoyBv2vAFzUFjl2nELzgbpx19pJAtIqDSqILDOQJi5AG4EMArAhwAOBfA6M0+IQlAnqtkARIWOslApv4RPZaqSw296qRPWub1+s2R0s3AEoR4JMhDmhwCmA3iSmacQ0UwA54YtoFCOVRmq3BqqAKKqWtZqLHTy9FVpk0Gxzu3165oR37ogeEfHAGSZ+SMiaiCiBmZeRUS3RC2YoF9cpFJ+qh0AIa/UjRiBk5ExXo9C+RuyW/EToxDfuiB4R8cA9BDRfgCeBXAPEX2IfDaQECGdXWmlrz3dk8GMJSuLK2RVu2eV0magaETcjIxb//wghKmgpb+OIHhHJwYwDEAG+VTvcwCMAHAPM38UvXil1EsMwM3fbpfHf0TbCDzncaBLUyqpbBJn+M79DFB3IkGEfmZR0IJQQXzHAJh5FxEdCuBwZv4VETUCcC4rFQLhtOpWzdb973e8KX/AuUOo4ZrxM0BdhZGZtOqNbdjakymZKRwUyawRBO+4dgMloosA3A/g9sJTrQA6I5Sp7nGqClWtxh06IHvG7JpxG9eoS2tTqpiWGvYsXJmxKwj+0GkH/S8AZgD4GACY+S0AB0QpVD1iHiTfoGiD3NqU8tXq2At2k7rMM3G99uYB8q6m5xbMwqo3tkXSB0f66wiCP3QMwB5m3ms8IKJB8DY+VnDBuoK1C94aq3LVGMlkCJMdEkS2vemN5nKtTSnHX3wyQWV/UMkGwqK5+ZKRqPrgSH8dQfCHThbQH4no+wBSRPQVAP8M4OFoxaovVD5/p4CpUZGbIEImm4PH2Sm2uKV6OinU1oKMhmx2vviocvWlBkAQ/KFjABYAuADAOgDfRL499B1RClXrWAOWqiBrjhkbC1WshotIlfqpk6afbCDsN3SQcjaAm3tJJau1srhSDdiiPq4g1Do6WUD9AH5W+BECYld4pcIYiWj3GbsRjCoIKGsQ50dhBlW0UeXqSw2AIPjD1QAQ0QwAi5DvATQI+9pBHxataLWJl8IqwyVj9xkvQRhrLxy/CjMMRRu0E2mljysItYyOC+hOAFcAWAMgmpLQOsJLYNJwyQQJZqrcOn4VpihaQagddAzATmb+Q+SS1DiG31+1cnea0qXyvdsVhZkxH8PcytnoEdTa5D43QBCE2kWnFcQS5Ct/HwSwx3iemV+OVrRyBmorCLfWDtYKWasCdur3b/7MzHEttsdwOr9qboBhXFrFGAjCgCdIO+ijCv+aP8wAqmvKRhXj5PdvdVDcBrq+986uNFa9sc3T+TPZXHG2rxnjkdtkLUEQBi46WUAzKyFILaPy4RPKM2tUCtfN9+7U1tkthuCW/2/XgloQhIGP0gAQ0bnMfDcRfdvudWa+KTqxagunQiXdnv9uOB3HraGbam6AGamqFYTaw6mBwLDCv8NtfvaLWK6qwtynZ8aSlZ6bjKnaN3TMHhtaGwOn4zg1dEslEzj7qENcG74x8qMnr+lcF+he6BD0fguCoIdyB8DMtxf+XWx9jYguj1CmqkJ3LKMTTj581ZB1r20MnHYZ5vOrsoCmHTqy+Loquyjdk8HdqzeXPA47PhDG/RYEQQ/XLCDbDxFtZua2CORxJI4sINXAdT+D1e3wOwQ9quMYx1IZJjvCuhdA9PdbEOqRIFlAtscLKM+AIepOk2G1MQizHYIRcNadBuZ0L7zWF0hnT0GoHH4NQN20g1a5VkakkqGdI6zq2rCrdHWngancVX7cOdLZUxAqhzIITESfENHHNj+fABhVQRljZea4FtvnP96drfngpM40MKdmcH4GtTgFzAVBCBelAWDm4cy8v83PcGb2u3MYcNgVVgH5EYy1PnHKPA2MkPfDnzu9reSxU4zBjzvH7px+4hiCILgTiyInoh8COAVAP4APAZzPzFvjkMUNJ2VVD37pIG4lv+4caTgnCJUhhEGCvljKzJOYuR3A7wFcW4mT+skvd1JW4pd2Rtw5glDdxLIDYOaPTQ+HoQJBZb/55R2zx6Lj/rXI5kpFTDaQJ0VW6W6b1dDdUwa1CEJ1E5svn4iuB3AegJ0AIu835LflgvHa4ofXF0cpNqWSWDR3Qkm3TiclV+nipmoqphJ3jiBUL74KwbQOTPQkgANtXrqamX9net9VAIYy80LFcS4GcDEAtLW1Td20aZMveVQ57YTyiVle0CnAqnRxkxRTCYJgJuxCMFeY+QTNt96D/KB5WwPAzMsALAPylcB+5Ykqv1xnZ1Hp4qYg56sG15EgCJUhliAwER1uengKgDeiPmdUAUkdZasyMlEFkf2ez9jNpHsyYOxzHdV6vYMg1CtxZQEtIaJXiegVACcCuCzqE0aVX66jbCudDeP3fH4KtwRBGLjElQV0WhznjSIgaR3oApQr20pnw/g9n/ThEYT6om4qeqNCV9lWOhvGz/mkD48g1BdiAEKgVlIddXYzgiDUDjVvACSrRR8p3BKE+qKmDUA1FUQNFGplNyMIgjtxZQFVBMlqEQRBUFPTOwC/WS3iNhIEoR6o6R2An4IoKYYSBKFeqGkD4KcgStxGgiDUCzXtAvKT1SLFUIIg1As1bQAA71ktUgwlCEK9UNMuID/IFCtBEOqFmt8BeEWKoQRBqBfEANggxVCCINQD4gISBEGoU2QHUOVIUZogCFEhBqCKkV5GgiBEibiAqhgpShMEIUrEAFQxUpQmCEKUiAGoYio9TF4QhPpCDEAVI0VpgiBEiQSBqxgpShMEIUrEAFQ5UpQmCEJUiAtIEAShThEDIAiCUKeIARAEQahTxAAIgiDUKWIABEEQ6hRi5rhl0IaItgHYFLccIfJZAH+NW4iIqfVrrPXrA+Qaa4FDmbnF+uSAMgC1BhG9xMzT4pYjSmr9Gmv9+gC5xlpGXECCIAh1ihgAQRCEOkUMQLwsi1uAClDr11jr1wfINdYsEgMQBEGoU2QHIAiCUKeIARAEQahTxADEDBH9kIheIaJuInqciEbFLVOYENFSInqjcI2/JaKmuGUKGyI6g4jWE1E/EdVUKiERnUREG4jobSJaELc8YUNEPyeiD4no1bhliQMxAPGzlJknMXM7gN8DuDZmecLmCQBfZOZJAN4EcFXM8kTBqwDmA3gmbkHChIgSAH4C4O8AjAdwNhGNj1eq0PklgJPiFiIuxADEDDN/bHo4DEBNReWZ+XFm7is8XA3g4DjliQJmfp2ZN8QtRwQcCeBtZn6HmfcC+DWAU2KWKVSY+RkA2+OWIy5kIEwVQETXAzgPwE4AM2MWJ0q+AWB53EII2rQCeM/0eAuAo2KSRYgAMQAVgIieBHCgzUtXM/PvmPlqAFcT0VUAvgVgYUUFDIjb9RXeczWAPgD3VFK2sNC5RkEYaIgBqADMfILmW+8B8AgGmAFwuz4iOh/A3wM4ngdo4YmH32EtkQZwiOnxwYXnhBpBYgAxQ0SHmx6eAuCNuGSJAiI6CcB3Acxl5t645RE88SKAw4loDBENBvAPAB6KWSYhRKQSOGaI6AEAYwH0I9/q+hJmrplVFhG9DWAIgI8KT61m5ktiFCl0iOhUALcCaAHQA6CbmWfHKlRIENFXAdwCIAHg58x8fbwShQsR3Qvgy8i3g/4AwEJmvjNWoSqIGABBEIQ6RVxAgiAIdYoYAEEQhDpFDIAgCEKdIgZAEAShThEDIAiCUKeIARAEC0Q0j4iYiMa5vO9yImoMcJ7ziejHfj8vCEERAyAI5ZwN4E+Ff524HIBvAyAIcSMGQBBMENF+AP4WwAXIV76CiBJE9CMierUw1+BfiehSAKMArCKiVYX3fWo6zulE9MvC/+cQ0QtE1EVETxLR/6j0dQmCHdILSBBKOQXAo8z8JhF9RERTkW+LPBpAOzP3EdFIZt5ORN8GMJOZ/+pyzD8BmM7MTEQXIt8a48ooL0IQdBADIAilnA3g3wv//3Xh8RgAtxlzDZjZa//4gwEsJ6KDAAwG8G5IsgpCIMQACEIBIhoJYBaAiUTEyPe/YeSboulg7qsy1PT/WwHcxMwPEdGXASwKLKwghIDEAARhH6cDuIuZD2Xm0cx8CPKr9bUAvklEg4CioQCATwAMN33+AyL6AhE1ADjV9PwI7Guj/PVIr0AQPCAGQBD2cTaA31qeewDAQQA2A3iFiNYC+FrhtWUAHjWCwAAWID/X+XkA75uOsQjAb4hoDQC3eIEgVAzpBioIglCnyA5AEAShThEDIAiCUKeIARAEQahTxAAIgiDUKWIABEEQ6hQxAIIgCHWKGABBEIQ65f8DjvuoYpTPeQcAAAAASUVORK5CYII=", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "Created on Sun May 16 08:05:43 2021\n", "\n", "@author: Jacob\n", "\"\"\"\n", "import matplotlib.pyplot as plt\n", "import math\n", "import os\n", "\n", "import numpy as np\n", "import pandas as pd \n", "from pandas import DataFrame\n", "\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.linear_model import LinearRegression, Ridge, ElasticNet\n", "from sklearn.metrics import mean_squared_error as mse\n", "from sklearn.model_selection import KFold\n", "from sklearn.preprocessing import StandardScaler\n", "\n", "from sklearn import datasets, linear_model\n", "from sklearn.metrics import mean_squared_error, r2_score\n", "\n", "import spacy\n", "from transformers import XLNetTokenizer\n", "\n", "import sys\n", "\n", "\n", "print(\"working directory\" , os.getcwd())\n", "\n", "useSpaCyParsing = True\n", "\n", "print()\n", "\n", "if useSpaCyParsing:\n", " print(\"spaCy features\")\n", " nlp = spacy.load('en_core_web_lg')\n", "\n", "\n", "\n", "nfolds = 10\n", " \n", " \n", "# count syllables: https://stackoverflow.com/questions/46759492/syllable-count-in-python\n", "def syllable_count(word):\n", " \n", " count = 0\n", " vowels = \"aeiouy\"\n", " \n", " if word[0] in vowels:\n", " count += 1\n", " \n", " for index in range(1, len(word)):\n", " if word[index] in vowels and word[index - 1] not in vowels:\n", " count += 1\n", " if word.endswith(\"e\"):\n", " count -= 1\n", " \n", " if count == 0:\n", " count += 1\n", " \n", " return count\n", "\n", "\n", "\n", "def punctsPerSentence(text):\n", " \n", " doc = nlp(text)\n", " \n", " nPuncts = 0 \n", " nSentences = 0\n", " \n", " for sent in doc.sents:\n", " \n", " sentence = sent.text.strip()\n", " \n", " nSentences += 1\n", " \n", " for i in range (0, len (sentence)): \n", " #Checks whether given character is a punctuation mark \n", " if sentence[i] in ('!', \",\" ,\"\\'\" ,\";\" ,\"\\\"\", \".\", \"-\" ,\"?\"): \n", " nPuncts += 1 \n", " \n", "# print (\"Number of punctuation characters in text: \", nPuncts) \n", " \n", " punctsPerSentence = nPuncts / nSentences\n", " \n", " return punctsPerSentence\n", " \n", "\n", " \n", "xtrain = pd.read_csv('../input/commonlitreadabilityprize/train.csv')\n", "xtest = pd.read_csv('../input/commonlitreadabilityprize/test.csv')\n", "\n", "\n", "print(\"xtrain\", xtrain.head(7))\n", "print()\n", "\n", "print(\"xtrain target\", xtrain.target.head(7))\n", "print()\n", "\n", "\n", "\n", "print(\"calculating Flesch score\")\n", "# Flesch score: https://blog.ung.edu/press/measure-readability/\n", "\n", "# count the characters\n", "xtrain['nof_char'] = xtrain['excerpt'].apply(len)\n", "xtest['nof_char'] = xtest['excerpt'].apply(len)\n", "\n", "# count the words\n", "xtrain['nof_words'] = xtrain['excerpt'].apply(lambda s: len(s.split(' ')))\n", "xtest['nof_words'] = xtest['excerpt'].apply(lambda s: len(s.split(' ')))\n", "\n", "# words to characters\n", "xtrain['w2c'] = xtrain['nof_words'] / xtrain['nof_char']\n", "xtest['w2c'] = xtest['nof_words'] / xtest['nof_char']\n", "\n", "# nof sentences\n", "xtrain['nof_sentences'] = xtrain['excerpt'].apply(lambda s: s.count('.'))\n", "xtest['nof_sentences'] = xtest['excerpt'].apply(lambda s: s.count('.'))\n", "\n", "# nof syllables\n", "xtrain['nof_syllables'] = xtrain['excerpt'].apply(lambda s: syllable_count(s))\n", "xtest['nof_syllables'] = xtest['excerpt'].apply(lambda s: syllable_count(s))\n", "\n", "# nof punctuation characters per sentence\n", "xtrain['punctsPerSentence'] = xtrain['excerpt'].apply(lambda s: punctsPerSentence(s))\n", "xtest['punctsPerSentence'] = xtest['excerpt'].apply(lambda s: punctsPerSentence(s))\n", "\n", "\n", "# Flesch score\n", "a = 206.835 - 1.015 * (xtrain['nof_words'] / xtrain['nof_sentences'])\n", "b = -84.6 * (xtrain['nof_syllables'] / xtrain['nof_words'])\n", "xtrain['flesch_score'] = a + b\n", "\n", "a = 206.835 - 1.015 * (xtest['nof_words'] / xtest['nof_sentences'])\n", "b = -84.6 * (xtest['nof_syllables'] / xtest['nof_words'])\n", "xtest['flesch_score'] = a + b\n", "\n", "# Flesch score 2\n", "a = (xtrain['nof_words'] / xtrain['nof_sentences'])\n", "b = (xtrain['nof_syllables'] / xtrain['nof_words'])\n", "xtrain['flesch_score2'] = 0.39 * a + 11.8 * b - 15.59\n", "\n", "a = (xtest['nof_words'] / xtest['nof_sentences'])\n", "b = (xtest['nof_syllables'] / xtest['nof_words'])\n", "xtest['flesch_score2'] = 0.39 * a + 11.8 * b - 15.59\n", " \n", " \n", "del a,b\n", "\n", "\n", "print(\"counting unique words\")\n", "# count the unique words\n", "xtrain['nof_unique_words'] = xtrain['excerpt'].apply(lambda s: len(set( s.split(' ') )))\n", "xtest['nof_unique_words'] = xtest['excerpt'].apply(lambda s: len(set( s.split(' ') )))\n", "\n", "# text diversity\n", "xtrain['txt_diversity'] = xtrain['nof_unique_words'] / xtrain['nof_words']\n", "xtest['txt_diversity'] = xtest['nof_unique_words'] / xtest['nof_words']\n", "\n", "# word lengths\n", "words = xtrain['excerpt'].apply(lambda s: s.split(' '))\n", "word_lengths = words.apply(lambda s: [len(f) for f in s ])\n", "xtrain['longest_word'] = word_lengths.apply(max)\n", "xtrain['avg_word'] = word_lengths.apply(np.mean)\n", "\n", "words = xtest['excerpt'].apply(lambda s: s.split(' '))\n", "word_lengths = words.apply(lambda s: [len(f) for f in s ])\n", "xtest['longest_word'] = word_lengths.apply(max)\n", "xtest['avg_word'] = word_lengths.apply(np.mean)\n", "\n", "xtrain['txt_diversity'] = xtrain['nof_unique_words'] / xtrain['nof_words']\n", "xtest['txt_diversity'] = xtest['nof_unique_words'] / xtest['nof_words']\n", "\n", "\n", "\n", "\n", "nText = 0\n", "\n", "for text in xtrain.excerpt:\n", " \n", " target = xtrain.target.iloc[nText]\n", "\n", " nText += 1\n", " \n", " if nText % 100 == 0:\n", " print()\n", " print(\"--------------------------------------------------------------------------------\")\n", " print(\"text\", nText , text[:77], \"target\", target)\n", " print()\n", " \n", "\n", "\n", "\n", " \n", "useSpaCyFeatures = True\n", "\n", "if useSpaCyFeatures:\n", " \n", " print(\"getting spaCy features\")\n", "\n", " with nlp.disable_pipes():\n", " train_vectors = np.array([nlp(text).vector for text in xtrain.excerpt])\n", " test_vectors = np.array([nlp(text).vector for text in xtest.excerpt])\n", " \n", " namelist = ['f' + str(ii) for ii in range(train_vectors.shape[1])]\n", " \n", " print(\"namelist\" , namelist)\n", " \n", " train_vectors = pd.DataFrame(train_vectors)\n", " test_vectors = pd.DataFrame(test_vectors)\n", " train_vectors.columns = namelist\n", " test_vectors.columns = namelist\n", " \n", " print(\"train_vectors\" , train_vectors.head())\n", " \n", " # combined\n", " xtrain = pd.concat([xtrain, train_vectors], axis = 1)\n", " xtest = pd.concat([xtest, test_vectors], axis = 1)\n", "\n", "#features = namelist\n", "\n", " features = ['nof_words', 'nof_sentences', 'nof_syllables', 'flesch_score',\n", " 'txt_diversity', 'nof_unique_words', 'nof_char', 'w2c', \n", " 'flesch_score2', 'punctsPerSentence'] + namelist\n", "\n", "else:\n", "\n", " features = ['nof_words', 'nof_sentences', 'nof_syllables', 'flesch_score',\n", " 'txt_diversity', 'nof_unique_words', 'nof_char', 'w2c', \n", " 'flesch_score2', 'punctsPerSentence'] \n", "\n", "print(\"xtrain\", xtrain.head())\n", "print()\n", "\n", "X = xtrain[features].sample(frac=0.8, random_state=0)\n", "# Xtest = xtrain[features].drop(X.index)\n", "\n", "y = xtrain['target'].sample(frac=0.8, random_state=0)\n", "# ytest = xtrain['target'].drop(y.index)\n", "\n", "Xtest = xtrain[features].iloc[4::5]\n", "ytest = xtrain['target'].iloc[4::5]\n", "\n", "testTexts = xtrain['excerpt'].iloc[4::5]\n", " \n", "print(len(X), len(Xtest), len(y), len(ytest))\n", "\n", "print(\"X\", X.head(7))\n", "print()\n", "print(\"y\", y.head(7))\n", "print()\n", "print(\"Xtest\", Xtest.head(7))\n", "print()\n", "print(\"ytest\", ytest.head(7))\n", "print()\n", "print(\"testTexts\", testTexts.head(7))\n", "print()\n", "\n", " \n", "# Create linear regression object\n", "regr = linear_model.LinearRegression()\n", "\n", "# Train the model using the training sets\n", "regr.fit(X, y)\n", "\n", "# Make predictions using the testing set\n", "y_pred = regr.predict(Xtest)\n", "\n", "# The coefficients\n", "print('Coefficients: \\n', regr.coef_)\n", "# The mean squared error\n", "print('Mean squared error: %.10f' % mean_squared_error(ytest, y_pred))\n", "print('Root Mean squared error: %.10f' % math.sqrt(mean_squared_error(ytest, y_pred)))\n", "# The coefficient of determination: 1 is perfect prediction\n", "print('Coefficient of determination: %.2f'\n", " % r2_score(ytest, y_pred))\n", "print(\"sklearn RMSE\" , np.sqrt(mse(y_pred, ytest)))\n", "\n", "def plot_prediction(x, y):\n", " plt.scatter(x, y, label='Predictions')\n", "# plt.plot(x, y, color='k', label='Predictions')\n", " plt.xlabel('Actual')\n", " plt.ylabel('Linear Regression Prediction')\n", " plt.legend()\n", " \n", "plot_prediction(ytest, y_pred)\n", "\n", "\n", "print(\"ytest is a\",type(ytest), \" y_pred is a\",type(y_pred))\n", "\n", "\n", "testResults = DataFrame()\n", "testResults['excerpt'] = testTexts \n", "testResults['predregr'] = y_pred \n", "testResults['target'] = ytest\n", "#testResults.loc[:,'target'] = ytest\n", "\n", "print(\"testTexts is a\",type(testTexts), \" testResults is a\",type(testResults))\n", "\n", "print(testResults.head(7))\n", "\n", "# testResults.to_csv('regression.csv', index = False)\n", "\n", "#continuousNN.doMultipleRegression(X , y, Xtest , ytest)\n", "\n", "#continuousNN.doMultipleNN(X , y , Xtest , ytest)\n", "\n", "Xtest = xtest[features]\n", "\n", "y_pred = regr.predict(Xtest)\n", "\n", "xsub = xtest[[\"id\"]].copy()\n", "xsub[\"target\"] = y_pred\n", "xsub.to_csv('regression.csv', index = False)\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Transformers XLNet" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2021-06-25T20:29:57.316067Z", "iopub.status.busy": "2021-06-25T20:29:57.315669Z", "iopub.status.idle": "2021-06-25T20:37:56.431985Z", "shell.execute_reply": "2021-06-25T20:37:56.430279Z", "shell.execute_reply.started": "2021-06-25T20:29:57.316036Z" }, "trusted": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "All model checkpoint layers were used when initializing TFXLNetForSequenceClassification.\n", "\n", "All the layers of TFXLNetForSequenceClassification were initialized from the model checkpoint at ./savedmodel/5.\n", "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetForSequenceClassification for predictions without further training.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "nr of testData 566\n", "\n", "Once upon a time there were Three Bears who lived\n", "One day he had gone beyond any point which he had\n", "Aunt Abigail was gone, Eleanor was gone. The room\n", "Father had been away in the country for three or \n", "One beautiful misummer night in 18— a large, heav\n", "Before Fred could complete the sentence his foot \n", "When Josie arrived at her destination she went to\n", "n 10 predictedScore -0.6352807\n", "n 20 predictedScore -0.57780975\n", "n 30 predictedScore -2.1973433\n", "n 40 predictedScore 0.3600465\n", "n 50 predictedScore -1.5519806\n", "n 60 predictedScore -0.85689175\n", "n 70 predictedScore -0.09774914\n", "n 80 predictedScore -0.66253567\n", "n 90 predictedScore 0.100836724\n", "n 100 predictedScore -1.6850315\n", "n 110 predictedScore -2.3307252\n", "n 120 predictedScore -1.9078956\n", "n 130 predictedScore -0.14376666\n", "n 140 predictedScore -0.5901213\n", "n 150 predictedScore -1.5638647\n", "n 160 predictedScore -0.7154779\n", "n 170 predictedScore -0.65485036\n", "n 180 predictedScore -0.527873\n", "n 190 predictedScore -2.7968028\n", "n 200 predictedScore -0.9472739\n", "n 210 predictedScore 0.5601345\n", "n 220 predictedScore 0.58908087\n", "n 230 predictedScore 0.029123785\n", "n 240 predictedScore -1.9193069\n", "n 250 predictedScore -1.1744077\n", "n 260 predictedScore -2.4476063\n", "n 270 predictedScore -2.6000533\n", "n 280 predictedScore -0.71867293\n", "n 290 predictedScore -2.2777648\n", "n 300 predictedScore -0.74542314\n", "n 310 predictedScore -0.2118968\n", "n 320 predictedScore -0.7084438\n", "n 330 predictedScore -3.1341808\n", "n 340 predictedScore -1.6591183\n", "n 350 predictedScore -2.861485\n", "n 360 predictedScore -0.39417243\n", "n 370 predictedScore -3.2452447\n", "n 380 predictedScore 0.39630795\n", "n 390 predictedScore 0.33230662\n", "n 400 predictedScore 0.089797206\n", "n 410 predictedScore -0.9605478\n", "n 420 predictedScore -2.0486407\n", "n 430 predictedScore -2.3067622\n", "n 440 predictedScore -0.46318254\n", "n 450 predictedScore -1.3052845\n", "n 460 predictedScore -1.2545989\n", "n 470 predictedScore -0.4134524\n", "n 480 predictedScore -1.9913256\n", "n 490 predictedScore -1.0554487\n", "n 500 predictedScore 0.115702085\n", "n 510 predictedScore -0.66827846\n", "n 520 predictedScore 0.11224969\n", "n 530 predictedScore -2.9705956\n", "n 540 predictedScore -1.5697746\n", "n 550 predictedScore -3.2411354\n", "n 560 predictedScore -0.817201\n", "prediction done\n" ] }, { "ename": "ValueError", "evalue": "Length of values (566) does not match length of index (7)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m \u001b[0mpredict_testdata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtestData\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 510\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mpredict_testdata\u001b[0;34m(texts)\u001b[0m\n\u001b[1;32m 420\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"prediction done\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 421\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 422\u001b[0;31m \u001b[0mtestResults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'target'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpredictedScores\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 423\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 424\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 3161\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3162\u001b[0m \u001b[0;31m# set column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3163\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3165\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_setitem_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 3240\u001b[0m \"\"\"\n\u001b[1;32m 3241\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_valid_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3242\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3243\u001b[0m \u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3244\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_sanitize_column\u001b[0;34m(self, key, value, broadcast)\u001b[0m\n\u001b[1;32m 3897\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3898\u001b[0m \u001b[0;31m# turn me into an ndarray\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3899\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msanitize_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3900\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3901\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36msanitize_index\u001b[0;34m(data, index)\u001b[0m\n\u001b[1;32m 750\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 751\u001b[0m raise ValueError(\n\u001b[0;32m--> 752\u001b[0;31m \u001b[0;34m\"Length of values \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 753\u001b[0m \u001b[0;34mf\"({len(data)}) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 754\u001b[0m \u001b[0;34m\"does not match length of index \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Length of values (566) does not match length of index (7)" ] } ], "source": [ "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "Created on Fri May 28 14:00:29 2021\n", "\n", "@author: Jacob\n", "\"\"\"\n", "\n", "from transformers import BertTokenizer\n", "from transformers import TFBertForSequenceClassification\n", "\n", "from transformers import XLNetTokenizer\n", "from transformers import TFXLNetForSequenceClassification\n", "\n", "# from transformers import AutoModelForSequenceClassification\n", "# from transformers import TrainingArguments\n", "# from transformers import Trainer\n", "\n", "import tensorflow_datasets as tfds\n", "import tensorflow as tf\n", "\n", "import csv\n", "import os\n", "import sys\n", "import gc\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from pandas import DataFrame\n", "\n", "import codecs\n", "import math\n", "from sklearn.metrics import mean_squared_error \n", "\n", "\n", "# max_length_text = 7\n", "# batch_size = 100\n", "# number_of_epochs = 1\n", "\n", "# max_length_text = 100\n", "# batch_size = 50\n", "# number_of_epochs = 3\n", "\n", "max_length_text = 150\n", "batch_size = 10\n", "number_of_epochs = 1\n", "\n", "#max_length_text = 300\n", "#batch_size = 1\n", "#number_of_epochs = 1\n", "\n", "\n", "testScores = []\n", "testData = []\n", "\n", "def load_train_validate_lit_data(data_dir):\n", " \"\"\"Loads the CommonLit train/validate datasets \n", " Input:\n", " data_dir: path to the \"aclImdb\" folder.\n", " \n", " Returns:\n", " train/test datasets as pandas dataframes.\n", " \"\"\"\n", "\n", " # CommonLit data\n", " # --------------\n", " cldata = {} \n", " \n", "# datadir = 'data/CommonLit/'-\n", " \n", " text_corpus = []\n", " targetvals = []\n", " \n", " \n", " with open('../input/commonlitreadabilityprize/train.csv', newline='', encoding='utf8') as trainfile:\n", "# with open(data_dir + 'train+grades.csv', newline='', encoding='utf8') as trainfile:\n", " \n", " corpus = csv.reader(trainfile, delimiter=',', quotechar='\"')\n", " \n", " for split in [\"train\", \"validate\"]:\n", " \n", " cldata[split] = []\n", " \n", " \n", " n = 0\n", " ntest = 0\n", " \n", " for row in corpus:\n", " # print()\n", " # print(n, row[0])\n", " # print(row[3])\n", " if n > 0:\n", " \n", " text = row[3]\n", " score = float(row[4])\n", " \n", " # if score <= 6:\n", " # score = 0\n", " # elif score >= 9:\n", " # score = 1\n", " # else:\n", " # continue\n", " \n", " # if n % 50 == 0:\n", " # cldata[\"test\"].append([text, score])\n", " # elif n % 10 == 0:\n", " # cldata[\"train\"].append([text, score]) \n", "\n", " if n % 5 == 0:\n", " cldata[\"validate\"].append([text, score])\n", " \n", " if True: \n", " # ntest < 1.4 * 37:\n", " testData.append(text)\n", " testScores.append(score)\n", " ntest += 1\n", " else:\n", " \n", " if score < -3.1 :\n", " for dup in range(0,10):\n", " cldata[\"train\"].append([text, score]) \n", " elif score < -2.6:\n", " for dup in range(0,4):\n", " cldata[\"train\"].append([text, score]) \n", " elif score < -2.1:\n", " for dup in range(0,2):\n", " cldata[\"train\"].append([text, score]) \n", " elif score < 0.1:\n", " cldata[\"train\"].append([text, score]) \n", " elif score < 0.6:\n", " for dup in range(0,2):\n", " cldata[\"train\"].append([text, score]) \n", " elif score < 1.2:\n", " for dup in range(0,5):\n", " cldata[\"train\"].append([text, score]) \n", " else:\n", " for dup in range(0,10):\n", " cldata[\"train\"].append([text, score]) \n", "\n", " n += 1\n", " \n", "\n", " np.random.shuffle(cldata[\"train\"]) \n", " cldata[\"train\"] = pd.DataFrame(cldata[\"train\"], columns=['text', 'score'])\n", " #print()\n", " #print(\"training data\")\n", " #print(cldata[\"train\"])\n", " \n", " np.random.shuffle(cldata[\"validate\"])\n", " cldata[\"validate\"] = pd.DataFrame(cldata[\"validate\"], columns=['text', 'score'])\n", " #print()\n", " #print(\"validation data\")\n", " #print(cldata[\"validate\"])\n", " \n", " return cldata[\"train\"], cldata[\"validate\"]\n", "\n", "\n", "\n", "def load_train_test_imdb_data(data_dir):\n", " \"\"\"Loads the IMDB train/test datasets from a folder path.\n", " Input:\n", " data_dir: path to the \"aclImdb\" folder.\n", " \n", " Returns:\n", " train/test datasets as pandas dataframes.\n", " \"\"\"\n", "\n", " data = {}\n", " for split in [\"train\", \"test\"]:\n", " \n", " data[split] = []\n", " \n", " for sentiment in [\"neg\", \"pos\"]:\n", " \n", " score = 1 if sentiment == \"pos\" else 0\n", "\n", " path = os.path.join(data_dir, split, sentiment)\n", " file_names = os.listdir(path)\n", " \n", " nData = 0\n", " \n", " for f_name in file_names:\n", " \n", " if nData % 20 == 0:\n", " with open(os.path.join(path, f_name), \"r\", encoding=\"utf8\") as f:\n", " review = f.read()\n", " data[split].append([review, score])\n", " \n", " nData += 1 \n", " \n", "\n", " np.random.shuffle(data[\"train\"]) \n", " data[\"train\"] = pd.DataFrame(data[\"train\"], columns=['text', 'sentiment'])\n", " print(\"training data\")\n", " print(data[\"train\"])\n", " np.random.shuffle(data[\"test\"])\n", " data[\"test\"] = pd.DataFrame(data[\"test\"], columns=['text', 'sentiment'])\n", " print(\"test data\")\n", " print(data[\"test\"])\n", " \n", " return data[\"train\"], data[\"test\"]\n", "\n", "\n", "\n", "def convert_example_to_feature(text):\n", "# # add special tokens\n", "# # print(\"text\", text)\n", "# text_with_special_tokens = '[CLS]' + text + '[SEP]'\n", "# tokenized = tokenizer.tokenize(text_with_special_tokens)\n", "# # print('tokenized', tokenized)\n", " \n", "# # convert tokens to ids in WordPiece\n", "# input_ids = tokenizer.convert_tokens_to_ids(tokenized)\n", " \n", "# # precalculation of pad length, so that we can reuse it later on\n", "# padding_length = max_length_text - len(input_ids)\n", " \n", "# # map tokens to WordPiece dictionary and add pad token for those text shorter than our max length\n", "# input_ids = input_ids + ([0] * padding_length)\n", " \n", "# # attention should focus just on sequence with non padded tokens\n", "# attention_mask = [1] * len(input_ids)\n", " \n", "# # do not focus attention on padded tokens\n", "# attention_mask = attention_mask + ([0] * padding_length)\n", " \n", "# # token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence\n", "# token_type_ids = [0] * max_length_text\n", "# bert_input = {\n", "# \"input_ids\": input_ids,\n", "# \"token_type_ids\": token_type_ids,\n", "# \"attention_mask\": attention_mask\n", "# }\n", " \n", "# return bert_input\n", "\n", " features = tokenizer.encode_plus(\n", " text, \n", " add_special_tokens = True, # add [CLS], [SEP]\n", " max_length = max_length_text, # max length of the text that can go to XLNet\n", " pad_to_max_length = True, # add [PAD] tokens\n", " return_attention_mask = True, # add attention mask to not focus on pad tokens\n", " )\n", " # print('encoded', bert_input)\n", " \n", " return features\n", "\n", "try:\n", " tokenizer = XLNetTokenizer.from_pretrained('../input/xlnet01/hf_xlnet-base-cased', do_lower_case=True)\n", "except Exception as e: # catch *all* exceptions\n", " print( \"Error: \" , e)\n", "#tokenizer = BertTokenizer.from_pretrained('uncased_L-12_H-768_A-12', do_lower_case=True)\n", "#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)\n", "\n", "#max_length_text = 100\n", "test_sentence = 'Test tokenization sentence. Followed by another sentence'\n", "\n", "transformer_input = convert_example_to_feature(test_sentence) \n", "\n", "#print(\"input\")\n", "#print(transformer_input)\n", "\n", "# bert_input = tokenizer.encode_plus(\n", "# test_sentence, \n", "# add_special_tokens = True, # add [CLS], [SEP]\n", "# max_length = max_length_text, # max length of the text that can go to BERT\n", "# pad_to_max_length = True, # add [PAD] tokens\n", "# return_attention_mask = True, # add attention mask to not focus on pad tokens\n", "# )\n", "# print('encoded', bert_input)\n", "\n", "\n", "train_data, validate_data = load_train_validate_lit_data(data_dir=\"data/CommonLit/\")\n", "\n", "test_data = validate_data.head(7)\n", "\n", "#train_data, test_data = load_train_test_imdb_data(data_dir=\"data/aclImdb/\")\n", "\n", "#print(\"train_data\", train_data)\n", "\n", "target = train_data.pop('score')\n", "ds_train = tf.data.Dataset.from_tensor_slices((train_data.values, target.values))\n", "\n", "target = validate_data.pop('score')\n", "ds_validate = tf.data.Dataset.from_tensor_slices((validate_data.values, target.values))\n", "\n", "\n", "# for review, label in tfds.as_numpy(ds_train.take(5)):\n", "# print('review', review[0:50], label)\n", "\n", "# (ds_train, ds_test), ds_info = tfds.load('imdb_reviews', \n", "# split = (tfds.Split.TRAIN, tfds.Split.TEST),\n", "# as_supervised=True,\n", "# with_info=True)\n", "\n", "# print('info', ds_info)\n", "\n", "# map to the expected input to TFBertForSequenceClassification, see here \n", "def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):\n", " \n", " return {\n", " \"input_ids\": input_ids,\n", " \"token_type_ids\": token_type_ids,\n", " \"attention_mask\": attention_masks,\n", " }, label\n", "\n", "\n", "\n", "def encode_examples(ds, limit=-1):\n", " # prepare list, so that we can build up final TensorFlow dataset from slices.\n", " input_ids_list = []\n", " token_type_ids_list = []\n", " attention_mask_list = []\n", " label_list = []\n", " \n", " if (limit > 0):\n", " ds = ds.take(limit)\n", " \n", " n = 0\n", " maxNids = 0\n", " minNids = 1000000\n", " sumNids = 0\n", " \n", " for text, label in tfds.as_numpy(ds):\n", "\n", " transformer_input = convert_example_to_feature(text[0].decode())\n", " \n", " nids = len(transformer_input['input_ids'])\n", " \n", " sumNids += nids\n", " \n", " if nids < minNids:\n", " minNids = nids\n", " \n", " if nids > maxNids:\n", " maxNids = nids\n", "\n", " # bert_input = convert_example_to_feature(codecs.decode(review,'ascii'))\n", " if n % 300 == 0:\n", " \n", " print()\n", " print(\"******* text\",text[0])\n", " print()\n", " \n", " # print(\" *** transformer_input\",transformer_input)\n", " print()\n", " print(\"nr input ids\", len(transformer_input['input_ids']))\n", "# print(len(transformer_input['input_ids']),\" *** input_ids\",transformer_input['input_ids'])\n", " print()\n", "\n", " \n", " # if n > 1:\n", " # break\n", " n += 1\n", " \n", " input_ids_list.append(transformer_input['input_ids'])\n", " token_type_ids_list.append(transformer_input['token_type_ids'])\n", " attention_mask_list.append(transformer_input['attention_mask'])\n", " label_list.append([label])\n", " \n", " meanNids = sumNids / n\n", " print(\"minNids\", minNids, \"meanNids\", meanNids, \"maxNids\", maxNids) \n", " \n", " return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)\n", "\n", "\n", "\n", "def predict_testdata():\n", " \n", " xtest = pd.read_csv('../input/commonlitreadabilityprize/test.csv')\n", " \n", " texts = xtest['excerpt']\n", " \n", " print(\"test texts\",texts.head)\n", " \n", " testResults = DataFrame()\n", " testResults = xtest[[\"id\"]].copy()\n", " \n", " token_type_ids_list = []\n", " attention_mask_list = []\n", " label_list = []\n", " \n", " predictedScores = []\n", " \n", " n = 0\n", " \n", " for text in texts:\n", " \n", " n += 1\n", " \n", " input_ids_list = []\n", "\n", " transformer_input = convert_example_to_feature(text)\n", "\n", " # if n % 500 == 0:\n", " \n", " # print()\n", " # print(\"******* text \", text)\n", " # print(\" *** transformer_input\",transformer_input)\n", " # print(len(transformer_input['input_ids']),\" *** input_ids\",transformer_input['input_ids'])\n", " # print()\n", "\n", " \n", " # if n > 1:\n", " # break\n", " \n", " input_ids_list.append(transformer_input['input_ids'])\n", " # token_type_ids_list.append(transformer_input['token_type_ids'])\n", " # attention_mask_list.append(transformer_input['attention_mask'])\n", " \n", " inp_tok = np.array([inputid for inputid in input_ids_list])\n", " \n", " preds = model.predict(inp_tok, batch_size=1, verbose=False)\n", " \n", " predictedScore = preds.logits[0][0]\n", " \n", " if n % 10 == 0:\n", " print(\"n\", n, \"predictedScore\", predictedScore)\n", " \n", " predictedScores.append(predictedScore)\n", "# preds = model.predict(inp_tok, batch_size=batch_size, verbose=True)\n", " gc.collect()\n", " \n", "# print(\"texts\", texts)\n", "# print(\"predictions\", preds.logits)\n", "\n", "# predictedScores = []\n", " \n", " n = 0\n", " \n", " print(\"prediction done\")\n", " \n", " testResults['target'] = predictedScores\n", " \n", " \n", "\n", " testResults.to_csv('xlnet.csv', index = False)\n", " \n", "\n", "#batch_size = 50\n", "\n", "\n", "# train dataset\n", "print()\n", "\n", "training = True\n", "\n", "if training:\n", " \n", " print(\"training dataset\")\n", " ds_train_encoded = encode_examples(ds_train)\n", " ds_train_encoded_batched = ds_train_encoded.batch(batch_size)\n", " #ds_train_encoded = encode_examples(ds_train).batch(batch_size)\n", " #ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)\n", " # test dataset\n", " print(\"validation dataset\")\n", " ds_validate_encoded = encode_examples(ds_validate)\n", " ds_validate_encoded_batched = ds_validate_encoded.batch(batch_size)\n", " \n", "# sys.exit()\n", "\n", " # recommended learning rate for Adam 5e-5, 3e-5, 2e-5\n", " learning_rate = 2e-5\n", " # we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model\n", "# number_of_epochs = 3\n", " \n", " # model initialization\n", "# model = AutoModelForSequenceClassification.from_pretrained('./hf_xlnet-base-cased/', from_tf=True, num_labels=1)\n", "# model = TFXLNetForSequenceClassification.from_pretrained('./hf_xlnet-base-cased/', num_labels=1)\n", "\n", " optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)\n", " loss = tf.keras.losses.MeanSquaredError()\n", "\n", "\n", " step = 0\n", " \n", " while step <= 7:\n", " # Place tensors on the CPU\n", " with tf.device('/CPU:0'):\n", " \n", " newDir = './savedmodel/' + str(step + 1) \n", " \n", " if not os.path.exists(newDir):\n", " os.makedirs(newDir)\n", " \n", " if step == 0:\n", " # model = TFXLNetForSequenceClassification.from_pretrained('../input/xlnet01/savedmodel150_2best/', num_labels=1)\n", " model = TFXLNetForSequenceClassification.from_pretrained('../input/xlnet01/hf_xlnet-base-cased/', num_labels=1)\n", " else:\n", " model = TFXLNetForSequenceClassification.from_pretrained('./savedmodel/' + str(step) + '/', num_labels=1)\n", " \n", " model.compile(optimizer=optimizer, loss=loss, metrics=[])\n", " \n", " print(\"start training with max_length_text\", max_length_text, \"batch_size\", batch_size, \"step\", step)\n", " \n", " history = model.fit(ds_train_encoded_batched, epochs=number_of_epochs, steps_per_epoch=10, validation_data=ds_validate_encoded_batched)\n", " \n", " print(\"training done step \" + str(step))\n", " \n", " step += 1\n", " \n", " model.save_pretrained('./savedmodel/' + str(step) + '/', saved_model=False)\n", " \n", " gc.collect()\n", "\n", "else:\n", " # use saved model for prediction\n", " with tf.device('/CPU:0'):\n", "\n", " model = TFXLNetForSequenceClassification.from_pretrained('../input/xlnet01/savedmodel150_2best', num_labels=1)\n", " \n", " print()\n", " print(\"nr of testData\", len(testData))\n", " print()\n", " for testText in testData[:7]:\n", " print(testText[:49])\n", " \n", "\n", " predict_testdata()\n", "\n", " print()\n", "\n", "\n", "\n", "\n", "sys.exit()\n", "\n", "predictions = model.predict(ds_validate_encoded, verbose = False)\n", "print(\"type predictions\", type(predictions))\n", "print(\"type predictions.logits\", type(predictions.logits))\n", "print(\"predictions.logits\", predictions.logits)\n", "print(\"len predictions\", len(predictions.logits))\n", "#print(\"tf.argmax(predictions)\", tf.argmax(predictions).numpy())\n", "#print(\"tf.nn.softmax(predictions, axis=-1)\", tf.nn.softmax(predictions.logits, axis=-1))\n", "\n", "print()\n", "\n", "print(\"test_data\", test_data)\n", "\n", "# result = model(ds, training=False)\n", "# result = tf.argmax(result).numpy()\n", "# print(result)\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ " Download Model \n", " Download Config " ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Ensemble" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2021-06-25T19:42:17.207547Z", "iopub.status.busy": "2021-06-25T19:42:17.207093Z", "iopub.status.idle": "2021-06-25T19:42:17.236517Z", "shell.execute_reply": "2021-06-25T19:42:17.235368Z", "shell.execute_reply.started": "2021-06-25T19:42:17.207503Z" }, "trusted": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "predictions\n", " id target predtrsf avgred\n", "0 c0f722661 -0.786303 -0.877908 -0.832105\n", "1 f0953f0a5 -0.121382 0.276648 0.077633\n", "2 0df072751 -0.838939 -0.771869 -0.805404\n", "3 04caf4e0c -2.342430 -1.925204 -2.133817\n", "4 0e63f8bea -1.534724 -1.855052 -1.694888\n", "5 12537fe78 -0.725721 -0.330248 -0.527985\n", "6 965e592c0 0.418392 0.393256 0.405824\n" ] } ], "source": [ "# -*- coding: utf-8 -*-\n", "\"\"\"\n", "Created on Thu Jun 17 14:37:27 2021\n", "\n", "@author: Jacob\n", "\"\"\"\n", "import pandas as pd \n", "from pandas import DataFrame\n", "\n", "from sklearn.preprocessing import PolynomialFeatures\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn import datasets\n", "from sklearn.metrics import mean_squared_error \n", "\n", "import numpy as np\n", "from scipy.optimize import curve_fit\n", "\n", "import matplotlib.pyplot as plt\n", "import math\n", "import sys\n", "\n", "\n", "\n", "def sigmoid(x, L , theta, gamma, b):\n", " \n", " \n", " y = L / (1 + np.exp(-gamma * (x - theta))) + b\n", " \n", " return (y)\n", "\n", "\n", "predictionData = pd.read_csv('regression.csv')\n", "\n", "\n", "resultsXLNet = pd.read_csv('xlnet.csv')\n", "\n", "predictionData['predtrsf'] = resultsXLNet['target']\n", "\n", "predictionData['avgred'] = (predictionData['target'] + predictionData['predtrsf']) / 2\n", "\n", "#y_pred = predictionData['predtrsf']\n", "\n", "\n", "print(\"predictions\")\n", "\n", "print(predictionData)\n", "\n", "\n", "# submissiion\n", "xsub = xtest[[\"id\"]].copy()\n", "xsub[\"target\"] = predictionData['avgred']\n", "xsub.to_csv('submission.csv', index = False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 4 }