File size: 99,996 Bytes

ee857e8

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
    "execution": {
     "iopub.execute_input": "2021-06-23T10:14:01.220679Z",
     "iopub.status.busy": "2021-06-23T10:14:01.219714Z",
     "iopub.status.idle": "2021-06-23T10:14:01.359474Z",
     "shell.execute_reply": "2021-06-23T10:14:01.357018Z",
     "shell.execute_reply.started": "2021-06-23T10:14:01.220489Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "# This Python 3 environment comes with many helpful analytics libraries installed\n",
    "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
    "# For example, here's several helpful packages to load\n",
    "\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "\n",
    "# Input data files are available in the read-only \"../input/\" directory\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
    "\n",
    "import os\n",
    "for dirname, _, filenames in os.walk('/kaggle/input'):\n",
    "    for filename in filenames:\n",
    "        \n",
    "        print(os.path.join(dirname, filename))\n",
    "\n",
    "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
    "ls # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2021-06-23T10:14:01.360753Z",
     "iopub.status.idle": "2021-06-23T10:14:01.361256Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import psutil\n",
    "\n",
    "print(\"TensorFlow version\",tf.__version__)\n",
    "print(\"Num GPUs Available: \", len(tf.config.list_physical_devices('GPU')))\n",
    "psutil.virtual_memory()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.status.busy": "2021-06-23T10:14:01.362996Z",
     "iopub.status.idle": "2021-06-23T10:14:01.363698Z"
    },
    "trusted": true
   },
   "outputs": [],
   "source": [
    "print (spacy.util.is_package(\"en_core_web_lg\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Readability Scores + spaCy**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-25T19:37:17.991312Z",
     "iopub.status.busy": "2021-06-25T19:37:17.990794Z",
     "iopub.status.idle": "2021-06-25T19:41:20.773700Z",
     "shell.execute_reply": "2021-06-25T19:41:20.772698Z",
     "shell.execute_reply.started": "2021-06-25T19:37:17.991258Z"
    },
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "working directory /kaggle/working\n",
      "\n",
      "spaCy features\n",
      "xtrain           id url_legal license  \\\n",
      "0  c12129c31       NaN     NaN   \n",
      "1  85aa80a4c       NaN     NaN   \n",
      "2  b69ac6792       NaN     NaN   \n",
      "3  dd1000b26       NaN     NaN   \n",
      "4  37c1b32fb       NaN     NaN   \n",
      "5  f9bf357fe       NaN     NaN   \n",
      "6  eaf8e7355       NaN     NaN   \n",
      "\n",
      "                                             excerpt    target  standard_error  \n",
      "0  When the young people returned to the ballroom... -0.340259        0.464009  \n",
      "1  All through dinner time, Mrs. Fayre was somewh... -0.315372        0.480805  \n",
      "2  As Roger had predicted, the snow departed as q... -0.580118        0.476676  \n",
      "3  And outside before the palace a great garden w... -1.054013        0.450007  \n",
      "4  Once upon a time there were Three Bears who li...  0.247197        0.510845  \n",
      "5  Hal and Chester found ample time to take an in... -0.861809        0.480936  \n",
      "6  Hal Paine and Chester Crawford were typical Am... -1.759061        0.476507  \n",
      "\n",
      "xtrain target 0   -0.340259\n",
      "1   -0.315372\n",
      "2   -0.580118\n",
      "3   -1.054013\n",
      "4    0.247197\n",
      "5   -0.861809\n",
      "6   -1.759061\n",
      "Name: target, dtype: float64\n",
      "\n",
      "calculating Flesch score\n",
      "counting unique words\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 100 When the two friends had thus become reconciled, they examined the cub, and s target -0.811519925\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 200 Maggie soon thought she had been hours in the attic, and it must be tea time, target 0.277737321\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 300 In Computer science, cloud computing describes a type of outsourcing of compu target -1.413698838\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 400 A habitat is an ecological or environmental area that is inhabited by a parti target -0.616799879\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 500 Nanotechnology has provided the possibility of delivering drugs to specific c target -1.6251413\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 600 Animal tissues are grouped into four basic types: connective, muscle, nervous target -2.601398503\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 700 A local area network (LAN) is a computer network in a small area like a home, target -1.10578373\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 800 You guessed it–science! Fluorescent colors are very special. You might have s target -0.101779002\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 900 Our immune systems work to protect us from illness by recognizing foreign mol target -0.986988435\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1000 Fog is an accumulation of tiny water droplets or ice crystals suspended above target -0.691647347\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1100 Chameli's mother had a lot of beautiful jewelry. \n",
      "One day, Chameli's mother s target -0.047198886\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1200 December was an anxious month. Several German divisions were east of the Piav target -2.279783534\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1300 The Government of the United States and the Imperial German Government are co target -1.532702197\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1400 It was not, however, until the morning that we entered the harbor of Havre th target -0.841382898\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1500 After a time the polished rocky sides of the shaft grew to be of a solemn sam target -1.826667527\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1600 One morning, Grandma had two loaves of \"riz bread,\" and some election cakes,  target -0.698302533\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1700 This beautiful Agave is now in blossom in the garden here, and I am happy to  target -2.162795917\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1800 Quickly he opens the paper to its full extent, and places it on the floor car target -0.584273721\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 1900 There is a great park here, known as the Maidan, where dogs run with bones to target -0.583532619\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2000 Mabel lives on a hill, quite near a beautiful lake, and is very fond of going target 0.902661245\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2100 He fallow-deer — This is the domestic or park deer; and no two animals can ma target -2.57511146\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2200 The sun is a star, just like the other millions of stars you see when you loo target -0.580630824\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2300 This may be a useful lesson to you, dear Isabel,\" she said. \"It will teach yo target -0.223570721\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2400 By looking at any map of Europe, it will be seen that England is separated fr target -1.188880582\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2500 It was a beautiful place to play. There were trees for hide-and-seek, flat sp target -0.025405297\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2600 When Doris got home she opened her paint-box. What do you think? Of course it target 0.198997768\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2700 Forty years ago women were given no representation in conventions where polit target -1.291127806\n",
      "\n",
      "\n",
      "--------------------------------------------------------------------------------\n",
      "text 2800 The American Civil War (1861–1865) was a civil war in the United States of Am target 0.223365705\n",
      "\n",
      "getting spaCy features\n",
      "namelist ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40', 'f41', 'f42', 'f43', 'f44', 'f45', 'f46', 'f47', 'f48', 'f49', 'f50', 'f51', 'f52', 'f53', 'f54', 'f55', 'f56', 'f57', 'f58', 'f59', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69', 'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79', 'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89', 'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99', 'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108', 'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117', 'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126', 'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135', 'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144', 'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153', 'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162', 'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171', 'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180', 'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189', 'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198', 'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207', 'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216', 'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225', 'f226', 'f227', 'f228', 'f229', 'f230', 'f231', 'f232', 'f233', 'f234', 'f235', 'f236', 'f237', 'f238', 'f239', 'f240', 'f241', 'f242', 'f243', 'f244', 'f245', 'f246', 'f247', 'f248', 'f249', 'f250', 'f251', 'f252', 'f253', 'f254', 'f255', 'f256', 'f257', 'f258', 'f259', 'f260', 'f261', 'f262', 'f263', 'f264', 'f265', 'f266', 'f267', 'f268', 'f269', 'f270', 'f271', 'f272', 'f273', 'f274', 'f275', 'f276', 'f277', 'f278', 'f279', 'f280', 'f281', 'f282', 'f283', 'f284', 'f285', 'f286', 'f287', 'f288', 'f289', 'f290', 'f291', 'f292', 'f293', 'f294', 'f295', 'f296', 'f297', 'f298', 'f299']\n",
      "train_vectors          f0        f1        f2        f3        f4        f5        f6  \\\n",
      "0  0.054942  0.104693 -0.129839 -0.077362  0.066479 -0.007780 -0.010820   \n",
      "1 -0.014731  0.213879 -0.183907 -0.048958  0.112992  0.026316  0.028143   \n",
      "2 -0.006671  0.217069 -0.125178 -0.073087  0.106584  0.016715  0.011751   \n",
      "3  0.040802  0.123908 -0.106365 -0.109637  0.090377  0.020302 -0.004274   \n",
      "4 -0.018610  0.109863 -0.161603 -0.035534  0.143024 -0.053699 -0.026999   \n",
      "\n",
      "         f7        f8        f9  ...      f290      f291      f292      f293  \\\n",
      "0 -0.054412 -0.069162  2.085747  ... -0.238603  0.031822  0.059434 -0.093304   \n",
      "1 -0.137892 -0.094396  2.170374  ... -0.105756  0.051830  0.006578 -0.007093   \n",
      "2 -0.115226 -0.071845  2.141033  ... -0.116192  0.020743  0.012045 -0.028818   \n",
      "3 -0.007752 -0.103293  2.044155  ... -0.201305  0.014579 -0.002538 -0.046069   \n",
      "4 -0.152680 -0.012829  2.205194  ... -0.284280  0.039023  0.082985 -0.052379   \n",
      "\n",
      "       f294      f295      f296      f297      f298      f299  \n",
      "0 -0.134071  0.053603  0.038264 -0.028437 -0.022459  0.068514  \n",
      "1  0.065427 -0.029093 -0.000948  0.012834  0.015731  0.002224  \n",
      "2  0.044761 -0.033804  0.015294 -0.019791  0.020420  0.016437  \n",
      "3 -0.155320  0.064200  0.079673 -0.035276  0.001438  0.066118  \n",
      "4 -0.135277  0.122106  0.057177 -0.104051 -0.100320  0.024026  \n",
      "\n",
      "[5 rows x 300 columns]\n",
      "xtrain           id url_legal license  \\\n",
      "0  c12129c31       NaN     NaN   \n",
      "1  85aa80a4c       NaN     NaN   \n",
      "2  b69ac6792       NaN     NaN   \n",
      "3  dd1000b26       NaN     NaN   \n",
      "4  37c1b32fb       NaN     NaN   \n",
      "\n",
      "                                             excerpt    target  \\\n",
      "0  When the young people returned to the ballroom... -0.340259   \n",
      "1  All through dinner time, Mrs. Fayre was somewh... -0.315372   \n",
      "2  As Roger had predicted, the snow departed as q... -0.580118   \n",
      "3  And outside before the palace a great garden w... -1.054013   \n",
      "4  Once upon a time there were Three Bears who li...  0.247197   \n",
      "\n",
      "   standard_error  nof_char  nof_words       w2c  nof_sentences  ...  \\\n",
      "0        0.464009       992        174  0.175403             11  ...   \n",
      "1        0.480805       937        164  0.175027             10  ...   \n",
      "2        0.476676       908        162  0.178414             11  ...   \n",
      "3        0.450007       909        163  0.179318              5  ...   \n",
      "4        0.510845       723        147  0.203320              5  ...   \n",
      "\n",
      "       f290      f291      f292      f293      f294      f295      f296  \\\n",
      "0 -0.238603  0.031822  0.059434 -0.093304 -0.134071  0.053603  0.038264   \n",
      "1 -0.105756  0.051830  0.006578 -0.007093  0.065427 -0.029093 -0.000948   \n",
      "2 -0.116192  0.020743  0.012045 -0.028818  0.044761 -0.033804  0.015294   \n",
      "3 -0.201305  0.014579 -0.002538 -0.046069 -0.155320  0.064200  0.079673   \n",
      "4 -0.284280  0.039023  0.082985 -0.052379 -0.135277  0.122106  0.057177   \n",
      "\n",
      "       f297      f298      f299  \n",
      "0 -0.028437 -0.022459  0.068514  \n",
      "1  0.012834  0.015731  0.002224  \n",
      "2 -0.019791  0.020420  0.016437  \n",
      "3 -0.035276  0.001438  0.066118  \n",
      "4 -0.104051 -0.100320  0.024026  \n",
      "\n",
      "[5 rows x 318 columns]\n",
      "\n",
      "2267 566 2267 566\n",
      "X       nof_words  nof_sentences  nof_syllables  flesch_score  txt_diversity  \\\n",
      "981         165             12            235     72.387841       0.709091   \n",
      "683         182              8            321     34.531662       0.631868   \n",
      "195         172              9            249     64.963966       0.703488   \n",
      "1158        151              5            264     28.272066       0.741722   \n",
      "1553        173              9            281     49.910572       0.722543   \n",
      "2273        169              9            273     51.114017       0.662722   \n",
      "2056        188             14            241     84.755000       0.632979   \n",
      "\n",
      "      nof_unique_words  nof_char       w2c  flesch_score2  punctsPerSentence  \\\n",
      "981                117       896  0.184152       6.578561           1.812500   \n",
      "683                115      1106  0.164557      14.094588           1.700000   \n",
      "195                121       988  0.174089       8.945891           4.000000   \n",
      "1158               112       927  0.162891      16.818464           4.833333   \n",
      "1553               125      1018  0.169941      11.073141           3.000000   \n",
      "2273               112       989  0.170880      10.794872           4.166667   \n",
      "2056               119       938  0.200426       4.773739           1.588235   \n",
      "\n",
      "      ...      f290      f291      f292      f293      f294      f295  \\\n",
      "981   ... -0.135388  0.006350  0.059477 -0.048736  0.037906 -0.004224   \n",
      "683   ... -0.209450  0.023985 -0.054420 -0.117216 -0.131639  0.101565   \n",
      "195   ... -0.155826  0.020981  0.004709  0.017356  0.054904 -0.006336   \n",
      "1158  ... -0.246229  0.019796 -0.079219 -0.012937 -0.049782  0.086845   \n",
      "1553  ... -0.115806  0.026836  0.069627 -0.061271 -0.058355  0.045644   \n",
      "2273  ... -0.201175  0.044129  0.031399 -0.086733 -0.077939  0.026245   \n",
      "2056  ... -0.159391  0.044991  0.042129 -0.041979 -0.002026  0.052522   \n",
      "\n",
      "          f296      f297      f298      f299  \n",
      "981  -0.000829  0.038497  0.012296  0.009338  \n",
      "683  -0.080587 -0.070543  0.008028  0.024660  \n",
      "195  -0.047379 -0.007128 -0.015000  0.042302  \n",
      "1158 -0.022204 -0.057120 -0.018331  0.023094  \n",
      "1553  0.062768 -0.013423 -0.016792  0.006240  \n",
      "2273 -0.007248 -0.011341 -0.099948  0.014829  \n",
      "2056  0.005720  0.018664  0.004293  0.039324  \n",
      "\n",
      "[7 rows x 310 columns]\n",
      "\n",
      "y 981     1.597870\n",
      "683    -0.743435\n",
      "195    -0.280994\n",
      "1158   -1.464792\n",
      "1553   -1.884352\n",
      "2273   -0.578085\n",
      "2056    0.666116\n",
      "Name: target, dtype: float64\n",
      "\n",
      "Xtest     nof_words  nof_sentences  nof_syllables  flesch_score  txt_diversity  \\\n",
      "4         147              5            183     71.675633       0.346939   \n",
      "9         191              8            274     61.238524       0.675393   \n",
      "14        181             11            259     69.076178       0.696133   \n",
      "19        176              8            256     61.450455       0.693182   \n",
      "24        169              6            273     41.584295       0.733728   \n",
      "29        167             10            256     60.198272       0.694611   \n",
      "34        146              5            230     43.923027       0.630137   \n",
      "\n",
      "    nof_unique_words  nof_char       w2c  flesch_score2  punctsPerSentence  \\\n",
      "4                 51       723  0.203320      10.565796           6.400000   \n",
      "9                129      1026  0.186160      10.648999           2.625000   \n",
      "14               126       967  0.187177       7.712356           2.230769   \n",
      "19               122       957  0.183908      10.153636           3.125000   \n",
      "24               124       993  0.170191      14.456538           4.000000   \n",
      "29               116       937  0.178228       9.011623           2.083333   \n",
      "34                92       802  0.182045      14.387041           1.500000   \n",
      "\n",
      "    ...      f290      f291      f292      f293      f294      f295      f296  \\\n",
      "4   ... -0.284280  0.039023  0.082985 -0.052379 -0.135277  0.122106  0.057177   \n",
      "9   ... -0.187329  0.043337  0.025266 -0.083779 -0.068758  0.022553  0.040310   \n",
      "14  ... -0.173686  0.034872  0.118947 -0.056733 -0.031695  0.045877  0.025237   \n",
      "19  ... -0.191329  0.022130  0.035867  0.000256  0.020506  0.012822  0.054820   \n",
      "24  ... -0.214793  0.006693  0.043532 -0.063686 -0.038540  0.034346  0.009910   \n",
      "29  ... -0.135137  0.031284  0.025826 -0.051649 -0.006029 -0.006497  0.011574   \n",
      "34  ... -0.135040  0.043721  0.088910 -0.033691  0.038809  0.023250 -0.022141   \n",
      "\n",
      "        f297      f298      f299  \n",
      "4  -0.104051 -0.100320  0.024026  \n",
      "9   0.023220 -0.014123  0.073939  \n",
      "14  0.017827 -0.013019  0.070423  \n",
      "19  0.012178  0.026124  0.014872  \n",
      "24 -0.043891 -0.028318  0.008500  \n",
      "29  0.073487  0.016896  0.012550  \n",
      "34 -0.017283  0.009402  0.041435  \n",
      "\n",
      "[7 rows x 310 columns]\n",
      "\n",
      "ytest 4     0.247197\n",
      "9    -1.238432\n",
      "14    0.245806\n",
      "19   -1.009999\n",
      "24   -1.483887\n",
      "29   -1.413744\n",
      "34    0.022598\n",
      "Name: target, dtype: float64\n",
      "\n",
      "testTexts 4     Once upon a time there were Three Bears who li...\n",
      "9     One day he had gone beyond any point which he ...\n",
      "14    Aunt Abigail was gone, Eleanor was gone. The r...\n",
      "19    Father had been away in the country for three ...\n",
      "24    One beautiful misummer night in 18— a large, h...\n",
      "29    Before Fred could complete the sentence his fo...\n",
      "34    When Josie arrived at her destination she went...\n",
      "Name: excerpt, dtype: object\n",
      "\n",
      "Coefficients: \n",
      " [ 1.48112616e-02 -1.55412549e-02 -7.73959046e-03 -2.88993529e-02\n",
      "  1.67597648e+00 -1.13562873e-02 -6.76790330e-04  2.31096998e-01\n",
      " -9.55646587e-02 -4.19500101e-02  1.36526000e-01 -4.64126500e-01\n",
      " -2.03473618e+00  9.43253633e-01 -2.02192115e-01 -1.34499713e+00\n",
      "  3.64497640e-01  6.57803691e-01  4.88818959e-01  2.15804127e+00\n",
      "  3.14023294e-01  1.67664341e+00  9.81973042e-01 -6.91690628e-01\n",
      " -1.01460977e+00 -2.02116650e+00  1.71898026e+00  1.74817617e-01\n",
      " -1.22450073e+00 -1.30771453e-01  2.11451440e+00  6.13313101e-01\n",
      "  6.78876769e-01 -2.25069093e-01  7.07877084e-01  2.04801343e-01\n",
      "  1.96560164e+00 -1.46212410e+00  1.31736764e+00 -1.05987346e+00\n",
      "  7.89324172e-01 -1.33164448e+00  9.95217519e-01 -2.49868056e+00\n",
      "  1.29056335e-01  1.49038129e+00 -5.42990407e-01 -8.39090134e-01\n",
      " -1.41186945e+00  7.54193366e-02  2.57674731e-01  1.30808569e-01\n",
      " -4.92950811e-01 -1.58826836e+00  5.96774858e-01  1.06580809e-01\n",
      " -1.99887650e+00  3.42666469e-01  6.91031626e-01 -1.36461958e+00\n",
      " -7.99921013e-01  1.06241437e+00  6.02372886e-01  1.34139155e-01\n",
      "  5.87312458e-01 -8.89612395e-01  2.31979457e-01 -2.72189030e-01\n",
      "  1.27046369e+00 -7.87424473e-01 -5.96105483e-01  1.24769951e+00\n",
      "  1.40308240e+00  1.27394400e+00  4.91412901e-01 -7.72385024e-01\n",
      " -5.05964685e-01  1.14412032e-01  8.54139656e-01  1.73251441e-01\n",
      "  5.39661393e-01 -1.52359023e+00  8.30725440e-01 -2.26389149e+00\n",
      " -4.30577198e-01 -2.44745684e-01  9.70829904e-01  9.66891873e-01\n",
      "  5.73260492e-01  2.39438422e+00  6.31846069e-01 -9.79336578e-01\n",
      " -7.77869900e-01 -8.86337073e-01  8.90635384e-01  1.50316880e+00\n",
      "  2.77835843e-01  3.36868622e+00 -6.44468462e-01  8.65056836e-01\n",
      " -7.77913999e-01  7.31843409e-01 -1.85950207e-01 -1.59319060e+00\n",
      "  1.45043870e+00 -8.12437286e-02 -2.70250697e+00  1.04160621e-02\n",
      "  6.05750580e-02 -6.64125045e-01  9.65134691e-01  1.08247921e+00\n",
      "  1.55597965e+00  2.14804900e+00 -1.78798007e+00  5.47404008e-01\n",
      "  5.62647384e-01 -7.78201603e-01  4.11461894e-02 -1.02381785e+00\n",
      "  4.75500088e-02  1.78260419e+00  4.47533117e-02  1.22368838e+00\n",
      "  7.79222434e-01  9.18839415e-01 -4.49509730e-01 -8.75912387e-01\n",
      " -6.25593418e-01 -9.89823633e-01 -2.26460072e-01  4.07520063e-01\n",
      "  1.01842585e+00 -1.75370348e+00 -1.88901046e+00  1.02996122e+00\n",
      "  6.23973502e-01  1.05602333e+00 -1.00035180e+00  8.30154535e-01\n",
      " -7.36426230e-01 -1.23791657e-01 -2.67737114e-01  4.15182059e-01\n",
      "  1.83684335e+00  2.83771232e-02 -1.23688909e+00  1.10544755e+00\n",
      " -1.28570229e+00  5.76877283e-02 -1.67473796e+00  5.93510889e-01\n",
      " -5.22998133e-01  8.88995961e-01 -6.45399566e-01  7.12948999e-01\n",
      "  1.14934601e+00 -8.13445538e-01 -5.62938695e-01 -3.20461496e-01\n",
      "  1.40390815e+00 -8.46474289e-01 -1.10234188e+00  1.14106878e+00\n",
      "  3.63046803e-01 -3.11109625e-01  4.11949925e-01  3.34908944e-02\n",
      " -8.05188696e-01  1.68705750e-01  9.73377722e-02  1.02001860e+00\n",
      " -3.23013411e-01  6.57740349e-01  8.48421459e-02 -8.48525076e-01\n",
      "  1.13746190e+00  1.34074006e+00  1.01844132e+00  9.41773743e-01\n",
      "  2.95959229e-01  3.40927881e-01  2.09716200e-01  1.03042481e+00\n",
      " -4.80037659e-01 -1.53610920e+00  6.90516961e-01  2.50788383e+00\n",
      " -1.13485462e+00  5.40351535e-01  1.53150011e+00  2.50065741e-01\n",
      " -1.66960225e+00 -1.46658813e+00  1.25153767e+00  1.38410771e-01\n",
      "  2.24230354e+00  1.77595626e+00  9.52513188e-01  3.93867629e-02\n",
      "  5.24358446e-01 -2.91415353e+00  9.89378191e-01 -9.35222678e-01\n",
      "  2.09256753e+00  5.85871842e-01  5.67877375e-01 -2.64329976e-01\n",
      "  5.24332810e-01  4.07220229e-01 -9.98309823e-01 -1.77984935e+00\n",
      " -3.05702491e-01 -5.94020140e-01  1.64474431e+00 -9.60539779e-01\n",
      "  5.73586279e-01 -8.04563719e-01  2.85977044e-01 -7.12660606e-01\n",
      "  1.16091398e+00  5.21863006e-01  6.36117494e-01 -1.54157975e+00\n",
      " -2.48319258e-02 -7.80990052e-01  1.93422206e+00 -1.81617141e-01\n",
      "  8.74368246e-01 -9.95851975e-01  1.20673851e+00  6.25174832e-01\n",
      " -4.14509525e-01  3.47246439e-01  1.24975180e+00 -1.24743672e-01\n",
      "  2.83320953e+00 -2.47267863e-01 -4.76491480e-01 -1.15920506e-01\n",
      " -6.13646729e-01 -2.02507388e+00 -1.17955649e+00 -4.96551385e-01\n",
      "  7.02227766e-01 -1.35640131e+00  1.49416852e-01  1.19430963e+00\n",
      " -9.06887201e-02 -6.62766712e-01 -2.14906303e+00 -3.19839067e-01\n",
      " -1.20248193e+00 -1.31185676e+00 -2.09510746e-01 -2.99065619e-01\n",
      " -3.96931126e-01 -2.74447079e-01 -7.14053736e-01 -2.31750076e+00\n",
      " -1.63908613e+00 -1.18412058e+00 -1.59452523e+00  2.56873125e-01\n",
      " -1.75285642e+00 -1.52126210e-03  4.38063031e-01  5.63524168e-01\n",
      " -1.86617995e+00  5.31489429e-02 -6.01337084e-01  1.37594875e-01\n",
      "  2.94007258e-01  1.44702658e+00 -2.10128714e+00 -1.73651395e+00\n",
      " -6.93153045e-01  1.19929792e+00 -1.30700577e+00  9.80201534e-01\n",
      "  1.27176203e+00 -3.31401047e-01 -8.54530472e-01 -8.68841038e-01\n",
      "  2.37037887e-01 -6.57746733e-01  1.09954468e-01 -1.09134992e-01\n",
      " -9.02331545e-01  1.42930236e+00 -1.97150272e+00 -2.42991619e+00\n",
      "  4.67394492e-01  6.05340704e-01  2.91821556e+00  7.67391194e-01\n",
      " -1.13204002e+00 -9.21798068e-01 -5.39630783e-01 -4.09563689e-01\n",
      " -1.44841898e+00  1.68235528e+00 -1.34297779e+00  9.36105670e-01\n",
      " -1.86417473e-02  2.78968737e+00 -4.39593949e-01  1.35258287e+00\n",
      " -7.39050533e-01  1.85308077e+00]\n",
      "Mean squared error: 0.3317574750\n",
      "Root Mean squared error: 0.5759839191\n",
      "Coefficient of determination: 0.69\n",
      "sklearn RMSE 0.5759839190527772\n",
      "ytest is a <class 'pandas.core.series.Series'>     y_pred is a <class 'numpy.ndarray'>\n",
      "testTexts is a <class 'pandas.core.series.Series'>     testResults is a <class 'pandas.core.frame.DataFrame'>\n",
      "                                              excerpt  predregr    target\n",
      "4   Once upon a time there were Three Bears who li...  0.303138  0.247197\n",
      "9   One day he had gone beyond any point which he ... -1.311287 -1.238432\n",
      "14  Aunt Abigail was gone, Eleanor was gone. The r...  0.211536  0.245806\n",
      "19  Father had been away in the country for three ... -1.139146 -1.009999\n",
      "24  One beautiful misummer night in 18— a large, h... -1.115843 -1.483887\n",
      "29  Before Fred could complete the sentence his fo... -1.260631 -1.413744\n",
      "34  When Josie arrived at her destination she went... -0.316249  0.022598\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEGCAYAAABsLkJ6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA7DElEQVR4nO2de5wU5Znvf880DfQgMkMyHmV0BLMGAgEGIYqHVQMaceOCiLc1usaNl3h2s15iJsHoEUj0yGeJlz0mGyWay6prMGomGoxXMBpdjOIMIipqFJDGo0QYVKaBnp7n/NFdTXV1vVVv3bp6up/v5zMf6FvVUzXTz/O+z5WYGYIgCEL90RC3AIIgCEI8iAEQBEGoU8QACIIg1CliAARBEOoUMQCCIAh1yqC4BfDCZz/7WR49enTcYgiCIAwo1qxZ81dmbrE+P6AMwOjRo/HSSy/FLYYgCMKAgog22T0vLiBBEIQ6RQyAIAhCnRKbASCiQ4hoFRG9RkTrieiyuGQRBEGoR+KMAfQBuJKZXyai4QDWENETzPxajDIJghAB2WwWW7Zswe7du+MWpaYZOnQoDj74YCSTSa33x2YAmPl9AO8X/v8JEb0OoBWAGABBqDG2bNmC4cOHY/To0SCiuMWpSZgZH330EbZs2YIxY8ZofaYqsoCIaDSAKQBeiFkUQRAC0NmVxtLHNmBrTwajmlLomD0W86a0Yvfu3aL8I4aI8JnPfAbbtm3T/kzsBoCI9gPwAIDLmfljm9cvBnAxALS1tVVYOkEQdOnsSuOqB9chk80BANI9GVz14DoAwNihEOVfAbze41izgIgoibzyv4eZH7R7DzMvY+ZpzDytpaWsjkEQhCph6WMbisrfIJPNYeljG2KSSHAjziwgAnAngNeZ+aa45BAEIRy29mQ8PV9pEokE2tvb8cUvfhFnnHEGent7fR/r/PPPx/333w8AuPDCC/Haa+rQ5dNPP43nn3+++Pi2227Df/7nf/o+d5jEuQOYAeAfAcwiou7Cz1djlEcQhACMakp5er7SpFIpdHd349VXX8XgwYNx2223lbze19fn67h33HEHxo8fr3zdagAuueQSnHfeeb7OFTaxGQBm/hMzEzNPYub2ws8jcckjCEIwOmaPRSqZKHkulUygY/ZYz8fq7EpjxpKVGLNgBWYsWYnOrnRYYgIAjjnmGLz99tt4+umnccwxx2Du3LkYP348crkcOjo68KUvfQmTJk3C7bffDiCfYfOtb30LY8eOxQknnIAPP/yweKwvf/nLxRY1jz76KI444ghMnjwZxx9/PDZu3IjbbrsNN998M9rb2/Hss89i0aJF+NGPfgQA6O7uxvTp0zFp0iSceuqp2LFjR/GY3/ve93DkkUfi85//PJ599lkAwPr163HkkUeivb0dkyZNwltvvRXoPsQeBBYEoTaYN6UVAGyzgF5/vSy/Q4lTMNk4RxD6+vrwhz/8ASeddBIA4OWXX8arr76KMWPGYNmyZRgxYgRefPFF7NmzBzNmzMCJJ56Irq4ubNiwAa+99ho++OADjB8/Ht/4xjdKjrtt2zZcdNFFeOaZZzBmzBhs374dI0eOxCWXXIL99tsP3/nOdwAATz31VPEz5513Hm699VYcd9xxuPbaa7F48WLccsstRTn//Oc/45FHHsHixYvx5JNP4rbbbsNll12Gc845B3v37kUuVxpz8YoYAEEQQmPelNbAStopmBzk2JlMBu3t7QDyO4ALLrgAzz//PI488shi3vzjjz+OV155pejf37lzJ9566y0888wzOPvss5FIJDBq1CjMmjWr7PirV6/GscceWzzWyJEjHeXZuXMnenp6cNxxxwEAvv71r+OMM84ovj5//nwAwNSpU7Fx40YAwNFHH43rr78eW7Zswfz583H44Yf7vh+AGABBEKqMqILJRgzAyrBhw4r/Z2bceuutmD17dsl7Hnmk8t7pIUOGAMgHr434xNe+9jUcddRRWLFiBb761a/i9ttvtzVGukgzOEEQqoo4g8mzZ8/GT3/6U2SzWQDAm2++iV27duHYY4/F8uXLkcvl8P7772PVqlVln50+fTqeeeYZvPvuuwCA7du3AwCGDx+OTz75pOz9I0aMQHNzc9G/f9dddxV3AyreeecdHHbYYbj00ktxyimn4JVXXgl0vbIDEAShquiYPbYkBgD4DyZ75cILL8TGjRtxxBFHgJnR0tKCzs5OnHrqqVi5ciXGjx+PtrY2HH300WWfbWlpwbJlyzB//nz09/fjgAMOwBNPPIE5c+bg9NNPx+9+9zvceuutJZ/51a9+hUsuuQS9vb047LDD8Itf/MJRvvvuuw933XUXkskkDjzwQHz/+98PdL3EzIEOUEmmTZvGMhBGEAYer7/+Or7whS9ov1/VUkJwx+5eE9EaZp5mfa/sAARBqDrCCCYL7kgMQBAEoU4RAyAIQkUYSO7mgYrXeywGQBCEyBk6dCg++ugjMQIRYswDGDp0qPZnJAYgCELkHHzwwdiyZYunXvWCd4yJYLqIARAEIXKSyaT2lCqhcogLSBAEoU4RAyAIglCniAEQBEGoU8QACIIg1CliAARBEOoUMQCCIAh1ihgAQRCEOkUMgCAIQp0iBkAQBKFOEQMgCIJQp4gBEARBqFOkF5AgCEIVE+V0NDEAgiAIVUpnV7pkPnK6J4OrHlwHAKEYATEAgiAIPol6dvHSxzYUlb9BJpvD0sc2iAEQBEGIi6hX5wCwtSfj6XmvxBoEJqKfE9GHRPRqnHIIgmBPZ1caM5asxJgFKzBjyUp0dqXjFqlqcFqdh8WoppSn573iagCIaAYRPUFEbxLRO0T0LhG9E8rZgV8COCmkYwmCECLGCjfdkwFj3wo3iBGoJYMS9eocADpmj0UqmSh5LpVMoGP22FCOr+MCuhPAFQDWAMi5vNcTzPwMEY0O85iCIIRDmP7nzq40Fj20Hj2ZbPG5KFwmlWRUUwppG2Uf1uoc2Hdf4swC2snMfwjlbD4goosBXAwAbW1tcYkhCFVLVIHIsFa4Vl+5mTADmpWmY/bYsusi5A3bjCUrQ/s9zJvSGtn90YkBrCKipUR0NBEdYfxEIo0NzLyMmacx87SWlpZKnVYQBgRRuGkMwvI/2+0kzITpMqkk86a04ob5E9FauB8EgAuvhfl7iBIdA3AUgGkA/g+AGws/P4pSKEEQ9IgyEBmW/9lNwYfpMqk086a04rkFs9DalCoqf4OwA8JR4OoCYuaZlRBEEATvRB2IHJpsKBqYplQSi+ZO8OyOUPnKgXACmlHn4utQiYBwFOhkAY0gopuI6KXCz41ENCKMkxPRvQD+G8BYItpCRBeEcVxBqBeiShM0XEs7evcFbff09fs6lt1OAgCaG5O4Yf7EQMo6SheYF6JO14wKHRfQzwF8AuDMws/HAH4RxsmZ+WxmPoiZk8x8MDPfGcZxBaFeiCpNMEzXktlXTgBam1K45ax2dF17YlH5+00PrUQuvg5Rp2tGhU4W0OeY+TTT48VE1B2RPIIg2KByc0SVJhi2S8MpkyVIRW21uF6iTteMCh0DkCGiv2XmPwH5wjAA1e3YEoQawk1BRpEmWIkcd4Mg9QYqORuI0NmV9n1f/MQVokzXjAodF9D/AvATItpIRJsA/BjAJdGKJQiCQRxujkq6NLyu4s3uot69fUg2UNl7csy+YwHVEleoBK4GgJm7mXkygEkAJjLzFGZeG71ogiAA8bg57Pz2QQO2KrwEUK3KeUdvFqB8Dr4Vv0ayWuIKlUDpAiKic5n5biL6tuV5AAAz3xSxbIIgoLLuGDNeXRpmt0lTYxLMwM5M1tWFoqqonTmuvPDTTjlnc9YM/H34MZLVEleoBE47gGGFf4fb/OwXsVyCIBSohDsmaJM2u5V5TybrwYXCZY8eWJMu+4xXJezHSA7UlE4/KA0AM99e+O+TzLzY/APgqcqIJwhC1O6YMHzebu0eVC6UfQHu8hoDu8+olHBzYzI0IzlQUzr9oJMFdCsAa+8fu+cEQYiIKDNMwuj6qbMyt3uP1z5Bdu6iVDKBhXMmFI8XNA1zoKZ0+sEpBnA0gP8JoMUSB9gfQHlZnyAIA5IwfN5O7R7M7/FzDnM6p5tyDktJD8SUTj847QAGI+/rH4S839/gYwCnRymUIAjlRNXzJowgc8fssej4zVpk++0DsioXipvhYKCsKEylnKuhJ9BAwykG8MeCv3+6JQZwEzO/VUEZBaHuiTI3PQyf97wprdhvqP16MkGkjFmo+gSZ0UnBrKfc/TDRKQS7g4iajAdE1ExEj0UnkiAIVqLMTQ8ryNxjahxnJseMpY9tsFXGxrndcHMV1VPufpjoBIE/y8w9xgNm3kFEB0QnkiAIVqLOTQ/D5+3kznHq7zNvSite2rQdd6/e7HhsJ+opdz9MdAxAPxG1MfNmACCiQ2FN2hUEIVLiKgYzsPOvA6XB2JnjWvDAmrQyq8cps+i6eRPx4Jot6LVJByXA1R0V9/3xQzXELIjZWZcT0UkAlgH4I/K/i2MAXMzMFXcDTZs2jV966aVKn1YQYsdurm4qmYisPYPbuZMJAhglQd9UMoHTprZi1RvblDsBAvDukpOVBsWuIvic6W24bp6zm8jp/gDVl9JZ6d8nEa1h5mnW53V6AT2KfM7/cgC/BjA1DuUvCPVMJXvzWFG1X7Bm/GSyOax6Y1txRKIdo5pSyoAtANwwfyKaG5PF949IJTHt0JGuMqruD4CqDA5XS8zCqQ5gHDO/YRoAv7Xwb1vBJfRy9OIJgmAQV266Fz/61p4MOrvS2LWnr+w1I7PISfl1zB6L3SY3UE8mqz0bwHp/OrvSuPK+tchxuaHyUuSmIogLp1piFk4xgCsBXIT8EHgrDGBWJBIJglBxnJSZTpGXQePgRJlrA8i3alg4Jz9P+Irl3baf3dqTCVyVbFxHuicDgjpYGVTRBhliA1RPzEJpAJj5osK/MhReEGoEHd+7VZnZtV9QsWuv/XsaBw9yNSijmlKBVsZWpewU3TRcUYaxSBAhx4xWzZV8UEOlamlR6X5DTi6g+U4fZOYHwxdHEAQrYWWLqFatQwY1OCozc/sF3Z2AFfPnnJSf6hw6K2O3vkLmc80c11Iig+EmSvdkcPnybix+eH1xx2JHUBdOtfQbcnIBzSn8ewDyPYFWFh7PBPA8ADEAghAxQV0NZlSrVpXSNCszwxCMWbDCVw44AZixZGVR2RnZQnbKz+/KWEf5GlXJbsZiR69z7CEMF0419BvSSQN9HMDXmfn9wuODAPySmWdXQL4SJA1UqDdmLFlpq2ham1J4boG3MJxX5U0FJ7pZQavk8YpTyqOda6YplQRRvtpYVYfQu7cvPyFM45y690J1n+NMy/WD7zRQAIcYyr/ABwDaQpNMEAQlqlVtuifjeXBLkym90sywwYl8Xr8FZpSlTqp69zQ3JtGUsj++HU4pj0bcIZVMFF0zPZksdvTuGzDT8Zu16Lh/bUl656e7+2yvA8iv/E+b2loSh9BBdf/jTMsNE51K4KcKvX/uLTw+C8CT0YkkCIKB3/YKdqg2+73ZnPI1A0NhG6thO991Z1calysyfOxwctm4uWjsuo5m+/M7hWFDBpVlAeWY8cCaNKYdOtJTYNswFKo4zEBT+FZcXUAAQESnAji28PAZZv5tpFIpEBeQUK1EVdZv52qwwyl7xexSCYJRxevENZ3rHHv6WFHJHSTW8O6Sk7VcZ6o6AQNzJfFAcvfYEcQFBAAvA1jBzFcAeIyIhrt9QBDqhShbERuuhmaF+8ZAdU6zbEHRcZtcN28ibjmrvcQ1cu70NmXLZ5XcfvPhjc/pZOnMm9KKG8+crHRpGQo+aNVu0HnLUeLqAiKiiwBcDGAkgM8BaAVwG4DjoxVNEAYGYYxUtGLdUWhs1G3PqZsa6UaygVwzcTq70lj88PpiILYplSyu7qcdOlK5C7GT20vtQVHGxD4ZdbN0dNIx3YyJ0+4vzCyuKNCJAfwLgCMBvAAAzPxWWO2gC43m/h35EZN3MPOSMI4rCJXELVDr1R1kpzT8yhLGyh9A3rfiQGdXGh33r0U2t89S9WSy6PjNWgD7Uh5HL1hh+3mrnFbFPMKUBdRQyAyyMsxUbOal0MrNl+9kTNwUfBSLgzDRMQB7mHkvUf4vgIgGIYR20ESUAPATAF8BsAXAi0T0EDO/FvTYghAFqpWel0CtTqwgyKrdvMLt7Eo7tkOwI9FAyNkFWHOMxQ+vV8q+9LENJcq/+Ll+LrpKnFwmCSq3MCrFPEZhRHZm9qWAhllo5Va45qTgq6XnjwodA/BHIvo+gBQRfQXAPwN4OIRzHwngbWZ+BwCI6NcATgEgBkCoOpxWem7uCrO/WMcdEEQ57NrTVxyivvSxDY7KP0GE6Yc1Y/U7O5BjRgPBVvkb7OjNFt07ZtmNxyqMtE3VvGAAykCsHV7cO2Gssp2MiVNfIy+yxoWOAfgegAsBrAPwTQCPALgjhHO3AnjP9HgLgKOsbyKii5GPQaCtTcoPhHhwWumZUyNVitBLo7OmxqRtQdOwwQk0NQ4uGcCy4pX3S95rdrs4GRKjd/8Da9JF5eugn23JZHNY9NB67OkrH+Jihsg+bdNMg4uLyYyueyfMzCyVMXFT8NXS80eFowEouGnWM/M4AD+rjEilMPMy5AfSYNq0aTKJTIgFt628oSBU6YcNRI7GwYxqMZxMNJRVpf5+7ftl78v2MxY9tF6pnHTbIejQkyk3VFZ0Fvf9nE8hVbWHMKPj3gkj+KpjQNwUfLX0/FHhaACYOUdEG8wjIUMkDeAQ0+ODC88JQtWhu5VXuYOcXBwjLBW0OxVK1e55lQLuyWTx95MPKhvRaM5fV7kv4uKe1ZuLLqt0TwYd96/FoofWY2cmW6Y43dw7YbSV1jEgOgq+mgvGdFxAzQDWE9GfAewynmTmuQHP/SKAw4loDPKK/x8AfC3gMQUhEnS38laFoMpYMfPJ7iw+d9UjyDEjQYRUssF2Nq5Xv/EDa9KOTde89Pm3g6B2V/nBepeyOS4auHRPBlcs78ZLm7a7jocEgnfrVBmQy5d3FwfX6BqjakbHAPzvKE7MzH1E9C0AjyGfBvpzZl4fxbkEIShetvJmhaDKWDGTT57hwv8Zvdlyg9FA+QDvmAUrSs7d7KCAzSMa7fCTa2+GAYw/aDie+8t2X5/3c757Vm8utnNwImjw1clQVFsufxCUrSCIaCiASwD8DfIB4DuZuXzOWwWRVhBCtWP1G7t1qPSLuU2BNf/ejF37BmvBll+8ppgaNDcm8enuPtfAsAqdTqhBu3XqdD3105E1Lvy0gvgVgGnIK/+/g/1oSEEQCti1hHDqUBkEsz976emTbfPogfIVb2dXGlf+Zm0oRsmr+k4lE7jlrHZ0XXsilp4xudguQiW7CnMFrqrFQtBunaqup3ZyDGScXEDjmXkiABDRnQD+XBmRBGFgYuc3tnaoDBOzAko0ADkbT07v3n11AQCw+OH1jrn+OjSQ95RRABgyaN9606ubzIxOBa71HF7RmYJWLbn8QXDaARSXCHG7fgRhIKBSFD2ZLLb2ZNDalMKMz40srngTRDj8gGG+z2cowm/f1429ChfQjt4srljejWs61xUfB0VH+aeSedViXtv3ZLKeGr8NG5wo60ChU4HrB7vdxLwprXhuwSzcclZ72W6gmnL5g+BkACYT0ceFn08ATDL+T0QfV0pAQYgTL50cnVwZhkvoub9sL2YF5ZixZcfuEqOgi1kRuilkI3hqGIEoIQDnTm/DDfMnoYHK3UR2StrO3ZJKJnD9qRNxs6WzqOHGCbPFgls311oZ/mKH0gXEzM4OMEGocbwWE3lpZ2CQyeaw8aMMbjxzsnbPfnMPfd1cfsMI6NKUSuKT3X3Ka7ILADc3JrFwzgQA+cC0yjBZlbRbhlVYM3lVhV06NQMDOdXTCZ00UEGoOa7pXId7X3ivRMFZh5OoFMMVy7uLk68MpTdvSitafebV6/TKMWPOPPGSy+/FPLlV+DLy98tOYc9YslKZlQTYK2mvCtZriwUnY17tDduiRAyAUNPYrfpe2rTddmqVdYWvUgBm1bajN4uO+/O9d2aOaympZtWF4N4rx4w5qNsxeyy+fV+3r6BsEIz5vwxg684MLi8YRae6BANDSQfp1eO1xYLTKr/aG7ZFidZIyGpB6gAEL6hywXf3Oc/ANfK7dXLBDYYNTqB3b65M+fvNmHGCCLj5zPaSoSPff/AV2+phN5obk9id7fdcDOb3ulLJBowcNqRsZm/+tX21DWH3zlGNmCQAN5/VPuBHProRdCSkIAw4VKs+tzWPsfLXyQU32GWj/IF8nx+nAG/SxzeQOe9jNwcp50892PNxUskEFs6ZoDVy0opfo5bJ9heNql2AeNFD6yMZr6lazY9qStV0kNcN1z8/IppPRG8R0U7JAhIGEn59uIaysCoGr5k6QH6CVb+DxfGxaM9/rjCgxeDeF95zeHc5DQScNnWf3323X0Ec8GpUgHzsIcz0TgNVppG5a+dzC2bh3SUn47kFs+pC+QN6MYB/AzCHmV+PWhhBCBOVb7dR0WwNyCuFmeNaMGPJyjIXRGdX2lOwFoBWMzi/mH3tXs/Rz8DyF98rzuoNY26wlcbBg7BwzgTt7CYnggZkq70tc1zoGIAPRPkLAxFVs7OMYoBJa2HIirmFsl3q56KH1hezZJobk2BWZ81EpfytJHwYmmyOiwpRBVHeT+7H5WPcO13jkkomMDTZYBtEDiMgW6upnEHQMQAvEdFyAJ0A9hhPMvODUQklCAZhZIqYFTZQOqDEGuybsWSlaxvg7oUnlrx+Tec6X9k/QUmZAgjTD2v21ZXTGLiuMmDMQDJBykpjNzLZnKNxMgLBRgouAGV6Z5gTvoQ8OgZgfwC9AMx/9QxADIAQKWFMdTIKfVQKLpPN4cr71uKK5d2OitA4v5Hyac7AeWBNuuLKHwD6+rkYHH15805fx2hqTOLTPc6dXvbm8vOCme1rCYYNTiCZaHDcBaWSiTLD2pRKYtHcCba/S6uiB/TmKQvekDRQoWpRpWEmiHDjmZO1v/iqFMAgNDcm0dObjdTHr0NrwTXix8eeTBCGDR6kNdbRIJ82mkPGEkNxSq81fl9BVu+qv4WB1JI5TlRpoK47ACI6GMCtAGYUnnoWwGXMvCVcEQWhFJVvOsdsu/pTuQiCTr6yw/BTx6n8Af/BUaOC2etYyB292bIGbQAc/fw55sD+93qu1o0SnSzkXwB4CMCows/DhecEITTsmq45Bf6sqYFODb1mjmuxVVq1ACOfaaSLuSe/YRz9nNMLCSKtZnpOOOXxC/7RMQAtzPwLZu4r/PwSQEvEcgl1hEp5zxzX4liIZV7Vq4q+Fj+8PjYffaXQ3YUkiMoKnGaOC++r3NyYtP195ZgDF3W55fEL/tAxAB8R0blElCj8nAvgo6gFE+oHlfJe9cY23DB/orIAi4CiMlG5Anb0lhcW6ZJMUM2UyqeSCdu4yao3tvk6nl2ffqOq2KlwTlXU5dZ2u56rdaNEJwvoG8jHAG5Gfvf3PIB/ilIoob5Q+ee39mSKX/ArlneXreIZKLbsDcPP35RKgihfvWvOPrnco5+82rB2OTWj40O39v1JJRM4bWorVr2xTdm+ubMrrbxv1nPqZntJHn/4uBoAZt4EYG4FZBFqFKf87c6utHK4uLklg5syUbUHHjJInZ5ohgDHlMSwg8iVggDHLBknw2nu7+8lg8dQ6E7nNKPTj1+IBqUBIKLvMvO/EdGtsPl+MvOlkUomDBjcFLxqdadqywzkFZfZv6vqtW82EoB7/rgK827Cem29ewfuRFS3IKmqWhrQ7w9k/f3v2tOnvN92fnvJ8IkPpx2A0f5BEu9riLCrKd2276rV3VUPvlKWS26GUbr91xkA4uQi0HHjuLkm/NDalELv3r5QZvF6heAe5DXu15X3rS0LJhtBdHO7aOvv1+7374Sd3z6ufvxSWew8EvLhwr+/Mp4jogYA+zGzdAMdgIRRWWvFbfuuWsU5KX9gX4GTQZBmXoYhclNOZoXT2ZW2VYpeMO9i4ogjMIAH1qQx7dCRtjsb4544tWqwM1zGVLTFD6/3ZNhaC62XrXid7hUGUXwXBiI67aD/i4j2J6JhAF4F8BoRdUQvmhA2TsraL27bdz+rONWXP0jLXrfe/uZzGsohqPI/Z3obAJS0ba40dr9fc9ot4K+YjWFvHFQ4KfQ4Mnyi+C4MRHSygMYz88dEdA6APwBYAGANgKWRSiaEjkpZp3syGLNgha9tsNv2XbW629OXU3aYNKZC2bVk9ot5B5HuyZRkthg9aYxzBg34GhO7gPzgFqf5uDo0NyZx8qSDSrJudu3p027hYP2967Z/9hJEt5O5cfAg7d9fpTN8JO6QR8cAJIkoCWAegB8zc5aIAv1FE9EZABYB+AKAI5lZ4gwVwCnjw1yoA+hvg9227yrXjSoAfG5h1ex1e24e8p4gwtlHHYLr5k0s8/Na2z0DwJ6+fry0aXvZ834ZVMh/X/rYBi3ln0yQ4/u6rj2x7Dkv8QnrLkxHySWIcNrUVkw7dKTnOIhRE1DNrpR6ngNsRscA3A5gI4C1AJ4hokMBBI0BvApgfuHYQoVwyvgw8Jp+p+Obt1vdGY/tlPaUHzzuKS3wms51JcYkx4y7V2/Gu9s+xcubd5YYEru2zZlsrihHGGT73fvsl7y/0G3TbkdkjYUYWHc0qlRaO9eLTs1EjrkYP7hh/kTtGIZTzUE1EUfcoRrx1Q2UiAYxc+DcOCJ6GsB3dHcA0g00OOYVseo3TwDeXXJyJcUq4lRABNgrmDFXrXCd8xsHTS7tpa1YWyZ7GUze2ZUum3ugarfsZfdgdNuc8oPHHX3+lRqiHmbmTj1lAQXpBnoZ8s3fPgFwB4ApyMcBHg9bSMX5LwZwMQC0tbVV4pQ1jXk1rvJ3h7EN9vvlcgvC2bmDqlH5A8AuD/UDBDhW1+qwxzLpzPrYwLp7cMoCMnYxC+dMUMYzKrXqDztzRyqLNVtBMPO/E9FsAM0A/hHAXXAxAET0JIADbV66mpl/pysgMy8DsAzI7wB0Pye4E9U2OMgXVcdtErRKVOUuCRsvwV9Gvi+P3972XqtprcrPbTFQDTN1pWI4fHQMgNHR6asA7mLm9UTu/WeZ+YRAkgmRE9WXWvVFvfK+fdO0gvbuD5KtMTTZ4FqHEAdBriloVkvQQrtKIJk74aNjANYQ0eMAxgC4ioiGA6i+b4/gy+0Sxpfael6VAjcGuVgzbtI9GVyxvBuXL+9Gc2NSGRA1Y3ZTpTwq9GpU/kDeMPklaFZLNazw3ZDMnfBxDQIXqn/bAbzDzD1E9BkArcz8iu+TEp2KfIfRFgA9ALqZebbb5yQIrMYusFeJwJzded1cLE4+Z13Mufth5NpXCzM+NxIbP8qU9TOyprJaYwWA/TD1WmqZHNffeC2gCgLrGAACcA6Aw5j5B0TUBuBAZv5zNKKqEQOgJq6ZqarzVsLPnkomMDTZoFWRWim/f9gkEwRwPrVUhXFtdu2sa00x1lPmTpj4zgIC8B/Iu3xmAfgB8tlADwD4UqgSCoGIyz+qOj5DvdIPSxlnsjntAiWd80VpJJIJQq6fXV1bVnR2NsY7ejJZpJIJ3HxWe80qxbjjELWGjtPxKGb+FwC7AYCZdwAYHKlUgmfimpmqOn5rUwo3njm5rP9OsoHgYYRtxYhC+SeIir1tlp4+GfsPTYZ8hnKMrBi3CVuCAOjtALJElEDh+0FELajTIHA1bz+9pHSGeR1O57ULLPb07sWuvd7aLbQ69L5pSiWxM5PVVt6qALNX5W/0unHKWOpnLimou6JCHUGNtNsoOl1W83dA8I6OAfi/AH4L4AAiuh7A6QCuiVSqKqTa28fqZnFEUUzjdF7rln30ghWez+EU5Fw0d4LjYBkr+w/N+8mNuEGyAfCTFLSjN4vGwYNw7vQ25bmtuyOnDCmi/O5obwjB7ARRJPny1f4dELzjGAQuZABNB7AdwPHI75SfYubXlR+KkDiDwHEFWcPGy3XYNVLzW6lq7j/vlaZUEsOGDCqpWrVWn7q1KjCz0bQq/9xVjwTKSGqA83a41ZSlY9c/35iva9eIzsktZbSZsL7H2k7CerwgLT5q5TtQj/gKAjNzPxH9hJmnAHgjMukGALVShKJ7HXarPfNK1271p3IP6PSeccrl78lki+6fHHOZiwnItyrwM70raDqq2+Yh3ZNBx/1rbTN5jFRWVXtmJ8mGDRmE7oUn2t5zlaENGg+qle+AsA8dF9BTRHQagAfZT+e4GqFWilB0r0OnZ7zZrWBnMK5Y3o2XNm3Hqje2OR4r2UC4Yf6ksmZmTudd/PD6soriEamkVlpoZ1e6aDzCqElwQ5XJM2zIIMyb0uorNmAoXVVWTBQtPmrlOyDsQycL6JsAfgNgDxF9TESfEFHdjYS0myhVyfaxYWV16F6H7qrOeJ+dwWAA96ze7Oj2aW7Mu3cMJZhM6KUI7ejN4prOdcXJVoz8TmF3th+3nNWOhEOq0VUPrivev+mHNWudLwrcpqY1NyahugrVZwyDmMnmivcgrAlbcX8HhPBxNQDMPJyZG5h5MDPvX3i8fyWEqyZUY+sARJ5uZx7hZx7c4udcuuP3dFd1o5pS6OxKOw6aUSnj5sYkdmf70VPI4unJZAHep/ham1JoblSnTt69erMy2Hn2UYcoP2dOlXx5807H6yPkh9So+vIHwTw1zU6xLpwzAedMbyszAoT834D1781u1KOdu8wvcYxuFKJFpxL4CJundwLYFMZMAC9UWyVwpUrT4wi+6fnt1QFMu/da7xOB0Wvj9zdfl3XYiy63nNXuOFeAoDcYxZDHLt1Vt3bArprX+ndiN6TdHEBWDX4xH0eCtIIKVRBYxwX0HwBWA/hZ4Wc18i6hDURUPquujqjUYOk4gm92qz1jJWxe/bn594F9ve7Nnz1taqut8gf2XVdnVxoPrPG3o3KbYDWqKeXJzWV3P24+q125M7AWgS09Y7LjynnelNbiTsCISZgD7c8tmIXWppTtNLNFD613nGVszHyWgjDBik4QeCuAC5h5PQAQ0XjkW0J8F8CDqNBgmGqkUoo5ruCbTtm9TgDTrtf9jCUrle83rkt3eLlXDLeIblqquSe+bsDVbhfodi/d+t2r/q7MWVIq/M58FmobnR3A5w3lDwDM/BqAccz8TnRiDQwq1X4h6uCb3wBzZ1caDZp9HazKy8lIGtcV1Q7HUM4zx7W4vtftPofpF3dbUITxdxXFDlUYuOjsANYT0U8B/Lrw+CwArxHREAD6A09rkEoNlo6yV7vf6k7jc7oplFblNcJhXu4Vy7ux9LENju/xS3Njspg+6uZe0h11GLRBmeH/V93JEal8INzu780PkrcvGOgYgPMB/DOAywuPnwPwHeSV/8xIpBogVHKIRlRdEFVuh8UPry/Jy29uTGLhnAkl12yniIiAQQ1UkvtuZxSdNg6GuyKZICQbyLEVsheSCcLCORMc5TfjVhkdxu9aJ9hu3Cu7v7cdu/bYxlLM1dNWJG9fMHA1AMycIaL/APB7ZrbuHT+NRqyBw0BvT6taDVqLqXb0ZvMVrYCjP5oZGDZ4kGtf+h6Ntg3ZHKMhYOdQI2vGupp3WwUTSgvGouqDo2OIzPfK+vc25QeP2xoAosrtUIWBi6sBIKK5AJYi3wJ6DBG1A/gBM8+NWDYhRFR9fbysrbM5LgYknVIoezJZJBsITY1JbO3JFH3OZsWlm4JpXvx7bdmcIMKNZ062VdBu7iUGSpqnRTWQXMcd47RiVxnSnt7sgBjzKMSLjgtoIYAjATwNAMzcTURjohRKCEZnV9q28ZiBta+PFwyF5eaPzvZz8fx2q2U//mw75e9kFPqZbZVdZ1cau/a6l7AYxVYds8dqZXz5cRG5GUK3FbtbhthA36EK0aKTBZRlZmu5ZN32BKp2OrvS6Lh/rXZnTCtuHhcjIGnOftHBmn1ifN6pZYMOTpXGqpXz0sc2aM8QNoyXcd2qc/it1rbL8DKuRiejSNozCEHQqQS+E8BTABYAOA3ApQCSzHxJ9OKVUm2VwNWIU0FQWFj96V7O2VoowDIPMw9jqLtTG2RrAHvMghW+BsB8uruvJCCdbCAsPWNy4CrcoMFlGdIiuBFkJvC/ArgawB4A9wJ4FMAPwxUvfuL6EoV93kqk+FldOrruHKOHjfkYN8yfiGGDBwVK9zQMksrtZQ1g68YfrMcoa1RnehikKDCom0bcPIJfdJrB9TLz1cz8pYIFuQvAj6MXrXKE2Wwt7vP6TfFLJRNoUrg57DC7dKzFUE2pZJmytPPVG8fYGUD5m90dux1GexkBbACOBWAqd1KCqGyXYj6m16JAmdkrVANKA0BEk4jocSJ6lYiuI6KDiOgB5N1Br1VOxOgJu6eP7pc7il5CHbPHardUBlBSvTph1HBP5zKvbudNacVzC2bh3SUno3vhiVh6emnvG5XLxdj56MhpJZVsQAPl+/5cvrzbdQeS7sm4FoAZHTRLz5NQFryZg+K6vvi4FhyCYMVpB/AzAP+FvN//rwC6AfwFwN8w883Ri1Y5wuzp4+XLHUUvoXlTWrH09MmObZQNWptSeHfJyUUf9fN/2e7pXE6K22wQjEZmqmPYKU8zyQbCOZZGdOdOb0NfP3saMJ8gcs27N4yhtbWDk/zG9eq2hKhUE0FBcMMpBjCEmX9Z+P8GIrqUmb9bAZkqTpjN1rzki0fZ5K1x8KBiIdbMcS22LZu39mSKQ9qJvKV2ec00cSpKMu7LlfettV1p7zd0EK6bN7HkuRlLVnoOHOeYHY2rWR4/U7Z0ffEyWlGoFpx2AEOJaAoRHVGYCbDH8rhmsFuBJhOEXXv6PPtovXy5o0jhs9uBPLAmjdOmtpb5+M3q08tURD8Nz9xWyPOmtKJfIYRdsZMfZZlKNqBJsTNqIDheU5hN3yrVRFAQ3HDaAbwP4CbT4/9neswAambChLVisqmQ8mdkpngp+/eyqo+iUlO1A/n92vfxye7g83sI5T1yDNwymtxWyKp710CEMQtWYEQqWWwx0eBjlq9q6DwA7D80GXnTNwNp0SBUC0oDwMyRNXojoqUA5gDYi3xc4Z+YuSeq8+lg/nLPWLKyLJ1Qt+zf65c77BQ+p57xYaBapV7TuQ73rN5c3FX46ZWjSic1FL35GlTK3615nMoIBMlEMtBN6ZUWDUK1oFMJHAVPAPgiM08C8CaAq2KSw5agOd1xzk0N6kYwJlnZpXI6ZbWYlb+B18Cm9d55rRJOJRuKk7e80kAUKAvHa2aPNUguyl+IA51CsNBhZvMUsdUATo9DDhV+grN+CrqiKD4L0jPeXNnqRT6nXvZbC6mXTsdRvT6mEKDWJ28wOmaPxRXLu21lMgbR2+0ygnT39BL8l8pdoVpwbAVBRATgYGZ+LzIBiB4GsJyZ71a8fjGAiwGgra1t6qZNm6ISpYjXYe9+hsNHOVDeqmB69/Zp9Qa65ax2X+d2aq0wbHACvXtzJa8nE4RhgwdhZyZbjLfYDUzXHdloxmi9YHVJmY8LqDOO/A5QV90DAvDukpOLj6P8vQuCCl9D4TlvHR7xecInC0Vk1p9TTO+5GkAfgHscZFjGzNOYeVpLi/sIvzDw6sbxk9cdZS641b2wcM4Exzx7IH+NfhWQ085ol0X5A/kK2p5MFox8iwWrzz6TzeHK+9Yi3ZNxbU5nxXDTXTdvYnFou/V36JRx5DcVUzezR2oAhGpCxwX0MhF9iZlf9HJgZj7B6XUiOh/A3wM4nt060sWAl+Csn5hBJXPBzUFHQ6laV8ZeM1Cu6VyHe194DznOD21pAKDOsfGOsTpn7Gsj0aSRBWRWuE6/w7BrMHSD/1IDIFQTOgbgKADnENEmALtQ+D4WAri+IKKTAHwXwHHM3Ov3ONWCH2USZRGYHWZlGNQHfU3nupJ5AsYCPtkAOGRa+saY6GV2zdi5Usy1G27XFXYqpm5mT6V/74LghI4BmB3BeX8MYAiAJ/JhBqyOo710WPhRJmErIJVSVz0fxN987wv2IaEolL+BdYUctHYjilRMnfsqNQBCNeE6D6D4RqIDAAw1HjOzv5FSAajmeQBxZgGpAounTW0tawERJOBoyBv2vAFzUFjl2nELzgbpx19pJAtIqDSqILDOQJi5AG4EMArAhwAOBfA6M0+IQlAnqtkARIWOslApv4RPZaqSw296qRPWub1+s2R0s3AEoR4JMhDmhwCmA3iSmacQ0UwA54YtoFCOVRmq3BqqAKKqWtZqLHTy9FVpk0Gxzu3165oR37ogeEfHAGSZ+SMiaiCiBmZeRUS3RC2YoF9cpFJ+qh0AIa/UjRiBk5ExXo9C+RuyW/EToxDfuiB4R8cA9BDRfgCeBXAPEX2IfDaQECGdXWmlrz3dk8GMJSuLK2RVu2eV0magaETcjIxb//wghKmgpb+OIHhHJwYwDEAG+VTvcwCMAHAPM38UvXil1EsMwM3fbpfHf0TbCDzncaBLUyqpbBJn+M79DFB3IkGEfmZR0IJQQXzHAJh5FxEdCuBwZv4VETUCcC4rFQLhtOpWzdb973e8KX/AuUOo4ZrxM0BdhZGZtOqNbdjakymZKRwUyawRBO+4dgMloosA3A/g9sJTrQA6I5Sp7nGqClWtxh06IHvG7JpxG9eoS2tTqpiWGvYsXJmxKwj+0GkH/S8AZgD4GACY+S0AB0QpVD1iHiTfoGiD3NqU8tXq2At2k7rMM3G99uYB8q6m5xbMwqo3tkXSB0f66wiCP3QMwB5m3ms8IKJB8DY+VnDBuoK1C94aq3LVGMlkCJMdEkS2vemN5nKtTSnHX3wyQWV/UMkGwqK5+ZKRqPrgSH8dQfCHThbQH4no+wBSRPQVAP8M4OFoxaovVD5/p4CpUZGbIEImm4PH2Sm2uKV6OinU1oKMhmx2vviocvWlBkAQ/KFjABYAuADAOgDfRL499B1RClXrWAOWqiBrjhkbC1WshotIlfqpk6afbCDsN3SQcjaAm3tJJau1srhSDdiiPq4g1Do6WUD9AH5W+BECYld4pcIYiWj3GbsRjCoIKGsQ50dhBlW0UeXqSw2AIPjD1QAQ0QwAi5DvATQI+9pBHxataLWJl8IqwyVj9xkvQRhrLxy/CjMMRRu0E2mljysItYyOC+hOAFcAWAMgmpLQOsJLYNJwyQQJZqrcOn4VpihaQagddAzATmb+Q+SS1DiG31+1cnea0qXyvdsVhZkxH8PcytnoEdTa5D43QBCE2kWnFcQS5Ct/HwSwx3iemV+OVrRyBmorCLfWDtYKWasCdur3b/7MzHEttsdwOr9qboBhXFrFGAjCgCdIO+ijCv+aP8wAqmvKRhXj5PdvdVDcBrq+986uNFa9sc3T+TPZXHG2rxnjkdtkLUEQBi46WUAzKyFILaPy4RPKM2tUCtfN9+7U1tkthuCW/2/XgloQhIGP0gAQ0bnMfDcRfdvudWa+KTqxagunQiXdnv9uOB3HraGbam6AGamqFYTaw6mBwLDCv8NtfvaLWK6qwtynZ8aSlZ6bjKnaN3TMHhtaGwOn4zg1dEslEzj7qENcG74x8qMnr+lcF+he6BD0fguCoIdyB8DMtxf+XWx9jYguj1CmqkJ3LKMTTj581ZB1r20MnHYZ5vOrsoCmHTqy+Loquyjdk8HdqzeXPA47PhDG/RYEQQ/XLCDbDxFtZua2CORxJI4sINXAdT+D1e3wOwQ9quMYx1IZJjvCuhdA9PdbEOqRIFlAtscLKM+AIepOk2G1MQizHYIRcNadBuZ0L7zWF0hnT0GoHH4NQN20g1a5VkakkqGdI6zq2rCrdHWngancVX7cOdLZUxAqhzIITESfENHHNj+fABhVQRljZea4FtvnP96drfngpM40MKdmcH4GtTgFzAVBCBelAWDm4cy8v83PcGb2u3MYcNgVVgH5EYy1PnHKPA2MkPfDnzu9reSxU4zBjzvH7px+4hiCILgTiyInoh8COAVAP4APAZzPzFvjkMUNJ2VVD37pIG4lv+4caTgnCJUhhEGCvljKzJOYuR3A7wFcW4mT+skvd1JW4pd2Rtw5glDdxLIDYOaPTQ+HoQJBZb/55R2zx6Lj/rXI5kpFTDaQJ0VW6W6b1dDdUwa1CEJ1E5svn4iuB3AegJ0AIu835LflgvHa4ofXF0cpNqWSWDR3Qkm3TiclV+nipmoqphJ3jiBUL74KwbQOTPQkgANtXrqamX9net9VAIYy80LFcS4GcDEAtLW1Td20aZMveVQ57YTyiVle0CnAqnRxkxRTCYJgJuxCMFeY+QTNt96D/KB5WwPAzMsALAPylcB+5Ykqv1xnZ1Hp4qYg56sG15EgCJUhliAwER1uengKgDeiPmdUAUkdZasyMlEFkf2ez9jNpHsyYOxzHdV6vYMg1CtxZQEtIaJXiegVACcCuCzqE0aVX66jbCudDeP3fH4KtwRBGLjElQV0WhznjSIgaR3oApQr20pnw/g9n/ThEYT6om4qeqNCV9lWOhvGz/mkD48g1BdiAEKgVlIddXYzgiDUDjVvACSrRR8p3BKE+qKmDUA1FUQNFGplNyMIgjtxZQFVBMlqEQRBUFPTOwC/WS3iNhIEoR6o6R2An4IoKYYSBKFeqGkD4KcgStxGgiDUCzXtAvKT1SLFUIIg1As1bQAA71ktUgwlCEK9UNMuID/IFCtBEOqFmt8BeEWKoQRBqBfEANggxVCCINQD4gISBEGoU2QHUOVIUZogCFEhBqCKkV5GgiBEibiAqhgpShMEIUrEAFQxUpQmCEKUiAGoYio9TF4QhPpCDEAVI0VpgiBEiQSBqxgpShMEIUrEAFQ5UpQmCEJUiAtIEAShThEDIAiCUKeIARAEQahTxAAIgiDUKWIABEEQ6hRi5rhl0IaItgHYFLccIfJZAH+NW4iIqfVrrPXrA+Qaa4FDmbnF+uSAMgC1BhG9xMzT4pYjSmr9Gmv9+gC5xlpGXECCIAh1ihgAQRCEOkUMQLwsi1uAClDr11jr1wfINdYsEgMQBEGoU2QHIAiCUKeIARAEQahTxADEDBH9kIheIaJuInqciEbFLVOYENFSInqjcI2/JaKmuGUKGyI6g4jWE1E/EdVUKiERnUREG4jobSJaELc8YUNEPyeiD4no1bhliQMxAPGzlJknMXM7gN8DuDZmecLmCQBfZOZJAN4EcFXM8kTBqwDmA3gmbkHChIgSAH4C4O8AjAdwNhGNj1eq0PklgJPiFiIuxADEDDN/bHo4DEBNReWZ+XFm7is8XA3g4DjliQJmfp2ZN8QtRwQcCeBtZn6HmfcC+DWAU2KWKVSY+RkA2+OWIy5kIEwVQETXAzgPwE4AM2MWJ0q+AWB53EII2rQCeM/0eAuAo2KSRYgAMQAVgIieBHCgzUtXM/PvmPlqAFcT0VUAvgVgYUUFDIjb9RXeczWAPgD3VFK2sNC5RkEYaIgBqADMfILmW+8B8AgGmAFwuz4iOh/A3wM4ngdo4YmH32EtkQZwiOnxwYXnhBpBYgAxQ0SHmx6eAuCNuGSJAiI6CcB3Acxl5t645RE88SKAw4loDBENBvAPAB6KWSYhRKQSOGaI6AEAYwH0I9/q+hJmrplVFhG9DWAIgI8KT61m5ktiFCl0iOhUALcCaAHQA6CbmWfHKlRIENFXAdwCIAHg58x8fbwShQsR3Qvgy8i3g/4AwEJmvjNWoSqIGABBEIQ6RVxAgiAIdYoYAEEQhDpFDIAgCEKdIgZAEAShThEDIAiCUKeIARAEC0Q0j4iYiMa5vO9yImoMcJ7ziejHfj8vCEERAyAI5ZwN4E+Ff524HIBvAyAIcSMGQBBMENF+AP4WwAXIV76CiBJE9CMierUw1+BfiehSAKMArCKiVYX3fWo6zulE9MvC/+cQ0QtE1EVETxLR/6j0dQmCHdILSBBKOQXAo8z8JhF9RERTkW+LPBpAOzP3EdFIZt5ORN8GMJOZ/+pyzD8BmM7MTEQXIt8a48ooL0IQdBADIAilnA3g3wv//3Xh8RgAtxlzDZjZa//4gwEsJ6KDAAwG8G5IsgpCIMQACEIBIhoJYBaAiUTEyPe/YeSboulg7qsy1PT/WwHcxMwPEdGXASwKLKwghIDEAARhH6cDuIuZD2Xm0cx8CPKr9bUAvklEg4CioQCATwAMN33+AyL6AhE1ADjV9PwI7Guj/PVIr0AQPCAGQBD2cTaA31qeewDAQQA2A3iFiNYC+FrhtWUAHjWCwAAWID/X+XkA75uOsQjAb4hoDQC3eIEgVAzpBioIglCnyA5AEAShThEDIAiCUKeIARAEQahTxAAIgiDUKWIABEEQ6hQxAIIgCHWKGABBEIQ65f8DjvuoYpTPeQcAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "Created on Sun May 16 08:05:43 2021\n",
    "\n",
    "@author: Jacob\n",
    "\"\"\"\n",
    "import matplotlib.pyplot as plt\n",
    "import math\n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd \n",
    "from pandas import DataFrame\n",
    "\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.linear_model import LinearRegression, Ridge, ElasticNet\n",
    "from sklearn.metrics import mean_squared_error as mse\n",
    "from sklearn.model_selection import KFold\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "from sklearn import datasets, linear_model\n",
    "from sklearn.metrics import mean_squared_error, r2_score\n",
    "\n",
    "import spacy\n",
    "from transformers import XLNetTokenizer\n",
    "\n",
    "import sys\n",
    "\n",
    "\n",
    "print(\"working directory\" , os.getcwd())\n",
    "\n",
    "useSpaCyParsing = True\n",
    "\n",
    "print()\n",
    "\n",
    "if useSpaCyParsing:\n",
    "    print(\"spaCy features\")\n",
    "    nlp = spacy.load('en_core_web_lg')\n",
    "\n",
    "\n",
    "\n",
    "nfolds =  10\n",
    " \n",
    " \n",
    "# count syllables: https://stackoverflow.com/questions/46759492/syllable-count-in-python\n",
    "def syllable_count(word):\n",
    "    \n",
    "    count = 0\n",
    "    vowels = \"aeiouy\"\n",
    "    \n",
    "    if word[0] in vowels:\n",
    "        count += 1\n",
    "        \n",
    "    for index in range(1, len(word)):\n",
    "        if word[index] in vowels and word[index - 1] not in vowels:\n",
    "            count += 1\n",
    "            if word.endswith(\"e\"):\n",
    "                count -= 1\n",
    "                \n",
    "    if count == 0:\n",
    "        count += 1\n",
    "        \n",
    "    return count\n",
    "\n",
    "\n",
    "\n",
    "def punctsPerSentence(text):\n",
    "    \n",
    "    doc = nlp(text)\n",
    "    \n",
    "    nPuncts = 0 \n",
    "    nSentences = 0\n",
    "    \n",
    "    for sent in doc.sents:\n",
    "        \n",
    "        sentence = sent.text.strip()\n",
    "        \n",
    "        nSentences += 1\n",
    "        \n",
    "        for i in range (0, len (sentence)):   \n",
    "            #Checks whether given character is a punctuation mark  \n",
    "            if sentence[i] in ('!', \",\" ,\"\\'\" ,\";\" ,\"\\\"\", \".\", \"-\" ,\"?\"):  \n",
    "                nPuncts += 1 \n",
    "                  \n",
    "#    print (\"Number of punctuation characters in text: \", nPuncts) \n",
    "    \n",
    "    punctsPerSentence = nPuncts / nSentences\n",
    "    \n",
    "    return punctsPerSentence\n",
    "    \n",
    "\n",
    "    \n",
    "xtrain = pd.read_csv('../input/commonlitreadabilityprize/train.csv')\n",
    "xtest = pd.read_csv('../input/commonlitreadabilityprize/test.csv')\n",
    "\n",
    "\n",
    "print(\"xtrain\", xtrain.head(7))\n",
    "print()\n",
    "\n",
    "print(\"xtrain target\", xtrain.target.head(7))\n",
    "print()\n",
    "\n",
    "\n",
    "\n",
    "print(\"calculating Flesch score\")\n",
    "# Flesch score: https://blog.ung.edu/press/measure-readability/\n",
    "\n",
    "# count the characters\n",
    "xtrain['nof_char'] = xtrain['excerpt'].apply(len)\n",
    "xtest['nof_char'] = xtest['excerpt'].apply(len)\n",
    "\n",
    "# count the words\n",
    "xtrain['nof_words'] = xtrain['excerpt'].apply(lambda s: len(s.split(' ')))\n",
    "xtest['nof_words'] = xtest['excerpt'].apply(lambda s: len(s.split(' ')))\n",
    "\n",
    "# words to characters\n",
    "xtrain['w2c'] = xtrain['nof_words'] / xtrain['nof_char']\n",
    "xtest['w2c'] = xtest['nof_words'] / xtest['nof_char']\n",
    "\n",
    "# nof sentences\n",
    "xtrain['nof_sentences'] =  xtrain['excerpt'].apply(lambda s: s.count('.'))\n",
    "xtest['nof_sentences'] =  xtest['excerpt'].apply(lambda s: s.count('.'))\n",
    "\n",
    "# nof syllables\n",
    "xtrain['nof_syllables'] =  xtrain['excerpt'].apply(lambda s: syllable_count(s))\n",
    "xtest['nof_syllables'] =  xtest['excerpt'].apply(lambda s: syllable_count(s))\n",
    "\n",
    "# nof punctuation characters per sentence\n",
    "xtrain['punctsPerSentence'] =  xtrain['excerpt'].apply(lambda s: punctsPerSentence(s))\n",
    "xtest['punctsPerSentence'] =  xtest['excerpt'].apply(lambda s: punctsPerSentence(s))\n",
    "\n",
    "\n",
    "# Flesch score\n",
    "a = 206.835 - 1.015 * (xtrain['nof_words'] / xtrain['nof_sentences'])\n",
    "b = -84.6 * (xtrain['nof_syllables'] / xtrain['nof_words'])\n",
    "xtrain['flesch_score'] = a + b\n",
    "\n",
    "a = 206.835 - 1.015 * (xtest['nof_words'] / xtest['nof_sentences'])\n",
    "b = -84.6 * (xtest['nof_syllables'] / xtest['nof_words'])\n",
    "xtest['flesch_score'] = a + b\n",
    "\n",
    "# Flesch score 2\n",
    "a = (xtrain['nof_words'] / xtrain['nof_sentences'])\n",
    "b = (xtrain['nof_syllables'] / xtrain['nof_words'])\n",
    "xtrain['flesch_score2'] = 0.39 * a + 11.8 * b - 15.59\n",
    "\n",
    "a = (xtest['nof_words'] / xtest['nof_sentences'])\n",
    "b = (xtest['nof_syllables'] / xtest['nof_words'])\n",
    "xtest['flesch_score2'] = 0.39 * a + 11.8 * b - 15.59\n",
    " \n",
    "    \n",
    "del a,b\n",
    "\n",
    "\n",
    "print(\"counting unique words\")\n",
    "# count the unique words\n",
    "xtrain['nof_unique_words'] = xtrain['excerpt'].apply(lambda s: len(set( s.split(' ') )))\n",
    "xtest['nof_unique_words'] = xtest['excerpt'].apply(lambda s: len(set( s.split(' ') )))\n",
    "\n",
    "# text diversity\n",
    "xtrain['txt_diversity'] = xtrain['nof_unique_words'] / xtrain['nof_words']\n",
    "xtest['txt_diversity'] = xtest['nof_unique_words'] / xtest['nof_words']\n",
    "\n",
    "# word lengths\n",
    "words = xtrain['excerpt'].apply(lambda s: s.split(' '))\n",
    "word_lengths = words.apply(lambda s: [len(f) for f in s ])\n",
    "xtrain['longest_word'] = word_lengths.apply(max)\n",
    "xtrain['avg_word'] = word_lengths.apply(np.mean)\n",
    "\n",
    "words = xtest['excerpt'].apply(lambda s: s.split(' '))\n",
    "word_lengths = words.apply(lambda s: [len(f) for f in s ])\n",
    "xtest['longest_word'] = word_lengths.apply(max)\n",
    "xtest['avg_word'] = word_lengths.apply(np.mean)\n",
    "\n",
    "xtrain['txt_diversity'] = xtrain['nof_unique_words'] / xtrain['nof_words']\n",
    "xtest['txt_diversity'] = xtest['nof_unique_words'] / xtest['nof_words']\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "nText = 0\n",
    "\n",
    "for text in xtrain.excerpt:\n",
    "    \n",
    "    target = xtrain.target.iloc[nText]\n",
    "\n",
    "    nText += 1\n",
    "    \n",
    "    if nText % 100 == 0:\n",
    "        print()\n",
    "        print(\"--------------------------------------------------------------------------------\")\n",
    "        print(\"text\", nText , text[:77], \"target\", target)\n",
    "        print()\n",
    "    \n",
    "\n",
    "\n",
    "\n",
    "                    \n",
    "useSpaCyFeatures = True\n",
    "\n",
    "if useSpaCyFeatures:\n",
    "    \n",
    "    print(\"getting spaCy features\")\n",
    "\n",
    "    with nlp.disable_pipes():\n",
    "        train_vectors = np.array([nlp(text).vector for text in xtrain.excerpt])\n",
    "        test_vectors = np.array([nlp(text).vector for text in xtest.excerpt])\n",
    "            \n",
    "    namelist = ['f' + str(ii) for ii in range(train_vectors.shape[1])]\n",
    "    \n",
    "    print(\"namelist\" , namelist)\n",
    "    \n",
    "    train_vectors = pd.DataFrame(train_vectors)\n",
    "    test_vectors = pd.DataFrame(test_vectors)\n",
    "    train_vectors.columns = namelist\n",
    "    test_vectors.columns = namelist\n",
    "    \n",
    "    print(\"train_vectors\" , train_vectors.head())\n",
    "    \n",
    "    # combined\n",
    "    xtrain = pd.concat([xtrain, train_vectors], axis = 1)\n",
    "    xtest = pd.concat([xtest, test_vectors], axis = 1)\n",
    "\n",
    "#features = namelist\n",
    "\n",
    "    features = ['nof_words', 'nof_sentences', 'nof_syllables', 'flesch_score',\n",
    "            'txt_diversity', 'nof_unique_words', 'nof_char', 'w2c', \n",
    "            'flesch_score2', 'punctsPerSentence'] + namelist\n",
    "\n",
    "else:\n",
    "\n",
    "    features = ['nof_words', 'nof_sentences', 'nof_syllables', 'flesch_score',\n",
    "            'txt_diversity', 'nof_unique_words', 'nof_char', 'w2c', \n",
    "            'flesch_score2', 'punctsPerSentence'] \n",
    "\n",
    "print(\"xtrain\", xtrain.head())\n",
    "print()\n",
    "\n",
    "X = xtrain[features].sample(frac=0.8, random_state=0)\n",
    "# Xtest = xtrain[features].drop(X.index)\n",
    "\n",
    "y = xtrain['target'].sample(frac=0.8, random_state=0)\n",
    "# ytest = xtrain['target'].drop(y.index)\n",
    "\n",
    "Xtest = xtrain[features].iloc[4::5]\n",
    "ytest =  xtrain['target'].iloc[4::5]\n",
    "\n",
    "testTexts = xtrain['excerpt'].iloc[4::5]\n",
    "        \n",
    "print(len(X), len(Xtest), len(y), len(ytest))\n",
    "\n",
    "print(\"X\", X.head(7))\n",
    "print()\n",
    "print(\"y\", y.head(7))\n",
    "print()\n",
    "print(\"Xtest\", Xtest.head(7))\n",
    "print()\n",
    "print(\"ytest\", ytest.head(7))\n",
    "print()\n",
    "print(\"testTexts\", testTexts.head(7))\n",
    "print()\n",
    "\n",
    "   \n",
    "# Create linear regression object\n",
    "regr = linear_model.LinearRegression()\n",
    "\n",
    "# Train the model using the training sets\n",
    "regr.fit(X, y)\n",
    "\n",
    "# Make predictions using the testing set\n",
    "y_pred = regr.predict(Xtest)\n",
    "\n",
    "# The coefficients\n",
    "print('Coefficients: \\n', regr.coef_)\n",
    "# The mean squared error\n",
    "print('Mean squared error: %.10f' % mean_squared_error(ytest, y_pred))\n",
    "print('Root Mean squared error: %.10f' % math.sqrt(mean_squared_error(ytest, y_pred)))\n",
    "# The coefficient of determination: 1 is perfect prediction\n",
    "print('Coefficient of determination: %.2f'\n",
    "      % r2_score(ytest, y_pred))\n",
    "print(\"sklearn RMSE\" ,  np.sqrt(mse(y_pred, ytest)))\n",
    "\n",
    "def plot_prediction(x, y):\n",
    "    plt.scatter(x, y, label='Predictions')\n",
    "#      plt.plot(x, y, color='k', label='Predictions')\n",
    "    plt.xlabel('Actual')\n",
    "    plt.ylabel('Linear Regression Prediction')\n",
    "    plt.legend()\n",
    "      \n",
    "plot_prediction(ytest, y_pred)\n",
    "\n",
    "\n",
    "print(\"ytest is a\",type(ytest), \"    y_pred is a\",type(y_pred))\n",
    "\n",
    "\n",
    "testResults = DataFrame()\n",
    "testResults['excerpt'] = testTexts \n",
    "testResults['predregr'] = y_pred \n",
    "testResults['target'] = ytest\n",
    "#testResults.loc[:,'target'] = ytest\n",
    "\n",
    "print(\"testTexts is a\",type(testTexts), \"    testResults is a\",type(testResults))\n",
    "\n",
    "print(testResults.head(7))\n",
    "\n",
    "# testResults.to_csv('regression.csv', index = False)\n",
    "\n",
    "#continuousNN.doMultipleRegression(X , y, Xtest , ytest)\n",
    "\n",
    "#continuousNN.doMultipleNN(X , y , Xtest , ytest)\n",
    "\n",
    "Xtest = xtest[features]\n",
    "\n",
    "y_pred = regr.predict(Xtest)\n",
    "\n",
    "xsub = xtest[[\"id\"]].copy()\n",
    "xsub[\"target\"] = y_pred\n",
    "xsub.to_csv('regression.csv', index = False)\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Transformers XLNet"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-25T20:29:57.316067Z",
     "iopub.status.busy": "2021-06-25T20:29:57.315669Z",
     "iopub.status.idle": "2021-06-25T20:37:56.431985Z",
     "shell.execute_reply": "2021-06-25T20:37:56.430279Z",
     "shell.execute_reply.started": "2021-06-25T20:29:57.316036Z"
    },
    "trusted": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "All model checkpoint layers were used when initializing TFXLNetForSequenceClassification.\n",
      "\n",
      "All the layers of TFXLNetForSequenceClassification were initialized from the model checkpoint at ./savedmodel/5.\n",
      "If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetForSequenceClassification for predictions without further training.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "nr of testData 566\n",
      "\n",
      "Once upon a time there were Three Bears who lived\n",
      "One day he had gone beyond any point which he had\n",
      "Aunt Abigail was gone, Eleanor was gone. The room\n",
      "Father had been away in the country for three or \n",
      "One beautiful misummer night in 18— a large, heav\n",
      "Before Fred could complete the sentence his foot \n",
      "When Josie arrived at her destination she went to\n",
      "n 10 predictedScore -0.6352807\n",
      "n 20 predictedScore -0.57780975\n",
      "n 30 predictedScore -2.1973433\n",
      "n 40 predictedScore 0.3600465\n",
      "n 50 predictedScore -1.5519806\n",
      "n 60 predictedScore -0.85689175\n",
      "n 70 predictedScore -0.09774914\n",
      "n 80 predictedScore -0.66253567\n",
      "n 90 predictedScore 0.100836724\n",
      "n 100 predictedScore -1.6850315\n",
      "n 110 predictedScore -2.3307252\n",
      "n 120 predictedScore -1.9078956\n",
      "n 130 predictedScore -0.14376666\n",
      "n 140 predictedScore -0.5901213\n",
      "n 150 predictedScore -1.5638647\n",
      "n 160 predictedScore -0.7154779\n",
      "n 170 predictedScore -0.65485036\n",
      "n 180 predictedScore -0.527873\n",
      "n 190 predictedScore -2.7968028\n",
      "n 200 predictedScore -0.9472739\n",
      "n 210 predictedScore 0.5601345\n",
      "n 220 predictedScore 0.58908087\n",
      "n 230 predictedScore 0.029123785\n",
      "n 240 predictedScore -1.9193069\n",
      "n 250 predictedScore -1.1744077\n",
      "n 260 predictedScore -2.4476063\n",
      "n 270 predictedScore -2.6000533\n",
      "n 280 predictedScore -0.71867293\n",
      "n 290 predictedScore -2.2777648\n",
      "n 300 predictedScore -0.74542314\n",
      "n 310 predictedScore -0.2118968\n",
      "n 320 predictedScore -0.7084438\n",
      "n 330 predictedScore -3.1341808\n",
      "n 340 predictedScore -1.6591183\n",
      "n 350 predictedScore -2.861485\n",
      "n 360 predictedScore -0.39417243\n",
      "n 370 predictedScore -3.2452447\n",
      "n 380 predictedScore 0.39630795\n",
      "n 390 predictedScore 0.33230662\n",
      "n 400 predictedScore 0.089797206\n",
      "n 410 predictedScore -0.9605478\n",
      "n 420 predictedScore -2.0486407\n",
      "n 430 predictedScore -2.3067622\n",
      "n 440 predictedScore -0.46318254\n",
      "n 450 predictedScore -1.3052845\n",
      "n 460 predictedScore -1.2545989\n",
      "n 470 predictedScore -0.4134524\n",
      "n 480 predictedScore -1.9913256\n",
      "n 490 predictedScore -1.0554487\n",
      "n 500 predictedScore 0.115702085\n",
      "n 510 predictedScore -0.66827846\n",
      "n 520 predictedScore 0.11224969\n",
      "n 530 predictedScore -2.9705956\n",
      "n 540 predictedScore -1.5697746\n",
      "n 550 predictedScore -3.2411354\n",
      "n 560 predictedScore -0.817201\n",
      "prediction done\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "Length of values (566) does not match length of index (7)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-14-c07b27597ea6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m    506\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    507\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 508\u001b[0;31m         \u001b[0mpredict_testdata\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtestData\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    509\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    510\u001b[0m         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-14-c07b27597ea6>\u001b[0m in \u001b[0;36mpredict_testdata\u001b[0;34m(texts)\u001b[0m\n\u001b[1;32m    420\u001b[0m     \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"prediction done\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    421\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 422\u001b[0;31m     \u001b[0mtestResults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'target'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpredictedScores\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    423\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    424\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   3161\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3162\u001b[0m             \u001b[0;31m# set column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3163\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3164\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3165\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_setitem_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mslice\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   3240\u001b[0m         \"\"\"\n\u001b[1;32m   3241\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_valid_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3242\u001b[0;31m         \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3243\u001b[0m         \u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3244\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_sanitize_column\u001b[0;34m(self, key, value, broadcast)\u001b[0m\n\u001b[1;32m   3897\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3898\u001b[0m             \u001b[0;31m# turn me into an ndarray\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3899\u001b[0;31m             \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msanitize_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3900\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3901\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/opt/conda/lib/python3.7/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36msanitize_index\u001b[0;34m(data, index)\u001b[0m\n\u001b[1;32m    750\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    751\u001b[0m         raise ValueError(\n\u001b[0;32m--> 752\u001b[0;31m             \u001b[0;34m\"Length of values \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    753\u001b[0m             \u001b[0;34mf\"({len(data)}) \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    754\u001b[0m             \u001b[0;34m\"does not match length of index \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Length of values (566) does not match length of index (7)"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "Created on Fri May 28 14:00:29 2021\n",
    "\n",
    "@author: Jacob\n",
    "\"\"\"\n",
    "\n",
    "from transformers import BertTokenizer\n",
    "from transformers import TFBertForSequenceClassification\n",
    "\n",
    "from transformers import XLNetTokenizer\n",
    "from transformers import TFXLNetForSequenceClassification\n",
    "\n",
    "# from transformers import AutoModelForSequenceClassification\n",
    "# from transformers import TrainingArguments\n",
    "# from transformers import Trainer\n",
    "\n",
    "import tensorflow_datasets as tfds\n",
    "import tensorflow as tf\n",
    "\n",
    "import csv\n",
    "import os\n",
    "import sys\n",
    "import gc\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "\n",
    "import codecs\n",
    "import math\n",
    "from sklearn.metrics import mean_squared_error \n",
    "\n",
    "\n",
    "# max_length_text = 7\n",
    "# batch_size = 100\n",
    "# number_of_epochs = 1\n",
    "\n",
    "# max_length_text = 100\n",
    "# batch_size = 50\n",
    "# number_of_epochs = 3\n",
    "\n",
    "max_length_text = 150\n",
    "batch_size = 10\n",
    "number_of_epochs = 1\n",
    "\n",
    "#max_length_text = 300\n",
    "#batch_size = 1\n",
    "#number_of_epochs = 1\n",
    "\n",
    "\n",
    "testScores = []\n",
    "testData = []\n",
    "\n",
    "def load_train_validate_lit_data(data_dir):\n",
    "    \"\"\"Loads the CommonLit train/validate datasets \n",
    "    Input:\n",
    "    data_dir: path to the \"aclImdb\" folder.\n",
    "    \n",
    "    Returns:\n",
    "    train/test datasets as pandas dataframes.\n",
    "    \"\"\"\n",
    "\n",
    "    # CommonLit data\n",
    "    # --------------\n",
    "    cldata = {}   \n",
    "    \n",
    "#    datadir = 'data/CommonLit/'-\n",
    "    \n",
    "    text_corpus = []\n",
    "    targetvals = []\n",
    "    \n",
    "    \n",
    "    with open('../input/commonlitreadabilityprize/train.csv', newline='', encoding='utf8') as trainfile:\n",
    "#    with open(data_dir + 'train+grades.csv', newline='', encoding='utf8') as trainfile:\n",
    "        \n",
    "         corpus = csv.reader(trainfile, delimiter=',', quotechar='\"')\n",
    "         \n",
    "         for split in [\"train\", \"validate\"]:\n",
    "            \n",
    "            cldata[split] = []\n",
    "         \n",
    "         \n",
    "         n = 0\n",
    "         ntest = 0\n",
    "         \n",
    "         for row in corpus:\n",
    "             # print()\n",
    "             # print(n, row[0])\n",
    "             # print(row[3])\n",
    "             if n > 0:\n",
    "                 \n",
    "                 text = row[3]\n",
    "                 score = float(row[4])\n",
    "                 \n",
    "                 # if score <= 6:\n",
    "                 #     score = 0\n",
    "                 # elif score >= 9:\n",
    "                 #    score = 1\n",
    "                 # else:\n",
    "                 #    continue\n",
    "                 \n",
    "                 # if n % 50 == 0:\n",
    "                 #     cldata[\"test\"].append([text, score])\n",
    "                 # elif n % 10 == 0:\n",
    "                 #     cldata[\"train\"].append([text, score]) \n",
    "\n",
    "                 if n % 5 == 0:\n",
    "                      cldata[\"validate\"].append([text, score])\n",
    "                      \n",
    "                      if True:   \n",
    "                        # ntest < 1.4 * 37:\n",
    "                          testData.append(text)\n",
    "                          testScores.append(score)\n",
    "                          ntest += 1\n",
    "                 else:\n",
    " \n",
    "                     if score < -3.1 :\n",
    "                         for dup in range(0,10):\n",
    "                             cldata[\"train\"].append([text, score]) \n",
    "                     elif score < -2.6:\n",
    "                         for dup in range(0,4):\n",
    "                             cldata[\"train\"].append([text, score]) \n",
    "                     elif score < -2.1:\n",
    "                         for dup in range(0,2):\n",
    "                             cldata[\"train\"].append([text, score]) \n",
    "                     elif score < 0.1:\n",
    "                         cldata[\"train\"].append([text, score]) \n",
    "                     elif score < 0.6:\n",
    "                         for dup in range(0,2):\n",
    "                             cldata[\"train\"].append([text, score]) \n",
    "                     elif score < 1.2:\n",
    "                         for dup in range(0,5):\n",
    "                             cldata[\"train\"].append([text, score]) \n",
    "                     else:\n",
    "                          for dup in range(0,10):\n",
    "                             cldata[\"train\"].append([text, score]) \n",
    "\n",
    "             n += 1\n",
    "                \n",
    "\n",
    "    np.random.shuffle(cldata[\"train\"])        \n",
    "    cldata[\"train\"] = pd.DataFrame(cldata[\"train\"], columns=['text', 'score'])\n",
    "    #print()\n",
    "    #print(\"training data\")\n",
    "    #print(cldata[\"train\"])\n",
    "    \n",
    "    np.random.shuffle(cldata[\"validate\"])\n",
    "    cldata[\"validate\"] = pd.DataFrame(cldata[\"validate\"], columns=['text', 'score'])\n",
    "    #print()\n",
    "    #print(\"validation data\")\n",
    "    #print(cldata[\"validate\"])\n",
    "    \n",
    "    return cldata[\"train\"], cldata[\"validate\"]\n",
    "\n",
    "\n",
    "\n",
    "def load_train_test_imdb_data(data_dir):\n",
    "    \"\"\"Loads the IMDB train/test datasets from a folder path.\n",
    "    Input:\n",
    "    data_dir: path to the \"aclImdb\" folder.\n",
    "    \n",
    "    Returns:\n",
    "    train/test datasets as pandas dataframes.\n",
    "    \"\"\"\n",
    "\n",
    "    data = {}\n",
    "    for split in [\"train\", \"test\"]:\n",
    "        \n",
    "        data[split] = []\n",
    "        \n",
    "        for sentiment in [\"neg\", \"pos\"]:\n",
    "            \n",
    "            score = 1 if sentiment == \"pos\" else 0\n",
    "\n",
    "            path = os.path.join(data_dir, split, sentiment)\n",
    "            file_names = os.listdir(path)\n",
    "            \n",
    "            nData = 0\n",
    "            \n",
    "            for f_name in file_names:\n",
    "                \n",
    "                if nData % 20 == 0:\n",
    "                    with open(os.path.join(path, f_name), \"r\", encoding=\"utf8\") as f:\n",
    "                        review = f.read()\n",
    "                        data[split].append([review, score])\n",
    "                    \n",
    "                nData += 1  \n",
    "                \n",
    "\n",
    "    np.random.shuffle(data[\"train\"])        \n",
    "    data[\"train\"] = pd.DataFrame(data[\"train\"], columns=['text', 'sentiment'])\n",
    "    print(\"training data\")\n",
    "    print(data[\"train\"])\n",
    "    np.random.shuffle(data[\"test\"])\n",
    "    data[\"test\"] = pd.DataFrame(data[\"test\"], columns=['text', 'sentiment'])\n",
    "    print(\"test data\")\n",
    "    print(data[\"test\"])\n",
    "    \n",
    "    return data[\"train\"], data[\"test\"]\n",
    "\n",
    "\n",
    "\n",
    "def convert_example_to_feature(text):\n",
    "#     # add special tokens\n",
    "# #    print(\"text\", text)\n",
    "#     text_with_special_tokens = '[CLS]' + text + '[SEP]'\n",
    "#     tokenized = tokenizer.tokenize(text_with_special_tokens)\n",
    "# #    print('tokenized', tokenized)\n",
    "    \n",
    "#     # convert tokens to ids in WordPiece\n",
    "#     input_ids = tokenizer.convert_tokens_to_ids(tokenized)\n",
    "      \n",
    "#     # precalculation of pad length, so that we can reuse it later on\n",
    "#     padding_length = max_length_text - len(input_ids)\n",
    "    \n",
    "#     # map tokens to WordPiece dictionary and add pad token for those text shorter than our max length\n",
    "#     input_ids = input_ids + ([0] * padding_length)\n",
    "    \n",
    "#     # attention should focus just on sequence with non padded tokens\n",
    "#     attention_mask = [1] * len(input_ids)\n",
    "    \n",
    "#     # do not focus attention on padded tokens\n",
    "#     attention_mask = attention_mask + ([0] * padding_length)\n",
    "    \n",
    "#     # token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence\n",
    "#     token_type_ids = [0] * max_length_text\n",
    "#     bert_input = {\n",
    "#         \"input_ids\": input_ids,\n",
    "#         \"token_type_ids\": token_type_ids,\n",
    "#         \"attention_mask\": attention_mask\n",
    "#         }\n",
    "        \n",
    "#     return bert_input\n",
    "\n",
    "    features = tokenizer.encode_plus(\n",
    "                        text,                      \n",
    "                        add_special_tokens = True, # add [CLS], [SEP]\n",
    "                        max_length = max_length_text, # max length of the text that can go to XLNet\n",
    "                        pad_to_max_length = True, # add [PAD] tokens\n",
    "                        return_attention_mask = True, # add attention mask to not focus on pad tokens\n",
    "              )\n",
    " #   print('encoded', bert_input)\n",
    " \n",
    "    return features\n",
    "\n",
    "try:\n",
    "    tokenizer = XLNetTokenizer.from_pretrained('../input/xlnet01/hf_xlnet-base-cased', do_lower_case=True)\n",
    "except Exception as e: # catch *all* exceptions\n",
    "   print( \"Error: \" , e)\n",
    "#tokenizer = BertTokenizer.from_pretrained('uncased_L-12_H-768_A-12', do_lower_case=True)\n",
    "#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)\n",
    "\n",
    "#max_length_text = 100\n",
    "test_sentence = 'Test tokenization sentence. Followed by another sentence'\n",
    "\n",
    "transformer_input = convert_example_to_feature(test_sentence) \n",
    "\n",
    "#print(\"input\")\n",
    "#print(transformer_input)\n",
    "\n",
    "# bert_input = tokenizer.encode_plus(\n",
    "#                         test_sentence,                      \n",
    "#                         add_special_tokens = True, # add [CLS], [SEP]\n",
    "#                         max_length = max_length_text, # max length of the text that can go to BERT\n",
    "#                         pad_to_max_length = True, # add [PAD] tokens\n",
    "#                         return_attention_mask = True, # add attention mask to not focus on pad tokens\n",
    "#               )\n",
    "# print('encoded', bert_input)\n",
    "\n",
    "\n",
    "train_data, validate_data = load_train_validate_lit_data(data_dir=\"data/CommonLit/\")\n",
    "\n",
    "test_data = validate_data.head(7)\n",
    "\n",
    "#train_data, test_data = load_train_test_imdb_data(data_dir=\"data/aclImdb/\")\n",
    "\n",
    "#print(\"train_data\", train_data)\n",
    "\n",
    "target = train_data.pop('score')\n",
    "ds_train = tf.data.Dataset.from_tensor_slices((train_data.values, target.values))\n",
    "\n",
    "target = validate_data.pop('score')\n",
    "ds_validate = tf.data.Dataset.from_tensor_slices((validate_data.values, target.values))\n",
    "\n",
    "\n",
    "# for review, label in tfds.as_numpy(ds_train.take(5)):\n",
    "#     print('review', review[0:50], label)\n",
    "\n",
    "# (ds_train, ds_test), ds_info = tfds.load('imdb_reviews', \n",
    "#           split = (tfds.Split.TRAIN, tfds.Split.TEST),\n",
    "#           as_supervised=True,\n",
    "#           with_info=True)\n",
    "\n",
    "# print('info', ds_info)\n",
    "\n",
    "# map to the expected input to TFBertForSequenceClassification, see here \n",
    "def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):\n",
    "    \n",
    "    return {\n",
    "              \"input_ids\": input_ids,\n",
    "              \"token_type_ids\": token_type_ids,\n",
    "              \"attention_mask\": attention_masks,\n",
    "           }, label\n",
    "\n",
    "\n",
    "\n",
    "def encode_examples(ds, limit=-1):\n",
    "    # prepare list, so that we can build up final TensorFlow dataset from slices.\n",
    "    input_ids_list = []\n",
    "    token_type_ids_list = []\n",
    "    attention_mask_list = []\n",
    "    label_list = []\n",
    "    \n",
    "    if (limit > 0):\n",
    "        ds = ds.take(limit)\n",
    "        \n",
    "    n = 0\n",
    "    maxNids = 0\n",
    "    minNids = 1000000\n",
    "    sumNids = 0\n",
    "      \n",
    "    for text, label in tfds.as_numpy(ds):\n",
    "\n",
    "        transformer_input = convert_example_to_feature(text[0].decode())\n",
    "        \n",
    "        nids = len(transformer_input['input_ids'])\n",
    "        \n",
    "        sumNids += nids\n",
    "        \n",
    "        if nids < minNids:\n",
    "            minNids = nids\n",
    " \n",
    "        if nids > maxNids:\n",
    "            maxNids = nids\n",
    "\n",
    " #     bert_input = convert_example_to_feature(codecs.decode(review,'ascii'))\n",
    "        if n % 300 == 0:\n",
    "            \n",
    "            print()\n",
    "            print(\"******* text\",text[0])\n",
    "            print()\n",
    "            \n",
    "            # print(\"    *** transformer_input\",transformer_input)\n",
    "            print()\n",
    "            print(\"nr input ids\", len(transformer_input['input_ids']))\n",
    "#            print(len(transformer_input['input_ids']),\"    *** input_ids\",transformer_input['input_ids'])\n",
    "            print()\n",
    "\n",
    "      \n",
    "      # if n > 1:\n",
    "      #     break\n",
    "        n += 1\n",
    "    \n",
    "        input_ids_list.append(transformer_input['input_ids'])\n",
    "        token_type_ids_list.append(transformer_input['token_type_ids'])\n",
    "        attention_mask_list.append(transformer_input['attention_mask'])\n",
    "        label_list.append([label])\n",
    "   \n",
    "    meanNids = sumNids / n\n",
    "    print(\"minNids\", minNids, \"meanNids\", meanNids, \"maxNids\", maxNids)    \n",
    "   \n",
    "    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)\n",
    "\n",
    "\n",
    "\n",
    "def predict_testdata():\n",
    "    \n",
    "    xtest = pd.read_csv('../input/commonlitreadabilityprize/test.csv')\n",
    "    \n",
    "    texts = xtest['excerpt']\n",
    "    \n",
    "    print(\"test texts\",texts.head)\n",
    "    \n",
    "    testResults = DataFrame()\n",
    "    testResults = xtest[[\"id\"]].copy()\n",
    "   \n",
    "    token_type_ids_list = []\n",
    "    attention_mask_list = []\n",
    "    label_list = []\n",
    "    \n",
    "    predictedScores = []\n",
    "    \n",
    "    n = 0\n",
    "    \n",
    "    for text in texts:\n",
    "        \n",
    "        n += 1\n",
    "        \n",
    "        input_ids_list = []\n",
    "\n",
    "        transformer_input = convert_example_to_feature(text)\n",
    "\n",
    "        # if n % 500 == 0:\n",
    "            \n",
    "        #     print()\n",
    "        #     print(\"******* text \", text)\n",
    "        #     print(\"    *** transformer_input\",transformer_input)\n",
    "        #     print(len(transformer_input['input_ids']),\"    *** input_ids\",transformer_input['input_ids'])\n",
    "        #     print()\n",
    "\n",
    "      \n",
    "      # if n > 1:\n",
    "      #     break\n",
    "    \n",
    "        input_ids_list.append(transformer_input['input_ids'])\n",
    "      #  token_type_ids_list.append(transformer_input['token_type_ids'])\n",
    "      #  attention_mask_list.append(transformer_input['attention_mask'])\n",
    "        \n",
    "        inp_tok = np.array([inputid for inputid in input_ids_list])\n",
    "    \n",
    "        preds = model.predict(inp_tok,  batch_size=1, verbose=False)\n",
    "        \n",
    "        predictedScore = preds.logits[0][0]\n",
    "        \n",
    "        if n % 10 == 0:\n",
    "            print(\"n\", n, \"predictedScore\", predictedScore)\n",
    "        \n",
    "        predictedScores.append(predictedScore)\n",
    "#    preds = model.predict(inp_tok, batch_size=batch_size, verbose=True)\n",
    "        gc.collect()\n",
    "    \n",
    "#    print(\"texts\", texts)\n",
    "#    print(\"predictions\", preds.logits)\n",
    "\n",
    "#    predictedScores = []\n",
    "     \n",
    "    n = 0\n",
    "    \n",
    "    print(\"prediction done\")\n",
    "    \n",
    "    testResults['target'] = predictedScores\n",
    "    \n",
    "    \n",
    "\n",
    "    testResults.to_csv('xlnet.csv', index = False)\n",
    "   \n",
    "\n",
    "#batch_size = 50\n",
    "\n",
    "\n",
    "# train dataset\n",
    "print()\n",
    "\n",
    "training = True\n",
    "\n",
    "if training:\n",
    "    \n",
    "    print(\"training dataset\")\n",
    "    ds_train_encoded = encode_examples(ds_train)\n",
    "    ds_train_encoded_batched = ds_train_encoded.batch(batch_size)\n",
    "    #ds_train_encoded = encode_examples(ds_train).batch(batch_size)\n",
    "    #ds_train_encoded = encode_examples(ds_train).shuffle(10000).batch(batch_size)\n",
    "    # test dataset\n",
    "    print(\"validation dataset\")\n",
    "    ds_validate_encoded = encode_examples(ds_validate)\n",
    "    ds_validate_encoded_batched = ds_validate_encoded.batch(batch_size)\n",
    "    \n",
    "#    sys.exit()\n",
    "\n",
    "    # recommended learning rate for Adam 5e-5, 3e-5, 2e-5\n",
    "    learning_rate = 2e-5\n",
    "    # we will do just 1 epoch for illustration, though multiple epochs might be better as long as we will not overfit the model\n",
    "#    number_of_epochs = 3\n",
    "    \n",
    "    # model initialization\n",
    "#    model = AutoModelForSequenceClassification.from_pretrained('./hf_xlnet-base-cased/', from_tf=True, num_labels=1)\n",
    "#    model = TFXLNetForSequenceClassification.from_pretrained('./hf_xlnet-base-cased/', num_labels=1)\n",
    "\n",
    "    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)\n",
    "    loss = tf.keras.losses.MeanSquaredError()\n",
    "\n",
    "\n",
    "    step = 0\n",
    "    \n",
    "    while step <= 7:\n",
    "        # Place tensors on the CPU\n",
    "        with tf.device('/CPU:0'):\n",
    "            \n",
    "            newDir = './savedmodel/' + str(step + 1) \n",
    "            \n",
    "            if not os.path.exists(newDir):\n",
    "                os.makedirs(newDir)\n",
    "    \n",
    "            if step == 0:\n",
    "  #              model = TFXLNetForSequenceClassification.from_pretrained('../input/xlnet01/savedmodel150_2best/', num_labels=1)\n",
    "                model = TFXLNetForSequenceClassification.from_pretrained('../input/xlnet01/hf_xlnet-base-cased/', num_labels=1)\n",
    "            else:\n",
    "                model = TFXLNetForSequenceClassification.from_pretrained('./savedmodel/' + str(step) + '/', num_labels=1)\n",
    "                \n",
    "            model.compile(optimizer=optimizer, loss=loss, metrics=[])\n",
    "        \n",
    "            print(\"start training with max_length_text\", max_length_text, \"batch_size\", batch_size, \"step\", step)\n",
    "        \n",
    "            history = model.fit(ds_train_encoded_batched, epochs=number_of_epochs, steps_per_epoch=10, validation_data=ds_validate_encoded_batched)\n",
    "    \n",
    "            print(\"training done step \" + str(step))\n",
    "        \n",
    "            step += 1\n",
    "        \n",
    "            model.save_pretrained('./savedmodel/' + str(step) + '/', saved_model=False)\n",
    "            \n",
    "            gc.collect()\n",
    "\n",
    "else:\n",
    "    # use saved model for prediction\n",
    "    with tf.device('/CPU:0'):\n",
    "\n",
    "        model = TFXLNetForSequenceClassification.from_pretrained('../input/xlnet01/savedmodel150_2best', num_labels=1)\n",
    "    \n",
    "        print()\n",
    "        print(\"nr of testData\", len(testData))\n",
    "        print()\n",
    "        for testText in testData[:7]:\n",
    "            print(testText[:49])\n",
    "       \n",
    "\n",
    "        predict_testdata()\n",
    "\n",
    "        print()\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "sys.exit()\n",
    "\n",
    "predictions = model.predict(ds_validate_encoded, verbose = False)\n",
    "print(\"type predictions\", type(predictions))\n",
    "print(\"type predictions.logits\", type(predictions.logits))\n",
    "print(\"predictions.logits\", predictions.logits)\n",
    "print(\"len predictions\", len(predictions.logits))\n",
    "#print(\"tf.argmax(predictions)\",  tf.argmax(predictions).numpy())\n",
    "#print(\"tf.nn.softmax(predictions, axis=-1)\", tf.nn.softmax(predictions.logits, axis=-1))\n",
    "\n",
    "print()\n",
    "\n",
    "print(\"test_data\", test_data)\n",
    "\n",
    "# result = model(ds, training=False)\n",
    "# result = tf.argmax(result).numpy()\n",
    "# print(result)\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<a href=\"./savedmodel/5/tf_model.h5\"> Download Model </a>\n",
    "<a href=\"./savedmodel/5/config.json\"> Download Config </a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2021-06-25T19:42:17.207547Z",
     "iopub.status.busy": "2021-06-25T19:42:17.207093Z",
     "iopub.status.idle": "2021-06-25T19:42:17.236517Z",
     "shell.execute_reply": "2021-06-25T19:42:17.235368Z",
     "shell.execute_reply.started": "2021-06-25T19:42:17.207503Z"
    },
    "trusted": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "predictions\n",
      "          id    target  predtrsf    avgred\n",
      "0  c0f722661 -0.786303 -0.877908 -0.832105\n",
      "1  f0953f0a5 -0.121382  0.276648  0.077633\n",
      "2  0df072751 -0.838939 -0.771869 -0.805404\n",
      "3  04caf4e0c -2.342430 -1.925204 -2.133817\n",
      "4  0e63f8bea -1.534724 -1.855052 -1.694888\n",
      "5  12537fe78 -0.725721 -0.330248 -0.527985\n",
      "6  965e592c0  0.418392  0.393256  0.405824\n"
     ]
    }
   ],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "Created on Thu Jun 17 14:37:27 2021\n",
    "\n",
    "@author: Jacob\n",
    "\"\"\"\n",
    "import pandas as pd \n",
    "from pandas import DataFrame\n",
    "\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn import datasets\n",
    "from sklearn.metrics import mean_squared_error \n",
    "\n",
    "import numpy as np\n",
    "from scipy.optimize import curve_fit\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import math\n",
    "import sys\n",
    "\n",
    "\n",
    "\n",
    "def sigmoid(x, L , theta, gamma, b):\n",
    "    \n",
    "    \n",
    "    y = L / (1 + np.exp(-gamma * (x - theta))) + b\n",
    "    \n",
    "    return (y)\n",
    "\n",
    "\n",
    "predictionData = pd.read_csv('regression.csv')\n",
    "\n",
    "\n",
    "resultsXLNet = pd.read_csv('xlnet.csv')\n",
    "\n",
    "predictionData['predtrsf'] = resultsXLNet['target']\n",
    "\n",
    "predictionData['avgred'] = (predictionData['target'] + predictionData['predtrsf']) / 2\n",
    "\n",
    "#y_pred =  predictionData['predtrsf']\n",
    "\n",
    "\n",
    "print(\"predictions\")\n",
    "\n",
    "print(predictionData)\n",
    "\n",
    "\n",
    "# submissiion\n",
    "xsub = xtest[[\"id\"]].copy()\n",
    "xsub[\"target\"] = predictionData['avgred']\n",
    "xsub.to_csv('submission.csv', index = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}