nlp model with tranformers

Browse files

Files changed (5) hide show

Data_Scrapping.ipynb +226 -0
Meme_Transformer1.ipynb +0 -0
cleaning_and_conversion.ipynb +255 -0
nlp_model.pth +3 -0
read.txt +1 -0

Data_Scrapping.ipynb ADDED Viewed

	@@ -0,0 +1,226 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-09T19:40:31.635208Z",
+     "start_time": "2022-03-09T19:40:30.622979Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/imgflip_white_96.png\n",
+      "/imgflip-icon-transparent-192.svg\n",
+      "/s/meme/Monkey-Puppet.jpg\n",
+      "//i.imgflip.com/4zv2v9.jpg\n",
+      "//i.imgflip.com/67jy1i.jpg\n",
+      "//i.imgflip.com/67gu7u.jpg\n",
+      "//i.imgflip.com/67ezqb.jpg\n",
+      "//i.imgflip.com/63add8.jpg\n",
+      "//i.imgflip.com/67t59e.jpg\n",
+      "//i.imgflip.com/67n7t7.jpg\n",
+      "//i.imgflip.com/67t4xy.jpg\n",
+      "//i.imgflip.com/675rc7.jpg\n",
+      "//i.imgflip.com/674mrj.jpg\n",
+      "//i.imgflip.com/67rmxb.jpg\n",
+      "//i.imgflip.com/61m49m.jpg\n",
+      "//i.imgflip.com/66p3ul.jpg\n",
+      "//i.imgflip.com/64kb0c.jpg\n"
+     ]
+    }
+   ],
+   "source": [
+    "from urllib.request import urlopen\n",
+    "from bs4 import BeautifulSoup\n",
+    "import requests\n",
+    "htmldata = requests.get('https://imgflip.com/meme/Monkey-Puppet?page=2', proxies={'http':'50.207.31.221:80'})\n",
+    "soup = BeautifulSoup(htmldata.text, 'lxml')\n",
+    "images = soup.find_all('img')\n",
+    "  \n",
+    "for item in images:\n",
+    "    print(item['src'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-02-25T12:24:56.533753Z",
+     "start_time": "2022-02-25T12:24:56.017684Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "img_data = requests.get('http://i.imgflip.com/4zv2v9.jpg').content\n",
+    "with open('image_name.jpg', 'wb') as handler:\n",
+    "    handler.write(img_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-02-25T14:22:45.253019Z",
+     "start_time": "2022-02-25T14:22:44.273480Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "htmldata = requests.get('https://imgflip.com/meme/Monkey-Puppet?page=104156000', proxies={'http':'50.207.31.221:80'})\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-09T19:41:25.773009Z",
+     "start_time": "2022-03-09T19:41:25.766008Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-03-09T22:08:26.209Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "done printing image:10 in page 0\n",
+      "done printing image:10 in page 1\n",
+      "done printing image:10 in page 2\n",
+      "done printing image:10 in page 3\n",
+      "done printing image:10 in page 4\n",
+      "done printing image:10 in page 5\n",
+      "done printing image:10 in page 6\n",
+      "done printing image:10 in page 7\n",
+      "done printing image:10 in page 8\n",
+      "done printing image:10 in page 9\n",
+      "done printing image:10 in page 10\n",
+      "done printing image:10 in page 11\n",
+      "done printing image:10 in page 12\n",
+      "done printing image:10 in page 13\n",
+      "done printing image:10 in page 14\n",
+      "done printing image:10 in page 15\n",
+      "done printing image:10 in page 16\n",
+      "done printing image:10 in page 17\n",
+      "done printing image:10 in page 18\n",
+      "done printing image:10 in page 19\n",
+      "done printing image:10 in page 20\n",
+      "done printing image:10 in page 21\n",
+      "done printing image:10 in page 22\n",
+      "done printing image:10 in page 23\n",
+      "done printing image:10 in page 24\n",
+      "done printing image:10 in page 25\n",
+      "done printing image:10 in page 26\n",
+      "done printing image:10 in page 27\n",
+      "done printing image:10 in page 28\n"
+     ]
+    }
+   ],
+   "source": [
+    "meme_templetes =  [\"Monkey-Puppet\",\"Surprised-Pikachu\",\"Well-Yes-But-Actually-No\",\"10-Guy\",\"Spiderman-Computer-Desk\", \"Kevin_Hart\", \"laughing-leo\", \"Lisa\", \"Roll-Safe-Think-About-It\",\"Batman-Slapping-Robin\", \"Change-My-Mind\", \"Futurama-Fry\", \"First-World-Problems\"]\n",
+    "for temp in meme_templetes:\n",
+    "    st = temp.replace(\"-\", \"_\")\n",
+    "    os.mkdir(\"D:/{}\".format(st))\n",
+    "    for i in range(100):\n",
+    "        htmldata = requests.get('https://imgflip.com/meme/{}?page={}'.format(temp,i+1), proxies={'http':'10.10.1.10:3128'})\n",
+    "        soup = BeautifulSoup(htmldata.text, 'lxml')\n",
+    "        images = soup.find_all('img')\n",
+    "        for item in images:\n",
+    "            if item['src'].startswith('//i.imgflip.com/'):\n",
+    "                img_data = requests.get('http:{}'.format(item['src'])).content\n",
+    "                with open('D:/{}/{}{}page{}.jpg'.format(st,st,images.index(item),i), 'wb') as handler:\n",
+    "                    handler.write(img_data)\n",
+    "                if images.index(item)%10 ==0:\n",
+    "                    print(\"done printing image:{}\".format(images.index(item)), \"in page {}\".format(i))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-09T23:46:49.531228Z",
+     "start_time": "2022-03-09T23:46:49.510689Z"
+    }
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

Meme_Transformer1.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

cleaning_and_conversion.ipynb ADDED Viewed

	@@ -0,0 +1,255 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-10T00:01:02.682985Z",
+     "start_time": "2022-03-10T00:00:58.195800Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import easyocr\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from os import listdir\n",
+    "from difflib import SequenceMatcher\n",
+    "from autocorrect import Speller"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-10T00:01:10.249133Z",
+     "start_time": "2022-03-10T00:01:02.715992Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "all_files =  [\"monkey_puppet\",\"surprised_pikachu\",\"well_yes_but_actually_no\",\"10_Guy\",\"Spiderman_Computer_Desk\", \"Kevin_Hart\", \"laughing_leo\", \"Lisa\", \"Roll_Safe_Think_About_It\", \"Change_My_Mind\", \"Futurama_Fry\", \"First_World_Problems\"]\n",
+    "reader = easyocr.Reader(['en']) \n",
+    "spell = Speller(lang='en')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-10T00:01:10.297143Z",
+     "start_time": "2022-03-10T00:01:10.282141Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def only_text(lis):\n",
+    "    text = []\n",
+    "    for i in result:\n",
+    "        x = i[1]\n",
+    "        text.append(x)\n",
+    "    return text\n",
+    "def only_eval(lis):\n",
+    "    evl = []\n",
+    "    for i in result:\n",
+    "        x = float(i[2])\n",
+    "        evl.append(x)\n",
+    "    return evl\n",
+    "def purify(text):\n",
+    "    if not text:\n",
+    "        return \"no text\"\n",
+    "    pu_text = spell((text.lower()))\n",
+    "    waste = [\"well yes but actually no\", \"change my mind\"]\n",
+    "    for i in waste:\n",
+    "        if i in pu_text:\n",
+    "            pu_text.replace(i,\" \")\n",
+    "    sp_text = pu_text.split(\" \")\n",
+    "    for i in range(0, len(sp_text)):\n",
+    "        if (SequenceMatcher(a=sp_text[i], b=\"imgflib\").ratio() > .8) or (SequenceMatcher(a=sp_text[i], b=\"imgflib.com\").ratio() > .8) or (sp_text[i] == \"com\"):\n",
+    "            del sp_text[i]\n",
+    "            break\n",
+    "    for i, t in enumerate(sp_text):\n",
+    "        if t.endswith(\":\") and ((sp_text[i-1]).lower() in \"my her his him\"):\n",
+    "            sp_text.insert(i-1,\"\\n\")\n",
+    "        elif t.endswith(\":\"):\n",
+    "            sp_text.insert(i,\"\\n\")\n",
+    "    return \" \".join(sp_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-10T00:43:29.097665Z",
+     "start_time": "2022-03-10T00:01:10.330151Z"
+    },
+    "code_folding": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.\n",
+      "  return _methods._mean(a, axis=axis, dtype=dtype,\n",
+      "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars\n",
+      "  ret = ret.dtype.type(ret / rcount)\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = []\n",
+    "all_class = []\n",
+    "for file in all_files:\n",
+    "    all_text = []\n",
+    "    cors_eval = []\n",
+    "    img = listdir(r\"D:\\{}\".format(file))\n",
+    "    for i in range(len(img)):\n",
+    "        try:\n",
+    "            result = reader.readtext(r\"D:\\{}\\{}\".format(file, img[i]), paragraph=False)\n",
+    "            x = only_text(result)\n",
+    "            y = only_eval(result)\n",
+    "            all_text.append(x)\n",
+    "            cors_eval.append(y)\n",
+    "        except:\n",
+    "            pass\n",
+    "    for i, j in zip(all_text[:], cors_eval[:]):\n",
+    "        for t, e in zip(i[:], j[:]):\n",
+    "            if e <.5:\n",
+    "                i.remove(t)\n",
+    "                j.remove(e)\n",
+    "        if np.mean(j) < .7:\n",
+    "            all_text.remove(i)\n",
+    "            cors_eval.remove(j)\n",
+    "    joined_text = []\n",
+    "    for i in all_text:\n",
+    "        joined_text.append(\" \".join(i))\n",
+    "    label = np.ones(len(all_text))*(all_files.index(file)+1)\n",
+    "    label-=1\n",
+    "    text.extend(joined_text)\n",
+    "    all_class.extend(label)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-03-10T00:01:04.412Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "purified_text = []\n",
+    "for i in text:\n",
+    "    pu = purify(str(i))\n",
+    "    purified_text.append(pu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-03-10T00:01:16.513Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "data = {\"Text\": purified_text, \"Class\": all_class}\n",
+    "df = pd.DataFrame(data)\n",
+    "df = df.drop_duplicates()\n",
+    "df.reset_index()\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "start_time": "2022-03-10T00:02:25.881Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df.to_excel(\"NLP_classes.xlsx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2022-03-09T23:19:05.044525Z",
+     "start_time": "2022-03-09T22:44:01.280Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "df.value_counts([\"Class\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

nlp_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c1d62cf5c0b38322510143cbc6abb3d2929f697d91e4ec5f3bac859a83492a03
+size 433476137

read.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ This model is for personal use only so there is a lack of comments and explaination. Sorry if it hurts your eyes :)