AmrSheta commited on
Commit
ba547ff
·
1 Parent(s): 8196bcb

nlp model with tranformers

Browse files
Data_Scrapping.ipynb ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "ExecuteTime": {
8
+ "end_time": "2022-03-09T19:40:31.635208Z",
9
+ "start_time": "2022-03-09T19:40:30.622979Z"
10
+ }
11
+ },
12
+ "outputs": [
13
+ {
14
+ "name": "stdout",
15
+ "output_type": "stream",
16
+ "text": [
17
+ "/imgflip_white_96.png\n",
18
+ "/imgflip-icon-transparent-192.svg\n",
19
+ "/s/meme/Monkey-Puppet.jpg\n",
20
+ "//i.imgflip.com/4zv2v9.jpg\n",
21
+ "//i.imgflip.com/67jy1i.jpg\n",
22
+ "//i.imgflip.com/67gu7u.jpg\n",
23
+ "//i.imgflip.com/67ezqb.jpg\n",
24
+ "//i.imgflip.com/63add8.jpg\n",
25
+ "//i.imgflip.com/67t59e.jpg\n",
26
+ "//i.imgflip.com/67n7t7.jpg\n",
27
+ "//i.imgflip.com/67t4xy.jpg\n",
28
+ "//i.imgflip.com/675rc7.jpg\n",
29
+ "//i.imgflip.com/674mrj.jpg\n",
30
+ "//i.imgflip.com/67rmxb.jpg\n",
31
+ "//i.imgflip.com/61m49m.jpg\n",
32
+ "//i.imgflip.com/66p3ul.jpg\n",
33
+ "//i.imgflip.com/64kb0c.jpg\n"
34
+ ]
35
+ }
36
+ ],
37
+ "source": [
38
+ "from urllib.request import urlopen\n",
39
+ "from bs4 import BeautifulSoup\n",
40
+ "import requests\n",
41
+ "htmldata = requests.get('https://imgflip.com/meme/Monkey-Puppet?page=2', proxies={'http':'50.207.31.221:80'})\n",
42
+ "soup = BeautifulSoup(htmldata.text, 'lxml')\n",
43
+ "images = soup.find_all('img')\n",
44
+ " \n",
45
+ "for item in images:\n",
46
+ " print(item['src'])"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 12,
52
+ "metadata": {
53
+ "ExecuteTime": {
54
+ "end_time": "2022-02-25T12:24:56.533753Z",
55
+ "start_time": "2022-02-25T12:24:56.017684Z"
56
+ }
57
+ },
58
+ "outputs": [],
59
+ "source": [
60
+ "img_data = requests.get('http://i.imgflip.com/4zv2v9.jpg').content\n",
61
+ "with open('image_name.jpg', 'wb') as handler:\n",
62
+ " handler.write(img_data)"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": 20,
68
+ "metadata": {
69
+ "ExecuteTime": {
70
+ "end_time": "2022-02-25T14:22:45.253019Z",
71
+ "start_time": "2022-02-25T14:22:44.273480Z"
72
+ }
73
+ },
74
+ "outputs": [],
75
+ "source": [
76
+ "htmldata = requests.get('https://imgflip.com/meme/Monkey-Puppet?page=104156000', proxies={'http':'50.207.31.221:80'})\n"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 5,
82
+ "metadata": {
83
+ "ExecuteTime": {
84
+ "end_time": "2022-03-09T19:41:25.773009Z",
85
+ "start_time": "2022-03-09T19:41:25.766008Z"
86
+ }
87
+ },
88
+ "outputs": [],
89
+ "source": [
90
+ "import os"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": null,
96
+ "metadata": {
97
+ "ExecuteTime": {
98
+ "start_time": "2022-03-09T22:08:26.209Z"
99
+ }
100
+ },
101
+ "outputs": [
102
+ {
103
+ "name": "stdout",
104
+ "output_type": "stream",
105
+ "text": [
106
+ "done printing image:10 in page 0\n",
107
+ "done printing image:10 in page 1\n",
108
+ "done printing image:10 in page 2\n",
109
+ "done printing image:10 in page 3\n",
110
+ "done printing image:10 in page 4\n",
111
+ "done printing image:10 in page 5\n",
112
+ "done printing image:10 in page 6\n",
113
+ "done printing image:10 in page 7\n",
114
+ "done printing image:10 in page 8\n",
115
+ "done printing image:10 in page 9\n",
116
+ "done printing image:10 in page 10\n",
117
+ "done printing image:10 in page 11\n",
118
+ "done printing image:10 in page 12\n",
119
+ "done printing image:10 in page 13\n",
120
+ "done printing image:10 in page 14\n",
121
+ "done printing image:10 in page 15\n",
122
+ "done printing image:10 in page 16\n",
123
+ "done printing image:10 in page 17\n",
124
+ "done printing image:10 in page 18\n",
125
+ "done printing image:10 in page 19\n",
126
+ "done printing image:10 in page 20\n",
127
+ "done printing image:10 in page 21\n",
128
+ "done printing image:10 in page 22\n",
129
+ "done printing image:10 in page 23\n",
130
+ "done printing image:10 in page 24\n",
131
+ "done printing image:10 in page 25\n",
132
+ "done printing image:10 in page 26\n",
133
+ "done printing image:10 in page 27\n",
134
+ "done printing image:10 in page 28\n"
135
+ ]
136
+ }
137
+ ],
138
+ "source": [
139
+ "meme_templetes = [\"Monkey-Puppet\",\"Surprised-Pikachu\",\"Well-Yes-But-Actually-No\",\"10-Guy\",\"Spiderman-Computer-Desk\", \"Kevin_Hart\", \"laughing-leo\", \"Lisa\", \"Roll-Safe-Think-About-It\",\"Batman-Slapping-Robin\", \"Change-My-Mind\", \"Futurama-Fry\", \"First-World-Problems\"]\n",
140
+ "for temp in meme_templetes:\n",
141
+ " st = temp.replace(\"-\", \"_\")\n",
142
+ " os.mkdir(\"D:/{}\".format(st))\n",
143
+ " for i in range(100):\n",
144
+ " htmldata = requests.get('https://imgflip.com/meme/{}?page={}'.format(temp,i+1), proxies={'http':'10.10.1.10:3128'})\n",
145
+ " soup = BeautifulSoup(htmldata.text, 'lxml')\n",
146
+ " images = soup.find_all('img')\n",
147
+ " for item in images:\n",
148
+ " if item['src'].startswith('//i.imgflip.com/'):\n",
149
+ " img_data = requests.get('http:{}'.format(item['src'])).content\n",
150
+ " with open('D:/{}/{}{}page{}.jpg'.format(st,st,images.index(item),i), 'wb') as handler:\n",
151
+ " handler.write(img_data)\n",
152
+ " if images.index(item)%10 ==0:\n",
153
+ " print(\"done printing image:{}\".format(images.index(item)), \"in page {}\".format(i))"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "metadata": {
160
+ "ExecuteTime": {
161
+ "end_time": "2022-03-09T23:46:49.531228Z",
162
+ "start_time": "2022-03-09T23:46:49.510689Z"
163
+ }
164
+ },
165
+ "outputs": [],
166
+ "source": []
167
+ },
168
+ {
169
+ "cell_type": "code",
170
+ "execution_count": null,
171
+ "metadata": {},
172
+ "outputs": [],
173
+ "source": []
174
+ }
175
+ ],
176
+ "metadata": {
177
+ "kernelspec": {
178
+ "display_name": "Python 3",
179
+ "language": "python",
180
+ "name": "python3"
181
+ },
182
+ "language_info": {
183
+ "codemirror_mode": {
184
+ "name": "ipython",
185
+ "version": 3
186
+ },
187
+ "file_extension": ".py",
188
+ "mimetype": "text/x-python",
189
+ "name": "python",
190
+ "nbconvert_exporter": "python",
191
+ "pygments_lexer": "ipython3",
192
+ "version": "3.8.5"
193
+ },
194
+ "varInspector": {
195
+ "cols": {
196
+ "lenName": 16,
197
+ "lenType": 16,
198
+ "lenVar": 40
199
+ },
200
+ "kernels_config": {
201
+ "python": {
202
+ "delete_cmd_postfix": "",
203
+ "delete_cmd_prefix": "del ",
204
+ "library": "var_list.py",
205
+ "varRefreshCmd": "print(var_dic_list())"
206
+ },
207
+ "r": {
208
+ "delete_cmd_postfix": ") ",
209
+ "delete_cmd_prefix": "rm(",
210
+ "library": "var_list.r",
211
+ "varRefreshCmd": "cat(var_dic_list()) "
212
+ }
213
+ },
214
+ "types_to_exclude": [
215
+ "module",
216
+ "function",
217
+ "builtin_function_or_method",
218
+ "instance",
219
+ "_Feature"
220
+ ],
221
+ "window_display": false
222
+ }
223
+ },
224
+ "nbformat": 4,
225
+ "nbformat_minor": 4
226
+ }
Meme_Transformer1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
cleaning_and_conversion.ipynb ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {
7
+ "ExecuteTime": {
8
+ "end_time": "2022-03-10T00:01:02.682985Z",
9
+ "start_time": "2022-03-10T00:00:58.195800Z"
10
+ }
11
+ },
12
+ "outputs": [],
13
+ "source": [
14
+ "import easyocr\n",
15
+ "import pandas as pd\n",
16
+ "import numpy as np\n",
17
+ "from os import listdir\n",
18
+ "from difflib import SequenceMatcher\n",
19
+ "from autocorrect import Speller"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "metadata": {
26
+ "ExecuteTime": {
27
+ "end_time": "2022-03-10T00:01:10.249133Z",
28
+ "start_time": "2022-03-10T00:01:02.715992Z"
29
+ }
30
+ },
31
+ "outputs": [],
32
+ "source": [
33
+ "all_files = [\"monkey_puppet\",\"surprised_pikachu\",\"well_yes_but_actually_no\",\"10_Guy\",\"Spiderman_Computer_Desk\", \"Kevin_Hart\", \"laughing_leo\", \"Lisa\", \"Roll_Safe_Think_About_It\", \"Change_My_Mind\", \"Futurama_Fry\", \"First_World_Problems\"]\n",
34
+ "reader = easyocr.Reader(['en']) \n",
35
+ "spell = Speller(lang='en')"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 3,
41
+ "metadata": {
42
+ "ExecuteTime": {
43
+ "end_time": "2022-03-10T00:01:10.297143Z",
44
+ "start_time": "2022-03-10T00:01:10.282141Z"
45
+ }
46
+ },
47
+ "outputs": [],
48
+ "source": [
49
+ "def only_text(lis):\n",
50
+ " text = []\n",
51
+ " for i in result:\n",
52
+ " x = i[1]\n",
53
+ " text.append(x)\n",
54
+ " return text\n",
55
+ "def only_eval(lis):\n",
56
+ " evl = []\n",
57
+ " for i in result:\n",
58
+ " x = float(i[2])\n",
59
+ " evl.append(x)\n",
60
+ " return evl\n",
61
+ "def purify(text):\n",
62
+ " if not text:\n",
63
+ " return \"no text\"\n",
64
+ " pu_text = spell((text.lower()))\n",
65
+ " waste = [\"well yes but actually no\", \"change my mind\"]\n",
66
+ " for i in waste:\n",
67
+ " if i in pu_text:\n",
68
+ " pu_text.replace(i,\" \")\n",
69
+ " sp_text = pu_text.split(\" \")\n",
70
+ " for i in range(0, len(sp_text)):\n",
71
+ " if (SequenceMatcher(a=sp_text[i], b=\"imgflib\").ratio() > .8) or (SequenceMatcher(a=sp_text[i], b=\"imgflib.com\").ratio() > .8) or (sp_text[i] == \"com\"):\n",
72
+ " del sp_text[i]\n",
73
+ " break\n",
74
+ " for i, t in enumerate(sp_text):\n",
75
+ " if t.endswith(\":\") and ((sp_text[i-1]).lower() in \"my her his him\"):\n",
76
+ " sp_text.insert(i-1,\"\\n\")\n",
77
+ " elif t.endswith(\":\"):\n",
78
+ " sp_text.insert(i,\"\\n\")\n",
79
+ " return \" \".join(sp_text)"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 4,
85
+ "metadata": {
86
+ "ExecuteTime": {
87
+ "end_time": "2022-03-10T00:43:29.097665Z",
88
+ "start_time": "2022-03-10T00:01:10.330151Z"
89
+ },
90
+ "code_folding": []
91
+ },
92
+ "outputs": [
93
+ {
94
+ "name": "stderr",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.\n",
98
+ " return _methods._mean(a, axis=axis, dtype=dtype,\n",
99
+ "C:\\Users\\amrsh\\anaconda3\\lib\\site-packages\\numpy\\core\\_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars\n",
100
+ " ret = ret.dtype.type(ret / rcount)\n"
101
+ ]
102
+ }
103
+ ],
104
+ "source": [
105
+ "text = []\n",
106
+ "all_class = []\n",
107
+ "for file in all_files:\n",
108
+ " all_text = []\n",
109
+ " cors_eval = []\n",
110
+ " img = listdir(r\"D:\\{}\".format(file))\n",
111
+ " for i in range(len(img)):\n",
112
+ " try:\n",
113
+ " result = reader.readtext(r\"D:\\{}\\{}\".format(file, img[i]), paragraph=False)\n",
114
+ " x = only_text(result)\n",
115
+ " y = only_eval(result)\n",
116
+ " all_text.append(x)\n",
117
+ " cors_eval.append(y)\n",
118
+ " except:\n",
119
+ " pass\n",
120
+ " for i, j in zip(all_text[:], cors_eval[:]):\n",
121
+ " for t, e in zip(i[:], j[:]):\n",
122
+ " if e <.5:\n",
123
+ " i.remove(t)\n",
124
+ " j.remove(e)\n",
125
+ " if np.mean(j) < .7:\n",
126
+ " all_text.remove(i)\n",
127
+ " cors_eval.remove(j)\n",
128
+ " joined_text = []\n",
129
+ " for i in all_text:\n",
130
+ " joined_text.append(\" \".join(i))\n",
131
+ " label = np.ones(len(all_text))*(all_files.index(file)+1)\n",
132
+ " label-=1\n",
133
+ " text.extend(joined_text)\n",
134
+ " all_class.extend(label)"
135
+ ]
136
+ },
137
+ {
138
+ "cell_type": "code",
139
+ "execution_count": null,
140
+ "metadata": {
141
+ "ExecuteTime": {
142
+ "start_time": "2022-03-10T00:01:04.412Z"
143
+ }
144
+ },
145
+ "outputs": [],
146
+ "source": [
147
+ "purified_text = []\n",
148
+ "for i in text:\n",
149
+ " pu = purify(str(i))\n",
150
+ " purified_text.append(pu)"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "code",
155
+ "execution_count": null,
156
+ "metadata": {
157
+ "ExecuteTime": {
158
+ "start_time": "2022-03-10T00:01:16.513Z"
159
+ }
160
+ },
161
+ "outputs": [],
162
+ "source": [
163
+ "data = {\"Text\": purified_text, \"Class\": all_class}\n",
164
+ "df = pd.DataFrame(data)\n",
165
+ "df = df.drop_duplicates()\n",
166
+ "df.reset_index()\n",
167
+ "df.head(5)"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "metadata": {
174
+ "ExecuteTime": {
175
+ "start_time": "2022-03-10T00:02:25.881Z"
176
+ }
177
+ },
178
+ "outputs": [],
179
+ "source": [
180
+ "df.to_excel(\"NLP_classes.xlsx\")"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "metadata": {
187
+ "ExecuteTime": {
188
+ "end_time": "2022-03-09T23:19:05.044525Z",
189
+ "start_time": "2022-03-09T22:44:01.280Z"
190
+ }
191
+ },
192
+ "outputs": [],
193
+ "source": [
194
+ "df.value_counts([\"Class\"])"
195
+ ]
196
+ },
197
+ {
198
+ "cell_type": "code",
199
+ "execution_count": null,
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": []
203
+ }
204
+ ],
205
+ "metadata": {
206
+ "kernelspec": {
207
+ "display_name": "Python 3",
208
+ "language": "python",
209
+ "name": "python3"
210
+ },
211
+ "language_info": {
212
+ "codemirror_mode": {
213
+ "name": "ipython",
214
+ "version": 3
215
+ },
216
+ "file_extension": ".py",
217
+ "mimetype": "text/x-python",
218
+ "name": "python",
219
+ "nbconvert_exporter": "python",
220
+ "pygments_lexer": "ipython3",
221
+ "version": "3.8.5"
222
+ },
223
+ "varInspector": {
224
+ "cols": {
225
+ "lenName": 16,
226
+ "lenType": 16,
227
+ "lenVar": 40
228
+ },
229
+ "kernels_config": {
230
+ "python": {
231
+ "delete_cmd_postfix": "",
232
+ "delete_cmd_prefix": "del ",
233
+ "library": "var_list.py",
234
+ "varRefreshCmd": "print(var_dic_list())"
235
+ },
236
+ "r": {
237
+ "delete_cmd_postfix": ") ",
238
+ "delete_cmd_prefix": "rm(",
239
+ "library": "var_list.r",
240
+ "varRefreshCmd": "cat(var_dic_list()) "
241
+ }
242
+ },
243
+ "types_to_exclude": [
244
+ "module",
245
+ "function",
246
+ "builtin_function_or_method",
247
+ "instance",
248
+ "_Feature"
249
+ ],
250
+ "window_display": false
251
+ }
252
+ },
253
+ "nbformat": 4,
254
+ "nbformat_minor": 4
255
+ }
nlp_model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d62cf5c0b38322510143cbc6abb3d2929f697d91e4ec5f3bac859a83492a03
3
+ size 433476137
read.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ This model is for personal use only so there is a lack of comments and explaination. Sorry if it hurts your eyes :)