Yash008 commited on
Commit
199e999
·
verified ·
1 Parent(s): 60fed37

Update mode.py

Browse files
Files changed (1) hide show
  1. mode.py +279 -276
mode.py CHANGED
@@ -1,276 +1,279 @@
1
- import re
2
- from bs4 import BeautifulSoup
3
- import pickle
4
- from nltk.corpus import stopwords
5
- from fuzzywuzzy import fuzz
6
- import numpy as np
7
-
8
-
9
- with open('cv.pkl', 'rb') as file:
10
- cv = pickle.load(file)
11
-
12
-
13
- def common_words(q1, q2):
14
- w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
15
- w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
16
- return len(w1 & w2)
17
-
18
-
19
- def total_words(q1, q2):
20
- w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
21
- w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
22
- return len(w1) + len(w2)
23
-
24
-
25
- # features based on tokens
26
- def token_features(q1, q2):
27
-
28
- safe_div = 0.0001
29
-
30
- token_features = [0.0]*8
31
-
32
- q1_tokens = q1.split()
33
- q2_tokens = q2.split()
34
-
35
- if len(q1_tokens) == 0 or len(q2_tokens) == 0:
36
- return token_features
37
-
38
- stopword = stopwords.words('english')
39
-
40
- q1_non_stopwords = set([word for word in q1_tokens if word not in stopword])
41
- q2_non_stopwords = set([word for word in q2_tokens if word not in stopword])
42
-
43
- q1_stop_words = set([word for word in q1_tokens if word in stopword])
44
- q2_stop_words = set([word for word in q2_tokens if word in stopword])
45
-
46
- common_word_count = len(q1_non_stopwords.intersection(q2_non_stopwords))
47
- common_stop_word_count = len(q1_stop_words.intersection(q2_stop_words))
48
- common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
49
-
50
- token_features[0] = common_word_count/(min(len(q1_non_stopwords), len(q2_non_stopwords)) + safe_div)
51
- token_features[1] = common_word_count/(max(len(q1_non_stopwords), len(q2_non_stopwords)) + safe_div)
52
- token_features[2] = common_stop_word_count/(min(len(q1_stop_words), len(q2_stop_words)) + safe_div)
53
- token_features[3] = common_stop_word_count/(max(len(q1_stop_words), len(q2_stop_words)) + safe_div)
54
- token_features[4] = common_token_count/(min(len(q1_tokens), len(q2_tokens)) + safe_div)
55
- token_features[5] = common_token_count/(max(len(q1_tokens), len(q2_tokens)) + safe_div)
56
- token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
57
- token_features[7] = int(q1_tokens[0] == q2_tokens[0])
58
-
59
- return token_features
60
-
61
-
62
- # Fuzzy Features
63
- def fuzzy_features(q1, q2):
64
-
65
- fuzzy_features = [0.0]*4
66
-
67
- # fuzz_ratio
68
- fuzzy_features[0] = fuzz.QRatio(q1, q2)
69
-
70
- # fuzz_partial_ratio
71
- fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
72
-
73
- # token_sort_ratio
74
- fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
75
-
76
- # token_set_ratio
77
- fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
78
-
79
- return fuzzy_features
80
-
81
-
82
- # data preprocessing
83
- def preprocess(q):
84
-
85
- q = str(q).lower().strip()
86
-
87
- # Replace certain special characters with their string equivalents
88
- q = q.replace('%', ' percent')
89
- q = q.replace('$', ' dollar ')
90
- q = q.replace('₹', ' rupee ')
91
- q = q.replace('', ' euro ')
92
- q = q.replace('@', ' at ')
93
-
94
- # The pattern '[math]' appears around 900 times in the whole dataset.
95
- q = q.replace('[math]', '')
96
-
97
- # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
98
- q = q.replace(',000,000,000 ', 'b ')
99
- q = q.replace(',000,000 ', 'm ')
100
- q = q.replace(',000 ', 'k ')
101
- q = re.sub(r'([0-9]+)000000000', r'\1b', q)
102
- q = re.sub(r'([0-9]+)000000', r'\1m', q)
103
- q = re.sub(r'([0-9]+)000', r'\1k', q)
104
-
105
- # Decontracting words
106
- # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
107
- # https://stackoverflow.com/a/19794953
108
- contractions = {
109
- "ain't": "am not",
110
- "aren't": "are not",
111
- "can't": "can not",
112
- "can't've": "can not have",
113
- "'cause": "because",
114
- "could've": "could have",
115
- "couldn't": "could not",
116
- "couldn't've": "could not have",
117
- "didn't": "did not",
118
- "doesn't": "does not",
119
- "don't": "do not",
120
- "hadn't": "had not",
121
- "hadn't've": "had not have",
122
- "hasn't": "has not",
123
- "haven't": "have not",
124
- "he'd": "he would",
125
- "he'd've": "he would have",
126
- "he'll": "he will",
127
- "he'll've": "he will have",
128
- "he's": "he is",
129
- "how'd": "how did",
130
- "how'd'y": "how do you",
131
- "how'll": "how will",
132
- "how's": "how is",
133
- "i'd": "i would",
134
- "i'd've": "i would have",
135
- "i'll": "i will",
136
- "i'll've": "i will have",
137
- "i'm": "i am",
138
- "i've": "i have",
139
- "isn't": "is not",
140
- "it'd": "it would",
141
- "it'd've": "it would have",
142
- "it'll": "it will",
143
- "it'll've": "it will have",
144
- "it's": "it is",
145
- "let's": "let us",
146
- "ma'am": "madam",
147
- "mayn't": "may not",
148
- "might've": "might have",
149
- "mightn't": "might not",
150
- "mightn't've": "might not have",
151
- "must've": "must have",
152
- "mustn't": "must not",
153
- "mustn't've": "must not have",
154
- "needn't": "need not",
155
- "needn't've": "need not have",
156
- "o'clock": "of the clock",
157
- "oughtn't": "ought not",
158
- "oughtn't've": "ought not have",
159
- "shan't": "shall not",
160
- "sha'n't": "shall not",
161
- "shan't've": "shall not have",
162
- "she'd": "she would",
163
- "she'd've": "she would have",
164
- "she'll": "she will",
165
- "she'll've": "she will have",
166
- "she's": "she is",
167
- "should've": "should have",
168
- "shouldn't": "should not",
169
- "shouldn't've": "should not have",
170
- "so've": "so have",
171
- "so's": "so as",
172
- "that'd": "that would",
173
- "that'd've": "that would have",
174
- "that's": "that is",
175
- "there'd": "there would",
176
- "there'd've": "there would have",
177
- "there's": "there is",
178
- "they'd": "they would",
179
- "they'd've": "they would have",
180
- "they'll": "they will",
181
- "they'll've": "they will have",
182
- "they're": "they are",
183
- "they've": "they have",
184
- "to've": "to have",
185
- "wasn't": "was not",
186
- "we'd": "we would",
187
- "we'd've": "we would have",
188
- "we'll": "we will",
189
- "we'll've": "we will have",
190
- "we're": "we are",
191
- "we've": "we have",
192
- "weren't": "were not",
193
- "what'll": "what will",
194
- "what'll've": "what will have",
195
- "what're": "what are",
196
- "what's": "what is",
197
- "what've": "what have",
198
- "when's": "when is",
199
- "when've": "when have",
200
- "where'd": "where did",
201
- "where's": "where is",
202
- "where've": "where have",
203
- "who'll": "who will",
204
- "who'll've": "who will have",
205
- "who's": "who is",
206
- "who've": "who have",
207
- "why's": "why is",
208
- "why've": "why have",
209
- "will've": "will have",
210
- "won't": "will not",
211
- "won't've": "will not have",
212
- "would've": "would have",
213
- "wouldn't": "would not",
214
- "wouldn't've": "would not have",
215
- "y'all": "you all",
216
- "y'all'd": "you all would",
217
- "y'all'd've": "you all would have",
218
- "y'all're": "you all are",
219
- "y'all've": "you all have",
220
- "you'd": "you would",
221
- "you'd've": "you would have",
222
- "you'll": "you will",
223
- "you'll've": "you will have",
224
- "you're": "you are",
225
- "you've": "you have"
226
- }
227
-
228
- q_decontracted = []
229
-
230
- for word in q.split():
231
- if word in contractions:
232
- word = contractions[word]
233
-
234
- q_decontracted.append(word)
235
-
236
- q = ' '.join(q_decontracted)
237
- q = q.replace("'ve", " have")
238
- q = q.replace("n't", " not")
239
- q = q.replace("'re", " are")
240
- q = q.replace("'ll", " will")
241
-
242
- # Removing HTML tags
243
- q = BeautifulSoup(q)
244
- q = q.get_text()
245
-
246
- # Remove punctuations
247
- pattern = re.compile('\W')
248
- q = re.sub(pattern, ' ', q).strip()
249
-
250
- return q
251
-
252
-
253
- def preprocessing(q1, q2):
254
-
255
- features = []
256
-
257
- q1 = preprocess(q1)
258
- q2 = preprocess(q2)
259
-
260
- features.append(len(q1))
261
- features.append(len(q2))
262
-
263
- features.append(len(q1.split(" ")))
264
- features.append(len(q2.split(" ")))
265
-
266
- features.append(common_words(q1, q2))
267
- features.append(total_words(q1, q2))
268
- features.append(common_words(q1, q2)/(total_words(q1, q2) + 0.0001))
269
-
270
- features.extend(token_features(q1, q2))
271
- features.extend(fuzzy_features(q1, q2))
272
-
273
- q1_bow = cv.transform([q1]).toarray()
274
- q2_bow = cv.transform([q2]).toarray()
275
-
276
- return np.hstack((np.array(features).reshape(1, 19), q1_bow, q2_bow))
 
 
 
 
1
+ import re
2
+ from bs4 import BeautifulSoup
3
+ import pickle
4
+ from nltk.corpus import stopwords
5
+ from fuzzywuzzy import fuzz
6
+ import numpy as np
7
+
8
+ import nltk
9
+ nltk.download('stopwords')
10
+
11
+
12
+ with open('cv.pkl', 'rb') as file:
13
+ cv = pickle.load(file)
14
+
15
+
16
+ def common_words(q1, q2):
17
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
18
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
19
+ return len(w1 & w2)
20
+
21
+
22
+ def total_words(q1, q2):
23
+ w1 = set(map(lambda word: word.lower().strip(), q1.split(" ")))
24
+ w2 = set(map(lambda word: word.lower().strip(), q2.split(" ")))
25
+ return len(w1) + len(w2)
26
+
27
+
28
+ # features based on tokens
29
+ def token_features(q1, q2):
30
+
31
+ safe_div = 0.0001
32
+
33
+ token_features = [0.0]*8
34
+
35
+ q1_tokens = q1.split()
36
+ q2_tokens = q2.split()
37
+
38
+ if len(q1_tokens) == 0 or len(q2_tokens) == 0:
39
+ return token_features
40
+
41
+ stopword = stopwords.words('english')
42
+
43
+ q1_non_stopwords = set([word for word in q1_tokens if word not in stopword])
44
+ q2_non_stopwords = set([word for word in q2_tokens if word not in stopword])
45
+
46
+ q1_stop_words = set([word for word in q1_tokens if word in stopword])
47
+ q2_stop_words = set([word for word in q2_tokens if word in stopword])
48
+
49
+ common_word_count = len(q1_non_stopwords.intersection(q2_non_stopwords))
50
+ common_stop_word_count = len(q1_stop_words.intersection(q2_stop_words))
51
+ common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
52
+
53
+ token_features[0] = common_word_count/(min(len(q1_non_stopwords), len(q2_non_stopwords)) + safe_div)
54
+ token_features[1] = common_word_count/(max(len(q1_non_stopwords), len(q2_non_stopwords)) + safe_div)
55
+ token_features[2] = common_stop_word_count/(min(len(q1_stop_words), len(q2_stop_words)) + safe_div)
56
+ token_features[3] = common_stop_word_count/(max(len(q1_stop_words), len(q2_stop_words)) + safe_div)
57
+ token_features[4] = common_token_count/(min(len(q1_tokens), len(q2_tokens)) + safe_div)
58
+ token_features[5] = common_token_count/(max(len(q1_tokens), len(q2_tokens)) + safe_div)
59
+ token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
60
+ token_features[7] = int(q1_tokens[0] == q2_tokens[0])
61
+
62
+ return token_features
63
+
64
+
65
+ # Fuzzy Features
66
+ def fuzzy_features(q1, q2):
67
+
68
+ fuzzy_features = [0.0]*4
69
+
70
+ # fuzz_ratio
71
+ fuzzy_features[0] = fuzz.QRatio(q1, q2)
72
+
73
+ # fuzz_partial_ratio
74
+ fuzzy_features[1] = fuzz.partial_ratio(q1, q2)
75
+
76
+ # token_sort_ratio
77
+ fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)
78
+
79
+ # token_set_ratio
80
+ fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)
81
+
82
+ return fuzzy_features
83
+
84
+
85
+ # data preprocessing
86
+ def preprocess(q):
87
+
88
+ q = str(q).lower().strip()
89
+
90
+ # Replace certain special characters with their string equivalents
91
+ q = q.replace('%', ' percent')
92
+ q = q.replace('$', ' dollar ')
93
+ q = q.replace('₹', ' rupee ')
94
+ q = q.replace('', ' euro ')
95
+ q = q.replace('@', ' at ')
96
+
97
+ # The pattern '[math]' appears around 900 times in the whole dataset.
98
+ q = q.replace('[math]', '')
99
+
100
+ # Replacing some numbers with string equivalents (not perfect, can be done better to account for more cases)
101
+ q = q.replace(',000,000,000 ', 'b ')
102
+ q = q.replace(',000,000 ', 'm ')
103
+ q = q.replace(',000 ', 'k ')
104
+ q = re.sub(r'([0-9]+)000000000', r'\1b', q)
105
+ q = re.sub(r'([0-9]+)000000', r'\1m', q)
106
+ q = re.sub(r'([0-9]+)000', r'\1k', q)
107
+
108
+ # Decontracting words
109
+ # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
110
+ # https://stackoverflow.com/a/19794953
111
+ contractions = {
112
+ "ain't": "am not",
113
+ "aren't": "are not",
114
+ "can't": "can not",
115
+ "can't've": "can not have",
116
+ "'cause": "because",
117
+ "could've": "could have",
118
+ "couldn't": "could not",
119
+ "couldn't've": "could not have",
120
+ "didn't": "did not",
121
+ "doesn't": "does not",
122
+ "don't": "do not",
123
+ "hadn't": "had not",
124
+ "hadn't've": "had not have",
125
+ "hasn't": "has not",
126
+ "haven't": "have not",
127
+ "he'd": "he would",
128
+ "he'd've": "he would have",
129
+ "he'll": "he will",
130
+ "he'll've": "he will have",
131
+ "he's": "he is",
132
+ "how'd": "how did",
133
+ "how'd'y": "how do you",
134
+ "how'll": "how will",
135
+ "how's": "how is",
136
+ "i'd": "i would",
137
+ "i'd've": "i would have",
138
+ "i'll": "i will",
139
+ "i'll've": "i will have",
140
+ "i'm": "i am",
141
+ "i've": "i have",
142
+ "isn't": "is not",
143
+ "it'd": "it would",
144
+ "it'd've": "it would have",
145
+ "it'll": "it will",
146
+ "it'll've": "it will have",
147
+ "it's": "it is",
148
+ "let's": "let us",
149
+ "ma'am": "madam",
150
+ "mayn't": "may not",
151
+ "might've": "might have",
152
+ "mightn't": "might not",
153
+ "mightn't've": "might not have",
154
+ "must've": "must have",
155
+ "mustn't": "must not",
156
+ "mustn't've": "must not have",
157
+ "needn't": "need not",
158
+ "needn't've": "need not have",
159
+ "o'clock": "of the clock",
160
+ "oughtn't": "ought not",
161
+ "oughtn't've": "ought not have",
162
+ "shan't": "shall not",
163
+ "sha'n't": "shall not",
164
+ "shan't've": "shall not have",
165
+ "she'd": "she would",
166
+ "she'd've": "she would have",
167
+ "she'll": "she will",
168
+ "she'll've": "she will have",
169
+ "she's": "she is",
170
+ "should've": "should have",
171
+ "shouldn't": "should not",
172
+ "shouldn't've": "should not have",
173
+ "so've": "so have",
174
+ "so's": "so as",
175
+ "that'd": "that would",
176
+ "that'd've": "that would have",
177
+ "that's": "that is",
178
+ "there'd": "there would",
179
+ "there'd've": "there would have",
180
+ "there's": "there is",
181
+ "they'd": "they would",
182
+ "they'd've": "they would have",
183
+ "they'll": "they will",
184
+ "they'll've": "they will have",
185
+ "they're": "they are",
186
+ "they've": "they have",
187
+ "to've": "to have",
188
+ "wasn't": "was not",
189
+ "we'd": "we would",
190
+ "we'd've": "we would have",
191
+ "we'll": "we will",
192
+ "we'll've": "we will have",
193
+ "we're": "we are",
194
+ "we've": "we have",
195
+ "weren't": "were not",
196
+ "what'll": "what will",
197
+ "what'll've": "what will have",
198
+ "what're": "what are",
199
+ "what's": "what is",
200
+ "what've": "what have",
201
+ "when's": "when is",
202
+ "when've": "when have",
203
+ "where'd": "where did",
204
+ "where's": "where is",
205
+ "where've": "where have",
206
+ "who'll": "who will",
207
+ "who'll've": "who will have",
208
+ "who's": "who is",
209
+ "who've": "who have",
210
+ "why's": "why is",
211
+ "why've": "why have",
212
+ "will've": "will have",
213
+ "won't": "will not",
214
+ "won't've": "will not have",
215
+ "would've": "would have",
216
+ "wouldn't": "would not",
217
+ "wouldn't've": "would not have",
218
+ "y'all": "you all",
219
+ "y'all'd": "you all would",
220
+ "y'all'd've": "you all would have",
221
+ "y'all're": "you all are",
222
+ "y'all've": "you all have",
223
+ "you'd": "you would",
224
+ "you'd've": "you would have",
225
+ "you'll": "you will",
226
+ "you'll've": "you will have",
227
+ "you're": "you are",
228
+ "you've": "you have"
229
+ }
230
+
231
+ q_decontracted = []
232
+
233
+ for word in q.split():
234
+ if word in contractions:
235
+ word = contractions[word]
236
+
237
+ q_decontracted.append(word)
238
+
239
+ q = ' '.join(q_decontracted)
240
+ q = q.replace("'ve", " have")
241
+ q = q.replace("n't", " not")
242
+ q = q.replace("'re", " are")
243
+ q = q.replace("'ll", " will")
244
+
245
+ # Removing HTML tags
246
+ q = BeautifulSoup(q)
247
+ q = q.get_text()
248
+
249
+ # Remove punctuations
250
+ pattern = re.compile('\W')
251
+ q = re.sub(pattern, ' ', q).strip()
252
+
253
+ return q
254
+
255
+
256
+ def preprocessing(q1, q2):
257
+
258
+ features = []
259
+
260
+ q1 = preprocess(q1)
261
+ q2 = preprocess(q2)
262
+
263
+ features.append(len(q1))
264
+ features.append(len(q2))
265
+
266
+ features.append(len(q1.split(" ")))
267
+ features.append(len(q2.split(" ")))
268
+
269
+ features.append(common_words(q1, q2))
270
+ features.append(total_words(q1, q2))
271
+ features.append(common_words(q1, q2)/(total_words(q1, q2) + 0.0001))
272
+
273
+ features.extend(token_features(q1, q2))
274
+ features.extend(fuzzy_features(q1, q2))
275
+
276
+ q1_bow = cv.transform([q1]).toarray()
277
+ q2_bow = cv.transform([q2]).toarray()
278
+
279
+ return np.hstack((np.array(features).reshape(1, 19), q1_bow, q2_bow))