muryshev commited on
Commit
54585e2
·
1 Parent(s): 156d744

Changed search

Browse files
Files changed (3) hide show
  1. BasicSearchV7.py +1038 -0
  2. Dockerfile +2 -1
  3. app.py +8 -15
BasicSearchV7.py ADDED
@@ -0,0 +1,1038 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.feature_extraction.text import TfidfTransformer
5
+ from scipy import sparse
6
+ import re
7
+ from xml.dom.minidom import parseString #, parse
8
+ import os
9
+ import sys
10
+ import json
11
+
12
+ # alpha = 1.15
13
+ # beta = .2
14
+ # gamma = .4
15
+ # delta = .31
16
+ # epsilon = 0
17
+
18
+ alpha = 0
19
+ beta = .55
20
+ gamma = .0
21
+ delta = .2
22
+ epsilon = 0
23
+ zeta = .65
24
+
25
+ # stemmer class
26
+ class Porter:
27
+ PERFECTIVEGROUND = re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
28
+ REFLEXIVE = re.compile(u"(с[яь])$")
29
+ ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
30
+ PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
31
+ VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
32
+ NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
33
+ RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
34
+ DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
35
+ DER = re.compile(u"ость?$")
36
+ SUPERLATIVE = re.compile(u"(ейше|ейш)$")
37
+ I = re.compile(u"и$")
38
+ P = re.compile(u"ь$")
39
+ NN = re.compile(u"нн$")
40
+
41
+ def stem(word):
42
+ # word = word.lower()
43
+ word = word.replace(u'ё', u'е')
44
+ m = re.match(Porter.RVRE, word)
45
+ if m and m.groups():
46
+ pre = m.group(1)
47
+ rv = m.group(2)
48
+ temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
49
+ if temp == rv:
50
+ rv = Porter.REFLEXIVE.sub('', rv, 1)
51
+ temp = Porter.ADJECTIVE.sub('', rv, 1)
52
+ if temp != rv:
53
+ rv = temp
54
+ rv = Porter.PARTICIPLE.sub('', rv, 1)
55
+ else:
56
+ temp = Porter.VERB.sub('', rv, 1)
57
+ if temp == rv:
58
+ rv = Porter.NOUN.sub('', rv, 1)
59
+ else:
60
+ rv = temp
61
+ else:
62
+ rv = temp
63
+
64
+ rv = Porter.I.sub('', rv, 1)
65
+
66
+ if re.match(Porter.DERIVATIONAL, rv):
67
+ rv = Porter.DER.sub('', rv, 1)
68
+
69
+ temp = Porter.P.sub('', rv, 1)
70
+ if temp == rv:
71
+ rv = Porter.SUPERLATIVE.sub('', rv, 1)
72
+ rv = Porter.NN.sub(u'н', rv, 1)
73
+ else:
74
+ rv = temp
75
+ word = pre+rv
76
+ return word
77
+ stem = staticmethod(stem)
78
+
79
+
80
+
81
+ class BasicSearch:
82
+ # constructor function
83
+ def __init__(self, doctype = 'all-docs', data_directory = './') :
84
+ self.doctype = doctype
85
+ self.load_everything(data_directory=data_directory)
86
+
87
+ def read_xml(self, path):
88
+ with open(path, "r", encoding="utf-8") as text_file:
89
+ data = text_file.read()
90
+
91
+ document = parseString('<data>' + data + '</data>')
92
+ return [
93
+ document.getElementsByTagName('title'),
94
+ document.getElementsByTagName('text')
95
+ ]
96
+
97
+
98
+ def getRefsNK(self, s) :
99
+ i = 0
100
+ refs = set()
101
+ x = 0
102
+ while x != -1 :
103
+ x = s.lower().find(' ст.', x)
104
+ if x != -1 :
105
+ # x += 1
106
+ y = s.lower().find('нк рф', x)
107
+ if y != -1 :
108
+ # print(i)
109
+ # print(x, y)
110
+ dx = 4
111
+ if s[x + dx] == ' ' :
112
+ dx = 5
113
+ if y - x <= 13 and y - x > 5 :
114
+ # print(s[x + 4: y + 5])
115
+ ref = 'Статья ' + s[x + dx: y - 1]
116
+ if ref in self.refid :
117
+ refs.add(ref)
118
+ x = y
119
+ else :
120
+ # print('error: ', s[x + 4: y + 5])
121
+ x += 1
122
+ i += 1
123
+ if i > 1000 :
124
+ break
125
+ return list(refs)
126
+
127
+ def getRefsNK1(self, s, debug = False, altrefs = set()) :
128
+ i = 0
129
+ refs = set()
130
+ x = 0
131
+ slen = len(s)
132
+
133
+ s0 = s
134
+ s = s.replace('(',' ')
135
+ s = s.replace(')',' ')
136
+ s = s.replace(';',' ')
137
+ s = s.replace(':',' ')
138
+ s = s.replace(',',' ')
139
+
140
+ while x != -1 :
141
+ # print(x)
142
+ x1 = s.lower().find('нк рф', x)
143
+ if x1 == -1 :
144
+ break
145
+
146
+ # print(x)
147
+ x2 = x1 - 12
148
+ x2 = max(x2, 0)
149
+
150
+ x31 = s.lower().find('ст.', x2)
151
+ x32 = s.lower().find('ьей', x2)
152
+ x33 = s.lower().find('ьёй', x2)
153
+ x34 = s.lower().find('ями', x2)
154
+ x35 = s.lower().find('тьи', x2)
155
+ x36 = s.lower().find('тье', x2)
156
+
157
+ if x31 == -1 :
158
+ x31 = slen
159
+ if x32 == -1 :
160
+ x32 = slen
161
+ if x33 == -1 :
162
+ x33 = slen
163
+ if x34 == -1 :
164
+ x34 = slen
165
+ if x35 == -1 :
166
+ x35 = slen
167
+ if x36 == -1 :
168
+ x36 = slen
169
+
170
+ x3 = min(x31, x32, x33, x34, x35, x36)
171
+ # print(x1, x2, x3)
172
+ # if x3 > x1 :
173
+ # print('not found: ', s0[x2 : x1 + 5])
174
+
175
+ x = x3
176
+ # print(x)
177
+
178
+ if x != -1 :
179
+ # x += 1
180
+ y = s.lower().find('нк рф', x)
181
+ if y != -1 :
182
+ # print(i)
183
+ # print(y)
184
+ # print(s)
185
+ dx = 3
186
+ if s[x + dx] == ' ' :
187
+ dx += 1
188
+ if y - x <= 13 and y - x > 4 :
189
+ # print(s[x + 4: y + 5])
190
+ ref = 'Статья ' + s[x + dx: y - 1]
191
+ if ref in self.refid :
192
+ refs.add(ref)
193
+ if debug and (ref not in altrefs):
194
+ print('...' + s0[y - 40 : y + 5])
195
+ x = y + 1
196
+ else :
197
+ # print('error: ', s[x + 4: y + 5])
198
+ x += 1
199
+
200
+ i += 1
201
+ if i > 1000 :
202
+ break
203
+ return list(refs)
204
+
205
+ def getRefsNK2(self, s, debug = False, altrefs = set()) :
206
+ i = 0
207
+ refs = set()
208
+ x = 0
209
+ slen = len(s)
210
+
211
+ s0 = s
212
+ s = s.replace('(',' ')
213
+ s = s.replace(')',' ')
214
+ s = s.replace(';',' ')
215
+ s = s.replace(':',' ')
216
+ s = s.replace(',',' ')
217
+
218
+ while x != -1 :
219
+ # print(x)
220
+ x1 = s.lower().find('нкрф', x)
221
+ if x1 == -1 :
222
+ break
223
+
224
+ # print(x)
225
+ x2 = x1 - 12
226
+ x2 = max(x2, 0)
227
+
228
+ x3 = s.lower().find('ст.', x2)
229
+
230
+ # print(x1, x2, x3)
231
+ # if x3 > x1 :
232
+ # print('not found: ', s0[x2 : x1 + 5])
233
+
234
+ x = x3
235
+ # print(x)
236
+
237
+ if x != -1 :
238
+ # x += 1
239
+ y = s.lower().find('нкрф', x)
240
+ if y != -1 :
241
+ # print(i)
242
+ # print(y)
243
+ # print(s)
244
+ dx = 3
245
+ if s[x + dx] == ' ' :
246
+ dx += 1
247
+ if y - x <= 13 and y - x > 4 :
248
+ # print(s[x + 4: y + 5])
249
+ ref = 'Статья ' + s[x + dx: y - 1]
250
+ if ref in self.refid :
251
+ refs.add(ref)
252
+ if debug and (ref not in altrefs):
253
+ print('...' + s0[y - 40 : y + 5])
254
+ x = y + 1
255
+ else :
256
+ # print('error: ', s[x + 4: y + 5])
257
+ x += 1
258
+
259
+ i += 1
260
+ if i > 1000 :
261
+ break
262
+ return list(refs)
263
+
264
+ # read data
265
+ def load_basic_data(self, data_directory = 'data') :
266
+
267
+ # global title
268
+ # global text
269
+ # global qtitle
270
+ # global qtext
271
+ # global atitle
272
+ # global atext
273
+ # global questions
274
+ # global answers
275
+ # global added_refs
276
+ # global missed_refs
277
+ self.title, self.text = self.read_xml(os.path.join(data_directory, 'taxcode.xml'))
278
+ self.atitle, self.atext = self.read_xml(os.path.join(data_directory, 'K2-answer.xml'))
279
+ self.qtitle, self.qtext = self.read_xml(os.path.join(data_directory, 'K2-question.xml'))
280
+
281
+ _, reftext = self.read_xml(os.path.join(data_directory, 'references-04-12-2023.xml'))
282
+
283
+ reflist = [set()] * len(self.qtitle)
284
+ reflist1 = [set()] * len(self.qtitle)
285
+ qreflist = [set()] * len(self.qtitle)
286
+
287
+
288
+ def getRefNK(s) :
289
+ x = s.find('. ')
290
+ y = s.find(' (')
291
+ if x == -1 :
292
+ x = sys.maxsize
293
+ if y == -1 :
294
+ y = sys.maxsize
295
+ x = min(x, y)
296
+ id = s[:x]
297
+ return id
298
+
299
+ self.refid = {}
300
+ self.titleref = {}
301
+ self.idref = [0] * len(self.title)
302
+ for i in range(len(self.title)) :
303
+ s = self.title[i].firstChild.nodeValue
304
+ id = getRefNK(s)
305
+ self.refid[id] = i
306
+ self.titleref[s] = id
307
+ self.idref[i] = id
308
+
309
+ for i in range(len(self.qtext)) :
310
+ # for i in range(1,2) :
311
+ doctext = self.atext[i].firstChild.nodeValue
312
+ qdoctext = self.qtext[i].firstChild.nodeValue
313
+ refdoctext = reftext[i].firstChild.nodeValue
314
+ refs = self.getRefsNK1(doctext)
315
+ qrefs = self.getRefsNK1(qdoctext)
316
+ refs1 = self.getRefsNK2(refdoctext)
317
+ # print(refs, qrefs)
318
+ intrefs = []
319
+ intrefs1 = []
320
+ intqrefs = []
321
+ for ref in refs :
322
+ intrefs.append(self.refid[ref])
323
+ for ref in refs1 :
324
+ intrefs1.append(self.refid[ref])
325
+ for ref in qrefs :
326
+ intqrefs.append(self.refid[ref])
327
+ reflist[i] = set(intrefs)
328
+ reflist1[i] = set(intrefs1)
329
+ qreflist[i] = set(intqrefs)
330
+
331
+ for i in range(len(reflist)) :
332
+ reflist[i] |= reflist1[i]
333
+
334
+ self.nk_refs = []
335
+
336
+ for i in range(len(reflist)) :
337
+ refs = list(reflist[i])
338
+ newrefs = []
339
+ for j in range(len(refs)) :
340
+ ref = self.idref[refs[j]]
341
+ m = re.search('(\d+\.\d+|\d+)', ref)
342
+ s = ref[m.start() : m.end()]
343
+ ref1 = 'ст.' + s + ' НКРФ'
344
+ newrefs.append(ref1)
345
+
346
+ self.nk_refs.append(newrefs)
347
+
348
+ # reading Vlad's json data
349
+ # datadir = os.path.join(data_directory, 'data_jsons_20240104')
350
+ datadir = os.path.join(data_directory, 'data_jsons_20240119')
351
+ filelist = os.listdir(datadir)
352
+ filelist = [x for x in filelist if re.search(r'\d+.json', x)]
353
+ filelist.sort()
354
+
355
+
356
+ questions = [''] * len(filelist)
357
+ answers = [''] * len(filelist)
358
+ added_refs = [[]] * len(filelist)
359
+ missed_refs = [[]] * len(filelist)
360
+ count = 0
361
+ for filename in filelist :
362
+ x = filename.find('.')
363
+ if x == -1 :
364
+ print('ERROR :', filename)
365
+ if filename[:x].isnumeric() :
366
+ i = int(filename[:x])
367
+ # print(i)
368
+ with open(os.path.join(datadir, filename), 'r', encoding='utf-8') as f:
369
+ d = json.load(f)
370
+ refs = set(d['added_refs'].keys())
371
+ refs -= {''}
372
+ refs = list(refs)
373
+ questions[i] = d['question']
374
+ answers[i] = d['answer']
375
+ missed_refs[i] = d['refs']
376
+ added_refs[i] = refs
377
+ count += 1
378
+
379
+ self.questions = questions#[:count]
380
+ self.answers = answers#[:count]
381
+ self.added_refs = added_refs#[:count]
382
+ self.missed_refs = missed_refs#[:count]
383
+
384
+ def load_text_processing(self) :
385
+ # globals stop_words
386
+ # global stemmer
387
+
388
+ # nltk.download('punkt')
389
+ # nltk.download('stopwords')
390
+ # nlp = ru_core_news_md.load()
391
+ # self.stop_words = set(stopwords.words('russian'))
392
+ self.stop_words = {'а', 'без', 'более', 'больше', 'будет', 'будто', 'бы', 'был', 'была', 'были', 'было', 'быть', 'в', 'вам', 'вас', 'вдруг', 'ведь', 'во', 'вот', 'впрочем', 'все', 'всегда', 'всего', 'всех', 'всю', 'вы', 'где', 'да', 'даже', 'два', 'для', 'до', 'другой', 'его', 'ее', 'ей', 'ему', 'если', 'есть', 'еще', 'ж', 'же', 'за', 'зачем', 'здесь', 'и', 'из', 'или', 'им', 'иногда', 'их', 'к', 'как', 'какая', 'какой', 'когда', 'конечно', 'кто', 'куда', 'ли', 'лучше', 'между', 'меня', 'мне', 'много', 'может', 'можно', 'мой', 'моя', 'мы', 'на', 'над', 'надо', 'наконец', 'нас', 'не', 'него', 'нее', 'ней', 'нельзя', 'нет', 'ни', 'нибудь', 'никогда', 'ним', 'них', 'ничего', 'но', 'ну', 'о', 'об', 'один', 'он', 'она', 'они', 'опять', 'от', 'перед', 'по', 'под', 'после', 'потом', 'потому', 'почти', 'при', 'про', 'раз', 'разве', 'с', 'сам', 'свою', 'себе', 'себя', 'сейчас', 'со', 'совсем', 'так', 'такой', 'там', 'тебя', 'тем', 'теперь', 'то', 'тогда', 'того', 'тоже', 'только', 'том', 'тот', 'три', 'тут', 'ты', 'у', 'уж', 'уже', 'хорошо', 'хоть', 'чего', 'чем', 'через', 'что', 'чтоб', 'чтобы', 'чуть', 'эти', 'этого', 'этой', 'этом', 'этот', 'эту', 'я'}
393
+ # self.stemmer = SnowballStemmer("russian")
394
+ self.stemmer = Porter()
395
+
396
+ def analyze(self, s) :
397
+ template = r'[\'\"\.\,\?\!\:\;\-\+\%\^\&\*\@\~\_\=/\\\>\<\#\$\(\)\|\n\r\d]'
398
+ s = re.sub(template, ' ', s)
399
+ # template = r'( \w |^\w | \w$)'
400
+ # s = re.sub(template, ' ', s)
401
+ # s = re.sub(' +', ' ', s)
402
+ s = ' '.join( [w for w in s.split() if len(w) > 1] )
403
+ # tokens = nlp(s)
404
+ # tokens = [str(t.lemma_) for t in tokens]
405
+ # tokens = word_tokenize(s)
406
+ tokens = s.strip().lower().split(' ')
407
+ # tokens = [t for t in tokens if t not in self.stop_words and t != ' ']
408
+ # tokens = [self.stemmer.stem(word) for word in tokens]
409
+ tokens = [self.stemmer.stem(word) for word in tokens if word not in self.stop_words]
410
+ newtext = ' '.join(tokens)
411
+ return newtext
412
+
413
+ # load medium dataset
414
+ def load_medium_dataset(self, path) :
415
+ # global dataset_medium
416
+ with open(path, 'r', encoding='utf-8') as infile:
417
+ self.dataset_medium = json.load(infile)
418
+
419
+ for i in range(len(self.atext)) :
420
+ question = self.qtext[i].firstChild.nodeValue.strip()
421
+ answer = self.atext[i].firstChild.nodeValue.strip()
422
+ title = self.atitle[i].firstChild.nodeValue.strip()
423
+ title = 'Консультация ' + title
424
+ text = 'Вопрос: ' + question + '\n' + 'Ответ: ' + answer
425
+ self.dataset_medium[title] = text
426
+
427
+ # data_path = "./legal_info_search_data/data_jsons_20240119"
428
+ # all_docs = {}
429
+ # for filename in os.listdir(data_path):
430
+ # with open(os.path.join(data_path, filename), "r", encoding="utf-8") as f:
431
+ # all_docs[int(filename.split(".")[0])] = json.load(f)
432
+
433
+ # # filter out docs with no added_refs
434
+ # dataset_small = {}
435
+ # for key, value in all_docs.items() :
436
+ # added_refs = value['added_refs']
437
+ # dataset_small.update(added_refs)
438
+
439
+ # # self.dataset_medium = dataset_small
440
+
441
+ # dataset_new = {}
442
+ # for key in dataset_small :
443
+ # m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', key)
444
+ # s = key
445
+ # if m != None :
446
+ # s = key[m.start() : ]
447
+
448
+ # if s in self.dataset_medium :
449
+ # dataset_new[s] = self.dataset_medium[s]
450
+ # elif s in dataset_small :
451
+ # dataset_new[s] = dataset_small[s]
452
+ # else :
453
+ # dataset_new[key] = dataset_small[key]
454
+ # # print(key, 'is absent')
455
+
456
+ # self.dataset_medium = dataset_new
457
+
458
+ # create a filtered list of references for Vlad's json data
459
+ def create_filtered_refs(self) :
460
+ doctype = self.doctype
461
+ added_refs = self.added_refs
462
+ # global filtered_refs
463
+ # global doctype_template
464
+
465
+ # t = r'(НКРФ|ГКРФ|ТКРФ|ФЗ|[Зз]акон|Минфин|ФНС|Правительства|ФАС|АС|КС|ВС|[Сс]удебн|[Сс]уд)'
466
+ if doctype == 'court-decisions' :
467
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд)' # courts' decisions
468
+ ref_template = doctype_template
469
+ elif doctype == 'minfin-letters' :
470
+ doctype_template = r'[Пп]исьмо [Мм]инфина' # Minfin letters
471
+ ref_template = doctype_template
472
+ elif doctype == 'fns-letters' :
473
+ doctype_template = r'[Пп]исьмо (ФНС|фнс)' # FNS letters
474
+ ref_template = doctype_template
475
+ elif doctype == 'all-letters' :
476
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс))' # courts' decisions + Minfin letters + FNS letters
477
+ ref_template = doctype_template
478
+ elif doctype == 'taxcode' :
479
+ doctype_template = r'^ст.(\d+\.\d+|\d+) НКРФ'
480
+ ref_template = r'ст.(\d+\.\d+|\d+) НКРФ' # taxcode ref formst differs from doctype format
481
+ elif doctype == 'other-laws' :
482
+ doctype_template = r'(^ст.(\d+\.\d+|\d+) [ГТ]КРФ|^Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # courts' decisions + Minfin letters + FNS letters + taxcode
483
+ ref_template = r'(ст.(\d+\.\d+|\d+) [ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # taxcode ref formst differs from doctype format
484
+ elif doctype == 'consultations' :
485
+ doctype_template = 'Консультация'
486
+ ref_template = 'Консультация'
487
+ elif doctype == 'all-docs' :
488
+ # doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ)' # courts' decisions + Minfin letters + FNS letters + taxcode
489
+ # ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ)' # taxcode ref formst differs from doctype format
490
+
491
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ|^ст.(\d+\.\d+|\d+) [ГТ]КРФ|^Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|^Решение Коллегии Евразийской экономической комиссии|Консультация)' # courts' decisions + Minfin letters + FNS letters + taxcode
492
+ ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ|ст.(\d+\.\d+|\d+) [ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии|Консультация)' # taxcode ref formst differs from doctype format
493
+ else :
494
+ print('Error : wrong doctype "' + doctype + '"')
495
+
496
+ filtered_refs = []
497
+ nk_mask = []
498
+ for i in range(len(added_refs)) :
499
+ refs = []
500
+ for j in range(len(added_refs[i])) :
501
+ s = added_refs[i][j]
502
+ if re.search(ref_template, s) != None:
503
+ m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', s)
504
+ if m != None :
505
+ s = s[m.start() : ]
506
+
507
+ if s in self.dataset_medium :
508
+ refs.append(s)
509
+ # print(i, j, s)
510
+
511
+ if doctype_template.find('НКРФ') != -1 :
512
+ refs += self.nk_refs[i]
513
+
514
+ refs = list(set(refs))
515
+ filtered_refs.append(refs)
516
+
517
+ self.filtered_refs = filtered_refs
518
+ self.doctype_template = doctype_template
519
+
520
+ # creating corpora fo TF-IDF embedding
521
+ def create_corpora(self) :
522
+
523
+ self.qcorpus = []
524
+ for i in range(len(self.qtext)) :
525
+ if not i % 100 : print(i, end = ' ')
526
+ # s = self.qtext[i].firstChild.nodeValue
527
+ s = self.qtitle[i].firstChild.nodeValue + ' ' + self.qtext[i].firstChild.nodeValue
528
+ s = self.analyze(s)
529
+ self.qcorpus.append(s)
530
+
531
+ self.acorpus = []
532
+ for i in range(len(self.qtext)) :
533
+ s = self.atext[i].firstChild.nodeValue
534
+ s = self.analyze(s)
535
+ self.acorpus.append(s)
536
+
537
+ # self.nkcorpus = []
538
+ # for i in range(len(self.text)) :
539
+ # if not i % 100 : print(i, end = ' ')
540
+ # s = self.text[i].firstChild.nodeValue
541
+ # s = self.analyze(s)
542
+ # self.nkcorpus.append(s)
543
+
544
+ self.pmfcorpus = []
545
+ self.pmfrefs = []
546
+ # self.pmfids = []
547
+ self.pmflengths = []
548
+ self.nk_mask = []
549
+ self.laws_mask = []
550
+
551
+ i = 0
552
+ self.items = []
553
+ for key, value in self.dataset_medium.items() :
554
+ # print('test')
555
+ # break
556
+ if re.search(self.doctype_template, key) != None :
557
+ s = value
558
+ ss = key
559
+ m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', ss)
560
+ if m != None :
561
+ ss = ss[m.start() : ]
562
+
563
+ if s != None :
564
+ s = s.replace('\n', ' ')
565
+ if s != None and s.count(' ') :
566
+ if not i % 100 : print(i, end = ' ')
567
+ # print('test')
568
+ # break
569
+ s = self.analyze(s)
570
+ if s.count(' ') :
571
+ self.pmfcorpus.append(s)
572
+ self.pmfrefs.append(ss)
573
+ # self.pmfids.append(i)
574
+ # self.items.append({'title' : key, 'text' : value})
575
+ # self.pmflengths.append(s.count(' '))
576
+
577
+ # if ss.find('НКРФ') != -1 :
578
+ if re.search(r'НКРФ', ss) :
579
+ self.nk_mask.append(1)
580
+ else:
581
+ self.nk_mask.append(0)
582
+
583
+ if re.search(r'([ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)', ss) :
584
+ self.laws_mask.append(1)
585
+ else:
586
+ self.laws_mask.append(0)
587
+
588
+ i += 1
589
+
590
+ self.refids = {}
591
+ for i in range(len(self.pmfrefs)) :
592
+ key = self.pmfrefs[i]
593
+ self.refids[key] = i
594
+
595
+ # build up TF-IDF representation
596
+ def create_TFIDF(self) :
597
+
598
+ self.vectorizer = CountVectorizer()
599
+ # self.transformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
600
+ self.transformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
601
+
602
+ X = self.vectorizer.fit_transform(self.pmfcorpus)
603
+ QX = self.vectorizer.transform(self.qcorpus)
604
+ self.TFIDF = self.transformer.fit_transform(X)
605
+ self.QTFIDF = self.transformer.transform(QX)
606
+
607
+ # self.norm = []
608
+ # for i in range(self.TFIDF.shape[0]) :
609
+ # n = scipy.sparse.linalg.norm(self.TFIDF[i])
610
+ # self.norm.append(n)
611
+ # self.TFIDF[i] /= n
612
+
613
+ # for i in range(self.QTFIDF.shape[0]) :
614
+ # qn = scipy.sparse.linalg.norm(self.QTFIDF[i])
615
+ # self.QTFIDF[i] /= qn
616
+
617
+ n = np.sqrt(self.TFIDF.multiply(self.TFIDF).sum(axis = 1))
618
+
619
+ self.TFIDF = self.TFIDF.multiply(sparse.csr_matrix(1 / n))
620
+ self.norm = n.flatten().tolist()[0]
621
+ n = np.sqrt(self.QTFIDF.multiply(self.QTFIDF).sum(axis = 1))
622
+ self.QTFIDF = self.QTFIDF.multiply(sparse.csr_matrix(1 / n))
623
+
624
+ self.avectorizer = CountVectorizer()
625
+ self.atransformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
626
+ # self.atransformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
627
+
628
+ AX = self.avectorizer.fit_transform(self.acorpus)
629
+ AQX = self.avectorizer.transform(self.qcorpus)
630
+ self.ATFIDF = self.atransformer.fit_transform(AX)
631
+ self.AQTFIDF = self.atransformer.transform(AQX)
632
+
633
+ # get top letters sorted by TF-IDF cosine similarity
634
+ def getKNNScores(self, v, i = -1) :
635
+ # v = self.AQTFIDF[i]
636
+ vt = v.transpose()
637
+ ascores = self.ATFIDF.dot(vt)[:, 0].todense()
638
+ ascores = np.squeeze(np.asarray(ascores))
639
+ scores = [0] * len(self.refids)
640
+ for j in range(len(self.filtered_refs)) :
641
+ score = ascores[j]
642
+ refs = self.filtered_refs[j]
643
+ for k in range(len(refs)) :
644
+ ref = refs[k]
645
+ m = re.search(r'ст.(\d+\.\d+|\d+) НКРФ', ref)
646
+ if i != j and m != None :
647
+ key = ref[m.start() : ]
648
+ if key in self.refids :
649
+ id = self.refids[key]
650
+ if scores[id] < score :
651
+ scores[id] = score
652
+
653
+ return scores
654
+
655
+ def getScores(self, v1, v2, i = -1) :
656
+ # v = self.QTFIDF[i]
657
+ vt = v1.transpose()
658
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
659
+ scores = np.squeeze(np.asarray(scores))
660
+ nk_scores = self.getKNNScores(v2, i)
661
+
662
+ df = pd.DataFrame()
663
+ df[0] = scores
664
+ df[1] = nk_scores
665
+ df[2] = self.norm
666
+ df[3] = self.nk_mask
667
+ df[4] = 1 - df[3]
668
+ df[5] = (1 - np.sign(df[1])) * df[3]
669
+
670
+ df[0] = df[0] * df[4] + df[1] + df[5] * df[0] * zeta
671
+ # df[0] = df[0] * df[4] + np.maximum(df[1], df[0] * zeta)
672
+
673
+ df[0] *= np.log(df[2]) ** alpha
674
+ df[0] *= (1 + df[3] * beta)
675
+ df[0] += df[3] * gamma
676
+
677
+ df[4] = self.laws_mask
678
+ df[0] *= (1 + df[4] * delta)
679
+ df[0] += df[4] * epsilon
680
+
681
+ return df[0].tolist()
682
+
683
+ def getTop(self, i, top) :
684
+ v1 = self.QTFIDF[i]
685
+ v2 = self.AQTFIDF[i]
686
+ df = pd.DataFrame()
687
+ df[0] = self.getScores(v1, v2, i)
688
+ # df[0] = self.getKNNScores(i)
689
+ df[1] = self.pmfrefs
690
+
691
+ df.sort_values(0, ascending = False, inplace = True)
692
+ # df.sort_values(0, ascending = True, inplace = True)
693
+
694
+ ids = df[1].tolist()
695
+ scores = df[0].tolist()
696
+ filtered_ids = []
697
+ for i in range(len(ids)) :
698
+ id = ids[i]
699
+ score = scores[i]
700
+ if id not in filtered_ids :
701
+ filtered_ids.append(id)
702
+
703
+ if len(filtered_ids) == top :
704
+ break
705
+
706
+ # return ids[:top].tolist()
707
+ return filtered_ids
708
+
709
+ def test_TFIDF_top(self, top = 40, metric = '') :
710
+ N = len(self.qtext)
711
+ allhits = 0
712
+ allrefs = 0
713
+ recall = []
714
+ precision = []
715
+ f1 = []
716
+
717
+ for i in range(N) :
718
+ # if not i % 10 : print(i, end = ' ')
719
+ refs = set(self.filtered_refs[i])
720
+ resp = self.getTop(i, top)
721
+ serp = set(resp)
722
+ hits = len(refs & serp)
723
+
724
+ allhits += hits
725
+ allrefs += len(refs)
726
+
727
+ tp = hits
728
+ fp = top - tp
729
+ fn = len(refs) - hits
730
+
731
+ if tp == 0 and metric == 'corrected':
732
+ if fp == 0 and fn == 0 :
733
+ # print(i, len(refs), fp, fn)
734
+ recall.append(1)
735
+ precision.append(1)
736
+ f1.append(1)
737
+ else :
738
+ # print(i, len(refs), fp, fn)
739
+ recall.append(0)
740
+ precision.append(0)
741
+ f1.append(0)
742
+
743
+ elif tp + fn > 0 :
744
+ recall.append(tp / (tp + fn))
745
+ precision.append(tp / (tp + fp))
746
+ f1.append(2 * tp / (2 * tp + fp + fn))
747
+
748
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
749
+ if len(recall)> 0 and len(precision) > 0 and len(f1) > 0 :
750
+ print('mean recall:', sum(recall) / len(recall))
751
+ print('mean precision:', sum(precision) / len(precision))
752
+ print('mean F1:', sum(f1) / len(f1))
753
+
754
+ # get letters with TF-IDF cosine similarity score > value
755
+ def getTopByScoreValue(self, i, value) :
756
+ # v = self.QTFIDF[i]
757
+ # vt = v.transpose()
758
+ # scores = self.TFIDF.dot(vt)[:, 0].todense()
759
+ # scores = np.squeeze(np.asarray(scores))
760
+
761
+ # df = pd.DataFrame()
762
+ # df[0] = scores
763
+ # df[1] = self.pmfrefs
764
+
765
+ v1 = self.QTFIDF[i]
766
+ v2 = self.AQTFIDF[i]
767
+ df = pd.DataFrame()
768
+ df[0] = self.getScores(v1, v2, i)
769
+ df[1] = self.pmfrefs
770
+
771
+ df.sort_values(0, ascending = False, inplace = True)
772
+
773
+ df1 = df.loc[df[0] > value]
774
+ ids = df1[1]
775
+
776
+ return ids.tolist()
777
+
778
+ # calculate metrics for letters with TF-IDF cosine similarity score > value
779
+
780
+ def test_TFIDF_value(self, value = .4) :
781
+ N = len(self.qtext)
782
+ allhits = 0
783
+ allrefs = 0
784
+ recall = []
785
+ precision = []
786
+ f1 = []
787
+ topsize = []
788
+ count = 0
789
+
790
+ for i in range(N) :
791
+ # if not i % 10 : print(i, end = ' ')
792
+ refs = set(self.filtered_refs[i])
793
+ resp = self.getTopByScoreValue(i, value)
794
+ serp = set(resp)
795
+ hits = len(refs & serp)
796
+ top = len(resp)
797
+ topsize.append(top)
798
+
799
+ if top > 0 :
800
+ count += 1
801
+
802
+ tp = hits
803
+ fp = top - tp
804
+ fn = len(refs) - hits
805
+
806
+ if tp == 0 :
807
+ if fp == 0 and fn == 0 :
808
+ recall.append(1)
809
+ precision.append(1)
810
+ f1.append(1)
811
+ else :
812
+ recall.append(0)
813
+ precision.append(0)
814
+ f1.append(0)
815
+
816
+ else :
817
+ recall.append(tp / (tp + fn))
818
+ precision.append(tp / (tp + fp))
819
+ f1.append(2 * tp / (2 * tp + fp + fn))
820
+
821
+ allhits += hits
822
+ allrefs += len(refs)
823
+
824
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
825
+ print('mean recall:', sum(recall) / len(recall))
826
+ print('mean precision:', sum(precision) / len(precision))
827
+ print('mean F1:', sum(f1) / len(f1))
828
+ print('mean top size: ', sum(topsize) / len(topsize))
829
+ print('non-empty top:', count)
830
+ print('non-empty top share:', count / 517)
831
+
832
+ # return topsize
833
+
834
+ # get letters with TF-IDF cosine similarity score > top score * ratio
835
+ def getTopByScoreRelValue(self, i, ratio) :
836
+ # v = self.QTFIDF[i]
837
+ # vt = v.transpose()
838
+ # scores = self.TFIDF.dot(vt)[:, 0].todense()
839
+ # scores = np.squeeze(np.asarray(scores))
840
+ # df = pd.DataFrame()
841
+ # df[0] = scores
842
+ # df[1] = self.pmfrefs
843
+
844
+ v1 = self.QTFIDF[i]
845
+ v2 = self.AQTFIDF[i]
846
+ df = pd.DataFrame()
847
+ df[0] = self.getScores(v1, v2, i)
848
+ df[1] = self.pmfrefs
849
+
850
+ df.sort_values(0, ascending = False, inplace = True)
851
+ value = df.iloc[0, 0]
852
+ df1 = df.loc[df[0] > value * ratio]
853
+ ids = df1[1]
854
+
855
+ return ids.tolist()
856
+
857
+ # calculate metrics for letters with TF-IDF cosine similarity score > top score * ratio
858
+ def test_TFIDF_ratio(self, ratio = .9) :
859
+ N = len(self.qtext)
860
+ allhits = 0
861
+ allrefs = 0
862
+ recall = []
863
+ precision = []
864
+ f1 = []
865
+ topsize = []
866
+ count = 0
867
+
868
+ for i in range(N) :
869
+ # if not i % 10 : print(i, end = ' ')
870
+ refs = set(self.filtered_refs[i])
871
+ resp = self.getTopByScoreRelValue(i, ratio)
872
+ serp = set(resp)
873
+ hits = len(refs & serp)
874
+ top = len(resp)
875
+ topsize.append(top)
876
+
877
+ tp = hits
878
+ fp = top - tp
879
+ fn = len(refs) - hits
880
+
881
+ r = 0
882
+ p = 0
883
+ f = 0
884
+
885
+ if tp == 0 :
886
+ if fp == 0 and fn == 0 :
887
+ recall.append(1)
888
+ precision.append(1)
889
+ f1.append(1)
890
+ r = 1
891
+ p = 1
892
+ f = 1
893
+ else :
894
+ recall.append(0)
895
+ precision.append(0)
896
+ f1.append(0)
897
+
898
+ else :
899
+ recall.append(tp / (tp + fn))
900
+ precision.append(tp / (tp + fp))
901
+ f1.append(2 * tp / (2 * tp + fp + fn))
902
+ r = tp / (tp + fn)
903
+ p = tp / (tp + fp)
904
+ f = 2 * tp / (2 * tp + fp + fn)
905
+
906
+ if (f > r and f > p) or (f < r and f < p) :
907
+ print('ERROR :', i, r, p, f)
908
+
909
+ allhits += hits
910
+ allrefs += len(refs)
911
+
912
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
913
+ print('mean recall:', sum(recall) / len(recall))
914
+ print('mean precision:', sum(precision) / len(precision))
915
+ print('mean F1:', sum(f1) / len(f1))
916
+ print('mean top size: ', sum(topsize) / len(topsize))
917
+
918
+ # return topsize
919
+
920
+ # def getTopForQuery(self, i, top, query) :
921
+ # v = QTFIDF[i]
922
+ # vt = v.transpose()
923
+ # scores = TFIDF.dot(vt)[:, 0].todense()
924
+ # scores = np.squeeze(np.asarray(scores))
925
+ # df = pd.DataFrame()
926
+ # df[0] = scores
927
+ # df[1] = pmfrefs
928
+
929
+ # df.sort_values(0, ascending = False, inplace = True)
930
+ # # df.sort_values(0, ascending = True, inplace = True)
931
+ # # ids = df.index
932
+ # ids = df[1]
933
+ # # print(df)
934
+
935
+ # return ids[:top].tolist()
936
+
937
+ def load_everything(self, data_directory = 'data') :
938
+ self.load_basic_data(data_directory=data_directory)
939
+ self.load_text_processing()
940
+ s = '|()><.,!?:;=*-/\\8. Форма \n \r Cчета-фактуры и порядок его заполнения, формы и порядок ведения журнала учета полученных и выставленных счетов-фактур, книг покупок и книг продаж устанавливаются Правительством Российской Федерации.'
941
+ print(self.analyze(s))
942
+ self.load_medium_dataset(path=os.path.join(data_directory, 'search_data', 'medium_dataset.json'))
943
+ self.create_filtered_refs()
944
+ self.create_corpora()
945
+ print(len(self.pmfcorpus))
946
+ self.create_TFIDF()
947
+
948
+ def test_everything(self) :
949
+ self.test_TFIDF_top(top = 40)
950
+ self.test_TFIDF_value(value = .2)
951
+ self.test_TFIDF_ratio(ratio = .9)
952
+
953
+ def search(self, query, top = 10) :
954
+ analyzed_query = self.analyze(query)
955
+
956
+ query_TF = self.vectorizer.transform([analyzed_query])
957
+ query_TFIDF = self.transformer.transform(query_TF)
958
+ n = np.sqrt(query_TFIDF.multiply(query_TFIDF).sum(axis = 1))
959
+ query_TFIDF = query_TFIDF.multiply(sparse.csr_matrix(1 / n))
960
+
961
+ query_ATF = self.avectorizer.transform([analyzed_query])
962
+ query_ATFIDF = self.atransformer.transform(query_ATF)
963
+
964
+ v1 = query_TFIDF[0]
965
+ v2 = query_ATFIDF[0]
966
+
967
+ # vt = v.transpose()
968
+ # scores = self.TFIDF.dot(vt)[:, 0].todense()
969
+ # scores = np.squeeze(np.asarray(scores))
970
+ # df = pd.DataFrame()
971
+ # df[0] = scores
972
+ # df[1] = self.pmfrefs
973
+ # df[2] = self.norm
974
+ # df[3] = self.nk_mask
975
+
976
+ # df[0] *= np.log(df[2]) ** alpha
977
+ # df[0] *= (1 + df[3] * beta)
978
+ # df[0] += df[3] * gamma
979
+
980
+ # df[4] = self.laws_mask
981
+ # df[0] *= (1 + df[4] * delta)
982
+ # df[0] += df[4] * epsilon
983
+
984
+ # df.sort_values(0, ascending = False, inplace = True)
985
+ # # df.sort_values(0, ascending = True, inplace = True)
986
+
987
+ # if top == 'auto' :
988
+ # value = df.iloc[0, 0]
989
+ # ratio = 0.81
990
+ # df1 = df.loc[df[0] > value * ratio]
991
+ # ids = df1[1]
992
+ # top = len(ids)
993
+ # else :
994
+ # ids = df[1][:top]
995
+
996
+ # # print(df)
997
+
998
+ df = pd.DataFrame()
999
+ df[0] = self.getScores(v1, v2)
1000
+ # df[0] = self.getKNNScores(i)
1001
+ df[1] = self.pmfrefs
1002
+
1003
+ df.sort_values(0, ascending = False, inplace = True)
1004
+ # df.sort_values(0, ascending = True, inplace = True)
1005
+
1006
+ titles = df[1].tolist()
1007
+ # titles = ids.tolist()
1008
+ docs = []
1009
+ for i in range(len(titles)) :
1010
+ id = df.iloc[i, 1]
1011
+ docs.append(self.dataset_medium[id])
1012
+ # print()
1013
+ # print (i, df.iloc[i, 0], id)
1014
+ # print(self.dataset_medium[id])
1015
+
1016
+ scores = df[0][:top].tolist()
1017
+
1018
+ return titles, docs, scores
1019
+
1020
+ # bsearch = BasicSearch('taxcode')
1021
+ # bsearch = BasicSearch('minfin-letters')
1022
+ # bsearch = BasicSearch('fns-letters')
1023
+ # bsearch = BasicSearch('other-laws')
1024
+ # bsearch = BasicSearch('consultations')
1025
+ # bsearch = BasicSearch('all-docs')
1026
+
1027
+ # bsearch.test_TFIDF_top(40)
1028
+
1029
+ # top = 10
1030
+ # query = 'Форма счета-фактуры и порядок его заполнения'
1031
+ # titles, docs, scores = bsearch.search(query, top = top)
1032
+
1033
+ # for i in range(top) :
1034
+ # print()
1035
+ # # print(len(scores), len(titles))
1036
+ # print(i, scores[i])
1037
+ # print(titles[i], ':\n')
1038
+ # print(docs[i][:1000], '...')
Dockerfile CHANGED
@@ -15,7 +15,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y git && \
15
  RUN pip install -U "huggingface_hub[cli]"
16
 
17
  RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
18
- huggingface-cli download muryshev/nn-legal-search-data --repo-type dataset --token=$(cat /run/secrets/HF_TOKEN)
 
19
 
20
 
21
  RUN pip install -r requirements.txt
 
15
  RUN pip install -U "huggingface_hub[cli]"
16
 
17
  RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
18
+ hf secrets login $(cat /run/secrets/HF_TOKEN) && \
19
+ hf repo clone myrushev/nn-legal-search-data /var/www/data
20
 
21
 
22
  RUN pip install -r requirements.txt
app.py CHANGED
@@ -1,16 +1,12 @@
1
  import json
2
  import os
3
  from flask import Flask, jsonify, request
4
- from BasicSearchV6 import BasicSearch as BasicSearchV6
5
- from BasicSearchV5 import BasicSearch as BasicSearchV5
6
 
7
- DATA_PATH = os.environ.get("DATA_PATH", "")
8
- DEFAULT_SEARCH_VERSION = os.environ.get("DEFAULT_SEARCH_VERSION", 6)
9
- search_v6 = BasicSearchV6(doctype='all-docs', data_directory=DATA_PATH)
10
- search_v6.test_everything()
11
-
12
- search_v5 = BasicSearchV5(doctype='all-docs', data_directory=DATA_PATH)
13
- search_v5.test_everything()
14
 
15
 
16
  app = Flask(__name__)
@@ -25,14 +21,11 @@ def search_route():
25
  data = request.get_json()
26
  query = data.get('query', '')
27
  top = data.get('top', 10)
28
- version = data.get('version', DEFAULT_SEARCH_VERSION)
29
- if str(version) == "6":
30
- titles, docs, scores = search_v6.search(query, top)
31
- else:
32
- titles, docs, scores = search_v5.search(query, top)
33
  result = [{'title': str(item1), 'text': str(item2), 'relevance': str(item3)} for item1, item2, item3 in zip(titles, docs, scores)]
34
  return jsonify(result)
35
 
36
  if __name__ == '__main__':
37
 
38
- app.run(debug=False, host='0.0.0.0', port=7860)
 
1
  import json
2
  import os
3
  from flask import Flask, jsonify, request
4
+ from BasicSearchV7 import BasicSearch as BasicSearchV7
 
5
 
6
+ DATA_PATH = os.environ.get("DATA_PATH", "./data")
7
+ # DEFAULT_SEARCH_VERSION = os.environ.get("DEFAULT_SEARCH_VERSION", 7)
8
+ search_v7 = BasicSearchV7(doctype='all-docs', data_directory="./data")
9
+ search_v7.test_everything()
 
 
 
10
 
11
 
12
  app = Flask(__name__)
 
21
  data = request.get_json()
22
  query = data.get('query', '')
23
  top = data.get('top', 10)
24
+ # version = data.get('version', DEFAULT_SEARCH_VERSION)
25
+ titles, docs, scores = search_v7.search(query, top)
 
 
 
26
  result = [{'title': str(item1), 'text': str(item2), 'relevance': str(item3)} for item1, item2, item3 in zip(titles, docs, scores)]
27
  return jsonify(result)
28
 
29
  if __name__ == '__main__':
30
 
31
+ app.run(debug=False, host='0.0.0.0')