muryshev commited on
Commit
936a3f8
·
1 Parent(s): 71dd912
Files changed (9) hide show
  1. .dockerignore +13 -0
  2. .gitignore +162 -0
  3. BasicSearch.py +484 -0
  4. BasicSearchV3.py +847 -0
  5. BasicSearchV5.py +878 -0
  6. BasicSearchV6.py +1025 -0
  7. Dockerfile +34 -0
  8. app.py +35 -0
  9. requirements.txt +6 -0
.dockerignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ *.db
6
+ *.sqlite
7
+ *.log
8
+ .DS_Store
9
+ .env
10
+ venv
11
+ *.bat
12
+ desktop.ini
13
+ data
.gitignore ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ data
BasicSearch.py ADDED
@@ -0,0 +1,484 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import CountVectorizer
4
+ from sklearn.feature_extraction.text import TfidfTransformer
5
+ import re
6
+ from xml.dom.minidom import parseString
7
+ import os
8
+ import json
9
+ import nltk
10
+ from nltk.tokenize import word_tokenize
11
+ from nltk.corpus import stopwords
12
+ from nltk.stem.snowball import SnowballStemmer
13
+
14
+ class BasicSearch:
15
+
16
+ # constructor function
17
+ def __init__(self, doctype = 'minfin-letters') :
18
+ self.doctype = doctype
19
+ self.load_everything()
20
+
21
+ # read data
22
+ def load_basic_data(self, data_directory = 'data') :
23
+
24
+ # global title
25
+ # global text
26
+ # global qtitle
27
+ # global qtext
28
+ # global atitle
29
+ # global atext
30
+ # global questions
31
+ # global answers
32
+ # global added_refs
33
+ # global missed_refs
34
+
35
+ text_file = open(os.path.join(data_directory, 'taxcode.xml'), "r", encoding="utf-8")
36
+ data = text_file.read()
37
+ text_file.close()
38
+ document = parseString('<data>' + data + '</data>')
39
+ self.title = document.getElementsByTagName('title')
40
+ self.text = document.getElementsByTagName('text')
41
+
42
+ text_file = open(os.path.join(data_directory, 'K2-answer.xml'), "r", encoding="utf-8")
43
+ textdata = text_file.read()
44
+ text_file.close()
45
+ document = parseString('<data>' + textdata + '</data>')
46
+ self.atitle = document.getElementsByTagName('title')
47
+ self.atext = document.getElementsByTagName('text')
48
+
49
+ text_file = open(os.path.join(data_directory, 'K2-question.xml'), "r", encoding="utf-8")
50
+ textdata = text_file.read()
51
+ text_file.close()
52
+ document = parseString('<data>' + textdata + '</data>')
53
+ self.qtitle = document.getElementsByTagName('title')
54
+ self.qtext = document.getElementsByTagName('text')
55
+
56
+ # fname2 = 'references-04-12-2023.xml'
57
+ text_file = open(os.path.join(data_directory, 'references-04-12-2023.xml'), "r", encoding="utf-8")
58
+ textdata = text_file.read()
59
+ text_file.close()
60
+ document = parseString('<data>' + textdata + '</data>')
61
+ reftext = document.getElementsByTagName('text')
62
+
63
+ text_file = open(os.path.join(data_directory, 'references-Vlad-11-12-2023.xml'), "r", encoding="utf-8")
64
+ textdata = text_file.read()
65
+ text_file.close()
66
+ document = parseString('<data>' + textdata + '</data>')
67
+ reftext2 = document.getElementsByTagName('text')
68
+
69
+ # reading Vlad's json data
70
+ datadir = os.path.join(data_directory, 'data_jsons_20240104')
71
+ filelist = os.listdir(datadir)
72
+ filelist.sort()
73
+
74
+ questions = [''] * len(filelist)
75
+ answers = [''] * len(filelist)
76
+ added_refs = [[]] * len(filelist)
77
+ missed_refs = [[]] * len(filelist)
78
+ count = 0
79
+ for filename in filelist :
80
+ x = filename.find('.')
81
+ if x == -1 :
82
+ print('ERROR :', filename)
83
+ if filename[:x].isnumeric() :
84
+ i = int(filename[:x])
85
+ # print(i)
86
+ f = open(os.path.join(datadir, filename), encoding="utf-8")
87
+ d = json.load(f)
88
+ refs = set(d['added_refs'].keys())
89
+ refs -= {''}
90
+ refs = list(refs)
91
+ questions[i] = d['question']
92
+ answers[i] = d['answer']
93
+ missed_refs[i] = d['refs']
94
+ added_refs[i] = refs
95
+ count += 1
96
+
97
+ self.questions = questions[:count]
98
+ self.answers = answers[:count]
99
+ self.added_refs = added_refs[:count]
100
+ self.missed_refs = missed_refs[:count]
101
+
102
+ def load_text_processing(self) :
103
+ # globals stop_words
104
+ # global stemmer
105
+
106
+ # nltk.download('punkt')
107
+ # nltk.download('stopwords')
108
+ # nlp = ru_core_news_md.load()
109
+ self.stop_words = set(stopwords.words('russian'))
110
+ self.stemmer = SnowballStemmer("russian")
111
+
112
+ def analyze(self, s) :
113
+ template = r'[\'\"\.\,\?\!\:\;\-\+\%\^\&\*\@\~\_\=/\\\>\<\#\$\(\)\|\n\r\d]'
114
+ s = re.sub(template, ' ', s)
115
+ s = re.sub(' +', ' ', s)
116
+ # tokens = nlp(s)
117
+ # tokens = [str(t.lemma_) for t in tokens]
118
+ tokens = word_tokenize(s)
119
+ tokens = [t for t in tokens if t not in self.stop_words and t != ' ']
120
+ tokens = [self.stemmer.stem(word) for word in tokens]
121
+ newtext = ' '.join(tokens)
122
+ return newtext
123
+
124
+ # load medium dataset
125
+ def load_medium_dataset(self) :
126
+ # global dataset_medium
127
+ infile = open(os.path.join('data', 'search_data', 'medium_dataset.json'), 'r', encoding="utf-8")
128
+ self.dataset_medium = json.load(infile)
129
+
130
+ # create a filtered list of references for Vlad's json data
131
+ def create_filtered_refs(self) :
132
+ doctype = self.doctype
133
+ added_refs = self.added_refs
134
+ # global filtered_refs
135
+ # global doctype_template
136
+
137
+ # t = r'(НКРФ|ГКРФ|ТКРФ|ФЗ|[Зз]акон|Минфин|ФНС|Правительства|ФАС|АС|КС|ВС|[Сс]удебн|[Сс]уд)'
138
+ if doctype == 'court-decisions' :
139
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд)' # courts' decisions
140
+ elif doctype == 'minfin-letters' :
141
+ doctype_template = r'[Пп]исьмо [Мм]инфина' # Minfin letters
142
+ elif doctype == 'fns-letters' :
143
+ doctype_template = r'[Пп]исьмо (ФНС|фнс)' # FNS letters
144
+ elif doctype == 'all-letters' :
145
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс))' # courts' decisions + Minfin letters + FNS letters
146
+ else :
147
+ print('Error : wrong doctype')
148
+
149
+ filtered_refs = []
150
+ for i in range(len(added_refs)) :
151
+ refs = []
152
+ for j in range(len(added_refs[i])) :
153
+ s = added_refs[i][j]
154
+ if re.search(doctype_template, s) != None:
155
+ refs.append(s)
156
+ # print(i, j, s)
157
+
158
+ filtered_refs.append(refs)
159
+
160
+ self.filtered_refs = filtered_refs
161
+ self.doctype_template = doctype_template
162
+
163
+ # creating corpora fo TF-IDF embedding
164
+ def create_corpora(self) :
165
+ # global qcorpus
166
+ # global nkcorpus
167
+ # global pmfcorpus
168
+ # global pmfrefs
169
+ # global pmfids
170
+ # global items
171
+
172
+ self.qcorpus = []
173
+ for i in range(len(self.qtext)) :
174
+ if not i % 100 : print(i, end = ' ')
175
+ s = self.qtext[i].firstChild.nodeValue
176
+ s = self.analyze(s)
177
+ self.qcorpus.append(s)
178
+
179
+ self.nkcorpus = []
180
+ for i in range(len(self.text)) :
181
+ if not i % 100 : print(i, end = ' ')
182
+ s = self.text[i].firstChild.nodeValue
183
+ s = self.analyze(s)
184
+ self.nkcorpus.append(s)
185
+
186
+ self.pmfcorpus = []
187
+ self.pmfrefs = []
188
+ self.pmfids = []
189
+
190
+ i = 0
191
+ self.items = []
192
+ for key, value in self.dataset_medium.items() :
193
+ # print('test')
194
+ # break
195
+ if re.search(self.doctype_template, key) != None :
196
+ s = value
197
+ ss = key
198
+ if s != None :
199
+ s = s.replace('\n', ' ')
200
+ if s != None and s.count(' ') < 12000 :
201
+ if not i % 100 : print(i, end = ' ')
202
+ # print('test')
203
+ # break
204
+ s = self.analyze(s)
205
+ self.pmfcorpus.append(s)
206
+ self.pmfrefs.append(ss)
207
+ self.pmfids.append(i)
208
+ self.items.append({'title' : key, 'text' : value})
209
+ i += 1
210
+
211
+ # build up TF-IDF representation
212
+ def create_TFIDF(self) :
213
+ # global TFIDF
214
+ # global QTFIDF
215
+ # global vectorizer
216
+ # global transformer
217
+
218
+ self.vectorizer = CountVectorizer()
219
+ self.transformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
220
+ X = self.vectorizer.fit_transform(self.pmfcorpus)
221
+ QX = self.vectorizer.transform(self.qcorpus)
222
+ self.TFIDF = self.transformer.fit_transform(X)
223
+ self.QTFIDF = self.transformer.transform(QX)
224
+
225
+ # get top letters sorted by TF-IDF cosine similarity
226
+ def getTop(self, i, top) :
227
+ v = self.QTFIDF[i]
228
+ vt = v.transpose()
229
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
230
+ scores = np.squeeze(np.asarray(scores))
231
+ df = pd.DataFrame()
232
+ df[0] = scores
233
+ df[1] = self.pmfrefs
234
+
235
+ df.sort_values(0, ascending = False, inplace = True)
236
+ # df.sort_values(0, ascending = True, inplace = True)
237
+ # ids = df.index
238
+ ids = df[1]
239
+ # print(df)
240
+
241
+ return ids[:top].tolist()
242
+
243
+ def test_TFIDF_top(self, top = 40) :
244
+ N = len(self.qtext)
245
+ allhits = 0
246
+ allrefs = 0
247
+ recall = []
248
+ precision = []
249
+ f1 = []
250
+
251
+ for i in range(N) :
252
+ # if not i % 10 : print(i, end = ' ')
253
+ refs = set(self.filtered_refs[i])
254
+ resp = self.getTop(i, top)
255
+ serp = set(resp)
256
+ hits = len(refs & serp)
257
+
258
+ tp = hits
259
+ fp = top - tp
260
+ fn = len(refs) - hits
261
+
262
+ if tp == 0 :
263
+ if fp == 0 and fn == 0 :
264
+ # print(i, len(refs), fp, fn)
265
+ recall.append(1)
266
+ precision.append(1)
267
+ f1.append(1)
268
+ else :
269
+ # print(i, len(refs), fp, fn)
270
+ recall.append(0)
271
+ precision.append(0)
272
+ f1.append(0)
273
+
274
+ else :
275
+ recall.append(tp / (tp + fn))
276
+ precision.append(tp / (tp + fp))
277
+ f1.append(2 * tp / (2 * tp + fp + fn))
278
+
279
+ print()
280
+ print('mean recall:', sum(recall) / len(recall))
281
+ print('mean precision:', sum(precision) / len(precision))
282
+ # print('mean F1:', 2 / (len(recall) / sum(recall) + len(precision) / sum(precision)))
283
+ print('mean F1:', sum(f1) / len(f1))
284
+
285
+ # get letters with TF-IDF cosine similarity score > value
286
+ def getTopByScoreValue(self, i, value) :
287
+ v = self.QTFIDF[i]
288
+ vt = v.transpose()
289
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
290
+ scores = np.squeeze(np.asarray(scores))
291
+
292
+ df = pd.DataFrame()
293
+ df[0] = scores
294
+ df[1] = self.pmfrefs
295
+
296
+ df.sort_values(0, ascending = False, inplace = True)
297
+
298
+ df1 = df.loc[df[0] > value]
299
+ ids = df1[1]
300
+
301
+ return ids.tolist()
302
+
303
+ # calculate metrics for letters with TF-IDF cosine similarity score > value
304
+
305
+ def test_TFIDF_value(self, value = .4) :
306
+ N = len(self.qtext)
307
+ allhits = 0
308
+ allrefs = 0
309
+ recall = []
310
+ precision = []
311
+ f1 = []
312
+ topsize = []
313
+ count = 0
314
+
315
+ for i in range(N) :
316
+ # if not i % 10 : print(i, end = ' ')
317
+ refs = set(self.filtered_refs[i])
318
+ resp = self.getTopByScoreValue(i, value)
319
+ serp = set(resp)
320
+ hits = len(refs & serp)
321
+ top = len(resp)
322
+ topsize.append(top)
323
+
324
+ if top > 0 :
325
+ count += 1
326
+
327
+ tp = hits
328
+ fp = top - tp
329
+ fn = len(refs) - hits
330
+
331
+ if tp == 0 :
332
+ if fp == 0 and fn == 0 :
333
+ recall.append(1)
334
+ precision.append(1)
335
+ f1.append(1)
336
+ else :
337
+ recall.append(0)
338
+ precision.append(0)
339
+ f1.append(0)
340
+
341
+ else :
342
+ recall.append(tp / (tp + fn))
343
+ precision.append(tp / (tp + fp))
344
+ f1.append(2 * tp / (2 * tp + fp + fn))
345
+
346
+ print()
347
+ print('mean recall:', sum(recall) / len(recall))
348
+ print('mean precision:', sum(precision) / len(precision))
349
+ print('mean F1:', sum(f1) / len(f1))
350
+ print('mean top size: ', sum(topsize) / len(topsize))
351
+ count, count / 517
352
+
353
+ # get letters with TF-IDF cosine similarity score > top score * ratio
354
+ def getTopByScoreRelValue(self, i, ratio) :
355
+ v = self.QTFIDF[i]
356
+ vt = v.transpose()
357
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
358
+ scores = np.squeeze(np.asarray(scores))
359
+ df = pd.DataFrame()
360
+ df[0] = scores
361
+ df[1] = self.pmfrefs
362
+
363
+ df.sort_values(0, ascending = False, inplace = True)
364
+ value = df.iloc[0, 0]
365
+ df1 = df.loc[df[0] > value * ratio]
366
+ ids = df1[1]
367
+
368
+ return ids.tolist()
369
+
370
+ # calculate metrics for letters with TF-IDF cosine similarity score > top score * ratio
371
+
372
+ def test_TFIDF_ratio(self, ratio = .9) :
373
+ N = len(self.qtext)
374
+ allhits = 0
375
+ allrefs = 0
376
+ recall = []
377
+ precision = []
378
+ f1 = []
379
+ topsize = []
380
+ count = 0
381
+
382
+ for i in range(N) :
383
+ # if not i % 10 : print(i, end = ' ')
384
+ refs = set(self.filtered_refs[i])
385
+ resp = self.getTopByScoreRelValue(i, ratio)
386
+ serp = set(resp)
387
+ hits = len(refs & serp)
388
+ top = len(resp)
389
+ topsize.append(top)
390
+
391
+ tp = hits
392
+ fp = top - tp
393
+ fn = len(refs) - hits
394
+
395
+ r = 0
396
+ p = 0
397
+ f = 0
398
+
399
+ if tp == 0 :
400
+ if fp == 0 and fn == 0 :
401
+ recall.append(1)
402
+ precision.append(1)
403
+ f1.append(1)
404
+ r = 1
405
+ p = 1
406
+ f = 1
407
+ else :
408
+ recall.append(0)
409
+ precision.append(0)
410
+ f1.append(0)
411
+
412
+ else :
413
+ recall.append(tp / (tp + fn))
414
+ precision.append(tp / (tp + fp))
415
+ f1.append(2 * tp / (2 * tp + fp + fn))
416
+ r = tp / (tp + fn)
417
+ p = tp / (tp + fp)
418
+ f = 2 * tp / (2 * tp + fp + fn)
419
+
420
+ if (f > r and f > p) or (f < r and f < p) :
421
+ print('ERROR :', i, r, p, f)
422
+
423
+ print()
424
+ print('mean recall:', sum(recall) / len(recall))
425
+ print('mean precision:', sum(precision) / len(precision))
426
+ print('mean F1:', sum(f1) / len(f1))
427
+ print('mean top size: ', sum(topsize) / len(topsize))
428
+
429
+ # def getTopForQuery(self, i, top, query) :
430
+ # v = QTFIDF[i]
431
+ # vt = v.transpose()
432
+ # scores = TFIDF.dot(vt)[:, 0].todense()
433
+ # scores = np.squeeze(np.asarray(scores))
434
+ # df = pd.DataFrame()
435
+ # df[0] = scores
436
+ # df[1] = pmfrefs
437
+
438
+ # df.sort_values(0, ascending = False, inplace = True)
439
+ # # df.sort_values(0, ascending = True, inplace = True)
440
+ # # ids = df.index
441
+ # ids = df[1]
442
+ # # print(df)
443
+
444
+ # return ids[:top].tolist()
445
+
446
+ def load_everything(self) :
447
+ self.load_basic_data()
448
+ self.load_text_processing()
449
+ s = '|()><.,!?:;=*-/\\8. Форма \n \r Cчета-фактуры и порядок его заполнения, формы и порядок ведения журнала учета полученных и выставленных счетов-фактур, книг покупок и книг продаж устанавливаются Правительством Российской Федерации.'
450
+ print(self.analyze(s))
451
+ self.load_medium_dataset()
452
+ self.create_filtered_refs()
453
+ self.create_corpora()
454
+ print(len(self.pmfcorpus))
455
+ self.create_TFIDF()
456
+
457
+ def test_everything(self) :
458
+ self.test_TFIDF_top(top = 40)
459
+ self.test_TFIDF_value(value = .4)
460
+ self.test_TFIDF_ratio(ratio = .9)
461
+
462
+ def search(self, query, top = 10) :
463
+ analyzed_query = self.analyze(query)
464
+ query_TF = self.vectorizer.transform([analyzed_query])
465
+ query_TFIDF = self.transformer.transform(query_TF)
466
+ v = query_TFIDF[0]
467
+ vt = v.transpose()
468
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
469
+ scores = np.squeeze(np.asarray(scores))
470
+ df = pd.DataFrame()
471
+ df[0] = scores
472
+ df[1] = self.pmfrefs
473
+
474
+ df.sort_values(0, ascending = False, inplace = True)
475
+ # df.sort_values(0, ascending = True, inplace = True)
476
+ # ids = df.index
477
+ ids = df[1]
478
+ # print(df)
479
+ titles = ids[:top].tolist()
480
+ docs = []
481
+ for id in ids :
482
+ docs.append(self.dataset_medium[id])
483
+
484
+ return titles, docs
BasicSearchV3.py ADDED
@@ -0,0 +1,847 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # import sklearn
3
+ # from sklearn import metrics
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.feature_extraction.text import CountVectorizer
7
+ from sklearn.feature_extraction.text import TfidfTransformer
8
+ from scipy import sparse
9
+ import re
10
+ from xml.dom.minidom import parseString #, parse
11
+ import os
12
+ import sys
13
+ import json
14
+ # import nltk
15
+ # from nltk.tokenize import word_tokenize
16
+ # from nltk.corpus import stopwords
17
+ # from nltk.stem.snowball import SnowballStemmer
18
+
19
+ # stemmer class
20
+ class Porter:
21
+ PERFECTIVEGROUND = re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
22
+ REFLEXIVE = re.compile(u"(с[яь])$")
23
+ ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
24
+ PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
25
+ VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
26
+ NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
27
+ RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
28
+ DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
29
+ DER = re.compile(u"ость?$")
30
+ SUPERLATIVE = re.compile(u"(ейше|ейш)$")
31
+ I = re.compile(u"и$")
32
+ P = re.compile(u"ь$")
33
+ NN = re.compile(u"нн$")
34
+
35
+ def stem(word):
36
+ # word = word.lower()
37
+ word = word.replace(u'ё', u'е')
38
+ m = re.match(Porter.RVRE, word)
39
+ if m and m.groups():
40
+ pre = m.group(1)
41
+ rv = m.group(2)
42
+ temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
43
+ if temp == rv:
44
+ rv = Porter.REFLEXIVE.sub('', rv, 1)
45
+ temp = Porter.ADJECTIVE.sub('', rv, 1)
46
+ if temp != rv:
47
+ rv = temp
48
+ rv = Porter.PARTICIPLE.sub('', rv, 1)
49
+ else:
50
+ temp = Porter.VERB.sub('', rv, 1)
51
+ if temp == rv:
52
+ rv = Porter.NOUN.sub('', rv, 1)
53
+ else:
54
+ rv = temp
55
+ else:
56
+ rv = temp
57
+
58
+ rv = Porter.I.sub('', rv, 1)
59
+
60
+ if re.match(Porter.DERIVATIONAL, rv):
61
+ rv = Porter.DER.sub('', rv, 1)
62
+
63
+ temp = Porter.P.sub('', rv, 1)
64
+ if temp == rv:
65
+ rv = Porter.SUPERLATIVE.sub('', rv, 1)
66
+ rv = Porter.NN.sub(u'н', rv, 1)
67
+ else:
68
+ rv = temp
69
+ word = pre+rv
70
+ return word
71
+ stem = staticmethod(stem)
72
+
73
+
74
+
75
+ class BasicSearch:
76
+ # constructor function
77
+ def __init__(self, doctype = 'minfin-letters', data_directory = 'data') :
78
+ self.doctype = doctype
79
+ self.load_everything(data_directory=data_directory)
80
+
81
+ def read_xml(self, path):
82
+ with open(path, "r", encoding="utf-8") as text_file:
83
+ data = text_file.read()
84
+
85
+ document = parseString('<data>' + data + '</data>')
86
+ return [
87
+ document.getElementsByTagName('title'),
88
+ document.getElementsByTagName('text')
89
+ ]
90
+
91
+
92
+ def getRefsNK(self, s) :
93
+ i = 0
94
+ refs = set()
95
+ x = 0
96
+ while x != -1 :
97
+ x = s.lower().find(' ст.', x)
98
+ if x != -1 :
99
+ # x += 1
100
+ y = s.lower().find('нк рф', x)
101
+ if y != -1 :
102
+ # print(i)
103
+ # print(x, y)
104
+ dx = 4
105
+ if s[x + dx] == ' ' :
106
+ dx = 5
107
+ if y - x <= 13 and y - x > 5 :
108
+ # print(s[x + 4: y + 5])
109
+ ref = 'Статья ' + s[x + dx: y - 1]
110
+ if ref in self.refid :
111
+ refs.add(ref)
112
+ x = y
113
+ else :
114
+ # print('error: ', s[x + 4: y + 5])
115
+ x += 1
116
+ i += 1
117
+ if i > 1000 :
118
+ break
119
+ return list(refs)
120
+
121
+ def getRefsNK1(self, s, debug = False, altrefs = set()) :
122
+ i = 0
123
+ refs = set()
124
+ x = 0
125
+ slen = len(s)
126
+
127
+ s0 = s
128
+ s = s.replace('(',' ')
129
+ s = s.replace(')',' ')
130
+ s = s.replace(';',' ')
131
+ s = s.replace(':',' ')
132
+ s = s.replace(',',' ')
133
+
134
+ while x != -1 :
135
+ # print(x)
136
+ x1 = s.lower().find('нк рф', x)
137
+ if x1 == -1 :
138
+ break
139
+
140
+ # print(x)
141
+ x2 = x1 - 12
142
+ x2 = max(x2, 0)
143
+
144
+ x31 = s.lower().find('ст.', x2)
145
+ x32 = s.lower().find('ьей', x2)
146
+ x33 = s.lower().find('ьёй', x2)
147
+ x34 = s.lower().find('ями', x2)
148
+ x35 = s.lower().find('тьи', x2)
149
+ x36 = s.lower().find('тье', x2)
150
+
151
+ if x31 == -1 :
152
+ x31 = slen
153
+ if x32 == -1 :
154
+ x32 = slen
155
+ if x33 == -1 :
156
+ x33 = slen
157
+ if x34 == -1 :
158
+ x34 = slen
159
+ if x35 == -1 :
160
+ x35 = slen
161
+ if x36 == -1 :
162
+ x36 = slen
163
+
164
+ x3 = min(x31, x32, x33, x34, x35, x36)
165
+ # print(x1, x2, x3)
166
+ # if x3 > x1 :
167
+ # print('not found: ', s0[x2 : x1 + 5])
168
+
169
+ x = x3
170
+ # print(x)
171
+
172
+ if x != -1 :
173
+ # x += 1
174
+ y = s.lower().find('нк рф', x)
175
+ if y != -1 :
176
+ # print(i)
177
+ # print(y)
178
+ # print(s)
179
+ dx = 3
180
+ if s[x + dx] == ' ' :
181
+ dx += 1
182
+ if y - x <= 13 and y - x > 4 :
183
+ # print(s[x + 4: y + 5])
184
+ ref = 'Статья ' + s[x + dx: y - 1]
185
+ if ref in self.refid :
186
+ refs.add(ref)
187
+ if debug and (ref not in altrefs):
188
+ print('...' + s0[y - 40 : y + 5])
189
+ x = y + 1
190
+ else :
191
+ # print('error: ', s[x + 4: y + 5])
192
+ x += 1
193
+
194
+ i += 1
195
+ if i > 1000 :
196
+ break
197
+ return list(refs)
198
+
199
+ def getRefsNK2(self, s, debug = False, altrefs = set()) :
200
+ i = 0
201
+ refs = set()
202
+ x = 0
203
+ slen = len(s)
204
+
205
+ s0 = s
206
+ s = s.replace('(',' ')
207
+ s = s.replace(')',' ')
208
+ s = s.replace(';',' ')
209
+ s = s.replace(':',' ')
210
+ s = s.replace(',',' ')
211
+
212
+ while x != -1 :
213
+ # print(x)
214
+ x1 = s.lower().find('нкрф', x)
215
+ if x1 == -1 :
216
+ break
217
+
218
+ # print(x)
219
+ x2 = x1 - 12
220
+ x2 = max(x2, 0)
221
+
222
+ x3 = s.lower().find('ст.', x2)
223
+
224
+ # print(x1, x2, x3)
225
+ # if x3 > x1 :
226
+ # print('not found: ', s0[x2 : x1 + 5])
227
+
228
+ x = x3
229
+ # print(x)
230
+
231
+ if x != -1 :
232
+ # x += 1
233
+ y = s.lower().find('нкрф', x)
234
+ if y != -1 :
235
+ # print(i)
236
+ # print(y)
237
+ # print(s)
238
+ dx = 3
239
+ if s[x + dx] == ' ' :
240
+ dx += 1
241
+ if y - x <= 13 and y - x > 4 :
242
+ # print(s[x + 4: y + 5])
243
+ ref = 'Статья ' + s[x + dx: y - 1]
244
+ if ref in self.refid :
245
+ refs.add(ref)
246
+ if debug and (ref not in altrefs):
247
+ print('...' + s0[y - 40 : y + 5])
248
+ x = y + 1
249
+ else :
250
+ # print('error: ', s[x + 4: y + 5])
251
+ x += 1
252
+
253
+ i += 1
254
+ if i > 1000 :
255
+ break
256
+ return list(refs)
257
+
258
+ # read data
259
+ def load_basic_data(self, data_directory = 'data') :
260
+
261
+ # global title
262
+ # global text
263
+ # global qtitle
264
+ # global qtext
265
+ # global atitle
266
+ # global atext
267
+ # global questions
268
+ # global answers
269
+ # global added_refs
270
+ # global missed_refs
271
+
272
+ self.title, self.text = self.read_xml(os.path.join(data_directory, 'taxcode.xml'))
273
+ self.atitle, self.atext = self.read_xml(os.path.join(data_directory, 'K2-answer.xml'))
274
+ self.qtitle, self.qtext = self.read_xml(os.path.join(data_directory, 'K2-question.xml'))
275
+
276
+ _, reftext = self.read_xml(os.path.join(data_directory, 'references-04-12-2023.xml'))
277
+ _, reftext2 = self.read_xml(os.path.join(data_directory, 'references-Vlad-11-12-2023.xml')) #reftext2 не используется
278
+
279
+ reflist = [set()] * len(self.qtitle)
280
+ reflist1 = [set()] * len(self.qtitle)
281
+ qreflist = [set()] * len(self.qtitle)
282
+
283
+
284
+ def getRefNK(s) :
285
+ x = s.find('. ')
286
+ y = s.find(' (')
287
+ if x == -1 :
288
+ x = sys.maxsize
289
+ if y == -1 :
290
+ y = sys.maxsize
291
+ x = min(x, y)
292
+ id = s[:x]
293
+ return id
294
+
295
+ self.refid = {}
296
+ self.titleref = {}
297
+ self.idref = [0] * len(self.title)
298
+ for i in range(len(self.title)) :
299
+ s = self.title[i].firstChild.nodeValue
300
+ id = getRefNK(s)
301
+ self.refid[id] = i
302
+ self.titleref[s] = id
303
+ self.idref[i] = id
304
+
305
+ for i in range(len(self.qtext)) :
306
+ # for i in range(1,2) :
307
+ doctext = self.atext[i].firstChild.nodeValue
308
+ qdoctext = self.qtext[i].firstChild.nodeValue
309
+ refdoctext = reftext[i].firstChild.nodeValue
310
+ refs = self.getRefsNK1(doctext)
311
+ qrefs = self.getRefsNK1(qdoctext)
312
+ refs1 = self.getRefsNK2(refdoctext)
313
+ # print(refs, qrefs)
314
+ intrefs = []
315
+ intrefs1 = []
316
+ intqrefs = []
317
+ for ref in refs :
318
+ intrefs.append(self.refid[ref])
319
+ for ref in refs1 :
320
+ intrefs1.append(self.refid[ref])
321
+ for ref in qrefs :
322
+ intqrefs.append(self.refid[ref])
323
+ reflist[i] = set(intrefs)
324
+ reflist1[i] = set(intrefs1)
325
+ qreflist[i] = set(intqrefs)
326
+
327
+ for i in range(len(reflist)) :
328
+ reflist[i] |= reflist1[i]
329
+
330
+ self.nk_refs = []
331
+
332
+ for i in range(len(reflist)) :
333
+ refs = list(reflist[i])
334
+ newrefs = []
335
+ for j in range(len(refs)) :
336
+ ref = self.idref[refs[j]]
337
+ m = re.search('(\d+\.\d+|\d+)', ref)
338
+ s = ref[m.start() : m.end()]
339
+ ref1 = 'ст.' + s + ' НКРФ'
340
+ newrefs.append(ref1)
341
+
342
+ self.nk_refs.append(newrefs)
343
+
344
+ # reading Vlad's json data
345
+ datadir = os.path.join(data_directory, 'data_jsons_20240104')
346
+ filelist = os.listdir(datadir)
347
+ filelist = [x for x in filelist if re.search(r'\d+.json', x)]
348
+ filelist.sort()
349
+
350
+
351
+ questions = [''] * len(filelist)
352
+ answers = [''] * len(filelist)
353
+ added_refs = [[]] * len(filelist)
354
+ missed_refs = [[]] * len(filelist)
355
+ count = 0
356
+ for filename in filelist :
357
+ x = filename.find('.')
358
+ if x == -1 :
359
+ print('ERROR :', filename)
360
+ if filename[:x].isnumeric() :
361
+ i = int(filename[:x])
362
+ # print(i)
363
+ with open(os.path.join(datadir, filename), 'r', encoding='utf-8') as f:
364
+ d = json.load(f)
365
+ refs = set(d['added_refs'].keys())
366
+ refs -= {''}
367
+ refs = list(refs)
368
+ questions[i] = d['question']
369
+ answers[i] = d['answer']
370
+ missed_refs[i] = d['refs']
371
+ added_refs[i] = refs
372
+ count += 1
373
+
374
+ self.questions = questions#[:count]
375
+ self.answers = answers#[:count]
376
+ self.added_refs = added_refs#[:count]
377
+ self.missed_refs = missed_refs#[:count]
378
+
379
+
380
+
381
+
382
+
383
+ def load_text_processing(self) :
384
+ # globals stop_words
385
+ # global stemmer
386
+
387
+ # nltk.download('punkt')
388
+ # nltk.download('stopwords')
389
+ # nlp = ru_core_news_md.load()
390
+ # self.stop_words = set(stopwords.words('russian'))
391
+ self.stop_words = {'а', 'без', 'более', 'больше', 'будет', 'будто', 'бы', 'был', 'была', 'были', 'было', 'быть', 'в', 'вам', 'вас', 'вдруг', 'ведь', 'во', 'вот', 'впрочем', 'все', 'всегда', 'всего', 'всех', 'всю', 'вы', 'где', 'да', 'даже', 'два', 'для', 'до', 'другой', 'его', 'ее', 'ей', 'ему', 'если', 'есть', 'еще', 'ж', 'же', 'за', 'зачем', 'здесь', 'и', 'из', 'или', 'им', 'иногда', 'их', 'к', 'как', 'какая', 'какой', 'когда', 'конечно', 'кто', 'куда', 'ли', 'лучше', 'между', 'меня', 'мне', 'много', 'может', 'можно', 'мой', 'моя', 'мы', 'на', 'над', 'надо', 'наконец', 'нас', 'не', 'него', 'нее', 'ней', 'нельзя', 'нет', 'ни', 'нибудь', 'никогда', 'ним', 'них', 'ничего', 'но', 'ну', 'о', 'об', 'один', 'он', 'она', 'они', 'опять', 'от', 'перед', 'по', 'под', 'после', 'потом', 'потому', 'почти', 'при', 'про', 'раз', 'разве', 'с', 'сам', 'свою', 'себе', 'себя', 'сейчас', 'со', 'совсем', 'так', 'такой', 'там', 'тебя', 'тем', 'теперь', 'то', 'тогда', 'того', 'тоже', 'только', 'том', 'тот', 'три', 'тут', 'ты', 'у', 'уж', 'уже', 'хорошо', 'хоть', 'чего', 'чем', 'через', 'что', 'чтоб', 'чтобы', 'чуть', 'эти', 'этого', 'этой', 'этом', 'этот', 'эту', 'я'}
392
+ # self.stemmer = SnowballStemmer("russian")
393
+ self.stemmer = Porter()
394
+
395
+ def analyze(self, s) :
396
+ template = r'[\'\"\.\,\?\!\:\;\-\+\%\^\&\*\@\~\_\=/\\\>\<\#\$\(\)\|\n\r\d]'
397
+ s = re.sub(template, ' ', s)
398
+ s = re.sub(' +', ' ', s)
399
+ # tokens = nlp(s)
400
+ # tokens = [str(t.lemma_) for t in tokens]
401
+ # tokens = word_tokenize(s)
402
+ tokens = s.strip().lower().split(' ')
403
+ # tokens = [t for t in tokens if t not in self.stop_words and t != ' ']
404
+ # tokens = [self.stemmer.stem(word) for word in tokens]
405
+ tokens = [self.stemmer.stem(word) for word in tokens if word not in self.stop_words]
406
+ newtext = ' '.join(tokens)
407
+ return newtext
408
+
409
+ # load medium dataset
410
+ def load_medium_dataset(self, path) :
411
+ # global dataset_medium
412
+ with open(path, 'r', encoding='utf-8') as infile:
413
+ self.dataset_medium = json.load(infile)
414
+
415
+ # create a filtered list of references for Vlad's json data
416
+ def create_filtered_refs(self) :
417
+ doctype = self.doctype
418
+ added_refs = self.added_refs
419
+ # global filtered_refs
420
+ # global doctype_template
421
+
422
+ # t = r'(НКРФ|ГКРФ|ТКРФ|ФЗ|[Зз]акон|Минфин|ФНС|Правительства|ФАС|АС|КС|ВС|[Сс]удебн|[Сс]уд)'
423
+ if doctype == 'court-decisions' :
424
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд)' # courts' decisions
425
+ ref_template = doctype_template
426
+ elif doctype == 'minfin-letters' :
427
+ doctype_template = r'[Пп]исьмо [Мм]инфина' # Minfin letters
428
+ ref_template = doctype_template
429
+ elif doctype == 'fns-letters' :
430
+ doctype_template = r'[Пп]исьмо (ФНС|фнс)' # FNS letters
431
+ ref_template = doctype_template
432
+ elif doctype == 'all-letters' :
433
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс))' # courts' decisions + Minfin letters + FNS letters
434
+ ref_template = doctype_template
435
+ elif doctype == 'taxcode' :
436
+ doctype_template = r'^ст.(\d+\.\d+|\d+) НКРФ'
437
+ ref_template = r'ст.(\d+\.\d+|\d+) НКРФ' # taxcode ref formst differs from doctype format
438
+ elif doctype == 'all-docs' :
439
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ)' # courts' decisions + Minfin letters + FNS letters + taxcode
440
+ ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ)' # taxcode ref formst differs from doctype format
441
+ else :
442
+ print('Error : wrong doctype "' + doctype + '"')
443
+
444
+ filtered_refs = []
445
+ nk_mask = []
446
+ for i in range(len(added_refs)) :
447
+ refs = []
448
+ for j in range(len(added_refs[i])) :
449
+ s = added_refs[i][j]
450
+ if re.search(ref_template, s) != None:
451
+ m = re.search(r'ст.(\d+\.\d+|\d+) НКРФ', s)
452
+ if m != None :
453
+ s = s[m.start() : m.end()]
454
+
455
+ if s in self.dataset_medium :
456
+ refs.append(s)
457
+ # print(i, j, s)
458
+
459
+ if doctype_template.find('НКРФ') != -1 :
460
+ refs += self.nk_refs[i]
461
+
462
+ refs = list(set(refs))
463
+ filtered_refs.append(refs)
464
+
465
+ self.filtered_refs = filtered_refs
466
+ self.doctype_template = doctype_template
467
+
468
+ # creating corpora fo TF-IDF embedding
469
+ def create_corpora(self) :
470
+ # global qcorpus
471
+ # global nkcorpus
472
+ # global pmfcorpus
473
+ # global pmfrefs
474
+ # global pmfids
475
+ # global items
476
+
477
+ self.qcorpus = []
478
+ for i in range(len(self.qtext)) :
479
+ if not i % 100 : print(i, end = ' ')
480
+ s = self.qtext[i].firstChild.nodeValue
481
+ s = self.analyze(s)
482
+ self.qcorpus.append(s)
483
+
484
+ # self.nkcorpus = []
485
+ # for i in range(len(self.text)) :
486
+ # if not i % 100 : print(i, end = ' ')
487
+ # s = self.text[i].firstChild.nodeValue
488
+ # s = self.analyze(s)
489
+ # self.nkcorpus.append(s)
490
+
491
+ self.pmfcorpus = []
492
+ self.pmfrefs = []
493
+ self.pmfids = []
494
+ self.pmflengths = []
495
+ self.nk_mask = []
496
+
497
+ i = 0
498
+ self.items = []
499
+ for key, value in self.dataset_medium.items() :
500
+ # print('test')
501
+ # break
502
+ if re.search(self.doctype_template, key) != None :
503
+ s = value
504
+ ss = key
505
+ if s != None :
506
+ s = s.replace('\n', ' ')
507
+ if s != None and s.count(' ') :
508
+ if not i % 100 : print(i, end = ' ')
509
+ # print('test')
510
+ # break
511
+ s = self.analyze(s)
512
+ self.pmfcorpus.append(s)
513
+ self.pmfrefs.append(ss)
514
+ self.pmfids.append(i)
515
+ self.items.append({'title' : key, 'text' : value})
516
+ self.pmflengths.append(s.count(' '))
517
+ mask = 0
518
+ if ss.find('НКРФ') != -1 :
519
+ mask = 1
520
+ self.nk_mask.append(mask)
521
+ i += 1
522
+
523
+ # build up TF-IDF representation
524
+ def create_TFIDF(self) :
525
+ # global TFIDF
526
+ # global QTFIDF
527
+ # global vectorizer
528
+ # global transformer
529
+
530
+ self.vectorizer = CountVectorizer()
531
+ # self.transformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
532
+ self.transformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
533
+
534
+ X = self.vectorizer.fit_transform(self.pmfcorpus)
535
+ QX = self.vectorizer.transform(self.qcorpus)
536
+ self.TFIDF = self.transformer.fit_transform(X)
537
+ self.QTFIDF = self.transformer.transform(QX)
538
+
539
+ # self.norm = []
540
+ # for i in range(self.TFIDF.shape[0]) :
541
+ # n = scipy.sparse.linalg.norm(self.TFIDF[i])
542
+ # self.norm.append(n)
543
+ # self.TFIDF[i] /= n
544
+
545
+ # for i in range(self.QTFIDF.shape[0]) :
546
+ # qn = scipy.sparse.linalg.norm(self.QTFIDF[i])
547
+ # self.QTFIDF[i] /= qn
548
+
549
+ n = np.sqrt(self.TFIDF.multiply(self.TFIDF).sum(axis = 1))
550
+ self.TFIDF = self.TFIDF.multiply(sparse.csr_matrix(1 / n))
551
+ self.norm = n.flatten().tolist()[0]
552
+ n = np.sqrt(self.QTFIDF.multiply(self.QTFIDF).sum(axis = 1))
553
+ self.QTFIDF = self.QTFIDF.multiply(sparse.csr_matrix(1 / n))
554
+
555
+ # get top letters sorted by TF-IDF cosine similarity
556
+ def getTop(self, i, top) :
557
+ v = self.QTFIDF[i]
558
+ vt = v.transpose()
559
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
560
+ scores = np.squeeze(np.asarray(scores))
561
+ df = pd.DataFrame()
562
+ df[0] = scores
563
+ df[1] = self.pmfrefs
564
+ # df[2] = self.pmflengths
565
+ df[2] = self.norm
566
+ # df[0] *= df[2] ** alpha
567
+ # df[0] *= np.log(df[2])
568
+
569
+ df[3] = self.nk_mask
570
+ alpha = 1.15
571
+ # beta = .43
572
+ # gamma = .2
573
+ beta = .2
574
+ gamma = .4
575
+ df[0] *= np.log(df[2]) ** alpha
576
+ df[0] *= (1 + df[3] * beta)
577
+ df[0] += df[3] * gamma
578
+
579
+ df.sort_values(0, ascending = False, inplace = True)
580
+ # df.sort_values(0, ascending = True, inplace = True)
581
+ # ids = df.index
582
+ ids = df[1]
583
+ # print(df)
584
+
585
+ return ids[:top].tolist()
586
+
587
+ def test_TFIDF_top(self, top = 40, metric = '') :
588
+ N = len(self.qtext)
589
+ allhits = 0
590
+ allrefs = 0
591
+ recall = []
592
+ precision = []
593
+ f1 = []
594
+
595
+ for i in range(N) :
596
+ # if not i % 10 : print(i, end = ' ')
597
+ refs = set(self.filtered_refs[i])
598
+ resp = self.getTop(i, top)
599
+ serp = set(resp)
600
+ hits = len(refs & serp)
601
+
602
+ allhits += hits
603
+ allrefs += len(refs)
604
+
605
+ tp = hits
606
+ fp = top - tp
607
+ fn = len(refs) - hits
608
+
609
+ if tp == 0 and metric == 'corrected':
610
+ if fp == 0 and fn == 0 :
611
+ # print(i, len(refs), fp, fn)
612
+ recall.append(1)
613
+ precision.append(1)
614
+ f1.append(1)
615
+ else :
616
+ # print(i, len(refs), fp, fn)
617
+ recall.append(0)
618
+ precision.append(0)
619
+ f1.append(0)
620
+
621
+ elif tp + fn > 0 :
622
+ recall.append(tp / (tp + fn))
623
+ precision.append(tp / (tp + fp))
624
+ f1.append(2 * tp / (2 * tp + fp + fn))
625
+
626
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
627
+ print('mean recall:', sum(recall) / len(recall))
628
+ print('mean precision:', sum(precision) / len(precision))
629
+ print('mean F1:', sum(f1) / len(f1))
630
+
631
+ # get letters with TF-IDF cosine similarity score > value
632
+ def getTopByScoreValue(self, i, value) :
633
+ v = self.QTFIDF[i]
634
+ vt = v.transpose()
635
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
636
+ scores = np.squeeze(np.asarray(scores))
637
+
638
+ df = pd.DataFrame()
639
+ df[0] = scores
640
+ df[1] = self.pmfrefs
641
+
642
+ df.sort_values(0, ascending = False, inplace = True)
643
+
644
+ df1 = df.loc[df[0] > value]
645
+ ids = df1[1]
646
+
647
+ return ids.tolist()
648
+
649
+ # calculate metrics for letters with TF-IDF cosine similarity score > value
650
+
651
+ def test_TFIDF_value(self, value = .4) :
652
+ N = len(self.qtext)
653
+ allhits = 0
654
+ allrefs = 0
655
+ recall = []
656
+ precision = []
657
+ f1 = []
658
+ topsize = []
659
+ count = 0
660
+
661
+ for i in range(N) :
662
+ # if not i % 10 : print(i, end = ' ')
663
+ refs = set(self.filtered_refs[i])
664
+ resp = self.getTopByScoreValue(i, value)
665
+ serp = set(resp)
666
+ hits = len(refs & serp)
667
+ top = len(resp)
668
+ topsize.append(top)
669
+
670
+ if top > 0 :
671
+ count += 1
672
+
673
+ tp = hits
674
+ fp = top - tp
675
+ fn = len(refs) - hits
676
+
677
+ if tp == 0 :
678
+ if fp == 0 and fn == 0 :
679
+ recall.append(1)
680
+ precision.append(1)
681
+ f1.append(1)
682
+ else :
683
+ recall.append(0)
684
+ precision.append(0)
685
+ f1.append(0)
686
+
687
+ else :
688
+ recall.append(tp / (tp + fn))
689
+ precision.append(tp / (tp + fp))
690
+ f1.append(2 * tp / (2 * tp + fp + fn))
691
+
692
+ print()
693
+ print('mean recall:', sum(recall) / len(recall))
694
+ print('mean precision:', sum(precision) / len(precision))
695
+ print('mean F1:', sum(f1) / len(f1))
696
+ print('mean top size: ', sum(topsize) / len(topsize))
697
+ count, count / 517
698
+
699
+ # get letters with TF-IDF cosine similarity score > top score * ratio
700
+ def getTopByScoreRelValue(self, i, ratio) :
701
+ v = self.QTFIDF[i]
702
+ vt = v.transpose()
703
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
704
+ scores = np.squeeze(np.asarray(scores))
705
+ df = pd.DataFrame()
706
+ df[0] = scores
707
+ df[1] = self.pmfrefs
708
+
709
+ df.sort_values(0, ascending = False, inplace = True)
710
+ value = df.iloc[0, 0]
711
+ df1 = df.loc[df[0] > value * ratio]
712
+ ids = df1[1]
713
+
714
+ return ids.tolist()
715
+
716
+ # calculate metrics for letters with TF-IDF cosine similarity score > top score * ratio
717
+
718
+ def test_TFIDF_ratio(self, ratio = .9) :
719
+ N = len(self.qtext)
720
+ allhits = 0
721
+ allrefs = 0
722
+ recall = []
723
+ precision = []
724
+ f1 = []
725
+ topsize = []
726
+ count = 0
727
+
728
+ for i in range(N) :
729
+ # if not i % 10 : print(i, end = ' ')
730
+ refs = set(self.filtered_refs[i])
731
+ resp = self.getTopByScoreRelValue(i, ratio)
732
+ serp = set(resp)
733
+ hits = len(refs & serp)
734
+ top = len(resp)
735
+ topsize.append(top)
736
+
737
+ tp = hits
738
+ fp = top - tp
739
+ fn = len(refs) - hits
740
+
741
+ r = 0
742
+ p = 0
743
+ f = 0
744
+
745
+ if tp == 0 :
746
+ if fp == 0 and fn == 0 :
747
+ recall.append(1)
748
+ precision.append(1)
749
+ f1.append(1)
750
+ r = 1
751
+ p = 1
752
+ f = 1
753
+ else :
754
+ recall.append(0)
755
+ precision.append(0)
756
+ f1.append(0)
757
+
758
+ else :
759
+ recall.append(tp / (tp + fn))
760
+ precision.append(tp / (tp + fp))
761
+ f1.append(2 * tp / (2 * tp + fp + fn))
762
+ r = tp / (tp + fn)
763
+ p = tp / (tp + fp)
764
+ f = 2 * tp / (2 * tp + fp + fn)
765
+
766
+ if (f > r and f > p) or (f < r and f < p) :
767
+ print('ERROR :', i, r, p, f)
768
+
769
+ print()
770
+ print('mean recall:', sum(recall) / len(recall))
771
+ print('mean precision:', sum(precision) / len(precision))
772
+ print('mean F1:', sum(f1) / len(f1))
773
+ print('mean top size: ', sum(topsize) / len(topsize))
774
+
775
+ # def getTopForQuery(self, i, top, query) :
776
+ # v = QTFIDF[i]
777
+ # vt = v.transpose()
778
+ # scores = TFIDF.dot(vt)[:, 0].todense()
779
+ # scores = np.squeeze(np.asarray(scores))
780
+ # df = pd.DataFrame()
781
+ # df[0] = scores
782
+ # df[1] = pmfrefs
783
+
784
+ # df.sort_values(0, ascending = False, inplace = True)
785
+ # # df.sort_values(0, ascending = True, inplace = True)
786
+ # # ids = df.index
787
+ # ids = df[1]
788
+ # # print(df)
789
+
790
+ # return ids[:top].tolist()
791
+
792
+ def load_everything(self, data_directory = 'data') :
793
+ self.load_basic_data(data_directory=data_directory)
794
+ self.load_text_processing()
795
+ s = '|()><.,!?:;=*-/\\8. Форма \n \r Cчета-фактуры и порядок его заполнения, формы и порядок ведения журнала учета полученных и выставленных счетов-фактур, книг покупок и книг продаж устанавливаются Правительством Российской Федерации.'
796
+ print(self.analyze(s))
797
+ self.load_medium_dataset(path=os.path.join(data_directory, 'search_data', 'medium_dataset.json'))
798
+ self.create_filtered_refs()
799
+ self.create_corpora()
800
+ print(len(self.pmfcorpus))
801
+ self.create_TFIDF()
802
+
803
+ def test_everything(self) :
804
+ self.test_TFIDF_top(top = 40)
805
+ self.test_TFIDF_value(value = .2)
806
+ self.test_TFIDF_ratio(ratio = .9)
807
+
808
+ def search(self, query, top = 10) :
809
+ analyzed_query = self.analyze(query)
810
+ query_TF = self.vectorizer.transform([analyzed_query])
811
+ query_TFIDF = self.transformer.transform(query_TF)
812
+ v = query_TFIDF[0]
813
+ vt = v.transpose()
814
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
815
+ scores = np.squeeze(np.asarray(scores))
816
+ df = pd.DataFrame()
817
+ df[0] = scores
818
+ df[1] = self.pmfrefs
819
+ df[2] = self.norm
820
+ df[3] = self.nk_mask
821
+ # alpha = 1.15
822
+ # beta = .43
823
+ # gamma = .2
824
+ alpha = 1.15 # for top 10
825
+ beta = .2 # for top 10
826
+ gamma = .4 # for top 10
827
+ df[0] *= np.log(df[2]) ** alpha
828
+ df[0] *= (1 + df[3] * beta)
829
+ df[0] += df[3] * gamma
830
+
831
+ df.sort_values(0, ascending = False, inplace = True)
832
+ # df.sort_values(0, ascending = True, inplace = True)
833
+ # ids = df.index
834
+ ids = df[1]
835
+ # print(df)
836
+ titles = ids[:top].tolist()
837
+ docs = []
838
+ for i in range(len(titles)) :
839
+ id = df.iloc[i, 1]
840
+ docs.append(self.dataset_medium[id])
841
+ # print()
842
+ # print (i, df.iloc[i, 0], id)
843
+ # print(self.dataset_medium[id])
844
+
845
+ scores = df[0][:top].tolist()
846
+
847
+ return titles, docs, scores
BasicSearchV5.py ADDED
@@ -0,0 +1,878 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.feature_extraction.text import TfidfTransformer
6
+ from scipy import sparse
7
+ import re
8
+ from xml.dom.minidom import parseString #, parse
9
+ import os
10
+ import sys
11
+ import json
12
+
13
+ alpha = 1.15
14
+ beta = .2
15
+ gamma = .4
16
+ delta = .31
17
+ epsilon = 0
18
+
19
+ # stemmer class
20
+ class Porter:
21
+ PERFECTIVEGROUND = re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
22
+ REFLEXIVE = re.compile(u"(с[яь])$")
23
+ ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
24
+ PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
25
+ VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
26
+ NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
27
+ RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
28
+ DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
29
+ DER = re.compile(u"ость?$")
30
+ SUPERLATIVE = re.compile(u"(ейше|ейш)$")
31
+ I = re.compile(u"и$")
32
+ P = re.compile(u"ь$")
33
+ NN = re.compile(u"нн$")
34
+
35
+ def stem(word):
36
+ # word = word.lower()
37
+ word = word.replace(u'ё', u'е')
38
+ m = re.match(Porter.RVRE, word)
39
+ if m and m.groups():
40
+ pre = m.group(1)
41
+ rv = m.group(2)
42
+ temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
43
+ if temp == rv:
44
+ rv = Porter.REFLEXIVE.sub('', rv, 1)
45
+ temp = Porter.ADJECTIVE.sub('', rv, 1)
46
+ if temp != rv:
47
+ rv = temp
48
+ rv = Porter.PARTICIPLE.sub('', rv, 1)
49
+ else:
50
+ temp = Porter.VERB.sub('', rv, 1)
51
+ if temp == rv:
52
+ rv = Porter.NOUN.sub('', rv, 1)
53
+ else:
54
+ rv = temp
55
+ else:
56
+ rv = temp
57
+
58
+ rv = Porter.I.sub('', rv, 1)
59
+
60
+ if re.match(Porter.DERIVATIONAL, rv):
61
+ rv = Porter.DER.sub('', rv, 1)
62
+
63
+ temp = Porter.P.sub('', rv, 1)
64
+ if temp == rv:
65
+ rv = Porter.SUPERLATIVE.sub('', rv, 1)
66
+ rv = Porter.NN.sub(u'н', rv, 1)
67
+ else:
68
+ rv = temp
69
+ word = pre+rv
70
+ return word
71
+ stem = staticmethod(stem)
72
+
73
+
74
+
75
+ class BasicSearch:
76
+ # constructor function
77
+ def __init__(self, doctype = 'minfin-letters', data_directory = './') :
78
+ self.doctype = doctype
79
+ self.load_everything(data_directory=data_directory)
80
+
81
+ def read_xml(self, path):
82
+ with open(path, "r", encoding="utf-8") as text_file:
83
+ data = text_file.read()
84
+
85
+ document = parseString('<data>' + data + '</data>')
86
+ return [
87
+ document.getElementsByTagName('title'),
88
+ document.getElementsByTagName('text')
89
+ ]
90
+
91
+
92
+ def getRefsNK(self, s) :
93
+ i = 0
94
+ refs = set()
95
+ x = 0
96
+ while x != -1 :
97
+ x = s.lower().find(' ст.', x)
98
+ if x != -1 :
99
+ # x += 1
100
+ y = s.lower().find('нк рф', x)
101
+ if y != -1 :
102
+ # print(i)
103
+ # print(x, y)
104
+ dx = 4
105
+ if s[x + dx] == ' ' :
106
+ dx = 5
107
+ if y - x <= 13 and y - x > 5 :
108
+ # print(s[x + 4: y + 5])
109
+ ref = 'Статья ' + s[x + dx: y - 1]
110
+ if ref in self.refid :
111
+ refs.add(ref)
112
+ x = y
113
+ else :
114
+ # print('error: ', s[x + 4: y + 5])
115
+ x += 1
116
+ i += 1
117
+ if i > 1000 :
118
+ break
119
+ return list(refs)
120
+
121
+ def getRefsNK1(self, s, debug = False, altrefs = set()) :
122
+ i = 0
123
+ refs = set()
124
+ x = 0
125
+ slen = len(s)
126
+
127
+ s0 = s
128
+ s = s.replace('(',' ')
129
+ s = s.replace(')',' ')
130
+ s = s.replace(';',' ')
131
+ s = s.replace(':',' ')
132
+ s = s.replace(',',' ')
133
+
134
+ while x != -1 :
135
+ # print(x)
136
+ x1 = s.lower().find('нк рф', x)
137
+ if x1 == -1 :
138
+ break
139
+
140
+ # print(x)
141
+ x2 = x1 - 12
142
+ x2 = max(x2, 0)
143
+
144
+ x31 = s.lower().find('ст.', x2)
145
+ x32 = s.lower().find('ьей', x2)
146
+ x33 = s.lower().find('ьёй', x2)
147
+ x34 = s.lower().find('ями', x2)
148
+ x35 = s.lower().find('тьи', x2)
149
+ x36 = s.lower().find('тье', x2)
150
+
151
+ if x31 == -1 :
152
+ x31 = slen
153
+ if x32 == -1 :
154
+ x32 = slen
155
+ if x33 == -1 :
156
+ x33 = slen
157
+ if x34 == -1 :
158
+ x34 = slen
159
+ if x35 == -1 :
160
+ x35 = slen
161
+ if x36 == -1 :
162
+ x36 = slen
163
+
164
+ x3 = min(x31, x32, x33, x34, x35, x36)
165
+ # print(x1, x2, x3)
166
+ # if x3 > x1 :
167
+ # print('not found: ', s0[x2 : x1 + 5])
168
+
169
+ x = x3
170
+ # print(x)
171
+
172
+ if x != -1 :
173
+ # x += 1
174
+ y = s.lower().find('нк рф', x)
175
+ if y != -1 :
176
+ # print(i)
177
+ # print(y)
178
+ # print(s)
179
+ dx = 3
180
+ if s[x + dx] == ' ' :
181
+ dx += 1
182
+ if y - x <= 13 and y - x > 4 :
183
+ # print(s[x + 4: y + 5])
184
+ ref = 'Статья ' + s[x + dx: y - 1]
185
+ if ref in self.refid :
186
+ refs.add(ref)
187
+ if debug and (ref not in altrefs):
188
+ print('...' + s0[y - 40 : y + 5])
189
+ x = y + 1
190
+ else :
191
+ # print('error: ', s[x + 4: y + 5])
192
+ x += 1
193
+
194
+ i += 1
195
+ if i > 1000 :
196
+ break
197
+ return list(refs)
198
+
199
+ def getRefsNK2(self, s, debug = False, altrefs = set()) :
200
+ i = 0
201
+ refs = set()
202
+ x = 0
203
+ slen = len(s)
204
+
205
+ s0 = s
206
+ s = s.replace('(',' ')
207
+ s = s.replace(')',' ')
208
+ s = s.replace(';',' ')
209
+ s = s.replace(':',' ')
210
+ s = s.replace(',',' ')
211
+
212
+ while x != -1 :
213
+ # print(x)
214
+ x1 = s.lower().find('нкрф', x)
215
+ if x1 == -1 :
216
+ break
217
+
218
+ # print(x)
219
+ x2 = x1 - 12
220
+ x2 = max(x2, 0)
221
+
222
+ x3 = s.lower().find('ст.', x2)
223
+
224
+ # print(x1, x2, x3)
225
+ # if x3 > x1 :
226
+ # print('not found: ', s0[x2 : x1 + 5])
227
+
228
+ x = x3
229
+ # print(x)
230
+
231
+ if x != -1 :
232
+ # x += 1
233
+ y = s.lower().find('нкрф', x)
234
+ if y != -1 :
235
+ # print(i)
236
+ # print(y)
237
+ # print(s)
238
+ dx = 3
239
+ if s[x + dx] == ' ' :
240
+ dx += 1
241
+ if y - x <= 13 and y - x > 4 :
242
+ # print(s[x + 4: y + 5])
243
+ ref = 'Статья ' + s[x + dx: y - 1]
244
+ if ref in self.refid :
245
+ refs.add(ref)
246
+ if debug and (ref not in altrefs):
247
+ print('...' + s0[y - 40 : y + 5])
248
+ x = y + 1
249
+ else :
250
+ # print('error: ', s[x + 4: y + 5])
251
+ x += 1
252
+
253
+ i += 1
254
+ if i > 1000 :
255
+ break
256
+ return list(refs)
257
+
258
+ # read data
259
+ def load_basic_data(self, data_directory = 'data') :
260
+
261
+ # global title
262
+ # global text
263
+ # global qtitle
264
+ # global qtext
265
+ # global atitle
266
+ # global atext
267
+ # global questions
268
+ # global answers
269
+ # global added_refs
270
+ # global missed_refs
271
+
272
+ self.title, self.text = self.read_xml(os.path.join(data_directory, 'taxcode.xml'))
273
+ self.atitle, self.atext = self.read_xml(os.path.join(data_directory, 'K2-answer.xml'))
274
+ self.qtitle, self.qtext = self.read_xml(os.path.join(data_directory, 'K2-question.xml'))
275
+
276
+ _, reftext = self.read_xml(os.path.join(data_directory, 'references-04-12-2023.xml'))
277
+ _, reftext2 = self.read_xml(os.path.join(data_directory, 'references-Vlad-11-12-2023.xml')) #reftext2 не используется
278
+
279
+ reflist = [set()] * len(self.qtitle)
280
+ reflist1 = [set()] * len(self.qtitle)
281
+ qreflist = [set()] * len(self.qtitle)
282
+
283
+
284
+ def getRefNK(s) :
285
+ x = s.find('. ')
286
+ y = s.find(' (')
287
+ if x == -1 :
288
+ x = sys.maxsize
289
+ if y == -1 :
290
+ y = sys.maxsize
291
+ x = min(x, y)
292
+ id = s[:x]
293
+ return id
294
+
295
+ self.refid = {}
296
+ self.titleref = {}
297
+ self.idref = [0] * len(self.title)
298
+ for i in range(len(self.title)) :
299
+ s = self.title[i].firstChild.nodeValue
300
+ id = getRefNK(s)
301
+ self.refid[id] = i
302
+ self.titleref[s] = id
303
+ self.idref[i] = id
304
+
305
+ for i in range(len(self.qtext)) :
306
+ # for i in range(1,2) :
307
+ doctext = self.atext[i].firstChild.nodeValue
308
+ qdoctext = self.qtext[i].firstChild.nodeValue
309
+ refdoctext = reftext[i].firstChild.nodeValue
310
+ refs = self.getRefsNK1(doctext)
311
+ qrefs = self.getRefsNK1(qdoctext)
312
+ refs1 = self.getRefsNK2(refdoctext)
313
+ # print(refs, qrefs)
314
+ intrefs = []
315
+ intrefs1 = []
316
+ intqrefs = []
317
+ for ref in refs :
318
+ intrefs.append(self.refid[ref])
319
+ for ref in refs1 :
320
+ intrefs1.append(self.refid[ref])
321
+ for ref in qrefs :
322
+ intqrefs.append(self.refid[ref])
323
+ reflist[i] = set(intrefs)
324
+ reflist1[i] = set(intrefs1)
325
+ qreflist[i] = set(intqrefs)
326
+
327
+ for i in range(len(reflist)) :
328
+ reflist[i] |= reflist1[i]
329
+
330
+ self.nk_refs = []
331
+
332
+ for i in range(len(reflist)) :
333
+ refs = list(reflist[i])
334
+ newrefs = []
335
+ for j in range(len(refs)) :
336
+ ref = self.idref[refs[j]]
337
+ m = re.search('(\d+\.\d+|\d+)', ref)
338
+ s = ref[m.start() : m.end()]
339
+ ref1 = 'ст.' + s + ' НКРФ'
340
+ newrefs.append(ref1)
341
+
342
+ self.nk_refs.append(newrefs)
343
+
344
+ # reading Vlad's json data
345
+ datadir = os.path.join(data_directory, 'data_jsons_20240104')
346
+ filelist = os.listdir(datadir)
347
+ filelist = [x for x in filelist if re.search(r'\d+.json', x)]
348
+ filelist.sort()
349
+
350
+
351
+ questions = [''] * len(filelist)
352
+ answers = [''] * len(filelist)
353
+ added_refs = [[]] * len(filelist)
354
+ missed_refs = [[]] * len(filelist)
355
+ count = 0
356
+ for filename in filelist :
357
+ x = filename.find('.')
358
+ if x == -1 :
359
+ print('ERROR :', filename)
360
+ if filename[:x].isnumeric() :
361
+ i = int(filename[:x])
362
+ # print(i)
363
+ with open(os.path.join(datadir, filename), 'r', encoding='utf-8') as f:
364
+ d = json.load(f)
365
+ refs = set(d['added_refs'].keys())
366
+ refs -= {''}
367
+ refs = list(refs)
368
+ questions[i] = d['question']
369
+ answers[i] = d['answer']
370
+ missed_refs[i] = d['refs']
371
+ added_refs[i] = refs
372
+ count += 1
373
+
374
+ self.questions = questions#[:count]
375
+ self.answers = answers#[:count]
376
+ self.added_refs = added_refs#[:count]
377
+ self.missed_refs = missed_refs#[:count]
378
+
379
+ def load_text_processing(self) :
380
+ # globals stop_words
381
+ # global stemmer
382
+
383
+ # nltk.download('punkt')
384
+ # nltk.download('stopwords')
385
+ # nlp = ru_core_news_md.load()
386
+ # self.stop_words = set(stopwords.words('russian'))
387
+ self.stop_words = {'а', 'без', 'более', 'больше', 'будет', 'будто', 'бы', 'был', 'была', 'были', 'было', 'быть', 'в', 'вам', 'вас', 'вдруг', 'ведь', 'во', 'вот', 'впрочем', 'все', 'всегда', 'всего', 'всех', 'всю', 'вы', 'где', 'да', 'даже', 'два', 'для', 'до', 'другой', 'его', 'ее', 'ей', 'ему', 'если', 'есть', 'еще', 'ж', 'же', 'за', 'зачем', 'здесь', 'и', 'из', 'или', 'им', 'иногда', 'их', 'к', 'как', 'какая', 'какой', 'когда', 'конечно', 'кто', 'куда', 'ли', 'лучше', 'между', 'меня', 'мне', 'много', 'может', 'можно', 'мой', 'моя', 'мы', 'на', 'над', 'надо', 'наконец', 'нас', 'не', 'него', 'нее', 'ней', 'нельзя', 'нет', 'ни', 'нибудь', 'никогда', 'ним', 'них', 'ничего', 'но', 'ну', 'о', 'об', 'один', 'он', 'она', 'они', 'опять', 'от', 'перед', 'по', 'под', 'после', 'потом', 'потому', 'почти', 'при', 'про', 'раз', 'разве', 'с', 'сам', 'свою', 'себе', 'себя', 'сейчас', 'со', 'совсем', 'так', 'такой', 'там', 'тебя', 'тем', 'теперь', 'то', 'тогда', 'того', 'тоже', 'только', 'том', 'тот', 'три', 'тут', 'ты', 'у', 'уж', 'уже', 'хорошо', 'хоть', 'чего', 'чем', 'через', 'что', 'чтоб', 'чтобы', 'чуть', 'эти', 'этого', 'этой', 'этом', 'этот', 'эту', 'я'}
388
+ # self.stemmer = SnowballStemmer("russian")
389
+ self.stemmer = Porter()
390
+
391
+ def analyze(self, s) :
392
+ template = r'[\'\"\.\,\?\!\:\;\-\+\%\^\&\*\@\~\_\=/\\\>\<\#\$\(\)\|\n\r\d]'
393
+ s = re.sub(template, ' ', s)
394
+ s = re.sub(' +', ' ', s)
395
+ # tokens = nlp(s)
396
+ # tokens = [str(t.lemma_) for t in tokens]
397
+ # tokens = word_tokenize(s)
398
+ tokens = s.strip().lower().split(' ')
399
+ # tokens = [t for t in tokens if t not in self.stop_words and t != ' ']
400
+ # tokens = [self.stemmer.stem(word) for word in tokens]
401
+ tokens = [self.stemmer.stem(word) for word in tokens if word not in self.stop_words]
402
+ newtext = ' '.join(tokens)
403
+ return newtext
404
+
405
+ # load medium dataset
406
+ def load_medium_dataset(self, path) :
407
+ # global dataset_medium
408
+ with open(path, 'r', encoding='utf-8') as infile:
409
+ self.dataset_medium = json.load(infile)
410
+
411
+ # create a filtered list of references for Vlad's json data
412
+ def create_filtered_refs(self) :
413
+ doctype = self.doctype
414
+ added_refs = self.added_refs
415
+ # global filtered_refs
416
+ # global doctype_template
417
+
418
+ # t = r'(НКРФ|ГКРФ|ТКРФ|ФЗ|[Зз]акон|Минфин|ФНС|Правительства|ФАС|АС|КС|ВС|[Сс]удебн|[Сс]уд)'
419
+ if doctype == 'court-decisions' :
420
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд)' # courts' decisions
421
+ ref_template = doctype_template
422
+ elif doctype == 'minfin-letters' :
423
+ doctype_template = r'[Пп]исьмо [Мм]инфина' # Minfin letters
424
+ ref_template = doctype_template
425
+ elif doctype == 'fns-letters' :
426
+ doctype_template = r'[Пп]исьмо (ФНС|фнс)' # FNS letters
427
+ ref_template = doctype_template
428
+ elif doctype == 'all-letters' :
429
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс))' # courts' decisions + Minfin letters + FNS letters
430
+ ref_template = doctype_template
431
+ elif doctype == 'taxcode' :
432
+ doctype_template = r'^ст.(\d+\.\d+|\d+) НКРФ'
433
+ ref_template = r'ст.(\d+\.\d+|\d+) НКРФ' # taxcode ref formst differs from doctype format
434
+ elif doctype == 'other-laws' :
435
+ doctype_template = r'(^ст.(\d+\.\d+|\d+) [ГТ]КРФ|([Зз]акон)|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # courts' decisions + Minfin letters + FNS letters + taxcode
436
+ ref_template = r'(ст.(\d+\.\d+|\d+) [ГТ]КРФ|([Зз]акон)|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # taxcode ref formst differs from doctype format
437
+ elif doctype == 'all-docs' :
438
+ # doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ)' # courts' decisions + Minfin letters + FNS letters + taxcode
439
+ # ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ)' # taxcode ref formst differs from doctype format
440
+
441
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ|^ст.(\d+\.\d+|\d+) [ГТ]КРФ|(^Федеральный закон)|^Приказ ФНС РФ|^Постановление Правительства РФ|^Решение Коллегии Евразийской экономической комиссии)' # courts' decisions + Minfin letters + FNS letters + taxcode
442
+ ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ|ст.(\d+\.\d+|\d+) [ГТ]КРФ|(Федеральный закон)|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # taxcode ref formst differs from doctype format
443
+ else :
444
+ print('Error : wrong doctype "' + doctype + '"')
445
+
446
+ filtered_refs = []
447
+ nk_mask = []
448
+ for i in range(len(added_refs)) :
449
+ refs = []
450
+ for j in range(len(added_refs[i])) :
451
+ s = added_refs[i][j]
452
+ if re.search(ref_template, s) != None:
453
+ m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', s)
454
+ if m != None :
455
+ s = s[m.start() : ]
456
+
457
+ if s in self.dataset_medium :
458
+ refs.append(s)
459
+ # print(i, j, s)
460
+
461
+ if doctype_template.find('НКРФ') != -1 :
462
+ refs += self.nk_refs[i]
463
+
464
+ refs = list(set(refs))
465
+ filtered_refs.append(refs)
466
+
467
+ self.filtered_refs = filtered_refs
468
+ self.doctype_template = doctype_template
469
+
470
+ # creating corpora fo TF-IDF embedding
471
+ def create_corpora(self) :
472
+ # global qcorpus
473
+ # global nkcorpus
474
+ # global pmfcorpus
475
+ # global pmfrefs
476
+ # global pmfids
477
+ # global items
478
+
479
+ self.qcorpus = []
480
+ for i in range(len(self.qtext)) :
481
+ if not i % 100 : print(i, end = ' ')
482
+ s = self.qtext[i].firstChild.nodeValue
483
+ s = self.analyze(s)
484
+ self.qcorpus.append(s)
485
+
486
+ self.acorpus = []
487
+ for i in range(len(self.qtext)) :
488
+ s = self.atext[i].firstChild.nodeValue
489
+ s = self.analyze(s)
490
+ self.acorpus.append(s)
491
+
492
+ # self.nkcorpus = []
493
+ # for i in range(len(self.text)) :
494
+ # if not i % 100 : print(i, end = ' ')
495
+ # s = self.text[i].firstChild.nodeValue
496
+ # s = self.analyze(s)
497
+ # self.nkcorpus.append(s)
498
+
499
+ self.pmfcorpus = []
500
+ self.pmfrefs = []
501
+ self.pmfids = []
502
+ self.pmflengths = []
503
+ self.nk_mask = []
504
+ self.laws_mask = []
505
+
506
+ i = 0
507
+ self.items = []
508
+ for key, value in self.dataset_medium.items() :
509
+ # print('test')
510
+ # break
511
+ if re.search(self.doctype_template, key) != None :
512
+ s = value
513
+ ss = key
514
+ if s != None :
515
+ s = s.replace('\n', ' ')
516
+ if s != None and s.count(' ') :
517
+ if not i % 100 : print(i, end = ' ')
518
+ # print('test')
519
+ # break
520
+ s = self.analyze(s)
521
+ if s.count(' ') :
522
+ self.pmfcorpus.append(s)
523
+ self.pmfrefs.append(ss)
524
+ self.pmfids.append(i)
525
+ self.items.append({'title' : key, 'text' : value})
526
+ self.pmflengths.append(s.count(' '))
527
+
528
+ # if ss.find('НКРФ') != -1 :
529
+ if re.search(r'ст.(\d+\.\d+|\d+) НКРФ', ss) :
530
+ self.nk_mask.append(1)
531
+ else:
532
+ self.nk_mask.append(0)
533
+
534
+ if re.search(r'(ст.(\d+\.\d+|\d+) [ГТ]КРФ|([Зз]акон)|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)', ss) :
535
+ self.laws_mask.append(1)
536
+ else:
537
+ self.laws_mask.append(0)
538
+
539
+ i += 1
540
+
541
+ # build up TF-IDF representation
542
+ def create_TFIDF(self) :
543
+ # global TFIDF
544
+ # global QTFIDF
545
+ # global vectorizer
546
+ # global transformer
547
+
548
+ self.vectorizer = CountVectorizer()
549
+ # self.transformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
550
+ self.transformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
551
+
552
+ X = self.vectorizer.fit_transform(self.pmfcorpus)
553
+ QX = self.vectorizer.transform(self.qcorpus)
554
+ self.TFIDF = self.transformer.fit_transform(X)
555
+ self.QTFIDF = self.transformer.transform(QX)
556
+
557
+ # self.norm = []
558
+ # for i in range(self.TFIDF.shape[0]) :
559
+ # n = scipy.sparse.linalg.norm(self.TFIDF[i])
560
+ # self.norm.append(n)
561
+ # self.TFIDF[i] /= n
562
+
563
+ # for i in range(self.QTFIDF.shape[0]) :
564
+ # qn = scipy.sparse.linalg.norm(self.QTFIDF[i])
565
+ # self.QTFIDF[i] /= qn
566
+
567
+ n = np.sqrt(self.TFIDF.multiply(self.TFIDF).sum(axis = 1))
568
+ self.TFIDF = self.TFIDF.multiply(sparse.csr_matrix(1 / n))
569
+ self.norm = n.flatten().tolist()[0]
570
+ n = np.sqrt(self.QTFIDF.multiply(self.QTFIDF).sum(axis = 1))
571
+ self.QTFIDF = self.QTFIDF.multiply(sparse.csr_matrix(1 / n))
572
+
573
+ self.avectorizer = CountVectorizer()
574
+ self.atransformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
575
+ # self.atransformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
576
+
577
+ AX = self.avectorizer.fit_transform(self.acorpus)
578
+ AQX = self.avectorizer.transform(self.qcorpus)
579
+ self.ATFIDF = self.atransformer.fit_transform(AX)
580
+ self.AQTFIDF = self.atransformer.transform(AQX)
581
+
582
+ # get top letters sorted by TF-IDF cosine similarity
583
+ def getTop(self, i, top) :
584
+ v = self.QTFIDF[i]
585
+ vt = v.transpose()
586
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
587
+ scores = np.squeeze(np.asarray(scores))
588
+ df = pd.DataFrame()
589
+ df[0] = scores
590
+ df[1] = self.pmfrefs
591
+ # df[2] = self.pmflengths
592
+ df[2] = self.norm
593
+ # df[0] *= df[2] ** alpha
594
+ # df[0] *= np.log(df[2])
595
+
596
+ df[3] = self.nk_mask
597
+ # alpha = 1.15
598
+ # beta = .43
599
+ # gamma = .2
600
+ # beta = .2
601
+ # gamma = .4
602
+ df[0] *= np.log(df[2]) ** alpha
603
+ df[0] *= (1 + df[3] * beta)
604
+ df[0] += df[3] * gamma
605
+
606
+ df[4] = self.laws_mask
607
+ # delta = .1
608
+ # epsilon = .1
609
+ df[0] *= (1 + df[4] * delta)
610
+ df[0] += df[4] * epsilon
611
+
612
+ df.sort_values(0, ascending = False, inplace = True)
613
+ # df.sort_values(0, ascending = True, inplace = True)
614
+ # ids = df.index
615
+ ids = df[1]
616
+ # print(df)
617
+
618
+ return ids[:top].tolist()
619
+
620
+ def test_TFIDF_top(self, top = 40, metric = '') :
621
+ N = len(self.qtext)
622
+ allhits = 0
623
+ allrefs = 0
624
+ recall = []
625
+ precision = []
626
+ f1 = []
627
+
628
+ for i in range(N) :
629
+ # if not i % 10 : print(i, end = ' ')
630
+ refs = set(self.filtered_refs[i])
631
+ resp = self.getTop(i, top)
632
+ serp = set(resp)
633
+ hits = len(refs & serp)
634
+
635
+ allhits += hits
636
+ allrefs += len(refs)
637
+
638
+ tp = hits
639
+ fp = top - tp
640
+ fn = len(refs) - hits
641
+
642
+ if tp == 0 and metric == 'corrected':
643
+ if fp == 0 and fn == 0 :
644
+ # print(i, len(refs), fp, fn)
645
+ recall.append(1)
646
+ precision.append(1)
647
+ f1.append(1)
648
+ else :
649
+ # print(i, len(refs), fp, fn)
650
+ recall.append(0)
651
+ precision.append(0)
652
+ f1.append(0)
653
+
654
+ elif tp + fn > 0 :
655
+ recall.append(tp / (tp + fn))
656
+ precision.append(tp / (tp + fp))
657
+ f1.append(2 * tp / (2 * tp + fp + fn))
658
+
659
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
660
+ print('mean recall:', sum(recall) / len(recall))
661
+ print('mean precision:', sum(precision) / len(precision))
662
+ print('mean F1:', sum(f1) / len(f1))
663
+
664
+ # get letters with TF-IDF cosine similarity score > value
665
+ def getTopByScoreValue(self, i, value) :
666
+ v = self.QTFIDF[i]
667
+ vt = v.transpose()
668
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
669
+ scores = np.squeeze(np.asarray(scores))
670
+
671
+ df = pd.DataFrame()
672
+ df[0] = scores
673
+ df[1] = self.pmfrefs
674
+
675
+ df.sort_values(0, ascending = False, inplace = True)
676
+
677
+ df1 = df.loc[df[0] > value]
678
+ ids = df1[1]
679
+
680
+ return ids.tolist()
681
+
682
+ # calculate metrics for letters with TF-IDF cosine similarity score > value
683
+
684
+ def test_TFIDF_value(self, value = .4) :
685
+ N = len(self.qtext)
686
+ allhits = 0
687
+ allrefs = 0
688
+ recall = []
689
+ precision = []
690
+ f1 = []
691
+ topsize = []
692
+ count = 0
693
+
694
+ for i in range(N) :
695
+ # if not i % 10 : print(i, end = ' ')
696
+ refs = set(self.filtered_refs[i])
697
+ resp = self.getTopByScoreValue(i, value)
698
+ serp = set(resp)
699
+ hits = len(refs & serp)
700
+ top = len(resp)
701
+ topsize.append(top)
702
+
703
+ if top > 0 :
704
+ count += 1
705
+
706
+ tp = hits
707
+ fp = top - tp
708
+ fn = len(refs) - hits
709
+
710
+ if tp == 0 :
711
+ if fp == 0 and fn == 0 :
712
+ recall.append(1)
713
+ precision.append(1)
714
+ f1.append(1)
715
+ else :
716
+ recall.append(0)
717
+ precision.append(0)
718
+ f1.append(0)
719
+
720
+ else :
721
+ recall.append(tp / (tp + fn))
722
+ precision.append(tp / (tp + fp))
723
+ f1.append(2 * tp / (2 * tp + fp + fn))
724
+
725
+ print()
726
+ print('mean recall:', sum(recall) / len(recall))
727
+ print('mean precision:', sum(precision) / len(precision))
728
+ print('mean F1:', sum(f1) / len(f1))
729
+ print('mean top size: ', sum(topsize) / len(topsize))
730
+ count, count / 517
731
+
732
+ # get letters with TF-IDF cosine similarity score > top score * ratio
733
+ def getTopByScoreRelValue(self, i, ratio) :
734
+ v = self.QTFIDF[i]
735
+ vt = v.transpose()
736
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
737
+ scores = np.squeeze(np.asarray(scores))
738
+ df = pd.DataFrame()
739
+ df[0] = scores
740
+ df[1] = self.pmfrefs
741
+
742
+ df.sort_values(0, ascending = False, inplace = True)
743
+ value = df.iloc[0, 0]
744
+ df1 = df.loc[df[0] > value * ratio]
745
+ ids = df1[1]
746
+
747
+ return ids.tolist()
748
+
749
+ # calculate metrics for letters with TF-IDF cosine similarity score > top score * ratio
750
+ def test_TFIDF_ratio(self, ratio = .9) :
751
+ N = len(self.qtext)
752
+ allhits = 0
753
+ allrefs = 0
754
+ recall = []
755
+ precision = []
756
+ f1 = []
757
+ topsize = []
758
+ count = 0
759
+
760
+ for i in range(N) :
761
+ # if not i % 10 : print(i, end = ' ')
762
+ refs = set(self.filtered_refs[i])
763
+ resp = self.getTopByScoreRelValue(i, ratio)
764
+ serp = set(resp)
765
+ hits = len(refs & serp)
766
+ top = len(resp)
767
+ topsize.append(top)
768
+
769
+ tp = hits
770
+ fp = top - tp
771
+ fn = len(refs) - hits
772
+
773
+ r = 0
774
+ p = 0
775
+ f = 0
776
+
777
+ if tp == 0 :
778
+ if fp == 0 and fn == 0 :
779
+ recall.append(1)
780
+ precision.append(1)
781
+ f1.append(1)
782
+ r = 1
783
+ p = 1
784
+ f = 1
785
+ else :
786
+ recall.append(0)
787
+ precision.append(0)
788
+ f1.append(0)
789
+
790
+ else :
791
+ recall.append(tp / (tp + fn))
792
+ precision.append(tp / (tp + fp))
793
+ f1.append(2 * tp / (2 * tp + fp + fn))
794
+ r = tp / (tp + fn)
795
+ p = tp / (tp + fp)
796
+ f = 2 * tp / (2 * tp + fp + fn)
797
+
798
+ if (f > r and f > p) or (f < r and f < p) :
799
+ print('ERROR :', i, r, p, f)
800
+
801
+ print()
802
+ print('mean recall:', sum(recall) / len(recall))
803
+ print('mean precision:', sum(precision) / len(precision))
804
+ print('mean F1:', sum(f1) / len(f1))
805
+ print('mean top size: ', sum(topsize) / len(topsize))
806
+
807
+ # def getTopForQuery(self, i, top, query) :
808
+ # v = QTFIDF[i]
809
+ # vt = v.transpose()
810
+ # scores = TFIDF.dot(vt)[:, 0].todense()
811
+ # scores = np.squeeze(np.asarray(scores))
812
+ # df = pd.DataFrame()
813
+ # df[0] = scores
814
+ # df[1] = pmfrefs
815
+
816
+ # df.sort_values(0, ascending = False, inplace = True)
817
+ # # df.sort_values(0, ascending = True, inplace = True)
818
+ # # ids = df.index
819
+ # ids = df[1]
820
+ # # print(df)
821
+
822
+ # return ids[:top].tolist()
823
+
824
+ def load_everything(self, data_directory = 'data') :
825
+ self.load_basic_data(data_directory=data_directory)
826
+ self.load_text_processing()
827
+ s = '|()><.,!?:;=*-/\\8. Форма \n \r Cчета-фактуры и порядок его заполнения, формы и порядок ведения журнала учета полученных и выставленных счетов-фактур, книг покупок и книг продаж устанавливаются Правительством Российской Федерации.'
828
+ print(self.analyze(s))
829
+ self.load_medium_dataset(path=os.path.join(data_directory, 'search_data', 'medium_dataset.json'))
830
+ self.create_filtered_refs()
831
+ self.create_corpora()
832
+ print(len(self.pmfcorpus))
833
+ self.create_TFIDF()
834
+
835
+ def test_everything(self) :
836
+ self.test_TFIDF_top(top = 40)
837
+ self.test_TFIDF_value(value = .2)
838
+ self.test_TFIDF_ratio(ratio = .9)
839
+
840
+ def search(self, query, top = 10) :
841
+ analyzed_query = self.analyze(query)
842
+ query_TF = self.vectorizer.transform([analyzed_query])
843
+ query_TFIDF = self.transformer.transform(query_TF)
844
+ v = query_TFIDF[0]
845
+ vt = v.transpose()
846
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
847
+ scores = np.squeeze(np.asarray(scores))
848
+ df = pd.DataFrame()
849
+ df[0] = scores
850
+ df[1] = self.pmfrefs
851
+ df[2] = self.norm
852
+ df[3] = self.nk_mask
853
+
854
+ df[0] *= np.log(df[2]) ** alpha
855
+ df[0] *= (1 + df[3] * beta)
856
+ df[0] += df[3] * gamma
857
+
858
+ df[4] = self.laws_mask
859
+ df[0] *= (1 + df[4] * delta)
860
+ df[0] += df[4] * epsilon
861
+
862
+ df.sort_values(0, ascending = False, inplace = True)
863
+ # df.sort_values(0, ascending = True, inplace = True)
864
+ # ids = df.index
865
+ ids = df[1]
866
+ # print(df)
867
+ titles = ids[:top].tolist()
868
+ docs = []
869
+ for i in range(len(titles)) :
870
+ id = df.iloc[i, 1]
871
+ docs.append(self.dataset_medium[id])
872
+ # print()
873
+ # print (i, df.iloc[i, 0], id)
874
+ # print(self.dataset_medium[id])
875
+
876
+ scores = df[0][:top].tolist()
877
+
878
+ return titles, docs, scores
BasicSearchV6.py ADDED
@@ -0,0 +1,1025 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import CountVectorizer
5
+ from sklearn.feature_extraction.text import TfidfTransformer
6
+ from scipy import sparse
7
+ import re
8
+ from xml.dom.minidom import parseString #, parse
9
+ import os
10
+ import sys
11
+ import json
12
+
13
+ # alpha = 1.15
14
+ # beta = .2
15
+ # gamma = .4
16
+ # delta = .31
17
+ # epsilon = 0
18
+
19
+ alpha = 0
20
+ beta = .55
21
+ gamma = .0
22
+ delta = .2
23
+ epsilon = 0
24
+ zeta = .65
25
+
26
+ # stemmer class
27
+ class Porter:
28
+ PERFECTIVEGROUND = re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
29
+ REFLEXIVE = re.compile(u"(с[яь])$")
30
+ ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
31
+ PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
32
+ VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
33
+ NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
34
+ RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
35
+ DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
36
+ DER = re.compile(u"ость?$")
37
+ SUPERLATIVE = re.compile(u"(ейше|ейш)$")
38
+ I = re.compile(u"и$")
39
+ P = re.compile(u"ь$")
40
+ NN = re.compile(u"нн$")
41
+
42
+ def stem(word):
43
+ # word = word.lower()
44
+ word = word.replace(u'ё', u'е')
45
+ m = re.match(Porter.RVRE, word)
46
+ if m and m.groups():
47
+ pre = m.group(1)
48
+ rv = m.group(2)
49
+ temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
50
+ if temp == rv:
51
+ rv = Porter.REFLEXIVE.sub('', rv, 1)
52
+ temp = Porter.ADJECTIVE.sub('', rv, 1)
53
+ if temp != rv:
54
+ rv = temp
55
+ rv = Porter.PARTICIPLE.sub('', rv, 1)
56
+ else:
57
+ temp = Porter.VERB.sub('', rv, 1)
58
+ if temp == rv:
59
+ rv = Porter.NOUN.sub('', rv, 1)
60
+ else:
61
+ rv = temp
62
+ else:
63
+ rv = temp
64
+
65
+ rv = Porter.I.sub('', rv, 1)
66
+
67
+ if re.match(Porter.DERIVATIONAL, rv):
68
+ rv = Porter.DER.sub('', rv, 1)
69
+
70
+ temp = Porter.P.sub('', rv, 1)
71
+ if temp == rv:
72
+ rv = Porter.SUPERLATIVE.sub('', rv, 1)
73
+ rv = Porter.NN.sub(u'н', rv, 1)
74
+ else:
75
+ rv = temp
76
+ word = pre+rv
77
+ return word
78
+ stem = staticmethod(stem)
79
+
80
+
81
+
82
+ class BasicSearch:
83
+ # constructor function
84
+ def __init__(self, doctype = 'minfin-letters', data_directory = './') :
85
+ self.doctype = doctype
86
+ self.load_everything(data_directory=data_directory)
87
+
88
+ def read_xml(self, path):
89
+ with open(path, "r", encoding="utf-8") as text_file:
90
+ data = text_file.read()
91
+
92
+ document = parseString('<data>' + data + '</data>')
93
+ return [
94
+ document.getElementsByTagName('title'),
95
+ document.getElementsByTagName('text')
96
+ ]
97
+
98
+
99
+ def getRefsNK(self, s) :
100
+ i = 0
101
+ refs = set()
102
+ x = 0
103
+ while x != -1 :
104
+ x = s.lower().find(' ст.', x)
105
+ if x != -1 :
106
+ # x += 1
107
+ y = s.lower().find('нк рф', x)
108
+ if y != -1 :
109
+ # print(i)
110
+ # print(x, y)
111
+ dx = 4
112
+ if s[x + dx] == ' ' :
113
+ dx = 5
114
+ if y - x <= 13 and y - x > 5 :
115
+ # print(s[x + 4: y + 5])
116
+ ref = 'Статья ' + s[x + dx: y - 1]
117
+ if ref in self.refid :
118
+ refs.add(ref)
119
+ x = y
120
+ else :
121
+ # print('error: ', s[x + 4: y + 5])
122
+ x += 1
123
+ i += 1
124
+ if i > 1000 :
125
+ break
126
+ return list(refs)
127
+
128
+ def getRefsNK1(self, s, debug = False, altrefs = set()) :
129
+ i = 0
130
+ refs = set()
131
+ x = 0
132
+ slen = len(s)
133
+
134
+ s0 = s
135
+ s = s.replace('(',' ')
136
+ s = s.replace(')',' ')
137
+ s = s.replace(';',' ')
138
+ s = s.replace(':',' ')
139
+ s = s.replace(',',' ')
140
+
141
+ while x != -1 :
142
+ # print(x)
143
+ x1 = s.lower().find('нк рф', x)
144
+ if x1 == -1 :
145
+ break
146
+
147
+ # print(x)
148
+ x2 = x1 - 12
149
+ x2 = max(x2, 0)
150
+
151
+ x31 = s.lower().find('ст.', x2)
152
+ x32 = s.lower().find('ьей', x2)
153
+ x33 = s.lower().find('ьёй', x2)
154
+ x34 = s.lower().find('ями', x2)
155
+ x35 = s.lower().find('тьи', x2)
156
+ x36 = s.lower().find('тье', x2)
157
+
158
+ if x31 == -1 :
159
+ x31 = slen
160
+ if x32 == -1 :
161
+ x32 = slen
162
+ if x33 == -1 :
163
+ x33 = slen
164
+ if x34 == -1 :
165
+ x34 = slen
166
+ if x35 == -1 :
167
+ x35 = slen
168
+ if x36 == -1 :
169
+ x36 = slen
170
+
171
+ x3 = min(x31, x32, x33, x34, x35, x36)
172
+ # print(x1, x2, x3)
173
+ # if x3 > x1 :
174
+ # print('not found: ', s0[x2 : x1 + 5])
175
+
176
+ x = x3
177
+ # print(x)
178
+
179
+ if x != -1 :
180
+ # x += 1
181
+ y = s.lower().find('нк рф', x)
182
+ if y != -1 :
183
+ # print(i)
184
+ # print(y)
185
+ # print(s)
186
+ dx = 3
187
+ if s[x + dx] == ' ' :
188
+ dx += 1
189
+ if y - x <= 13 and y - x > 4 :
190
+ # print(s[x + 4: y + 5])
191
+ ref = 'Статья ' + s[x + dx: y - 1]
192
+ if ref in self.refid :
193
+ refs.add(ref)
194
+ if debug and (ref not in altrefs):
195
+ print('...' + s0[y - 40 : y + 5])
196
+ x = y + 1
197
+ else :
198
+ # print('error: ', s[x + 4: y + 5])
199
+ x += 1
200
+
201
+ i += 1
202
+ if i > 1000 :
203
+ break
204
+ return list(refs)
205
+
206
+ def getRefsNK2(self, s, debug = False, altrefs = set()) :
207
+ i = 0
208
+ refs = set()
209
+ x = 0
210
+ slen = len(s)
211
+
212
+ s0 = s
213
+ s = s.replace('(',' ')
214
+ s = s.replace(')',' ')
215
+ s = s.replace(';',' ')
216
+ s = s.replace(':',' ')
217
+ s = s.replace(',',' ')
218
+
219
+ while x != -1 :
220
+ # print(x)
221
+ x1 = s.lower().find('нкрф', x)
222
+ if x1 == -1 :
223
+ break
224
+
225
+ # print(x)
226
+ x2 = x1 - 12
227
+ x2 = max(x2, 0)
228
+
229
+ x3 = s.lower().find('ст.', x2)
230
+
231
+ # print(x1, x2, x3)
232
+ # if x3 > x1 :
233
+ # print('not found: ', s0[x2 : x1 + 5])
234
+
235
+ x = x3
236
+ # print(x)
237
+
238
+ if x != -1 :
239
+ # x += 1
240
+ y = s.lower().find('нкрф', x)
241
+ if y != -1 :
242
+ # print(i)
243
+ # print(y)
244
+ # print(s)
245
+ dx = 3
246
+ if s[x + dx] == ' ' :
247
+ dx += 1
248
+ if y - x <= 13 and y - x > 4 :
249
+ # print(s[x + 4: y + 5])
250
+ ref = 'Статья ' + s[x + dx: y - 1]
251
+ if ref in self.refid :
252
+ refs.add(ref)
253
+ if debug and (ref not in altrefs):
254
+ print('...' + s0[y - 40 : y + 5])
255
+ x = y + 1
256
+ else :
257
+ # print('error: ', s[x + 4: y + 5])
258
+ x += 1
259
+
260
+ i += 1
261
+ if i > 1000 :
262
+ break
263
+ return list(refs)
264
+
265
+ # read data
266
+ def load_basic_data(self, data_directory = 'data') :
267
+
268
+ # global title
269
+ # global text
270
+ # global qtitle
271
+ # global qtext
272
+ # global atitle
273
+ # global atext
274
+ # global questions
275
+ # global answers
276
+ # global added_refs
277
+ # global missed_refs
278
+
279
+ self.title, self.text = self.read_xml(os.path.join(data_directory, 'taxcode.xml'))
280
+ self.atitle, self.atext = self.read_xml(os.path.join(data_directory, 'K2-answer.xml'))
281
+ self.qtitle, self.qtext = self.read_xml(os.path.join(data_directory, 'K2-question.xml'))
282
+
283
+ _, reftext = self.read_xml(os.path.join(data_directory, 'references-04-12-2023.xml'))
284
+
285
+ reflist = [set()] * len(self.qtitle)
286
+ reflist1 = [set()] * len(self.qtitle)
287
+ qreflist = [set()] * len(self.qtitle)
288
+
289
+
290
+ def getRefNK(s) :
291
+ x = s.find('. ')
292
+ y = s.find(' (')
293
+ if x == -1 :
294
+ x = sys.maxsize
295
+ if y == -1 :
296
+ y = sys.maxsize
297
+ x = min(x, y)
298
+ id = s[:x]
299
+ return id
300
+
301
+ self.refid = {}
302
+ self.titleref = {}
303
+ self.idref = [0] * len(self.title)
304
+ for i in range(len(self.title)) :
305
+ s = self.title[i].firstChild.nodeValue
306
+ id = getRefNK(s)
307
+ self.refid[id] = i
308
+ self.titleref[s] = id
309
+ self.idref[i] = id
310
+
311
+ for i in range(len(self.qtext)) :
312
+ # for i in range(1,2) :
313
+ doctext = self.atext[i].firstChild.nodeValue
314
+ qdoctext = self.qtext[i].firstChild.nodeValue
315
+ refdoctext = reftext[i].firstChild.nodeValue
316
+ refs = self.getRefsNK1(doctext)
317
+ qrefs = self.getRefsNK1(qdoctext)
318
+ refs1 = self.getRefsNK2(refdoctext)
319
+ # print(refs, qrefs)
320
+ intrefs = []
321
+ intrefs1 = []
322
+ intqrefs = []
323
+ for ref in refs :
324
+ intrefs.append(self.refid[ref])
325
+ for ref in refs1 :
326
+ intrefs1.append(self.refid[ref])
327
+ for ref in qrefs :
328
+ intqrefs.append(self.refid[ref])
329
+ reflist[i] = set(intrefs)
330
+ reflist1[i] = set(intrefs1)
331
+ qreflist[i] = set(intqrefs)
332
+
333
+ for i in range(len(reflist)) :
334
+ reflist[i] |= reflist1[i]
335
+
336
+ self.nk_refs = []
337
+
338
+ for i in range(len(reflist)) :
339
+ refs = list(reflist[i])
340
+ newrefs = []
341
+ for j in range(len(refs)) :
342
+ ref = self.idref[refs[j]]
343
+ m = re.search('(\d+\.\d+|\d+)', ref)
344
+ s = ref[m.start() : m.end()]
345
+ ref1 = 'ст.' + s + ' НКРФ'
346
+ newrefs.append(ref1)
347
+
348
+ self.nk_refs.append(newrefs)
349
+
350
+ # reading Vlad's json data
351
+ # datadir = os.path.join(data_directory, 'data_jsons_20240104')
352
+ datadir = os.path.join(data_directory, 'data_jsons_20240119')
353
+ filelist = os.listdir(datadir)
354
+ filelist = [x for x in filelist if re.search(r'\d+.json', x)]
355
+ filelist.sort()
356
+
357
+
358
+ questions = [''] * len(filelist)
359
+ answers = [''] * len(filelist)
360
+ added_refs = [[]] * len(filelist)
361
+ missed_refs = [[]] * len(filelist)
362
+ count = 0
363
+ for filename in filelist :
364
+ x = filename.find('.')
365
+ if x == -1 :
366
+ print('ERROR :', filename)
367
+ if filename[:x].isnumeric() :
368
+ i = int(filename[:x])
369
+ # print(i)
370
+ with open(os.path.join(datadir, filename), 'r', encoding='utf-8') as f:
371
+ d = json.load(f)
372
+ refs = set(d['added_refs'].keys())
373
+ refs -= {''}
374
+ refs = list(refs)
375
+ questions[i] = d['question']
376
+ answers[i] = d['answer']
377
+ missed_refs[i] = d['refs']
378
+ added_refs[i] = refs
379
+ count += 1
380
+
381
+ self.questions = questions#[:count]
382
+ self.answers = answers#[:count]
383
+ self.added_refs = added_refs#[:count]
384
+ self.missed_refs = missed_refs#[:count]
385
+
386
+ def load_text_processing(self) :
387
+ # globals stop_words
388
+ # global stemmer
389
+
390
+ # nltk.download('punkt')
391
+ # nltk.download('stopwords')
392
+ # nlp = ru_core_news_md.load()
393
+ # self.stop_words = set(stopwords.words('russian'))
394
+ self.stop_words = {'а', 'без', 'более', 'больше', 'будет', 'будто', 'бы', 'был', 'была', 'были', 'было', 'быть', 'в', 'вам', 'вас', 'вдруг', 'ведь', 'во', 'вот', 'впрочем', 'все', 'всегда', 'всего', 'всех', 'всю', 'вы', 'где', 'да', 'даже', 'два', 'для', 'до', 'другой', 'его', 'ее', 'ей', 'ему', 'если', 'есть', 'еще', 'ж', 'же', 'за', 'зачем', 'здесь', 'и', 'из', 'или', 'им', 'иногда', 'их', 'к', 'как', 'какая', 'какой', 'когда', 'конечно', 'кто', 'куда', 'ли', 'лучше', 'между', 'меня', 'мне', 'много', 'может', 'можно', 'мой', 'моя', 'мы', 'на', 'над', 'надо', 'наконец', 'нас', 'не', 'него', 'нее', 'ней', 'нельзя', 'нет', 'ни', 'нибудь', 'никогда', 'ним', 'них', 'ничего', 'но', 'ну', 'о', 'об', 'один', 'он', 'она', 'они', 'опять', 'от', 'перед', 'по', 'под', 'после', 'потом', 'потому', 'почти', 'при', 'про', 'раз', 'разве', 'с', 'сам', 'свою', 'себе', 'себя', 'сейчас', 'со', 'совсем', 'так', 'такой', 'там', 'тебя', 'тем', 'теперь', 'то', 'тогда', 'того', 'тоже', 'только', 'том', 'тот', 'три', 'тут', 'ты', 'у', 'уж', 'уже', 'хорошо', 'хоть', 'чего', 'чем', 'через', 'что', 'чтоб', 'чтобы', 'чуть', 'эти', 'этого', 'этой', 'этом', 'этот', 'эту', 'я'}
395
+ # self.stemmer = SnowballStemmer("russian")
396
+ self.stemmer = Porter()
397
+
398
+ def analyze(self, s) :
399
+ template = r'[\'\"\.\,\?\!\:\;\-\+\%\^\&\*\@\~\_\=/\\\>\<\#\$\(\)\|\n\r\d]'
400
+ s = re.sub(template, ' ', s)
401
+ # template = r'( \w |^\w | \w$)'
402
+ # s = re.sub(template, ' ', s)
403
+ # s = re.sub(' +', ' ', s)
404
+ s = ' '.join( [w for w in s.split() if len(w) > 1] )
405
+ # tokens = nlp(s)
406
+ # tokens = [str(t.lemma_) for t in tokens]
407
+ # tokens = word_tokenize(s)
408
+ tokens = s.strip().lower().split(' ')
409
+ # tokens = [t for t in tokens if t not in self.stop_words and t != ' ']
410
+ # tokens = [self.stemmer.stem(word) for word in tokens]
411
+ tokens = [self.stemmer.stem(word) for word in tokens if word not in self.stop_words]
412
+ newtext = ' '.join(tokens)
413
+ return newtext
414
+
415
+ # load medium dataset
416
+ def load_medium_dataset(self, path) :
417
+ # global dataset_medium
418
+ with open(path, 'r', encoding='utf-8') as infile:
419
+ self.dataset_medium = json.load(infile)
420
+
421
+ # data_path = "./legal_info_search_data/data_jsons_20240119"
422
+ # all_docs = {}
423
+ # for filename in os.listdir(data_path):
424
+ # with open(os.path.join(data_path, filename), "r", encoding="utf-8") as f:
425
+ # all_docs[int(filename.split(".")[0])] = json.load(f)
426
+
427
+ # # filter out docs with no added_refs
428
+ # dataset_small = {}
429
+ # for key, value in all_docs.items() :
430
+ # added_refs = value['added_refs']
431
+ # dataset_small.update(added_refs)
432
+
433
+ # # self.dataset_medium = dataset_small
434
+
435
+ # dataset_new = {}
436
+ # for key in dataset_small :
437
+ # m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', key)
438
+ # s = key
439
+ # if m != None :
440
+ # s = key[m.start() : ]
441
+
442
+ # if s in self.dataset_medium :
443
+ # dataset_new[s] = self.dataset_medium[s]
444
+ # elif s in dataset_small :
445
+ # dataset_new[s] = dataset_small[s]
446
+ # else :
447
+ # dataset_new[key] = dataset_small[key]
448
+ # # print(key, 'is absent')
449
+
450
+ # self.dataset_medium = dataset_new
451
+
452
+ # create a filtered list of references for Vlad's json data
453
+ def create_filtered_refs(self) :
454
+ doctype = self.doctype
455
+ added_refs = self.added_refs
456
+ # global filtered_refs
457
+ # global doctype_template
458
+
459
+ # t = r'(НКРФ|ГКРФ|ТКРФ|ФЗ|[Зз]акон|Минфин|ФНС|Правительства|ФАС|АС|КС|ВС|[Сс]удебн|[Сс]уд)'
460
+ if doctype == 'court-decisions' :
461
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд)' # courts' decisions
462
+ ref_template = doctype_template
463
+ elif doctype == 'minfin-letters' :
464
+ doctype_template = r'[Пп]исьмо [Мм]инфина' # Minfin letters
465
+ ref_template = doctype_template
466
+ elif doctype == 'fns-letters' :
467
+ doctype_template = r'[Пп]исьмо (ФНС|фнс)' # FNS letters
468
+ ref_template = doctype_template
469
+ elif doctype == 'all-letters' :
470
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс))' # courts' decisions + Minfin letters + FNS letters
471
+ ref_template = doctype_template
472
+ elif doctype == 'taxcode' :
473
+ doctype_template = r'^ст.(\d+\.\d+|\d+) НКРФ'
474
+ ref_template = r'ст.(\d+\.\d+|\d+) НКРФ' # taxcode ref formst differs from doctype format
475
+ elif doctype == 'other-laws' :
476
+ doctype_template = r'(^ст.(\d+\.\d+|\d+) [ГТ]КРФ|^Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # courts' decisions + Minfin letters + FNS letters + taxcode
477
+ ref_template = r'(ст.(\d+\.\d+|\d+) [ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # taxcode ref formst differs from doctype format
478
+ elif doctype == 'all-docs' :
479
+ # doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ)' # courts' decisions + Minfin letters + FNS letters + taxcode
480
+ # ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ)' # taxcode ref formst differs from doctype format
481
+
482
+ doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ|^ст.(\d+\.\d+|\d+) [ГТ]КРФ|^Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|^Решение Коллегии Евразийской экономической комиссии)' # courts' decisions + Minfin letters + FNS letters + taxcode
483
+ ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ|ст.(\d+\.\d+|\d+) [ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # taxcode ref formst differs from doctype format
484
+ else :
485
+ print('Error : wrong doctype "' + doctype + '"')
486
+
487
+ filtered_refs = []
488
+ nk_mask = []
489
+ for i in range(len(added_refs)) :
490
+ refs = []
491
+ for j in range(len(added_refs[i])) :
492
+ s = added_refs[i][j]
493
+ if re.search(ref_template, s) != None:
494
+ m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', s)
495
+ if m != None :
496
+ s = s[m.start() : ]
497
+
498
+ if s in self.dataset_medium :
499
+ refs.append(s)
500
+ # print(i, j, s)
501
+
502
+ if doctype_template.find('НКРФ') != -1 :
503
+ refs += self.nk_refs[i]
504
+
505
+ refs = list(set(refs))
506
+ filtered_refs.append(refs)
507
+
508
+ self.filtered_refs = filtered_refs
509
+ self.doctype_template = doctype_template
510
+
511
+ # creating corpora fo TF-IDF embedding
512
+ def create_corpora(self) :
513
+
514
+ self.qcorpus = []
515
+ for i in range(len(self.qtext)) :
516
+ if not i % 100 : print(i, end = ' ')
517
+ # s = self.qtext[i].firstChild.nodeValue
518
+ s = self.qtitle[i].firstChild.nodeValue + ' ' + self.qtext[i].firstChild.nodeValue
519
+ s = self.analyze(s)
520
+ self.qcorpus.append(s)
521
+
522
+ self.acorpus = []
523
+ for i in range(len(self.qtext)) :
524
+ s = self.atext[i].firstChild.nodeValue
525
+ s = self.analyze(s)
526
+ self.acorpus.append(s)
527
+
528
+ # self.nkcorpus = []
529
+ # for i in range(len(self.text)) :
530
+ # if not i % 100 : print(i, end = ' ')
531
+ # s = self.text[i].firstChild.nodeValue
532
+ # s = self.analyze(s)
533
+ # self.nkcorpus.append(s)
534
+
535
+ self.pmfcorpus = []
536
+ self.pmfrefs = []
537
+ self.pmfids = []
538
+ self.pmflengths = []
539
+ self.nk_mask = []
540
+ self.laws_mask = []
541
+
542
+ i = 0
543
+ self.items = []
544
+ for key, value in self.dataset_medium.items() :
545
+ # print('test')
546
+ # break
547
+ if re.search(self.doctype_template, key) != None :
548
+ s = value
549
+ ss = key
550
+ m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', ss)
551
+ if m != None :
552
+ ss = ss[m.start() : ]
553
+
554
+ if s != None :
555
+ s = s.replace('\n', ' ')
556
+ if s != None and s.count(' ') :
557
+ if not i % 100 : print(i, end = ' ')
558
+ # print('test')
559
+ # break
560
+ s = self.analyze(s)
561
+ if s.count(' ') :
562
+ self.pmfcorpus.append(s)
563
+ self.pmfrefs.append(ss)
564
+ self.pmfids.append(i)
565
+ self.items.append({'title' : key, 'text' : value})
566
+ self.pmflengths.append(s.count(' '))
567
+
568
+ # if ss.find('НКРФ') != -1 :
569
+ if re.search(r'НКРФ', ss) :
570
+ self.nk_mask.append(1)
571
+ else:
572
+ self.nk_mask.append(0)
573
+
574
+ if re.search(r'([ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)', ss) :
575
+ self.laws_mask.append(1)
576
+ else:
577
+ self.laws_mask.append(0)
578
+
579
+ i += 1
580
+
581
+ self.refids = {}
582
+ for i in range(len(self.pmfrefs)) :
583
+ key = self.pmfrefs[i]
584
+ self.refids[key] = i
585
+
586
+ # build up TF-IDF representation
587
+ def create_TFIDF(self) :
588
+
589
+ self.vectorizer = CountVectorizer()
590
+ # self.transformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
591
+ self.transformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
592
+
593
+ X = self.vectorizer.fit_transform(self.pmfcorpus)
594
+ QX = self.vectorizer.transform(self.qcorpus)
595
+ self.TFIDF = self.transformer.fit_transform(X)
596
+ self.QTFIDF = self.transformer.transform(QX)
597
+
598
+ # self.norm = []
599
+ # for i in range(self.TFIDF.shape[0]) :
600
+ # n = scipy.sparse.linalg.norm(self.TFIDF[i])
601
+ # self.norm.append(n)
602
+ # self.TFIDF[i] /= n
603
+
604
+ # for i in range(self.QTFIDF.shape[0]) :
605
+ # qn = scipy.sparse.linalg.norm(self.QTFIDF[i])
606
+ # self.QTFIDF[i] /= qn
607
+
608
+ n = np.sqrt(self.TFIDF.multiply(self.TFIDF).sum(axis = 1))
609
+
610
+ self.TFIDF = self.TFIDF.multiply(sparse.csr_matrix(1 / n))
611
+ self.norm = n.flatten().tolist()[0]
612
+ n = np.sqrt(self.QTFIDF.multiply(self.QTFIDF).sum(axis = 1))
613
+ self.QTFIDF = self.QTFIDF.multiply(sparse.csr_matrix(1 / n))
614
+
615
+ self.avectorizer = CountVectorizer()
616
+ self.atransformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
617
+ # self.atransformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
618
+
619
+ AX = self.avectorizer.fit_transform(self.acorpus)
620
+ AQX = self.avectorizer.transform(self.qcorpus)
621
+ self.ATFIDF = self.atransformer.fit_transform(AX)
622
+ self.AQTFIDF = self.atransformer.transform(AQX)
623
+
624
+ # get top letters sorted by TF-IDF cosine similarity
625
+ def getKNNScores(self, v, i = -1) :
626
+ # v = self.AQTFIDF[i]
627
+ vt = v.transpose()
628
+ ascores = self.ATFIDF.dot(vt)[:, 0].todense()
629
+ ascores = np.squeeze(np.asarray(ascores))
630
+ scores = [0] * len(self.refids)
631
+ for j in range(len(self.filtered_refs)) :
632
+ score = ascores[j]
633
+ refs = self.filtered_refs[j]
634
+ for k in range(len(refs)) :
635
+ ref = refs[k]
636
+ m = re.search(r'ст.(\d+\.\d+|\d+) НКРФ', ref)
637
+ if i != j and m != None :
638
+ key = ref[m.start() : ]
639
+ if key in self.refids :
640
+ id = self.refids[key]
641
+ if scores[id] < score :
642
+ scores[id] = score
643
+
644
+ return scores
645
+
646
+ def getScores(self, v1, v2, i = -1) :
647
+ # v = self.QTFIDF[i]
648
+ vt = v1.transpose()
649
+ scores = self.TFIDF.dot(vt)[:, 0].todense()
650
+ scores = np.squeeze(np.asarray(scores))
651
+ nk_scores = self.getKNNScores(v2, i)
652
+
653
+ df = pd.DataFrame()
654
+ df[0] = scores
655
+ df[1] = nk_scores
656
+ df[2] = self.norm
657
+ df[3] = self.nk_mask
658
+ df[4] = 1 - df[3]
659
+ df[5] = (1 - np.sign(df[1])) * df[3]
660
+
661
+ df[0] = df[0] * df[4] + df[1] + df[5] * df[0] * zeta
662
+ # df[0] = df[0] * df[4] + np.maximum(df[1], df[0] * zeta)
663
+
664
+ df[0] *= np.log(df[2]) ** alpha
665
+ df[0] *= (1 + df[3] * beta)
666
+ df[0] += df[3] * gamma
667
+
668
+ df[4] = self.laws_mask
669
+ df[0] *= (1 + df[4] * delta)
670
+ df[0] += df[4] * epsilon
671
+
672
+ return df[0].tolist()
673
+
674
+ def getTop(self, i, top) :
675
+ v1 = self.QTFIDF[i]
676
+ v2 = self.AQTFIDF[i]
677
+ df = pd.DataFrame()
678
+ df[0] = self.getScores(v1, v2, i)
679
+ # df[0] = self.getKNNScores(i)
680
+ df[1] = self.pmfrefs
681
+
682
+ df.sort_values(0, ascending = False, inplace = True)
683
+ # df.sort_values(0, ascending = True, inplace = True)
684
+
685
+ ids = df[1].tolist()
686
+ scores = df[0].tolist()
687
+ filtered_ids = []
688
+ for i in range(len(ids)) :
689
+ id = ids[i]
690
+ score = scores[i]
691
+ if id not in filtered_ids :
692
+ filtered_ids.append(id)
693
+
694
+ if len(filtered_ids) == top :
695
+ break
696
+
697
+ # return ids[:top].tolist()
698
+ return filtered_ids
699
+
700
+ def test_TFIDF_top(self, top = 40, metric = '') :
701
+ N = len(self.qtext)
702
+ allhits = 0
703
+ allrefs = 0
704
+ recall = []
705
+ precision = []
706
+ f1 = []
707
+
708
+ for i in range(N) :
709
+ # if not i % 10 : print(i, end = ' ')
710
+ refs = set(self.filtered_refs[i])
711
+ resp = self.getTop(i, top)
712
+ serp = set(resp)
713
+ hits = len(refs & serp)
714
+
715
+ allhits += hits
716
+ allrefs += len(refs)
717
+
718
+ tp = hits
719
+ fp = top - tp
720
+ fn = len(refs) - hits
721
+
722
+ if tp == 0 and metric == 'corrected':
723
+ if fp == 0 and fn == 0 :
724
+ # print(i, len(refs), fp, fn)
725
+ recall.append(1)
726
+ precision.append(1)
727
+ f1.append(1)
728
+ else :
729
+ # print(i, len(refs), fp, fn)
730
+ recall.append(0)
731
+ precision.append(0)
732
+ f1.append(0)
733
+
734
+ elif tp + fn > 0 :
735
+ recall.append(tp / (tp + fn))
736
+ precision.append(tp / (tp + fp))
737
+ f1.append(2 * tp / (2 * tp + fp + fn))
738
+
739
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
740
+ print('mean recall:', sum(recall) / len(recall))
741
+ print('mean precision:', sum(precision) / len(precision))
742
+ print('mean F1:', sum(f1) / len(f1))
743
+
744
+ # get letters with TF-IDF cosine similarity score > value
745
+ def getTopByScoreValue(self, i, value) :
746
+ # v = self.QTFIDF[i]
747
+ # vt = v.transpose()
748
+ # scores = self.TFIDF.dot(vt)[:, 0].todense()
749
+ # scores = np.squeeze(np.asarray(scores))
750
+
751
+ # df = pd.DataFrame()
752
+ # df[0] = scores
753
+ # df[1] = self.pmfrefs
754
+
755
+ v1 = self.QTFIDF[i]
756
+ v2 = self.AQTFIDF[i]
757
+ df = pd.DataFrame()
758
+ df[0] = self.getScores(v1, v2, i)
759
+ df[1] = self.pmfrefs
760
+
761
+ df.sort_values(0, ascending = False, inplace = True)
762
+
763
+ df1 = df.loc[df[0] > value]
764
+ ids = df1[1]
765
+
766
+ return ids.tolist()
767
+
768
+ # calculate metrics for letters with TF-IDF cosine similarity score > value
769
+
770
+ def test_TFIDF_value(self, value = .4) :
771
+ N = len(self.qtext)
772
+ allhits = 0
773
+ allrefs = 0
774
+ recall = []
775
+ precision = []
776
+ f1 = []
777
+ topsize = []
778
+ count = 0
779
+
780
+ for i in range(N) :
781
+ # if not i % 10 : print(i, end = ' ')
782
+ refs = set(self.filtered_refs[i])
783
+ resp = self.getTopByScoreValue(i, value)
784
+ serp = set(resp)
785
+ hits = len(refs & serp)
786
+ top = len(resp)
787
+ topsize.append(top)
788
+
789
+ if top > 0 :
790
+ count += 1
791
+
792
+ tp = hits
793
+ fp = top - tp
794
+ fn = len(refs) - hits
795
+
796
+ if tp == 0 :
797
+ if fp == 0 and fn == 0 :
798
+ recall.append(1)
799
+ precision.append(1)
800
+ f1.append(1)
801
+ else :
802
+ recall.append(0)
803
+ precision.append(0)
804
+ f1.append(0)
805
+
806
+ else :
807
+ recall.append(tp / (tp + fn))
808
+ precision.append(tp / (tp + fp))
809
+ f1.append(2 * tp / (2 * tp + fp + fn))
810
+
811
+ allhits += hits
812
+ allrefs += len(refs)
813
+
814
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
815
+ print('mean recall:', sum(recall) / len(recall))
816
+ print('mean precision:', sum(precision) / len(precision))
817
+ print('mean F1:', sum(f1) / len(f1))
818
+ print('mean top size: ', sum(topsize) / len(topsize))
819
+ print('non-empty top:', count)
820
+ print('non-empty top share:', count / 517)
821
+
822
+ # return topsize
823
+
824
+ # get letters with TF-IDF cosine similarity score > top score * ratio
825
+ def getTopByScoreRelValue(self, i, ratio) :
826
+ # v = self.QTFIDF[i]
827
+ # vt = v.transpose()
828
+ # scores = self.TFIDF.dot(vt)[:, 0].todense()
829
+ # scores = np.squeeze(np.asarray(scores))
830
+ # df = pd.DataFrame()
831
+ # df[0] = scores
832
+ # df[1] = self.pmfrefs
833
+
834
+ v1 = self.QTFIDF[i]
835
+ v2 = self.AQTFIDF[i]
836
+ df = pd.DataFrame()
837
+ df[0] = self.getScores(v1, v2, i)
838
+ df[1] = self.pmfrefs
839
+
840
+ df.sort_values(0, ascending = False, inplace = True)
841
+ value = df.iloc[0, 0]
842
+ df1 = df.loc[df[0] > value * ratio]
843
+ ids = df1[1]
844
+
845
+ return ids.tolist()
846
+
847
+ # calculate metrics for letters with TF-IDF cosine similarity score > top score * ratio
848
+ def test_TFIDF_ratio(self, ratio = .9) :
849
+ N = len(self.qtext)
850
+ allhits = 0
851
+ allrefs = 0
852
+ recall = []
853
+ precision = []
854
+ f1 = []
855
+ topsize = []
856
+ count = 0
857
+
858
+ for i in range(N) :
859
+ # if not i % 10 : print(i, end = ' ')
860
+ refs = set(self.filtered_refs[i])
861
+ resp = self.getTopByScoreRelValue(i, ratio)
862
+ serp = set(resp)
863
+ hits = len(refs & serp)
864
+ top = len(resp)
865
+ topsize.append(top)
866
+
867
+ tp = hits
868
+ fp = top - tp
869
+ fn = len(refs) - hits
870
+
871
+ r = 0
872
+ p = 0
873
+ f = 0
874
+
875
+ if tp == 0 :
876
+ if fp == 0 and fn == 0 :
877
+ recall.append(1)
878
+ precision.append(1)
879
+ f1.append(1)
880
+ r = 1
881
+ p = 1
882
+ f = 1
883
+ else :
884
+ recall.append(0)
885
+ precision.append(0)
886
+ f1.append(0)
887
+
888
+ else :
889
+ recall.append(tp / (tp + fn))
890
+ precision.append(tp / (tp + fp))
891
+ f1.append(2 * tp / (2 * tp + fp + fn))
892
+ r = tp / (tp + fn)
893
+ p = tp / (tp + fp)
894
+ f = 2 * tp / (2 * tp + fp + fn)
895
+
896
+ if (f > r and f > p) or (f < r and f < p) :
897
+ print('ERROR :', i, r, p, f)
898
+
899
+ allhits += hits
900
+ allrefs += len(refs)
901
+
902
+ print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
903
+ print('mean recall:', sum(recall) / len(recall))
904
+ print('mean precision:', sum(precision) / len(precision))
905
+ print('mean F1:', sum(f1) / len(f1))
906
+ print('mean top size: ', sum(topsize) / len(topsize))
907
+
908
+ # return topsize
909
+
910
+ # def getTopForQuery(self, i, top, query) :
911
+ # v = QTFIDF[i]
912
+ # vt = v.transpose()
913
+ # scores = TFIDF.dot(vt)[:, 0].todense()
914
+ # scores = np.squeeze(np.asarray(scores))
915
+ # df = pd.DataFrame()
916
+ # df[0] = scores
917
+ # df[1] = pmfrefs
918
+
919
+ # df.sort_values(0, ascending = False, inplace = True)
920
+ # # df.sort_values(0, ascending = True, inplace = True)
921
+ # # ids = df.index
922
+ # ids = df[1]
923
+ # # print(df)
924
+
925
+ # return ids[:top].tolist()
926
+
927
+ def load_everything(self, data_directory = 'data') :
928
+ self.load_basic_data(data_directory=data_directory)
929
+ self.load_text_processing()
930
+ s = '|()><.,!?:;=*-/\\8. Форма \n \r Cчета-фактуры и порядок его заполнения, формы и порядок ведения журнала учета полученных и выставленных счетов-фактур, книг покупок и книг продаж устанавливаются Правительством Российской Федерации.'
931
+ print(self.analyze(s))
932
+ self.load_medium_dataset(path=os.path.join(data_directory, 'search_data', 'medium_dataset.json'))
933
+ self.create_filtered_refs()
934
+ self.create_corpora()
935
+ print(len(self.pmfcorpus))
936
+ self.create_TFIDF()
937
+
938
+ def test_everything(self) :
939
+ self.test_TFIDF_top(top = 40)
940
+ self.test_TFIDF_value(value = .2)
941
+ self.test_TFIDF_ratio(ratio = .9)
942
+
943
+ def search(self, query, top = 10) :
944
+ analyzed_query = self.analyze(query)
945
+
946
+ query_TF = self.vectorizer.transform([analyzed_query])
947
+ query_TFIDF = self.transformer.transform(query_TF)
948
+ n = np.sqrt(query_TFIDF.multiply(query_TFIDF).sum(axis = 1))
949
+ query_TFIDF = query_TFIDF.multiply(sparse.csr_matrix(1 / n))
950
+
951
+ query_ATF = self.avectorizer.transform([analyzed_query])
952
+ query_ATFIDF = self.atransformer.transform(query_ATF)
953
+
954
+ v1 = query_TFIDF[0]
955
+ v2 = query_ATFIDF[0]
956
+
957
+ # vt = v.transpose()
958
+ # scores = self.TFIDF.dot(vt)[:, 0].todense()
959
+ # scores = np.squeeze(np.asarray(scores))
960
+ # df = pd.DataFrame()
961
+ # df[0] = scores
962
+ # df[1] = self.pmfrefs
963
+ # df[2] = self.norm
964
+ # df[3] = self.nk_mask
965
+
966
+ # df[0] *= np.log(df[2]) ** alpha
967
+ # df[0] *= (1 + df[3] * beta)
968
+ # df[0] += df[3] * gamma
969
+
970
+ # df[4] = self.laws_mask
971
+ # df[0] *= (1 + df[4] * delta)
972
+ # df[0] += df[4] * epsilon
973
+
974
+ # df.sort_values(0, ascending = False, inplace = True)
975
+ # # df.sort_values(0, ascending = True, inplace = True)
976
+
977
+ # if top == 'auto' :
978
+ # value = df.iloc[0, 0]
979
+ # ratio = 0.81
980
+ # df1 = df.loc[df[0] > value * ratio]
981
+ # ids = df1[1]
982
+ # top = len(ids)
983
+ # else :
984
+ # ids = df[1][:top]
985
+
986
+ # # print(df)
987
+
988
+ df = pd.DataFrame()
989
+ df[0] = self.getScores(v1, v2)
990
+ # df[0] = self.getKNNScores(i)
991
+ df[1] = self.pmfrefs
992
+
993
+ df.sort_values(0, ascending = False, inplace = True)
994
+ # df.sort_values(0, ascending = True, inplace = True)
995
+
996
+ titles = df[1].tolist()
997
+ # titles = ids.tolist()
998
+ docs = []
999
+ for i in range(len(titles)) :
1000
+ id = df.iloc[i, 1]
1001
+ docs.append(self.dataset_medium[id])
1002
+ # print()
1003
+ # print (i, df.iloc[i, 0], id)
1004
+ # print(self.dataset_medium[id])
1005
+
1006
+ scores = df[0][:top].tolist()
1007
+
1008
+ return titles, docs, scores
1009
+
1010
+ # bsearch = BasicSearch('taxcode')
1011
+ # bsearch = BasicSearch('minfin-letters')
1012
+ # bsearch = BasicSearch('fns-letters')
1013
+ # bsearch = BasicSearch('other-laws')
1014
+ # bsearch = BasicSearch('all-docs', './data')
1015
+
1016
+ # bsearch.test_TFIDF_top(40)
1017
+
1018
+ # query = 'Форма счета-фактуры и порядок его заполнения'
1019
+ # titles, docs, scores = bsearch.search(query, top = 40)
1020
+
1021
+ # print()
1022
+ # print('top size', len(scores))
1023
+ # print('top score', scores[0])
1024
+ # print('\n', titles[0], ':\n')
1025
+ # print(docs[0])
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.8.10-slim
3
+ # Set Python to use unbuffered mode
4
+ ENV PYTHONUNBUFFERED 1
5
+ # Set the working directory in the container
6
+ RUN mkdir /var/www
7
+ ENV HOME /var/www
8
+ WORKDIR /var/www
9
+
10
+ # Copy the current directory contents into the container at /app
11
+ COPY . /var/www
12
+
13
+ RUN apt-get update && apt-get install --no-install-recommends -y git && \
14
+ rm -rf /var/lib/apt/lists/*
15
+ RUN pip install huggingface-cli
16
+
17
+ RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
18
+ hf secrets login $(cat /run/secrets/HF_TOKEN) && \
19
+ hf repo clone myrushev/nn-legal-search-data /var/www/data
20
+
21
+
22
+ RUN pip install -r requirements.txt
23
+ # RUN python -c "import nltk; nltk.download('punkt')"
24
+ # RUN python -c "import nltk; nltk.download('stopwords')"
25
+ RUN python -m nltk.downloader -d $HOME/nltk_data punkt stopwords
26
+
27
+ # Make port 5000 available to the world outside this container
28
+ EXPOSE 7866
29
+
30
+ # Define environment variable
31
+ ENV FLASK_APP app.py
32
+
33
+ # Run app.py when the container launches
34
+ CMD flask run --host=0.0.0.0 --port=7866
app.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from flask import Flask, jsonify, request
3
+ from BasicSearchV6 import BasicSearch as BasicSearchV6
4
+ from BasicSearchV5 import BasicSearch as BasicSearchV5
5
+
6
+ search_v6 = BasicSearchV6(doctype='all-docs', data_directory='./data')
7
+ search_v6.test_everything()
8
+
9
+ search_v5 = BasicSearchV5(doctype='all-docs', data_directory='./data')
10
+ search_v5.test_everything()
11
+
12
+
13
+ app = Flask(__name__)
14
+ app.config['JSON_AS_ASCII'] = False
15
+
16
+ @app.route('/health', methods=['GET'])
17
+ def health():
18
+ return jsonify({"status": "ok"})
19
+
20
+ @app.route('/search', methods=['POST'])
21
+ def search_route():
22
+ data = request.get_json()
23
+ query = data.get('query', '')
24
+ top = data.get('top', 10)
25
+ version = data.get('version', 6)
26
+ if version == 6:
27
+ titles, docs, scores = search_v6.search(query, top)
28
+ else:
29
+ titles, docs, scores = search_v5.search(query, top)
30
+ result = [{'title': str(item1), 'text': str(item2), 'relevance': str(item3)} for item1, item2, item3 in zip(titles, docs, scores)]
31
+ return jsonify(result)
32
+
33
+ if __name__ == '__main__':
34
+
35
+ app.run(debug=False, host='0.0.0.0', port=7866)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ scikit-learn==1.3.2
2
+ pandas==2.0.3
3
+ numpy==1.24.4
4
+ regex==2023.10.3
5
+ nltk==3.8.1
6
+ flask==3.0.0