Spaces:
Build error
Build error
Changed search
Browse files- BasicSearchV7.py +1038 -0
- Dockerfile +2 -1
- app.py +8 -15
BasicSearchV7.py
ADDED
|
@@ -0,0 +1,1038 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfTransformer
|
| 5 |
+
from scipy import sparse
|
| 6 |
+
import re
|
| 7 |
+
from xml.dom.minidom import parseString #, parse
|
| 8 |
+
import os
|
| 9 |
+
import sys
|
| 10 |
+
import json
|
| 11 |
+
|
| 12 |
+
# alpha = 1.15
|
| 13 |
+
# beta = .2
|
| 14 |
+
# gamma = .4
|
| 15 |
+
# delta = .31
|
| 16 |
+
# epsilon = 0
|
| 17 |
+
|
| 18 |
+
alpha = 0
|
| 19 |
+
beta = .55
|
| 20 |
+
gamma = .0
|
| 21 |
+
delta = .2
|
| 22 |
+
epsilon = 0
|
| 23 |
+
zeta = .65
|
| 24 |
+
|
| 25 |
+
# stemmer class
|
| 26 |
+
class Porter:
|
| 27 |
+
PERFECTIVEGROUND = re.compile(u"((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$")
|
| 28 |
+
REFLEXIVE = re.compile(u"(с[яь])$")
|
| 29 |
+
ADJECTIVE = re.compile(u"(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$")
|
| 30 |
+
PARTICIPLE = re.compile(u"((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$")
|
| 31 |
+
VERB = re.compile(u"((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|уют|ит|ыт|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$")
|
| 32 |
+
NOUN = re.compile(u"(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$")
|
| 33 |
+
RVRE = re.compile(u"^(.*?[аеиоуыэюя])(.*)$")
|
| 34 |
+
DERIVATIONAL = re.compile(u".*[^аеиоуыэюя]+[аеиоуыэюя].*ость?$")
|
| 35 |
+
DER = re.compile(u"ость?$")
|
| 36 |
+
SUPERLATIVE = re.compile(u"(ейше|ейш)$")
|
| 37 |
+
I = re.compile(u"и$")
|
| 38 |
+
P = re.compile(u"ь$")
|
| 39 |
+
NN = re.compile(u"нн$")
|
| 40 |
+
|
| 41 |
+
def stem(word):
|
| 42 |
+
# word = word.lower()
|
| 43 |
+
word = word.replace(u'ё', u'е')
|
| 44 |
+
m = re.match(Porter.RVRE, word)
|
| 45 |
+
if m and m.groups():
|
| 46 |
+
pre = m.group(1)
|
| 47 |
+
rv = m.group(2)
|
| 48 |
+
temp = Porter.PERFECTIVEGROUND.sub('', rv, 1)
|
| 49 |
+
if temp == rv:
|
| 50 |
+
rv = Porter.REFLEXIVE.sub('', rv, 1)
|
| 51 |
+
temp = Porter.ADJECTIVE.sub('', rv, 1)
|
| 52 |
+
if temp != rv:
|
| 53 |
+
rv = temp
|
| 54 |
+
rv = Porter.PARTICIPLE.sub('', rv, 1)
|
| 55 |
+
else:
|
| 56 |
+
temp = Porter.VERB.sub('', rv, 1)
|
| 57 |
+
if temp == rv:
|
| 58 |
+
rv = Porter.NOUN.sub('', rv, 1)
|
| 59 |
+
else:
|
| 60 |
+
rv = temp
|
| 61 |
+
else:
|
| 62 |
+
rv = temp
|
| 63 |
+
|
| 64 |
+
rv = Porter.I.sub('', rv, 1)
|
| 65 |
+
|
| 66 |
+
if re.match(Porter.DERIVATIONAL, rv):
|
| 67 |
+
rv = Porter.DER.sub('', rv, 1)
|
| 68 |
+
|
| 69 |
+
temp = Porter.P.sub('', rv, 1)
|
| 70 |
+
if temp == rv:
|
| 71 |
+
rv = Porter.SUPERLATIVE.sub('', rv, 1)
|
| 72 |
+
rv = Porter.NN.sub(u'н', rv, 1)
|
| 73 |
+
else:
|
| 74 |
+
rv = temp
|
| 75 |
+
word = pre+rv
|
| 76 |
+
return word
|
| 77 |
+
stem = staticmethod(stem)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
class BasicSearch:
|
| 82 |
+
# constructor function
|
| 83 |
+
def __init__(self, doctype = 'all-docs', data_directory = './') :
|
| 84 |
+
self.doctype = doctype
|
| 85 |
+
self.load_everything(data_directory=data_directory)
|
| 86 |
+
|
| 87 |
+
def read_xml(self, path):
|
| 88 |
+
with open(path, "r", encoding="utf-8") as text_file:
|
| 89 |
+
data = text_file.read()
|
| 90 |
+
|
| 91 |
+
document = parseString('<data>' + data + '</data>')
|
| 92 |
+
return [
|
| 93 |
+
document.getElementsByTagName('title'),
|
| 94 |
+
document.getElementsByTagName('text')
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def getRefsNK(self, s) :
|
| 99 |
+
i = 0
|
| 100 |
+
refs = set()
|
| 101 |
+
x = 0
|
| 102 |
+
while x != -1 :
|
| 103 |
+
x = s.lower().find(' ст.', x)
|
| 104 |
+
if x != -1 :
|
| 105 |
+
# x += 1
|
| 106 |
+
y = s.lower().find('нк рф', x)
|
| 107 |
+
if y != -1 :
|
| 108 |
+
# print(i)
|
| 109 |
+
# print(x, y)
|
| 110 |
+
dx = 4
|
| 111 |
+
if s[x + dx] == ' ' :
|
| 112 |
+
dx = 5
|
| 113 |
+
if y - x <= 13 and y - x > 5 :
|
| 114 |
+
# print(s[x + 4: y + 5])
|
| 115 |
+
ref = 'Статья ' + s[x + dx: y - 1]
|
| 116 |
+
if ref in self.refid :
|
| 117 |
+
refs.add(ref)
|
| 118 |
+
x = y
|
| 119 |
+
else :
|
| 120 |
+
# print('error: ', s[x + 4: y + 5])
|
| 121 |
+
x += 1
|
| 122 |
+
i += 1
|
| 123 |
+
if i > 1000 :
|
| 124 |
+
break
|
| 125 |
+
return list(refs)
|
| 126 |
+
|
| 127 |
+
def getRefsNK1(self, s, debug = False, altrefs = set()) :
|
| 128 |
+
i = 0
|
| 129 |
+
refs = set()
|
| 130 |
+
x = 0
|
| 131 |
+
slen = len(s)
|
| 132 |
+
|
| 133 |
+
s0 = s
|
| 134 |
+
s = s.replace('(',' ')
|
| 135 |
+
s = s.replace(')',' ')
|
| 136 |
+
s = s.replace(';',' ')
|
| 137 |
+
s = s.replace(':',' ')
|
| 138 |
+
s = s.replace(',',' ')
|
| 139 |
+
|
| 140 |
+
while x != -1 :
|
| 141 |
+
# print(x)
|
| 142 |
+
x1 = s.lower().find('нк рф', x)
|
| 143 |
+
if x1 == -1 :
|
| 144 |
+
break
|
| 145 |
+
|
| 146 |
+
# print(x)
|
| 147 |
+
x2 = x1 - 12
|
| 148 |
+
x2 = max(x2, 0)
|
| 149 |
+
|
| 150 |
+
x31 = s.lower().find('ст.', x2)
|
| 151 |
+
x32 = s.lower().find('ьей', x2)
|
| 152 |
+
x33 = s.lower().find('ьёй', x2)
|
| 153 |
+
x34 = s.lower().find('ями', x2)
|
| 154 |
+
x35 = s.lower().find('тьи', x2)
|
| 155 |
+
x36 = s.lower().find('тье', x2)
|
| 156 |
+
|
| 157 |
+
if x31 == -1 :
|
| 158 |
+
x31 = slen
|
| 159 |
+
if x32 == -1 :
|
| 160 |
+
x32 = slen
|
| 161 |
+
if x33 == -1 :
|
| 162 |
+
x33 = slen
|
| 163 |
+
if x34 == -1 :
|
| 164 |
+
x34 = slen
|
| 165 |
+
if x35 == -1 :
|
| 166 |
+
x35 = slen
|
| 167 |
+
if x36 == -1 :
|
| 168 |
+
x36 = slen
|
| 169 |
+
|
| 170 |
+
x3 = min(x31, x32, x33, x34, x35, x36)
|
| 171 |
+
# print(x1, x2, x3)
|
| 172 |
+
# if x3 > x1 :
|
| 173 |
+
# print('not found: ', s0[x2 : x1 + 5])
|
| 174 |
+
|
| 175 |
+
x = x3
|
| 176 |
+
# print(x)
|
| 177 |
+
|
| 178 |
+
if x != -1 :
|
| 179 |
+
# x += 1
|
| 180 |
+
y = s.lower().find('нк рф', x)
|
| 181 |
+
if y != -1 :
|
| 182 |
+
# print(i)
|
| 183 |
+
# print(y)
|
| 184 |
+
# print(s)
|
| 185 |
+
dx = 3
|
| 186 |
+
if s[x + dx] == ' ' :
|
| 187 |
+
dx += 1
|
| 188 |
+
if y - x <= 13 and y - x > 4 :
|
| 189 |
+
# print(s[x + 4: y + 5])
|
| 190 |
+
ref = 'Статья ' + s[x + dx: y - 1]
|
| 191 |
+
if ref in self.refid :
|
| 192 |
+
refs.add(ref)
|
| 193 |
+
if debug and (ref not in altrefs):
|
| 194 |
+
print('...' + s0[y - 40 : y + 5])
|
| 195 |
+
x = y + 1
|
| 196 |
+
else :
|
| 197 |
+
# print('error: ', s[x + 4: y + 5])
|
| 198 |
+
x += 1
|
| 199 |
+
|
| 200 |
+
i += 1
|
| 201 |
+
if i > 1000 :
|
| 202 |
+
break
|
| 203 |
+
return list(refs)
|
| 204 |
+
|
| 205 |
+
def getRefsNK2(self, s, debug = False, altrefs = set()) :
|
| 206 |
+
i = 0
|
| 207 |
+
refs = set()
|
| 208 |
+
x = 0
|
| 209 |
+
slen = len(s)
|
| 210 |
+
|
| 211 |
+
s0 = s
|
| 212 |
+
s = s.replace('(',' ')
|
| 213 |
+
s = s.replace(')',' ')
|
| 214 |
+
s = s.replace(';',' ')
|
| 215 |
+
s = s.replace(':',' ')
|
| 216 |
+
s = s.replace(',',' ')
|
| 217 |
+
|
| 218 |
+
while x != -1 :
|
| 219 |
+
# print(x)
|
| 220 |
+
x1 = s.lower().find('нкрф', x)
|
| 221 |
+
if x1 == -1 :
|
| 222 |
+
break
|
| 223 |
+
|
| 224 |
+
# print(x)
|
| 225 |
+
x2 = x1 - 12
|
| 226 |
+
x2 = max(x2, 0)
|
| 227 |
+
|
| 228 |
+
x3 = s.lower().find('ст.', x2)
|
| 229 |
+
|
| 230 |
+
# print(x1, x2, x3)
|
| 231 |
+
# if x3 > x1 :
|
| 232 |
+
# print('not found: ', s0[x2 : x1 + 5])
|
| 233 |
+
|
| 234 |
+
x = x3
|
| 235 |
+
# print(x)
|
| 236 |
+
|
| 237 |
+
if x != -1 :
|
| 238 |
+
# x += 1
|
| 239 |
+
y = s.lower().find('нкрф', x)
|
| 240 |
+
if y != -1 :
|
| 241 |
+
# print(i)
|
| 242 |
+
# print(y)
|
| 243 |
+
# print(s)
|
| 244 |
+
dx = 3
|
| 245 |
+
if s[x + dx] == ' ' :
|
| 246 |
+
dx += 1
|
| 247 |
+
if y - x <= 13 and y - x > 4 :
|
| 248 |
+
# print(s[x + 4: y + 5])
|
| 249 |
+
ref = 'Статья ' + s[x + dx: y - 1]
|
| 250 |
+
if ref in self.refid :
|
| 251 |
+
refs.add(ref)
|
| 252 |
+
if debug and (ref not in altrefs):
|
| 253 |
+
print('...' + s0[y - 40 : y + 5])
|
| 254 |
+
x = y + 1
|
| 255 |
+
else :
|
| 256 |
+
# print('error: ', s[x + 4: y + 5])
|
| 257 |
+
x += 1
|
| 258 |
+
|
| 259 |
+
i += 1
|
| 260 |
+
if i > 1000 :
|
| 261 |
+
break
|
| 262 |
+
return list(refs)
|
| 263 |
+
|
| 264 |
+
# read data
|
| 265 |
+
def load_basic_data(self, data_directory = 'data') :
|
| 266 |
+
|
| 267 |
+
# global title
|
| 268 |
+
# global text
|
| 269 |
+
# global qtitle
|
| 270 |
+
# global qtext
|
| 271 |
+
# global atitle
|
| 272 |
+
# global atext
|
| 273 |
+
# global questions
|
| 274 |
+
# global answers
|
| 275 |
+
# global added_refs
|
| 276 |
+
# global missed_refs
|
| 277 |
+
self.title, self.text = self.read_xml(os.path.join(data_directory, 'taxcode.xml'))
|
| 278 |
+
self.atitle, self.atext = self.read_xml(os.path.join(data_directory, 'K2-answer.xml'))
|
| 279 |
+
self.qtitle, self.qtext = self.read_xml(os.path.join(data_directory, 'K2-question.xml'))
|
| 280 |
+
|
| 281 |
+
_, reftext = self.read_xml(os.path.join(data_directory, 'references-04-12-2023.xml'))
|
| 282 |
+
|
| 283 |
+
reflist = [set()] * len(self.qtitle)
|
| 284 |
+
reflist1 = [set()] * len(self.qtitle)
|
| 285 |
+
qreflist = [set()] * len(self.qtitle)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
def getRefNK(s) :
|
| 289 |
+
x = s.find('. ')
|
| 290 |
+
y = s.find(' (')
|
| 291 |
+
if x == -1 :
|
| 292 |
+
x = sys.maxsize
|
| 293 |
+
if y == -1 :
|
| 294 |
+
y = sys.maxsize
|
| 295 |
+
x = min(x, y)
|
| 296 |
+
id = s[:x]
|
| 297 |
+
return id
|
| 298 |
+
|
| 299 |
+
self.refid = {}
|
| 300 |
+
self.titleref = {}
|
| 301 |
+
self.idref = [0] * len(self.title)
|
| 302 |
+
for i in range(len(self.title)) :
|
| 303 |
+
s = self.title[i].firstChild.nodeValue
|
| 304 |
+
id = getRefNK(s)
|
| 305 |
+
self.refid[id] = i
|
| 306 |
+
self.titleref[s] = id
|
| 307 |
+
self.idref[i] = id
|
| 308 |
+
|
| 309 |
+
for i in range(len(self.qtext)) :
|
| 310 |
+
# for i in range(1,2) :
|
| 311 |
+
doctext = self.atext[i].firstChild.nodeValue
|
| 312 |
+
qdoctext = self.qtext[i].firstChild.nodeValue
|
| 313 |
+
refdoctext = reftext[i].firstChild.nodeValue
|
| 314 |
+
refs = self.getRefsNK1(doctext)
|
| 315 |
+
qrefs = self.getRefsNK1(qdoctext)
|
| 316 |
+
refs1 = self.getRefsNK2(refdoctext)
|
| 317 |
+
# print(refs, qrefs)
|
| 318 |
+
intrefs = []
|
| 319 |
+
intrefs1 = []
|
| 320 |
+
intqrefs = []
|
| 321 |
+
for ref in refs :
|
| 322 |
+
intrefs.append(self.refid[ref])
|
| 323 |
+
for ref in refs1 :
|
| 324 |
+
intrefs1.append(self.refid[ref])
|
| 325 |
+
for ref in qrefs :
|
| 326 |
+
intqrefs.append(self.refid[ref])
|
| 327 |
+
reflist[i] = set(intrefs)
|
| 328 |
+
reflist1[i] = set(intrefs1)
|
| 329 |
+
qreflist[i] = set(intqrefs)
|
| 330 |
+
|
| 331 |
+
for i in range(len(reflist)) :
|
| 332 |
+
reflist[i] |= reflist1[i]
|
| 333 |
+
|
| 334 |
+
self.nk_refs = []
|
| 335 |
+
|
| 336 |
+
for i in range(len(reflist)) :
|
| 337 |
+
refs = list(reflist[i])
|
| 338 |
+
newrefs = []
|
| 339 |
+
for j in range(len(refs)) :
|
| 340 |
+
ref = self.idref[refs[j]]
|
| 341 |
+
m = re.search('(\d+\.\d+|\d+)', ref)
|
| 342 |
+
s = ref[m.start() : m.end()]
|
| 343 |
+
ref1 = 'ст.' + s + ' НКРФ'
|
| 344 |
+
newrefs.append(ref1)
|
| 345 |
+
|
| 346 |
+
self.nk_refs.append(newrefs)
|
| 347 |
+
|
| 348 |
+
# reading Vlad's json data
|
| 349 |
+
# datadir = os.path.join(data_directory, 'data_jsons_20240104')
|
| 350 |
+
datadir = os.path.join(data_directory, 'data_jsons_20240119')
|
| 351 |
+
filelist = os.listdir(datadir)
|
| 352 |
+
filelist = [x for x in filelist if re.search(r'\d+.json', x)]
|
| 353 |
+
filelist.sort()
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
questions = [''] * len(filelist)
|
| 357 |
+
answers = [''] * len(filelist)
|
| 358 |
+
added_refs = [[]] * len(filelist)
|
| 359 |
+
missed_refs = [[]] * len(filelist)
|
| 360 |
+
count = 0
|
| 361 |
+
for filename in filelist :
|
| 362 |
+
x = filename.find('.')
|
| 363 |
+
if x == -1 :
|
| 364 |
+
print('ERROR :', filename)
|
| 365 |
+
if filename[:x].isnumeric() :
|
| 366 |
+
i = int(filename[:x])
|
| 367 |
+
# print(i)
|
| 368 |
+
with open(os.path.join(datadir, filename), 'r', encoding='utf-8') as f:
|
| 369 |
+
d = json.load(f)
|
| 370 |
+
refs = set(d['added_refs'].keys())
|
| 371 |
+
refs -= {''}
|
| 372 |
+
refs = list(refs)
|
| 373 |
+
questions[i] = d['question']
|
| 374 |
+
answers[i] = d['answer']
|
| 375 |
+
missed_refs[i] = d['refs']
|
| 376 |
+
added_refs[i] = refs
|
| 377 |
+
count += 1
|
| 378 |
+
|
| 379 |
+
self.questions = questions#[:count]
|
| 380 |
+
self.answers = answers#[:count]
|
| 381 |
+
self.added_refs = added_refs#[:count]
|
| 382 |
+
self.missed_refs = missed_refs#[:count]
|
| 383 |
+
|
| 384 |
+
def load_text_processing(self) :
|
| 385 |
+
# globals stop_words
|
| 386 |
+
# global stemmer
|
| 387 |
+
|
| 388 |
+
# nltk.download('punkt')
|
| 389 |
+
# nltk.download('stopwords')
|
| 390 |
+
# nlp = ru_core_news_md.load()
|
| 391 |
+
# self.stop_words = set(stopwords.words('russian'))
|
| 392 |
+
self.stop_words = {'а', 'без', 'более', 'больше', 'будет', 'будто', 'бы', 'был', 'была', 'были', 'было', 'быть', 'в', 'вам', 'вас', 'вдруг', 'ведь', 'во', 'вот', 'впрочем', 'все', 'всегда', 'всего', 'всех', 'всю', 'вы', 'где', 'да', 'даже', 'два', 'для', 'до', 'другой', 'его', 'ее', 'ей', 'ему', 'если', 'есть', 'еще', 'ж', 'же', 'за', 'зачем', 'здесь', 'и', 'из', 'или', 'им', 'иногда', 'их', 'к', 'как', 'какая', 'какой', 'когда', 'конечно', 'кто', 'куда', 'ли', 'лучше', 'между', 'меня', 'мне', 'много', 'может', 'можно', 'мой', 'моя', 'мы', 'на', 'над', 'надо', 'наконец', 'нас', 'не', 'него', 'нее', 'ней', 'нельзя', 'нет', 'ни', 'нибудь', 'никогда', 'ним', 'них', 'ничего', 'но', 'ну', 'о', 'об', 'один', 'он', 'она', 'они', 'опять', 'от', 'перед', 'по', 'под', 'после', 'потом', 'потому', 'почти', 'при', 'про', 'раз', 'разве', 'с', 'сам', 'свою', 'себе', 'себя', 'сейчас', 'со', 'совсем', 'так', 'такой', 'там', 'тебя', 'тем', 'теперь', 'то', 'тогда', 'того', 'тоже', 'только', 'том', 'тот', 'три', 'тут', 'ты', 'у', 'уж', 'уже', 'хорошо', 'хоть', 'чего', 'чем', 'через', 'что', 'чтоб', 'чтобы', 'чуть', 'эти', 'этого', 'этой', 'этом', 'этот', 'эту', 'я'}
|
| 393 |
+
# self.stemmer = SnowballStemmer("russian")
|
| 394 |
+
self.stemmer = Porter()
|
| 395 |
+
|
| 396 |
+
def analyze(self, s) :
|
| 397 |
+
template = r'[\'\"\.\,\?\!\:\;\-\+\%\^\&\*\@\~\_\=/\\\>\<\#\$\(\)\|\n\r\d]'
|
| 398 |
+
s = re.sub(template, ' ', s)
|
| 399 |
+
# template = r'( \w |^\w | \w$)'
|
| 400 |
+
# s = re.sub(template, ' ', s)
|
| 401 |
+
# s = re.sub(' +', ' ', s)
|
| 402 |
+
s = ' '.join( [w for w in s.split() if len(w) > 1] )
|
| 403 |
+
# tokens = nlp(s)
|
| 404 |
+
# tokens = [str(t.lemma_) for t in tokens]
|
| 405 |
+
# tokens = word_tokenize(s)
|
| 406 |
+
tokens = s.strip().lower().split(' ')
|
| 407 |
+
# tokens = [t for t in tokens if t not in self.stop_words and t != ' ']
|
| 408 |
+
# tokens = [self.stemmer.stem(word) for word in tokens]
|
| 409 |
+
tokens = [self.stemmer.stem(word) for word in tokens if word not in self.stop_words]
|
| 410 |
+
newtext = ' '.join(tokens)
|
| 411 |
+
return newtext
|
| 412 |
+
|
| 413 |
+
# load medium dataset
|
| 414 |
+
def load_medium_dataset(self, path) :
|
| 415 |
+
# global dataset_medium
|
| 416 |
+
with open(path, 'r', encoding='utf-8') as infile:
|
| 417 |
+
self.dataset_medium = json.load(infile)
|
| 418 |
+
|
| 419 |
+
for i in range(len(self.atext)) :
|
| 420 |
+
question = self.qtext[i].firstChild.nodeValue.strip()
|
| 421 |
+
answer = self.atext[i].firstChild.nodeValue.strip()
|
| 422 |
+
title = self.atitle[i].firstChild.nodeValue.strip()
|
| 423 |
+
title = 'Консультация ' + title
|
| 424 |
+
text = 'Вопрос: ' + question + '\n' + 'Ответ: ' + answer
|
| 425 |
+
self.dataset_medium[title] = text
|
| 426 |
+
|
| 427 |
+
# data_path = "./legal_info_search_data/data_jsons_20240119"
|
| 428 |
+
# all_docs = {}
|
| 429 |
+
# for filename in os.listdir(data_path):
|
| 430 |
+
# with open(os.path.join(data_path, filename), "r", encoding="utf-8") as f:
|
| 431 |
+
# all_docs[int(filename.split(".")[0])] = json.load(f)
|
| 432 |
+
|
| 433 |
+
# # filter out docs with no added_refs
|
| 434 |
+
# dataset_small = {}
|
| 435 |
+
# for key, value in all_docs.items() :
|
| 436 |
+
# added_refs = value['added_refs']
|
| 437 |
+
# dataset_small.update(added_refs)
|
| 438 |
+
|
| 439 |
+
# # self.dataset_medium = dataset_small
|
| 440 |
+
|
| 441 |
+
# dataset_new = {}
|
| 442 |
+
# for key in dataset_small :
|
| 443 |
+
# m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', key)
|
| 444 |
+
# s = key
|
| 445 |
+
# if m != None :
|
| 446 |
+
# s = key[m.start() : ]
|
| 447 |
+
|
| 448 |
+
# if s in self.dataset_medium :
|
| 449 |
+
# dataset_new[s] = self.dataset_medium[s]
|
| 450 |
+
# elif s in dataset_small :
|
| 451 |
+
# dataset_new[s] = dataset_small[s]
|
| 452 |
+
# else :
|
| 453 |
+
# dataset_new[key] = dataset_small[key]
|
| 454 |
+
# # print(key, 'is absent')
|
| 455 |
+
|
| 456 |
+
# self.dataset_medium = dataset_new
|
| 457 |
+
|
| 458 |
+
# create a filtered list of references for Vlad's json data
|
| 459 |
+
def create_filtered_refs(self) :
|
| 460 |
+
doctype = self.doctype
|
| 461 |
+
added_refs = self.added_refs
|
| 462 |
+
# global filtered_refs
|
| 463 |
+
# global doctype_template
|
| 464 |
+
|
| 465 |
+
# t = r'(НКРФ|ГКРФ|ТКРФ|ФЗ|[Зз]акон|Минфин|ФНС|Правительства|ФАС|АС|КС|ВС|[Сс]удебн|[Сс]уд)'
|
| 466 |
+
if doctype == 'court-decisions' :
|
| 467 |
+
doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд)' # courts' decisions
|
| 468 |
+
ref_template = doctype_template
|
| 469 |
+
elif doctype == 'minfin-letters' :
|
| 470 |
+
doctype_template = r'[Пп]исьмо [Мм]инфина' # Minfin letters
|
| 471 |
+
ref_template = doctype_template
|
| 472 |
+
elif doctype == 'fns-letters' :
|
| 473 |
+
doctype_template = r'[Пп]исьмо (ФНС|фнс)' # FNS letters
|
| 474 |
+
ref_template = doctype_template
|
| 475 |
+
elif doctype == 'all-letters' :
|
| 476 |
+
doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс))' # courts' decisions + Minfin letters + FNS letters
|
| 477 |
+
ref_template = doctype_template
|
| 478 |
+
elif doctype == 'taxcode' :
|
| 479 |
+
doctype_template = r'^ст.(\d+\.\d+|\d+) НКРФ'
|
| 480 |
+
ref_template = r'ст.(\d+\.\d+|\d+) НКРФ' # taxcode ref formst differs from doctype format
|
| 481 |
+
elif doctype == 'other-laws' :
|
| 482 |
+
doctype_template = r'(^ст.(\d+\.\d+|\d+) [ГТ]КРФ|^Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # courts' decisions + Minfin letters + FNS letters + taxcode
|
| 483 |
+
ref_template = r'(ст.(\d+\.\d+|\d+) [ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)' # taxcode ref formst differs from doctype format
|
| 484 |
+
elif doctype == 'consultations' :
|
| 485 |
+
doctype_template = 'Консультация'
|
| 486 |
+
ref_template = 'Консультация'
|
| 487 |
+
elif doctype == 'all-docs' :
|
| 488 |
+
# doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ)' # courts' decisions + Minfin letters + FNS letters + taxcode
|
| 489 |
+
# ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ)' # taxcode ref formst differs from doctype format
|
| 490 |
+
|
| 491 |
+
doctype_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|^ст.(\d+\.\d+|\d+) НКРФ|^ст.(\d+\.\d+|\d+) [ГТ]КРФ|^Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|^Решение Коллегии Евразийской экономической комиссии|Консультация)' # courts' decisions + Minfin letters + FNS letters + taxcode
|
| 492 |
+
ref_template = r'(ФАС |АС |КС |ВС |[Сс]удебн|[Сс]уд|[Пп]исьмо [Мм]инфина|[Пп]исьмо (ФНС|фнс)|ст.(\d+\.\d+|\d+) НКРФ|ст.(\d+\.\d+|\d+) [ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии|Консультация)' # taxcode ref formst differs from doctype format
|
| 493 |
+
else :
|
| 494 |
+
print('Error : wrong doctype "' + doctype + '"')
|
| 495 |
+
|
| 496 |
+
filtered_refs = []
|
| 497 |
+
nk_mask = []
|
| 498 |
+
for i in range(len(added_refs)) :
|
| 499 |
+
refs = []
|
| 500 |
+
for j in range(len(added_refs[i])) :
|
| 501 |
+
s = added_refs[i][j]
|
| 502 |
+
if re.search(ref_template, s) != None:
|
| 503 |
+
m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', s)
|
| 504 |
+
if m != None :
|
| 505 |
+
s = s[m.start() : ]
|
| 506 |
+
|
| 507 |
+
if s in self.dataset_medium :
|
| 508 |
+
refs.append(s)
|
| 509 |
+
# print(i, j, s)
|
| 510 |
+
|
| 511 |
+
if doctype_template.find('НКРФ') != -1 :
|
| 512 |
+
refs += self.nk_refs[i]
|
| 513 |
+
|
| 514 |
+
refs = list(set(refs))
|
| 515 |
+
filtered_refs.append(refs)
|
| 516 |
+
|
| 517 |
+
self.filtered_refs = filtered_refs
|
| 518 |
+
self.doctype_template = doctype_template
|
| 519 |
+
|
| 520 |
+
# creating corpora fo TF-IDF embedding
|
| 521 |
+
def create_corpora(self) :
|
| 522 |
+
|
| 523 |
+
self.qcorpus = []
|
| 524 |
+
for i in range(len(self.qtext)) :
|
| 525 |
+
if not i % 100 : print(i, end = ' ')
|
| 526 |
+
# s = self.qtext[i].firstChild.nodeValue
|
| 527 |
+
s = self.qtitle[i].firstChild.nodeValue + ' ' + self.qtext[i].firstChild.nodeValue
|
| 528 |
+
s = self.analyze(s)
|
| 529 |
+
self.qcorpus.append(s)
|
| 530 |
+
|
| 531 |
+
self.acorpus = []
|
| 532 |
+
for i in range(len(self.qtext)) :
|
| 533 |
+
s = self.atext[i].firstChild.nodeValue
|
| 534 |
+
s = self.analyze(s)
|
| 535 |
+
self.acorpus.append(s)
|
| 536 |
+
|
| 537 |
+
# self.nkcorpus = []
|
| 538 |
+
# for i in range(len(self.text)) :
|
| 539 |
+
# if not i % 100 : print(i, end = ' ')
|
| 540 |
+
# s = self.text[i].firstChild.nodeValue
|
| 541 |
+
# s = self.analyze(s)
|
| 542 |
+
# self.nkcorpus.append(s)
|
| 543 |
+
|
| 544 |
+
self.pmfcorpus = []
|
| 545 |
+
self.pmfrefs = []
|
| 546 |
+
# self.pmfids = []
|
| 547 |
+
self.pmflengths = []
|
| 548 |
+
self.nk_mask = []
|
| 549 |
+
self.laws_mask = []
|
| 550 |
+
|
| 551 |
+
i = 0
|
| 552 |
+
self.items = []
|
| 553 |
+
for key, value in self.dataset_medium.items() :
|
| 554 |
+
# print('test')
|
| 555 |
+
# break
|
| 556 |
+
if re.search(self.doctype_template, key) != None :
|
| 557 |
+
s = value
|
| 558 |
+
ss = key
|
| 559 |
+
m = re.search(r'(ст.(\d+\.\d+|\d+) [НГТ]КРФ|Федеральный закон|Постановление Правительства РФ|Приказ ФНС РФ|Решение Коллегии Евразийской экономической комиссии)', ss)
|
| 560 |
+
if m != None :
|
| 561 |
+
ss = ss[m.start() : ]
|
| 562 |
+
|
| 563 |
+
if s != None :
|
| 564 |
+
s = s.replace('\n', ' ')
|
| 565 |
+
if s != None and s.count(' ') :
|
| 566 |
+
if not i % 100 : print(i, end = ' ')
|
| 567 |
+
# print('test')
|
| 568 |
+
# break
|
| 569 |
+
s = self.analyze(s)
|
| 570 |
+
if s.count(' ') :
|
| 571 |
+
self.pmfcorpus.append(s)
|
| 572 |
+
self.pmfrefs.append(ss)
|
| 573 |
+
# self.pmfids.append(i)
|
| 574 |
+
# self.items.append({'title' : key, 'text' : value})
|
| 575 |
+
# self.pmflengths.append(s.count(' '))
|
| 576 |
+
|
| 577 |
+
# if ss.find('НКРФ') != -1 :
|
| 578 |
+
if re.search(r'НКРФ', ss) :
|
| 579 |
+
self.nk_mask.append(1)
|
| 580 |
+
else:
|
| 581 |
+
self.nk_mask.append(0)
|
| 582 |
+
|
| 583 |
+
if re.search(r'([ГТ]КРФ|Федеральный закон|Приказ ФНС РФ|Постановление Правительства РФ|Решение Коллегии Евразийской экономической комиссии)', ss) :
|
| 584 |
+
self.laws_mask.append(1)
|
| 585 |
+
else:
|
| 586 |
+
self.laws_mask.append(0)
|
| 587 |
+
|
| 588 |
+
i += 1
|
| 589 |
+
|
| 590 |
+
self.refids = {}
|
| 591 |
+
for i in range(len(self.pmfrefs)) :
|
| 592 |
+
key = self.pmfrefs[i]
|
| 593 |
+
self.refids[key] = i
|
| 594 |
+
|
| 595 |
+
# build up TF-IDF representation
|
| 596 |
+
def create_TFIDF(self) :
|
| 597 |
+
|
| 598 |
+
self.vectorizer = CountVectorizer()
|
| 599 |
+
# self.transformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
|
| 600 |
+
self.transformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
|
| 601 |
+
|
| 602 |
+
X = self.vectorizer.fit_transform(self.pmfcorpus)
|
| 603 |
+
QX = self.vectorizer.transform(self.qcorpus)
|
| 604 |
+
self.TFIDF = self.transformer.fit_transform(X)
|
| 605 |
+
self.QTFIDF = self.transformer.transform(QX)
|
| 606 |
+
|
| 607 |
+
# self.norm = []
|
| 608 |
+
# for i in range(self.TFIDF.shape[0]) :
|
| 609 |
+
# n = scipy.sparse.linalg.norm(self.TFIDF[i])
|
| 610 |
+
# self.norm.append(n)
|
| 611 |
+
# self.TFIDF[i] /= n
|
| 612 |
+
|
| 613 |
+
# for i in range(self.QTFIDF.shape[0]) :
|
| 614 |
+
# qn = scipy.sparse.linalg.norm(self.QTFIDF[i])
|
| 615 |
+
# self.QTFIDF[i] /= qn
|
| 616 |
+
|
| 617 |
+
n = np.sqrt(self.TFIDF.multiply(self.TFIDF).sum(axis = 1))
|
| 618 |
+
|
| 619 |
+
self.TFIDF = self.TFIDF.multiply(sparse.csr_matrix(1 / n))
|
| 620 |
+
self.norm = n.flatten().tolist()[0]
|
| 621 |
+
n = np.sqrt(self.QTFIDF.multiply(self.QTFIDF).sum(axis = 1))
|
| 622 |
+
self.QTFIDF = self.QTFIDF.multiply(sparse.csr_matrix(1 / n))
|
| 623 |
+
|
| 624 |
+
self.avectorizer = CountVectorizer()
|
| 625 |
+
self.atransformer = TfidfTransformer(smooth_idf = False, norm = 'l2', sublinear_tf = True)
|
| 626 |
+
# self.atransformer = TfidfTransformer(smooth_idf = False, norm = None, sublinear_tf = True)
|
| 627 |
+
|
| 628 |
+
AX = self.avectorizer.fit_transform(self.acorpus)
|
| 629 |
+
AQX = self.avectorizer.transform(self.qcorpus)
|
| 630 |
+
self.ATFIDF = self.atransformer.fit_transform(AX)
|
| 631 |
+
self.AQTFIDF = self.atransformer.transform(AQX)
|
| 632 |
+
|
| 633 |
+
# get top letters sorted by TF-IDF cosine similarity
|
| 634 |
+
def getKNNScores(self, v, i = -1) :
|
| 635 |
+
# v = self.AQTFIDF[i]
|
| 636 |
+
vt = v.transpose()
|
| 637 |
+
ascores = self.ATFIDF.dot(vt)[:, 0].todense()
|
| 638 |
+
ascores = np.squeeze(np.asarray(ascores))
|
| 639 |
+
scores = [0] * len(self.refids)
|
| 640 |
+
for j in range(len(self.filtered_refs)) :
|
| 641 |
+
score = ascores[j]
|
| 642 |
+
refs = self.filtered_refs[j]
|
| 643 |
+
for k in range(len(refs)) :
|
| 644 |
+
ref = refs[k]
|
| 645 |
+
m = re.search(r'ст.(\d+\.\d+|\d+) НКРФ', ref)
|
| 646 |
+
if i != j and m != None :
|
| 647 |
+
key = ref[m.start() : ]
|
| 648 |
+
if key in self.refids :
|
| 649 |
+
id = self.refids[key]
|
| 650 |
+
if scores[id] < score :
|
| 651 |
+
scores[id] = score
|
| 652 |
+
|
| 653 |
+
return scores
|
| 654 |
+
|
| 655 |
+
def getScores(self, v1, v2, i = -1) :
|
| 656 |
+
# v = self.QTFIDF[i]
|
| 657 |
+
vt = v1.transpose()
|
| 658 |
+
scores = self.TFIDF.dot(vt)[:, 0].todense()
|
| 659 |
+
scores = np.squeeze(np.asarray(scores))
|
| 660 |
+
nk_scores = self.getKNNScores(v2, i)
|
| 661 |
+
|
| 662 |
+
df = pd.DataFrame()
|
| 663 |
+
df[0] = scores
|
| 664 |
+
df[1] = nk_scores
|
| 665 |
+
df[2] = self.norm
|
| 666 |
+
df[3] = self.nk_mask
|
| 667 |
+
df[4] = 1 - df[3]
|
| 668 |
+
df[5] = (1 - np.sign(df[1])) * df[3]
|
| 669 |
+
|
| 670 |
+
df[0] = df[0] * df[4] + df[1] + df[5] * df[0] * zeta
|
| 671 |
+
# df[0] = df[0] * df[4] + np.maximum(df[1], df[0] * zeta)
|
| 672 |
+
|
| 673 |
+
df[0] *= np.log(df[2]) ** alpha
|
| 674 |
+
df[0] *= (1 + df[3] * beta)
|
| 675 |
+
df[0] += df[3] * gamma
|
| 676 |
+
|
| 677 |
+
df[4] = self.laws_mask
|
| 678 |
+
df[0] *= (1 + df[4] * delta)
|
| 679 |
+
df[0] += df[4] * epsilon
|
| 680 |
+
|
| 681 |
+
return df[0].tolist()
|
| 682 |
+
|
| 683 |
+
def getTop(self, i, top) :
|
| 684 |
+
v1 = self.QTFIDF[i]
|
| 685 |
+
v2 = self.AQTFIDF[i]
|
| 686 |
+
df = pd.DataFrame()
|
| 687 |
+
df[0] = self.getScores(v1, v2, i)
|
| 688 |
+
# df[0] = self.getKNNScores(i)
|
| 689 |
+
df[1] = self.pmfrefs
|
| 690 |
+
|
| 691 |
+
df.sort_values(0, ascending = False, inplace = True)
|
| 692 |
+
# df.sort_values(0, ascending = True, inplace = True)
|
| 693 |
+
|
| 694 |
+
ids = df[1].tolist()
|
| 695 |
+
scores = df[0].tolist()
|
| 696 |
+
filtered_ids = []
|
| 697 |
+
for i in range(len(ids)) :
|
| 698 |
+
id = ids[i]
|
| 699 |
+
score = scores[i]
|
| 700 |
+
if id not in filtered_ids :
|
| 701 |
+
filtered_ids.append(id)
|
| 702 |
+
|
| 703 |
+
if len(filtered_ids) == top :
|
| 704 |
+
break
|
| 705 |
+
|
| 706 |
+
# return ids[:top].tolist()
|
| 707 |
+
return filtered_ids
|
| 708 |
+
|
| 709 |
+
def test_TFIDF_top(self, top = 40, metric = '') :
|
| 710 |
+
N = len(self.qtext)
|
| 711 |
+
allhits = 0
|
| 712 |
+
allrefs = 0
|
| 713 |
+
recall = []
|
| 714 |
+
precision = []
|
| 715 |
+
f1 = []
|
| 716 |
+
|
| 717 |
+
for i in range(N) :
|
| 718 |
+
# if not i % 10 : print(i, end = ' ')
|
| 719 |
+
refs = set(self.filtered_refs[i])
|
| 720 |
+
resp = self.getTop(i, top)
|
| 721 |
+
serp = set(resp)
|
| 722 |
+
hits = len(refs & serp)
|
| 723 |
+
|
| 724 |
+
allhits += hits
|
| 725 |
+
allrefs += len(refs)
|
| 726 |
+
|
| 727 |
+
tp = hits
|
| 728 |
+
fp = top - tp
|
| 729 |
+
fn = len(refs) - hits
|
| 730 |
+
|
| 731 |
+
if tp == 0 and metric == 'corrected':
|
| 732 |
+
if fp == 0 and fn == 0 :
|
| 733 |
+
# print(i, len(refs), fp, fn)
|
| 734 |
+
recall.append(1)
|
| 735 |
+
precision.append(1)
|
| 736 |
+
f1.append(1)
|
| 737 |
+
else :
|
| 738 |
+
# print(i, len(refs), fp, fn)
|
| 739 |
+
recall.append(0)
|
| 740 |
+
precision.append(0)
|
| 741 |
+
f1.append(0)
|
| 742 |
+
|
| 743 |
+
elif tp + fn > 0 :
|
| 744 |
+
recall.append(tp / (tp + fn))
|
| 745 |
+
precision.append(tp / (tp + fp))
|
| 746 |
+
f1.append(2 * tp / (2 * tp + fp + fn))
|
| 747 |
+
|
| 748 |
+
print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
|
| 749 |
+
if len(recall)> 0 and len(precision) > 0 and len(f1) > 0 :
|
| 750 |
+
print('mean recall:', sum(recall) / len(recall))
|
| 751 |
+
print('mean precision:', sum(precision) / len(precision))
|
| 752 |
+
print('mean F1:', sum(f1) / len(f1))
|
| 753 |
+
|
| 754 |
+
# get letters with TF-IDF cosine similarity score > value
|
| 755 |
+
def getTopByScoreValue(self, i, value) :
|
| 756 |
+
# v = self.QTFIDF[i]
|
| 757 |
+
# vt = v.transpose()
|
| 758 |
+
# scores = self.TFIDF.dot(vt)[:, 0].todense()
|
| 759 |
+
# scores = np.squeeze(np.asarray(scores))
|
| 760 |
+
|
| 761 |
+
# df = pd.DataFrame()
|
| 762 |
+
# df[0] = scores
|
| 763 |
+
# df[1] = self.pmfrefs
|
| 764 |
+
|
| 765 |
+
v1 = self.QTFIDF[i]
|
| 766 |
+
v2 = self.AQTFIDF[i]
|
| 767 |
+
df = pd.DataFrame()
|
| 768 |
+
df[0] = self.getScores(v1, v2, i)
|
| 769 |
+
df[1] = self.pmfrefs
|
| 770 |
+
|
| 771 |
+
df.sort_values(0, ascending = False, inplace = True)
|
| 772 |
+
|
| 773 |
+
df1 = df.loc[df[0] > value]
|
| 774 |
+
ids = df1[1]
|
| 775 |
+
|
| 776 |
+
return ids.tolist()
|
| 777 |
+
|
| 778 |
+
# calculate metrics for letters with TF-IDF cosine similarity score > value
|
| 779 |
+
|
| 780 |
+
def test_TFIDF_value(self, value = .4) :
|
| 781 |
+
N = len(self.qtext)
|
| 782 |
+
allhits = 0
|
| 783 |
+
allrefs = 0
|
| 784 |
+
recall = []
|
| 785 |
+
precision = []
|
| 786 |
+
f1 = []
|
| 787 |
+
topsize = []
|
| 788 |
+
count = 0
|
| 789 |
+
|
| 790 |
+
for i in range(N) :
|
| 791 |
+
# if not i % 10 : print(i, end = ' ')
|
| 792 |
+
refs = set(self.filtered_refs[i])
|
| 793 |
+
resp = self.getTopByScoreValue(i, value)
|
| 794 |
+
serp = set(resp)
|
| 795 |
+
hits = len(refs & serp)
|
| 796 |
+
top = len(resp)
|
| 797 |
+
topsize.append(top)
|
| 798 |
+
|
| 799 |
+
if top > 0 :
|
| 800 |
+
count += 1
|
| 801 |
+
|
| 802 |
+
tp = hits
|
| 803 |
+
fp = top - tp
|
| 804 |
+
fn = len(refs) - hits
|
| 805 |
+
|
| 806 |
+
if tp == 0 :
|
| 807 |
+
if fp == 0 and fn == 0 :
|
| 808 |
+
recall.append(1)
|
| 809 |
+
precision.append(1)
|
| 810 |
+
f1.append(1)
|
| 811 |
+
else :
|
| 812 |
+
recall.append(0)
|
| 813 |
+
precision.append(0)
|
| 814 |
+
f1.append(0)
|
| 815 |
+
|
| 816 |
+
else :
|
| 817 |
+
recall.append(tp / (tp + fn))
|
| 818 |
+
precision.append(tp / (tp + fp))
|
| 819 |
+
f1.append(2 * tp / (2 * tp + fp + fn))
|
| 820 |
+
|
| 821 |
+
allhits += hits
|
| 822 |
+
allrefs += len(refs)
|
| 823 |
+
|
| 824 |
+
print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
|
| 825 |
+
print('mean recall:', sum(recall) / len(recall))
|
| 826 |
+
print('mean precision:', sum(precision) / len(precision))
|
| 827 |
+
print('mean F1:', sum(f1) / len(f1))
|
| 828 |
+
print('mean top size: ', sum(topsize) / len(topsize))
|
| 829 |
+
print('non-empty top:', count)
|
| 830 |
+
print('non-empty top share:', count / 517)
|
| 831 |
+
|
| 832 |
+
# return topsize
|
| 833 |
+
|
| 834 |
+
# get letters with TF-IDF cosine similarity score > top score * ratio
|
| 835 |
+
def getTopByScoreRelValue(self, i, ratio) :
|
| 836 |
+
# v = self.QTFIDF[i]
|
| 837 |
+
# vt = v.transpose()
|
| 838 |
+
# scores = self.TFIDF.dot(vt)[:, 0].todense()
|
| 839 |
+
# scores = np.squeeze(np.asarray(scores))
|
| 840 |
+
# df = pd.DataFrame()
|
| 841 |
+
# df[0] = scores
|
| 842 |
+
# df[1] = self.pmfrefs
|
| 843 |
+
|
| 844 |
+
v1 = self.QTFIDF[i]
|
| 845 |
+
v2 = self.AQTFIDF[i]
|
| 846 |
+
df = pd.DataFrame()
|
| 847 |
+
df[0] = self.getScores(v1, v2, i)
|
| 848 |
+
df[1] = self.pmfrefs
|
| 849 |
+
|
| 850 |
+
df.sort_values(0, ascending = False, inplace = True)
|
| 851 |
+
value = df.iloc[0, 0]
|
| 852 |
+
df1 = df.loc[df[0] > value * ratio]
|
| 853 |
+
ids = df1[1]
|
| 854 |
+
|
| 855 |
+
return ids.tolist()
|
| 856 |
+
|
| 857 |
+
# calculate metrics for letters with TF-IDF cosine similarity score > top score * ratio
|
| 858 |
+
def test_TFIDF_ratio(self, ratio = .9) :
|
| 859 |
+
N = len(self.qtext)
|
| 860 |
+
allhits = 0
|
| 861 |
+
allrefs = 0
|
| 862 |
+
recall = []
|
| 863 |
+
precision = []
|
| 864 |
+
f1 = []
|
| 865 |
+
topsize = []
|
| 866 |
+
count = 0
|
| 867 |
+
|
| 868 |
+
for i in range(N) :
|
| 869 |
+
# if not i % 10 : print(i, end = ' ')
|
| 870 |
+
refs = set(self.filtered_refs[i])
|
| 871 |
+
resp = self.getTopByScoreRelValue(i, ratio)
|
| 872 |
+
serp = set(resp)
|
| 873 |
+
hits = len(refs & serp)
|
| 874 |
+
top = len(resp)
|
| 875 |
+
topsize.append(top)
|
| 876 |
+
|
| 877 |
+
tp = hits
|
| 878 |
+
fp = top - tp
|
| 879 |
+
fn = len(refs) - hits
|
| 880 |
+
|
| 881 |
+
r = 0
|
| 882 |
+
p = 0
|
| 883 |
+
f = 0
|
| 884 |
+
|
| 885 |
+
if tp == 0 :
|
| 886 |
+
if fp == 0 and fn == 0 :
|
| 887 |
+
recall.append(1)
|
| 888 |
+
precision.append(1)
|
| 889 |
+
f1.append(1)
|
| 890 |
+
r = 1
|
| 891 |
+
p = 1
|
| 892 |
+
f = 1
|
| 893 |
+
else :
|
| 894 |
+
recall.append(0)
|
| 895 |
+
precision.append(0)
|
| 896 |
+
f1.append(0)
|
| 897 |
+
|
| 898 |
+
else :
|
| 899 |
+
recall.append(tp / (tp + fn))
|
| 900 |
+
precision.append(tp / (tp + fp))
|
| 901 |
+
f1.append(2 * tp / (2 * tp + fp + fn))
|
| 902 |
+
r = tp / (tp + fn)
|
| 903 |
+
p = tp / (tp + fp)
|
| 904 |
+
f = 2 * tp / (2 * tp + fp + fn)
|
| 905 |
+
|
| 906 |
+
if (f > r and f > p) or (f < r and f < p) :
|
| 907 |
+
print('ERROR :', i, r, p, f)
|
| 908 |
+
|
| 909 |
+
allhits += hits
|
| 910 |
+
allrefs += len(refs)
|
| 911 |
+
|
| 912 |
+
print('\ntotal: ', allhits, allrefs, allhits / (allrefs + .00001))
|
| 913 |
+
print('mean recall:', sum(recall) / len(recall))
|
| 914 |
+
print('mean precision:', sum(precision) / len(precision))
|
| 915 |
+
print('mean F1:', sum(f1) / len(f1))
|
| 916 |
+
print('mean top size: ', sum(topsize) / len(topsize))
|
| 917 |
+
|
| 918 |
+
# return topsize
|
| 919 |
+
|
| 920 |
+
# def getTopForQuery(self, i, top, query) :
|
| 921 |
+
# v = QTFIDF[i]
|
| 922 |
+
# vt = v.transpose()
|
| 923 |
+
# scores = TFIDF.dot(vt)[:, 0].todense()
|
| 924 |
+
# scores = np.squeeze(np.asarray(scores))
|
| 925 |
+
# df = pd.DataFrame()
|
| 926 |
+
# df[0] = scores
|
| 927 |
+
# df[1] = pmfrefs
|
| 928 |
+
|
| 929 |
+
# df.sort_values(0, ascending = False, inplace = True)
|
| 930 |
+
# # df.sort_values(0, ascending = True, inplace = True)
|
| 931 |
+
# # ids = df.index
|
| 932 |
+
# ids = df[1]
|
| 933 |
+
# # print(df)
|
| 934 |
+
|
| 935 |
+
# return ids[:top].tolist()
|
| 936 |
+
|
| 937 |
+
def load_everything(self, data_directory = 'data') :
|
| 938 |
+
self.load_basic_data(data_directory=data_directory)
|
| 939 |
+
self.load_text_processing()
|
| 940 |
+
s = '|()><.,!?:;=*-/\\8. Форма \n \r Cчета-фактуры и порядок его заполнения, формы и порядок ведения журнала учета полученных и выставленных счетов-фактур, книг покупок и книг продаж устанавливаются Правительством Российской Федерации.'
|
| 941 |
+
print(self.analyze(s))
|
| 942 |
+
self.load_medium_dataset(path=os.path.join(data_directory, 'search_data', 'medium_dataset.json'))
|
| 943 |
+
self.create_filtered_refs()
|
| 944 |
+
self.create_corpora()
|
| 945 |
+
print(len(self.pmfcorpus))
|
| 946 |
+
self.create_TFIDF()
|
| 947 |
+
|
| 948 |
+
def test_everything(self) :
|
| 949 |
+
self.test_TFIDF_top(top = 40)
|
| 950 |
+
self.test_TFIDF_value(value = .2)
|
| 951 |
+
self.test_TFIDF_ratio(ratio = .9)
|
| 952 |
+
|
| 953 |
+
def search(self, query, top = 10) :
|
| 954 |
+
analyzed_query = self.analyze(query)
|
| 955 |
+
|
| 956 |
+
query_TF = self.vectorizer.transform([analyzed_query])
|
| 957 |
+
query_TFIDF = self.transformer.transform(query_TF)
|
| 958 |
+
n = np.sqrt(query_TFIDF.multiply(query_TFIDF).sum(axis = 1))
|
| 959 |
+
query_TFIDF = query_TFIDF.multiply(sparse.csr_matrix(1 / n))
|
| 960 |
+
|
| 961 |
+
query_ATF = self.avectorizer.transform([analyzed_query])
|
| 962 |
+
query_ATFIDF = self.atransformer.transform(query_ATF)
|
| 963 |
+
|
| 964 |
+
v1 = query_TFIDF[0]
|
| 965 |
+
v2 = query_ATFIDF[0]
|
| 966 |
+
|
| 967 |
+
# vt = v.transpose()
|
| 968 |
+
# scores = self.TFIDF.dot(vt)[:, 0].todense()
|
| 969 |
+
# scores = np.squeeze(np.asarray(scores))
|
| 970 |
+
# df = pd.DataFrame()
|
| 971 |
+
# df[0] = scores
|
| 972 |
+
# df[1] = self.pmfrefs
|
| 973 |
+
# df[2] = self.norm
|
| 974 |
+
# df[3] = self.nk_mask
|
| 975 |
+
|
| 976 |
+
# df[0] *= np.log(df[2]) ** alpha
|
| 977 |
+
# df[0] *= (1 + df[3] * beta)
|
| 978 |
+
# df[0] += df[3] * gamma
|
| 979 |
+
|
| 980 |
+
# df[4] = self.laws_mask
|
| 981 |
+
# df[0] *= (1 + df[4] * delta)
|
| 982 |
+
# df[0] += df[4] * epsilon
|
| 983 |
+
|
| 984 |
+
# df.sort_values(0, ascending = False, inplace = True)
|
| 985 |
+
# # df.sort_values(0, ascending = True, inplace = True)
|
| 986 |
+
|
| 987 |
+
# if top == 'auto' :
|
| 988 |
+
# value = df.iloc[0, 0]
|
| 989 |
+
# ratio = 0.81
|
| 990 |
+
# df1 = df.loc[df[0] > value * ratio]
|
| 991 |
+
# ids = df1[1]
|
| 992 |
+
# top = len(ids)
|
| 993 |
+
# else :
|
| 994 |
+
# ids = df[1][:top]
|
| 995 |
+
|
| 996 |
+
# # print(df)
|
| 997 |
+
|
| 998 |
+
df = pd.DataFrame()
|
| 999 |
+
df[0] = self.getScores(v1, v2)
|
| 1000 |
+
# df[0] = self.getKNNScores(i)
|
| 1001 |
+
df[1] = self.pmfrefs
|
| 1002 |
+
|
| 1003 |
+
df.sort_values(0, ascending = False, inplace = True)
|
| 1004 |
+
# df.sort_values(0, ascending = True, inplace = True)
|
| 1005 |
+
|
| 1006 |
+
titles = df[1].tolist()
|
| 1007 |
+
# titles = ids.tolist()
|
| 1008 |
+
docs = []
|
| 1009 |
+
for i in range(len(titles)) :
|
| 1010 |
+
id = df.iloc[i, 1]
|
| 1011 |
+
docs.append(self.dataset_medium[id])
|
| 1012 |
+
# print()
|
| 1013 |
+
# print (i, df.iloc[i, 0], id)
|
| 1014 |
+
# print(self.dataset_medium[id])
|
| 1015 |
+
|
| 1016 |
+
scores = df[0][:top].tolist()
|
| 1017 |
+
|
| 1018 |
+
return titles, docs, scores
|
| 1019 |
+
|
| 1020 |
+
# bsearch = BasicSearch('taxcode')
|
| 1021 |
+
# bsearch = BasicSearch('minfin-letters')
|
| 1022 |
+
# bsearch = BasicSearch('fns-letters')
|
| 1023 |
+
# bsearch = BasicSearch('other-laws')
|
| 1024 |
+
# bsearch = BasicSearch('consultations')
|
| 1025 |
+
# bsearch = BasicSearch('all-docs')
|
| 1026 |
+
|
| 1027 |
+
# bsearch.test_TFIDF_top(40)
|
| 1028 |
+
|
| 1029 |
+
# top = 10
|
| 1030 |
+
# query = 'Форма счета-фактуры и порядок его заполнения'
|
| 1031 |
+
# titles, docs, scores = bsearch.search(query, top = top)
|
| 1032 |
+
|
| 1033 |
+
# for i in range(top) :
|
| 1034 |
+
# print()
|
| 1035 |
+
# # print(len(scores), len(titles))
|
| 1036 |
+
# print(i, scores[i])
|
| 1037 |
+
# print(titles[i], ':\n')
|
| 1038 |
+
# print(docs[i][:1000], '...')
|
Dockerfile
CHANGED
|
@@ -15,7 +15,8 @@ RUN apt-get update && apt-get install --no-install-recommends -y git && \
|
|
| 15 |
RUN pip install -U "huggingface_hub[cli]"
|
| 16 |
|
| 17 |
RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
|
| 18 |
-
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
RUN pip install -r requirements.txt
|
|
|
|
| 15 |
RUN pip install -U "huggingface_hub[cli]"
|
| 16 |
|
| 17 |
RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=true \
|
| 18 |
+
hf secrets login $(cat /run/secrets/HF_TOKEN) && \
|
| 19 |
+
hf repo clone myrushev/nn-legal-search-data /var/www/data
|
| 20 |
|
| 21 |
|
| 22 |
RUN pip install -r requirements.txt
|
app.py
CHANGED
|
@@ -1,16 +1,12 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
from flask import Flask, jsonify, request
|
| 4 |
-
from
|
| 5 |
-
from BasicSearchV5 import BasicSearch as BasicSearchV5
|
| 6 |
|
| 7 |
-
DATA_PATH = os.environ.get("DATA_PATH", "")
|
| 8 |
-
DEFAULT_SEARCH_VERSION = os.environ.get("DEFAULT_SEARCH_VERSION",
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
search_v5 = BasicSearchV5(doctype='all-docs', data_directory=DATA_PATH)
|
| 13 |
-
search_v5.test_everything()
|
| 14 |
|
| 15 |
|
| 16 |
app = Flask(__name__)
|
|
@@ -25,14 +21,11 @@ def search_route():
|
|
| 25 |
data = request.get_json()
|
| 26 |
query = data.get('query', '')
|
| 27 |
top = data.get('top', 10)
|
| 28 |
-
version = data.get('version', DEFAULT_SEARCH_VERSION)
|
| 29 |
-
|
| 30 |
-
titles, docs, scores = search_v6.search(query, top)
|
| 31 |
-
else:
|
| 32 |
-
titles, docs, scores = search_v5.search(query, top)
|
| 33 |
result = [{'title': str(item1), 'text': str(item2), 'relevance': str(item3)} for item1, item2, item3 in zip(titles, docs, scores)]
|
| 34 |
return jsonify(result)
|
| 35 |
|
| 36 |
if __name__ == '__main__':
|
| 37 |
|
| 38 |
-
app.run(debug=False, host='0.0.0.0'
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
from flask import Flask, jsonify, request
|
| 4 |
+
from BasicSearchV7 import BasicSearch as BasicSearchV7
|
|
|
|
| 5 |
|
| 6 |
+
DATA_PATH = os.environ.get("DATA_PATH", "./data")
|
| 7 |
+
# DEFAULT_SEARCH_VERSION = os.environ.get("DEFAULT_SEARCH_VERSION", 7)
|
| 8 |
+
search_v7 = BasicSearchV7(doctype='all-docs', data_directory="./data")
|
| 9 |
+
search_v7.test_everything()
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
app = Flask(__name__)
|
|
|
|
| 21 |
data = request.get_json()
|
| 22 |
query = data.get('query', '')
|
| 23 |
top = data.get('top', 10)
|
| 24 |
+
# version = data.get('version', DEFAULT_SEARCH_VERSION)
|
| 25 |
+
titles, docs, scores = search_v7.search(query, top)
|
|
|
|
|
|
|
|
|
|
| 26 |
result = [{'title': str(item1), 'text': str(item2), 'relevance': str(item3)} for item1, item2, item3 in zip(titles, docs, scores)]
|
| 27 |
return jsonify(result)
|
| 28 |
|
| 29 |
if __name__ == '__main__':
|
| 30 |
|
| 31 |
+
app.run(debug=False, host='0.0.0.0')
|