jellyhater commited on
Commit
85ac9d4
·
1 Parent(s): 61f200c
Files changed (6) hide show
  1. app.py +25 -0
  2. models/.DS_Store +0 -0
  3. models/tags_encoder.pkl +3 -0
  4. models/tags_model.pkl +3 -0
  5. tags.py +17 -0
  6. utils.py +29 -0
app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, jsonify, request
2
+ import pickle
3
+ import pandas as pd
4
+
5
+ from utils import predict
6
+
7
+
8
+ app = Flask(__name__)
9
+
10
+
11
+ @app.route('/')
12
+ def index():
13
+ return "/model/text – predicts tag for string sample"
14
+
15
+
16
+ @app.route('/model/text', methods=['POST'])
17
+ def parse_string():
18
+ sample = request.get_data()
19
+ model = pickle.load(open('models/tags_model.pkl', 'rb'))
20
+ labelencoder = pickle.load(open('models/tags_encoder.pkl', 'rb'))
21
+ return predict(sample, model, labelencoder)
22
+
23
+
24
+ if __name__ == '__main__':
25
+ app.run(debug=True)
models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
models/tags_encoder.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3d16ff00b7f5cf88af997b48086fe4599ac4b22a841d2e468f4a95b6249cc65
3
+ size 1925
models/tags_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3848747f6fc040cc4c3cad14cb0e9d73ea64805e86b9affd04949265819a75a2
3
+ size 23629883
tags.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import numpy as np
3
+ from utils import predict
4
+
5
+
6
+ if __name__ == '__main__':
7
+ data = pickle.load(open('data/tags_data.pkl', 'rb'))
8
+ sample = data['orig_text']
9
+ model = pickle.load(open('models/tags_model.pkl', 'rb'))
10
+ labelencoder = pickle.load(open('models/tags_encoder.pkl', 'rb'))
11
+
12
+ data['prediction'] = predict(data['orig_text'],
13
+ model,
14
+ labelencoder,
15
+ preproc=False)
16
+
17
+ print(data.iloc[0]["orig_text"])
utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ from nltk.corpus import stopwords
3
+ from nltk.tokenize import RegexpTokenizer
4
+ nltk.download('stopwords')
5
+ from pymystem3 import Mystem
6
+ import pandas as pd
7
+ from pandarallel import pandarallel
8
+ pandarallel.initialize(progress_bar=True)
9
+
10
+
11
+ def preproc_text(x):
12
+ mystem = Mystem()
13
+ stop_words = stopwords.words('russian')
14
+ stop_words.extend([' ', ' \n', ' ', 'также', 'который', 'весь', 'заявлять', 'сообщать', 'риа'])
15
+ tokenizer = RegexpTokenizer(r'\w+')
16
+ data = ''.join(mystem.lemmatize(x))
17
+ data = tokenizer.tokenize(data)
18
+ return ' '.join([word for word in data if word not in stop_words])
19
+
20
+
21
+ def predict(sample, model, label_encoder, preproc=True):
22
+ if preproc:
23
+ if type(sample) == pd.Series:
24
+ sample = sample.parallel_apply(preproc_text)
25
+ return label_encoder.inverse_transform(model.predict(sample))
26
+
27
+ elif type(sample) == bytes or type(sample) == str:
28
+ sample = [preproc_text(sample)]
29
+ return label_encoder.inverse_transform(model.predict(sample))[0]