Commit ·
85ac9d4
1
Parent(s): 61f200c
model
Browse files- app.py +25 -0
- models/.DS_Store +0 -0
- models/tags_encoder.pkl +3 -0
- models/tags_model.pkl +3 -0
- tags.py +17 -0
- utils.py +29 -0
app.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, jsonify, request
|
| 2 |
+
import pickle
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from utils import predict
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
app = Flask(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@app.route('/')
|
| 12 |
+
def index():
|
| 13 |
+
return "/model/text – predicts tag for string sample"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@app.route('/model/text', methods=['POST'])
|
| 17 |
+
def parse_string():
|
| 18 |
+
sample = request.get_data()
|
| 19 |
+
model = pickle.load(open('models/tags_model.pkl', 'rb'))
|
| 20 |
+
labelencoder = pickle.load(open('models/tags_encoder.pkl', 'rb'))
|
| 21 |
+
return predict(sample, model, labelencoder)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
if __name__ == '__main__':
|
| 25 |
+
app.run(debug=True)
|
models/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
models/tags_encoder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e3d16ff00b7f5cf88af997b48086fe4599ac4b22a841d2e468f4a95b6249cc65
|
| 3 |
+
size 1925
|
models/tags_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3848747f6fc040cc4c3cad14cb0e9d73ea64805e86b9affd04949265819a75a2
|
| 3 |
+
size 23629883
|
tags.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pickle
|
| 2 |
+
import numpy as np
|
| 3 |
+
from utils import predict
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
if __name__ == '__main__':
|
| 7 |
+
data = pickle.load(open('data/tags_data.pkl', 'rb'))
|
| 8 |
+
sample = data['orig_text']
|
| 9 |
+
model = pickle.load(open('models/tags_model.pkl', 'rb'))
|
| 10 |
+
labelencoder = pickle.load(open('models/tags_encoder.pkl', 'rb'))
|
| 11 |
+
|
| 12 |
+
data['prediction'] = predict(data['orig_text'],
|
| 13 |
+
model,
|
| 14 |
+
labelencoder,
|
| 15 |
+
preproc=False)
|
| 16 |
+
|
| 17 |
+
print(data.iloc[0]["orig_text"])
|
utils.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import nltk
|
| 2 |
+
from nltk.corpus import stopwords
|
| 3 |
+
from nltk.tokenize import RegexpTokenizer
|
| 4 |
+
nltk.download('stopwords')
|
| 5 |
+
from pymystem3 import Mystem
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from pandarallel import pandarallel
|
| 8 |
+
pandarallel.initialize(progress_bar=True)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def preproc_text(x):
|
| 12 |
+
mystem = Mystem()
|
| 13 |
+
stop_words = stopwords.words('russian')
|
| 14 |
+
stop_words.extend([' ', ' \n', ' ', 'также', 'который', 'весь', 'заявлять', 'сообщать', 'риа'])
|
| 15 |
+
tokenizer = RegexpTokenizer(r'\w+')
|
| 16 |
+
data = ''.join(mystem.lemmatize(x))
|
| 17 |
+
data = tokenizer.tokenize(data)
|
| 18 |
+
return ' '.join([word for word in data if word not in stop_words])
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def predict(sample, model, label_encoder, preproc=True):
|
| 22 |
+
if preproc:
|
| 23 |
+
if type(sample) == pd.Series:
|
| 24 |
+
sample = sample.parallel_apply(preproc_text)
|
| 25 |
+
return label_encoder.inverse_transform(model.predict(sample))
|
| 26 |
+
|
| 27 |
+
elif type(sample) == bytes or type(sample) == str:
|
| 28 |
+
sample = [preproc_text(sample)]
|
| 29 |
+
return label_encoder.inverse_transform(model.predict(sample))[0]
|