Spaces:
Runtime error
Runtime error
lyangas commited on
Commit ·
da7535b
1
Parent(s): afc3da6
add voting
Browse files- Dockerfile +2 -0
- app.py +71 -18
- classifiers/.DS_Store +0 -0
- classifiers/codes/.DS_Store +0 -0
- model_finetuned_clear.pkl → classifiers/codes/mlp_codes.pkl +2 -2
- classifiers/codes/svc_codes.pkl +3 -0
- classifiers/codes/xgboost_codes.pkl +3 -0
- classifiers/groups/.DS_Store +0 -0
- classifiers/groups/mlp_groups.pkl +3 -0
- classifiers/groups/svc_groups.pkl +3 -0
- classifiers/groups/xgboost_groups.pkl +3 -0
- embedder/embedder.pkl +3 -0
- required_classes.py +1 -1
- requirements.txt +2 -1
Dockerfile
CHANGED
|
@@ -6,6 +6,8 @@ COPY ./requirements.txt /code/requirements.txt
|
|
| 6 |
|
| 7 |
RUN pip install --upgrade -r /code/requirements.txt
|
| 8 |
|
|
|
|
|
|
|
| 9 |
COPY ./model_finetuned_clear.pkl ./model_finetuned_clear.pkl
|
| 10 |
COPY ./required_classes.py ./required_classes.py
|
| 11 |
COPY ./app.py ./app.py
|
|
|
|
| 6 |
|
| 7 |
RUN pip install --upgrade -r /code/requirements.txt
|
| 8 |
|
| 9 |
+
COPY ./embedder ./embedder
|
| 10 |
+
COPY ./classifiers ./classifiers
|
| 11 |
COPY ./model_finetuned_clear.pkl ./model_finetuned_clear.pkl
|
| 12 |
COPY ./required_classes.py ./required_classes.py
|
| 13 |
COPY ./app.py ./app.py
|
app.py
CHANGED
|
@@ -4,33 +4,86 @@ from flask import Flask, request
|
|
| 4 |
import json
|
| 5 |
import pickle
|
| 6 |
import numpy as np
|
|
|
|
| 7 |
|
| 8 |
from required_classes import BertEmbedder, PredictModel
|
| 9 |
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
try:
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
except Exception as e:
|
| 18 |
-
print(f"ERROR: loading
|
|
|
|
| 19 |
|
| 20 |
def classify_code(text, top_n):
|
| 21 |
-
embed =
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
return preds
|
| 26 |
|
|
|
|
| 27 |
def classify_group(text, top_n):
|
| 28 |
-
embed =
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
return preds
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
app = Flask(__name__)
|
| 36 |
|
|
@@ -49,8 +102,6 @@ def read_root():
|
|
| 49 |
base64_bytes = str(data['textB64']).encode("ascii")
|
| 50 |
sample_string_bytes = base64.b64decode(base64_bytes)
|
| 51 |
text = sample_string_bytes.decode("ascii")
|
| 52 |
-
print(text)
|
| 53 |
-
# text = str(data['text'])
|
| 54 |
top_n = int(data['top_n'])
|
| 55 |
|
| 56 |
if top_n < 1:
|
|
@@ -60,11 +111,13 @@ def read_root():
|
|
| 60 |
|
| 61 |
pred_codes = classify_code(text, top_n)
|
| 62 |
pred_groups = classify_group(text, top_n)
|
|
|
|
|
|
|
| 63 |
result = {
|
| 64 |
"icd10":
|
| 65 |
-
{'result':
|
| 66 |
"dx_group":
|
| 67 |
-
{'result':
|
| 68 |
}
|
| 69 |
return result
|
| 70 |
|
|
|
|
| 4 |
import json
|
| 5 |
import pickle
|
| 6 |
import numpy as np
|
| 7 |
+
import os
|
| 8 |
|
| 9 |
from required_classes import BertEmbedder, PredictModel
|
| 10 |
|
| 11 |
|
| 12 |
+
CLS_WEIGHTS = {'mlp': 0.3, 'svc': 0.4, 'xgboost': 0.3}
|
| 13 |
+
|
| 14 |
+
print('INFO: loading models')
|
| 15 |
+
try:
|
| 16 |
+
with open('embedder/embedder.pkl', 'rb') as f:
|
| 17 |
+
embedder = pickle.load(f)
|
| 18 |
+
print('INFO: embedder loaded')
|
| 19 |
+
except Exception as e:
|
| 20 |
+
print(f"ERROR: loading embedder failed with: {str(e)}")
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
classifiers_codes = {}
|
| 24 |
+
try:
|
| 25 |
+
for clf_name in os.listdir('classifiers/codes'):
|
| 26 |
+
if '.' == clf_name[0]:
|
| 27 |
+
continue
|
| 28 |
+
with open('classifiers/codes/'+clf_name, 'rb') as f:
|
| 29 |
+
model = pickle.load(f)
|
| 30 |
+
classifiers_codes[clf_name.split('.')[0]] = model
|
| 31 |
+
print(f'INFO: classifier {clf_name} loaded')
|
| 32 |
+
except Exception as e:
|
| 33 |
+
print(f"ERROR: loading classifiers failed with: {str(e)}")
|
| 34 |
+
|
| 35 |
+
classifiers_groups = {}
|
| 36 |
try:
|
| 37 |
+
for clf_name in os.listdir('classifiers/groups'):
|
| 38 |
+
if '.' == clf_name[0]:
|
| 39 |
+
continue
|
| 40 |
+
with open('classifiers/groups/'+clf_name, 'rb') as f:
|
| 41 |
+
model = pickle.load(f)
|
| 42 |
+
classifiers_groups[clf_name.split('.')[0]] = model
|
| 43 |
+
print(f'INFO: classifier {clf_name} loaded')
|
| 44 |
except Exception as e:
|
| 45 |
+
print(f"ERROR: loading classifiers failed with: {str(e)}")
|
| 46 |
+
|
| 47 |
|
| 48 |
def classify_code(text, top_n):
|
| 49 |
+
embed = [embedder(text)]
|
| 50 |
+
preds = {}
|
| 51 |
+
for clf_name in classifiers_codes.keys():
|
| 52 |
+
model = classifiers_codes[clf_name]
|
| 53 |
+
probs = model.predict_proba(embed)
|
| 54 |
+
best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
|
| 55 |
+
clf_preds = {str(model.classes_[i]): float(probs[0][i]) for i in best_n}
|
| 56 |
+
preds[clf_name] = clf_preds
|
| 57 |
return preds
|
| 58 |
|
| 59 |
+
|
| 60 |
def classify_group(text, top_n):
|
| 61 |
+
embed = [embedder(text)]
|
| 62 |
+
preds = {}
|
| 63 |
+
for clf_name in classifiers_groups.keys():
|
| 64 |
+
model = classifiers_groups[clf_name]
|
| 65 |
+
probs = model.predict_proba(embed)
|
| 66 |
+
best_n = np.flip(np.argsort(probs, axis=1,)[0,-top_n:])
|
| 67 |
+
clf_preds = {str(model.classes_[i]): float(probs[0][i]) for i in best_n}
|
| 68 |
+
preds[clf_name] = clf_preds
|
| 69 |
return preds
|
| 70 |
|
| 71 |
+
def get_top_result(preds):
|
| 72 |
+
total_scores = {}
|
| 73 |
+
for clf_name, scores in preds.items():
|
| 74 |
+
clf_name = clf_name.replace('_codes', '').replace('_groups', '')
|
| 75 |
+
for class_name, score in scores.items():
|
| 76 |
+
if class_name in total_scores:
|
| 77 |
+
total_scores[class_name] += CLS_WEIGHTS[clf_name] * score
|
| 78 |
+
else:
|
| 79 |
+
total_scores[class_name] = CLS_WEIGHTS[clf_name] * score
|
| 80 |
+
|
| 81 |
+
max_idx = np.array(total_scores.values()).argmax()
|
| 82 |
+
if list(total_scores.values())[max_idx] > 0.5:
|
| 83 |
+
return list(total_scores.keys())[max_idx]
|
| 84 |
+
else:
|
| 85 |
+
return None
|
| 86 |
+
|
| 87 |
|
| 88 |
app = Flask(__name__)
|
| 89 |
|
|
|
|
| 102 |
base64_bytes = str(data['textB64']).encode("ascii")
|
| 103 |
sample_string_bytes = base64.b64decode(base64_bytes)
|
| 104 |
text = sample_string_bytes.decode("ascii")
|
|
|
|
|
|
|
| 105 |
top_n = int(data['top_n'])
|
| 106 |
|
| 107 |
if top_n < 1:
|
|
|
|
| 111 |
|
| 112 |
pred_codes = classify_code(text, top_n)
|
| 113 |
pred_groups = classify_group(text, top_n)
|
| 114 |
+
pred_codes_top = get_top_result(pred_codes)
|
| 115 |
+
pred_groups_top = get_top_result(pred_groups)
|
| 116 |
result = {
|
| 117 |
"icd10":
|
| 118 |
+
{'result': pred_codes_top, 'details': pred_codes},
|
| 119 |
"dx_group":
|
| 120 |
+
{'result': pred_groups_top, 'details': pred_groups}
|
| 121 |
}
|
| 122 |
return result
|
| 123 |
|
classifiers/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
classifiers/codes/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
model_finetuned_clear.pkl → classifiers/codes/mlp_codes.pkl
RENAMED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:39eda1dc2f583525dd689e048b396f476835c42ed2a8fead4884b6c87bad639d
|
| 3 |
+
size 5185392
|
classifiers/codes/svc_codes.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:669ddbb9dea179f38dcc5e94282074240eac2cb361e839416a4fc74ea718050d
|
| 3 |
+
size 865684575
|
classifiers/codes/xgboost_codes.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49a24f3740770aca1f546ccf44e2d2346ba18105920691255b587026cba962d7
|
| 3 |
+
size 17972293
|
classifiers/groups/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
classifiers/groups/mlp_groups.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f494f298b214d3239be0a53ae986b6eadf6822ccb77fcfdf8149b1aad80c87f2
|
| 3 |
+
size 4924655
|
classifiers/groups/svc_groups.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85d1f0874ebdc3986d412e6d8508d979a324c278d70b29dc4491477b209e69fa
|
| 3 |
+
size 574657606
|
classifiers/groups/xgboost_groups.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3f1d2fd1f46c18e3bf6b0b67ae8a6c6b52c8b4e06961eff4568cebee7f68989
|
| 3 |
+
size 8361773
|
embedder/embedder.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b27f378ed88ce2ae9e01e9fd4e5706fc2072957b665c4d12e02a84bbe4f4c4d8
|
| 3 |
+
size 434228972
|
required_classes.py
CHANGED
|
@@ -25,7 +25,7 @@ class BertEmbedder:
|
|
| 25 |
truncation=True).to(self.device)
|
| 26 |
model_output = self.embedder(**encoded_input)
|
| 27 |
text_embed = model_output.pooler_output[0].cpu()
|
| 28 |
-
return text_embed
|
| 29 |
|
| 30 |
def batch_predict(self, texts: List[str]):
|
| 31 |
encoded_input = self.tokenizer(texts,
|
|
|
|
| 25 |
truncation=True).to(self.device)
|
| 26 |
model_output = self.embedder(**encoded_input)
|
| 27 |
text_embed = model_output.pooler_output[0].cpu()
|
| 28 |
+
return text_embed.tolist()
|
| 29 |
|
| 30 |
def batch_predict(self, texts: List[str]):
|
| 31 |
encoded_input = self.tokenizer(texts,
|
requirements.txt
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
numpy==1.22.4
|
| 2 |
torch==2.0.1
|
| 3 |
-
scikit-learn==1.
|
| 4 |
transformers==4.29.2
|
| 5 |
flask==2.0.3
|
|
|
|
|
|
| 1 |
numpy==1.22.4
|
| 2 |
torch==2.0.1
|
| 3 |
+
scikit-learn==1.3.0
|
| 4 |
transformers==4.29.2
|
| 5 |
flask==2.0.3
|
| 6 |
+
xgboost==1.7.6
|