| | import streamlit as st |
| | import torch |
| | from transformers import AutoTokenizer, AutoModel, pipeline |
| | from torch import nn |
| |
|
| | st.markdown("### Articles classificator.") |
| |
|
| | @st.cache(allow_output_mutation=True) |
| | def get_tokenizer(): |
| | model_name = 'microsoft/deberta-v3-small' |
| | return AutoTokenizer.from_pretrained(model_name) |
| |
|
| | tokenizer = get_tokenizer() |
| |
|
| | class devops_model(nn.Module): |
| | def __init__(self): |
| | super(devops_model, self).__init__() |
| | self.berta = None |
| | self.fc = nn.Sequential( |
| | nn.Linear(768, 768), |
| | nn.ReLU(), |
| | nn.Dropout(0.3), |
| | nn.BatchNorm1d(768), |
| | nn.Linear(768, 5), |
| | nn.LogSoftmax(dim=-1) |
| | ) |
| | |
| | def forward(self, train_batch): |
| | emb = self.berta(**train_batch)['last_hidden_state'].mean(axis=1) |
| | return self.fc(emb) |
| |
|
| | @st.cache |
| | def LoadModel(): |
| | return torch.load('model_full.pt', map_location=torch.device('cpu')) |
| |
|
| | model = LoadModel() |
| |
|
| | classes = ['Computer Science', 'Mathematics', 'Physics', 'Quantitative Biology', 'Statistics'] |
| |
|
| | def process(title, summary): |
| | text = title + summary |
| | if not text.strip(): |
| | return '' |
| | model.eval() |
| | lines = [text] |
| | X = tokenizer(lines, padding=True, truncation=True, return_tensors="pt") |
| | out = model(X) |
| | probs = torch.exp(out[0]) |
| | sorted_indexes = torch.argsort(probs, descending=True) |
| | probs_sum = idx = 0 |
| | res = [] |
| | while probs_sum < 0.95: |
| | prob_idx = sorted_indexes[idx] |
| | prob = probs[prob_idx] |
| | res.append(f'{classes[prob_idx]}: {prob:.3f}') |
| | idx += 1 |
| | probs_sum += prob |
| | return res |
| | |
| | title = st.text_area("Title", height=30) |
| |
|
| | summary = st.text_area("Summary", height=180) |
| |
|
| | for string in process(title, summary): |
| | st.markdown(string) |