Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import sentencepiece | |
| # 모델 준비하기 | |
| from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer | |
| from torch.utils.data import DataLoader, Dataset | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| import os | |
| from tqdm import tqdm | |
| # [theme] | |
| # base="dark" | |
| # primaryColor="purple" | |
| # 제목 입력 | |
| st.header('한국표준산업분류 자동코딩 서비스') | |
| # 재로드 안하도록 | |
| def md_loading(): | |
| ## cpu | |
| device = torch.device("cpu") | |
| tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') | |
| model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=493) | |
| model_checkpoint = 'en_ko_4mix_proto.bin' | |
| project_path = './' | |
| output_model_file = os.path.join(project_path, model_checkpoint) | |
| # model.load_state_dict(torch.load(output_model_file)) | |
| model.load_state_dict(torch.load(output_model_file, map_location=torch.device('cpu'))) | |
| # ckpt = torch.load(output_model_file, map_location=torch.device('cpu')) | |
| # model.load_state_dict(ckpt['model_state_dict']) | |
| # device = torch.device("cuda" if torch.cuda.is_available() and not False else "cpu") | |
| # device = torch.device("cpu") | |
| model.to(device) | |
| label_tbl = np.load('./label_table.npy') | |
| loc_tbl = pd.read_csv('./kisc_table.csv', encoding='utf-8') | |
| print('ready') | |
| return tokenizer, model, label_tbl, loc_tbl, device | |
| # 모델 로드 | |
| tokenizer, model, label_tbl, loc_tbl, device = md_loading() | |
| # 데이터 셋 준비용 | |
| max_len = 64 # 64 | |
| class TVT_Dataset(Dataset): | |
| def __init__(self, df): | |
| self.df_data = df | |
| def __getitem__(self, index): | |
| # 데이터프레임 칼럼 들고오기 | |
| # sentence = self.df_data.loc[index, 'text'] | |
| sentence = self.df_data.loc[index, ['CMPNY_NM', 'MAJ_ACT', 'WORK_TYPE', 'POSITION', 'DEPT_NM']] | |
| encoded_dict = tokenizer( | |
| ' <s> '.join(sentence.to_list()), | |
| add_special_tokens = True, | |
| max_length = max_len, | |
| padding='max_length', | |
| truncation=True, | |
| return_attention_mask = True, | |
| return_tensors = 'pt') | |
| padded_token_list = encoded_dict['input_ids'][0] | |
| att_mask = encoded_dict['attention_mask'][0] | |
| # 숫자로 변환된 label을 텐서로 변환 | |
| # target = torch.tensor(self.df_data.loc[index, 'NEW_CD']) | |
| # input_ids, attention_mask, label을 하나의 인풋으로 묶음 | |
| # sample = (padded_token_list, att_mask, target) | |
| sample = (padded_token_list, att_mask) | |
| return sample | |
| def __len__(self): | |
| return len(self.df_data) | |
| # 텍스트 input 박스 | |
| business = st.text_input('사업체명') | |
| business_work = st.text_input('사업체 하는일') | |
| work_department = st.text_input('근무부서') | |
| work_position = st.text_input('직책') | |
| what_do_i = st.text_input('내가 하는 일') | |
| # business_work = '' | |
| # work_department = '' | |
| # work_position = '' | |
| # what_do_i = '' | |
| # data 준비 | |
| # test dataset을 만들어줍니다. | |
| input_col_type = ['CMPNY_NM', 'MAJ_ACT', 'WORK_TYPE', 'POSITION', 'DEPT_NM'] | |
| def preprocess_dataset(dataset): | |
| dataset.reset_index(drop=True, inplace=True) | |
| dataset.fillna('') | |
| return dataset[input_col_type] | |
| ## 임시 확인 | |
| # st.write(md_input) | |
| # 버튼 | |
| if st.button('확인'): | |
| ## 버튼 클릭 시 수행사항 | |
| ### 데이터 준비 | |
| # md_input: 모델에 입력할 input 값 정의 | |
| # md_input = '|'.join([business, business_work, what_do_i, work_position, work_department]) | |
| md_input = [str(business), str(business_work), str(what_do_i), str(work_position), str(work_department)] | |
| test_dataset = pd.DataFrame({ | |
| input_col_type[0]: md_input[0], | |
| input_col_type[1]: md_input[1], | |
| input_col_type[2]: md_input[2], | |
| input_col_type[3]: md_input[3], | |
| input_col_type[4]: md_input[4] | |
| }, index=[0]) | |
| # test_dataset = pd.read_csv(DATA_IN_PATH + test_set_name, sep='|', na_filter=False) | |
| test_dataset.reset_index(inplace=True) | |
| test_dataset = preprocess_dataset(test_dataset) | |
| print(len(test_dataset)) | |
| print(test_dataset) | |
| print('base_data_loader 사용 시점점') | |
| test_data = TVT_Dataset(test_dataset) | |
| train_batch_size = 48 | |
| # batch_size 만큼 데이터 분할 | |
| test_dataloader = DataLoader(test_data, | |
| batch_size=train_batch_size, | |
| shuffle=False) | |
| ### 모델 실행 | |
| # Put model in evaluation mode | |
| model.eval() | |
| model.zero_grad() | |
| # Tracking variables | |
| predictions , true_labels = [], [] | |
| # Predict | |
| for batch in tqdm(test_dataloader): | |
| # Add batch to GPU | |
| batch = tuple(t.to(device) for t in batch) | |
| # Unpack the inputs from our dataloader | |
| test_input_ids, test_attention_mask = batch | |
| # Telling the model not to compute or store gradients, saving memory and | |
| # speeding up prediction | |
| with torch.no_grad(): | |
| # Forward pass, calculate logit predictions | |
| outputs = model(test_input_ids, token_type_ids=None, attention_mask=test_attention_mask) | |
| logits = outputs.logits | |
| # Move logits and labels to CPU | |
| # logits = logits.detach().cpu().numpy() | |
| pred_m = torch.nn.Softmax(dim=1) | |
| pred_ = pred_m(logits) | |
| # st.write(logits.size()) | |
| # # 단독 예측 시 | |
| # arg_idx = torch.argmax(logits, dim=1) | |
| # print('arg_idx:', arg_idx) | |
| # num_ans = label_tbl[arg_idx] | |
| # str_ans = loc_tbl['항목명'][loc_tbl['코드'] == num_ans].values | |
| # 상위 k번째까지 예측 시 | |
| k = 10 | |
| topk_idx = torch.topk(pred_.flatten(), k).indices | |
| topk_values = torch.topk(pred_.flatten(), k).values | |
| num_ans_topk = label_tbl[topk_idx] | |
| str_ans_topk = [loc_tbl['항목명'][loc_tbl['코드'] == k] for k in num_ans_topk] | |
| percent_ans_topk = topk_values.numpy() | |
| st.write(sum(torch.topk(pred_.flatten(), 493).values.numpy())) | |
| # print(num_ans, str_ans) | |
| # print(num_ans_topk) | |
| # print('사업체명:', query_tokens[0]) | |
| # print('사업체 하는일:', query_tokens[1]) | |
| # print('근무부서:', query_tokens[2]) | |
| # print('직책:', query_tokens[3]) | |
| # print('내가 하는일:', query_tokens[4]) | |
| # print('산업코드 및 분류:', num_ans, str_ans) | |
| # ans = '' | |
| # ans1, ans2, ans3 = '', '', '' | |
| ## 모델 결과값 출력 | |
| # st.write("산업코드 및 분류:", num_ans, str_ans[0]) | |
| # st.write("세분류 코드") | |
| # for i in range(k): | |
| # st.write(str(i+1) + '순위:', num_ans_topk[i], str_ans_topk[i].iloc[0]) | |
| # print(num_ans) | |
| # print(str_ans, type(str_ans)) | |
| str_ans_topk_list = [] | |
| percent_ans_topk_list = [] | |
| for i in range(k): | |
| str_ans_topk_list.append(str_ans_topk[i].iloc[0]) | |
| percent_ans_topk_list.append(percent_ans_topk[i]*100) | |
| # print(str_ans_topk_list) | |
| ans_topk_df = pd.DataFrame({ | |
| 'NO': range(1, k+1), | |
| '세분류 코드': num_ans_topk, | |
| '세분류 명칭': str_ans_topk_list, | |
| '확률': percent_ans_topk_list | |
| }) | |
| ans_topk_df = ans_topk_df.set_index('NO') | |
| # ans_topk_df.style.bar(subset='확률', align='left', color='blue') | |
| # ans_topk_df['확률'].style.applymap(color='black', font_color='blue') | |
| # st.dataframe(ans_topk_df) | |
| # st.dataframe(ans_topk_df.style.bar(subset='확률', align='left', color='blue')) | |
| st.write(ans_topk_df.style.bar(subset='확률', align='left', color='blue')) |