Spaces:
Runtime error
Runtime error
| import os | |
| import random | |
| import json | |
| import numpy as np | |
| import torch | |
| import heapq | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from torch.utils.data import TensorDataset, DataLoader | |
| import os | |
| import random | |
| import json | |
| import numpy as np | |
| import torch | |
| import heapq | |
| import pandas as pd | |
| from tqdm import tqdm | |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
| from torch.utils.data import TensorDataset, DataLoader | |
| class Preprocess: | |
| def __init__(self, tokenizer_vocab_path, tokenizer_max_len): | |
| self.stopwords = ["i", "was", "transferred", | |
| "from", "to", "nilienda", "kituo", | |
| "cha", "lakini", "saa", "hii", "niko", | |
| "at", "nilienda", "nikahudumiwa", "pole", | |
| "deliver", "na", "ni", "baada", "ya", | |
| "kutumwa", "kutoka", "nilienda", | |
| "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa", | |
| "mgonjwa", "nikatibiwa", "in", "had", "a", | |
| "visit", "gynaecologist", "ndio", | |
| "karibu", "mimi", "niko", "sehemu", "hospitali", | |
| "serikali", "delivered", "katika", "kaunti", "kujifungua", | |
| "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea", | |
| "sija", "maliza", "mwisho", | |
| "nilianza", "kliniki", "yangu", | |
| "nilianzia", "nilijifungua"] | |
| self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path, | |
| use_auth_token='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') | |
| self.max_len = tokenizer_max_len | |
| def clean_text(self, text): | |
| text = text.lower() | |
| self.text_single = ' '.join(word for word in text.split() if word not in self.stopwords) | |
| return self.text_single | |
| def encode_fn(self): | |
| """ | |
| Using tokenizer to preprocess the text | |
| example of text_single:'Nairobi Hospital' | |
| """ | |
| tokenizer = self.tokenizer(self.text_single, | |
| padding=True, | |
| truncation=True, | |
| max_length=self.max_len, | |
| return_tensors='pt' | |
| ) | |
| input_ids = tokenizer['input_ids'] | |
| attention_mask = tokenizer['attention_mask'] | |
| return input_ids, attention_mask | |
| def process_tokenizer(self, data): | |
| """ | |
| Preprocess text and prepare dataloader for a single new sentence | |
| """ | |
| self.clean_text(data) | |
| input_ids, attention_mask = self.encode_fn() | |
| data = TensorDataset(input_ids, attention_mask) | |
| return data | |
| class Facility_Model: | |
| def __init__(self, facility_model_path: any, | |
| max_len: int): | |
| self.max_len = max_len | |
| self.softmax = torch.nn.Softmax(dim=1) | |
| self.gpu = False | |
| self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path, | |
| use_auth_token='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') | |
| self.model.eval() # set pytorch model for inference mode | |
| if torch.cuda.device_count() > 1: | |
| self.model = torch.nn.DataParallel(self.model) | |
| if self.gpu: | |
| seed = 42 | |
| random.seed(seed) | |
| np.random.seed(seed) | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) | |
| torch.backends.cudnn.deterministic = True | |
| self.device = torch.device('cuda') | |
| else: | |
| self.device = 'cpu' | |
| self.model = self.model.to(self.device) | |
| def predict_single(self, model, pred_data): | |
| """ | |
| Model inference for new single sentence | |
| """ | |
| pred_dataloader = DataLoader(pred_data, batch_size=10, shuffle=False) | |
| for i, batch in enumerate(pred_dataloader): | |
| with torch.no_grad(): | |
| outputs = model(input_ids=batch[0].to(self.device), | |
| attention_mask=batch[1].to(self.device) | |
| ) | |
| loss, logits = outputs.loss, outputs.logits | |
| probability = self.softmax(logits) | |
| probability_list = probability.detach().cpu().numpy() | |
| return probability_list | |
| def output_intent_probability(self, pred: any) -> dict: | |
| """ | |
| convert the model output into a dictionary with all intents and its probability | |
| """ | |
| output_dict = {} | |
| # transform the relation table(between label and intent) | |
| path_table = pd.read_csv('dhis_label_relation_14357.csv') | |
| label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[ | |
| 'label'] | |
| # transform the output into dictionary(between intent and probability) | |
| for intent in range(pred.shape[1]): | |
| output_dict[label_intent_dict[intent]] = pred[0][intent] | |
| return output_dict | |
| def inference(self, prepared_data): | |
| """ | |
| Make predictions on one new sentence and output a JSON format variable | |
| """ | |
| temp = [] | |
| prob_distribution = self.predict_single(self.model, prepared_data) | |
| prediction_results = self.output_intent_probability(prob_distribution.astype(float)) | |
| # Filter out predictions containing "dental" or "optical" keywords | |
| filtered_results = {intent: prob for intent, prob in prediction_results.items() | |
| if | |
| "dental" not in intent.lower() and "optical" not in intent.lower() and "eye" not in intent.lower()} | |
| sorted_pred_intent_results = sorted(filtered_results.items(), key=lambda x: x[1], reverse=True) | |
| sorted_pred_intent_results_dict = dict(sorted_pred_intent_results) | |
| # Return the top result | |
| top_results = dict(list(sorted_pred_intent_results)[:4]) | |
| temp.append(top_results) | |
| final_preds = json.dumps(temp) | |
| #final_preds = ', '.join(top_results.keys()) | |
| #final_preds = ', '.join(top_results) | |
| # final_preds = final_preds.replace("'", "") | |
| return final_preds | |
| jacaranda_hugging_face_model = "Jacaranda/dhis_14000_600k_Test_Model" | |
| obj_Facility_Model = Facility_Model(facility_model_path=jacaranda_hugging_face_model, | |
| max_len=128 | |
| ) | |
| processor = Preprocess(tokenizer_vocab_path=jacaranda_hugging_face_model, | |
| tokenizer_max_len=128 | |
| ) |