Spaces:

Jacaranda
/

Facility_Predict

Runtime error

App Files Files Community

stanslausmwongela commited on Jun 7, 2023

Commit

57373b0

1 Parent(s): b0ed1a9

Updated Cleaning Text Function

Browse files

Files changed (1) hide show

predict.py +36 -24

predict.py CHANGED Viewed

@@ -10,38 +10,50 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
 from torch.utils.data import TensorDataset, DataLoader
 class Preprocess:
     def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
                                                        use_auth_token='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')
         self.max_len = tokenizer_max_len
     def clean_text(self, text):
         text = text.lower()
-        stopwords = ["i", "was", "transferred",
-                     "from", "to", "nilienda", "kituo",
-                     "cha", "lakini", "saa", "hii", "niko",
-                     "at", "nilienda", "nikahudumiwa", "pole",
-                     "deliver", "na", "ni", "baada", "ya",
-                     "kutumwa", "kutoka", "nilienda",
-                     "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
-                     "mgonjwa", "nikatibiwa", "in", "had", "a",
-                     "visit", "gynaecologist", "ndio",
-                     "karibu", "mimi", "niko", "sehemu", "hospitali",
-                     "serikali", "delivered", "katika", "kaunti", "kujifungua",
-                     "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
-                     "sija", "maliza", "mwisho",
-                     "nilianza", "kliniki", "yangu",
-                     "nilianzia", "nilijifungua"]
-        text_single = ' '.join(word for word in text.split() if word not in stopwords)
-        return text_single
-    def encode_fn(self, text_single):
         """
         Using tokenizer to preprocess the text
         example of text_single:'Nairobi Hospital'
         """
-        tokenizer = self.tokenizer(text_single,
                                    padding=True,
                                    truncation=True,
                                    max_length=self.max_len,
@@ -51,15 +63,15 @@ class Preprocess:
         attention_mask = tokenizer['attention_mask']
         return input_ids, attention_mask
-    def process_tokenizer(self, text_single):
         """
         Preprocess text and prepare dataloader for a single new sentence
         """
-        input_ids, attention_mask = self.encode_fn(text_single)
         data = TensorDataset(input_ids, attention_mask)
         return data
 class Facility_Model:
     def __init__(self, facility_model_path: any,
                  max_len: int):
@@ -107,7 +119,7 @@ class Facility_Model:
         """
         output_dict = {}
         # transform the relation table(between label and intent)
-        path_table = pd.read_csv('dhis_label_relation_14357.csv')
         label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
             'label']

 from torch.utils.data import TensorDataset, DataLoader
+import os
+import random
+import json
+import numpy as np
+import torch
+import heapq
+import pandas as pd
+from tqdm import tqdm
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from torch.utils.data import TensorDataset, DataLoader
 class Preprocess:
     def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
+        self.stopwords = ["i", "was", "transferred",
+                          "from", "to", "nilienda", "kituo",
+                          "cha", "lakini", "saa", "hii", "niko",
+                          "at", "nilienda", "nikahudumiwa", "pole",
+                          "deliver", "na", "ni", "baada", "ya",
+                          "kutumwa", "kutoka", "nilienda",
+                          "ndipo", "nikapewa", "hiyo", "lindam ama", "nikawa",
+                          "mgonjwa", "nikatibiwa", "in", "had", "a",
+                          "visit", "gynaecologist", "ndio",
+                          "karibu", "mimi", "niko", "sehemu", "hospitali",
+                          "serikali", "delivered", "katika", "kaunti", "kujifungua",
+                          "katika", "huko", "nilipoenda", "kwa", "bado", "naedelea",
+                          "sija", "maliza", "mwisho",
+                          "nilianza", "kliniki", "yangu",
+                          "nilianzia", "nilijifungua"]
         self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
                                                        use_auth_token='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')
         self.max_len = tokenizer_max_len
     def clean_text(self, text):
         text = text.lower()
+        self.text_single = ' '.join(word for word in text.split() if word not in self.stopwords)
+        return self.text_single
+    def encode_fn(self):
         """
         Using tokenizer to preprocess the text
         example of text_single:'Nairobi Hospital'
         """
+        tokenizer = self.tokenizer(self.text_single,
                                    padding=True,
                                    truncation=True,
                                    max_length=self.max_len,
         attention_mask = tokenizer['attention_mask']
         return input_ids, attention_mask
+    def process_tokenizer(self, data):
         """
         Preprocess text and prepare dataloader for a single new sentence
         """
+        self.clean_text(data)
+        input_ids, attention_mask = self.encode_fn()
         data = TensorDataset(input_ids, attention_mask)
         return data
 class Facility_Model:
     def __init__(self, facility_model_path: any,
                  max_len: int):
         """
         output_dict = {}
         # transform the relation table(between label and intent)
+        path_table = pd.read_csv('/content/drive/MyDrive/dhis14000/dhis_label_relation_14357.csv')
         label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
             'label']