Spaces:
Runtime error
Runtime error
Upload facility_predict.py
Browse files- facility_predict.py +26 -9
facility_predict.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import random
|
| 3 |
-
import gradio as gr
|
| 4 |
import json
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
|
@@ -13,7 +13,8 @@ from torch.utils.data import TensorDataset, DataLoader
|
|
| 13 |
|
| 14 |
class Preprocess:
|
| 15 |
def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
|
| 16 |
-
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
|
|
|
|
| 17 |
self.max_len = tokenizer_max_len
|
| 18 |
|
| 19 |
def clean_text(self, text):
|
|
@@ -59,13 +60,15 @@ class Preprocess:
|
|
| 59 |
data = TensorDataset(input_ids, attention_mask)
|
| 60 |
return data
|
| 61 |
|
|
|
|
| 62 |
class Facility_Model:
|
| 63 |
def __init__(self, facility_model_path: any,
|
| 64 |
max_len: int):
|
| 65 |
self.max_len = max_len
|
| 66 |
self.softmax = torch.nn.Softmax(dim=1)
|
| 67 |
self.gpu = False
|
| 68 |
-
self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path,
|
|
|
|
| 69 |
self.model.eval() # set pytorch model for inference mode
|
| 70 |
|
| 71 |
if torch.cuda.device_count() > 1:
|
|
@@ -105,9 +108,10 @@ class Facility_Model:
|
|
| 105 |
"""
|
| 106 |
output_dict = {}
|
| 107 |
# transform the relation table(between label and intent)
|
| 108 |
-
path_table = pd.read_csv('dhis_label_relation_14357.csv')
|
| 109 |
|
| 110 |
-
label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
|
|
|
|
| 111 |
|
| 112 |
# transform the output into dictionary(between intent and probability)
|
| 113 |
for intent in range(pred.shape[1]):
|
|
@@ -157,17 +161,30 @@ def predict_batch_from_csv(input_file, output_file):
|
|
| 157 |
# Initialize predictions list
|
| 158 |
predictions = []
|
| 159 |
|
|
|
|
| 160 |
# Iterate over rows with tqdm for progress tracking
|
| 161 |
for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)):
|
| 162 |
-
text = row['
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
prepared_data = processor.process_tokenizer(cleaned_text)
|
| 165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
predictions.append(prediction)
|
| 167 |
|
| 168 |
# Create DataFrame for predictions
|
| 169 |
output_data = pd.DataFrame({'prediction': predictions})
|
|
|
|
| 170 |
# Merge with input DataFrame
|
| 171 |
-
pred_output_df = pd.concat([batch_data, output_data], axis=1)
|
|
|
|
| 172 |
# Save predictions to CSV
|
| 173 |
pred_output_df.to_csv(output_file, index=False)
|
|
|
|
| 1 |
+
# writefile facility_predict.py
|
| 2 |
import os
|
| 3 |
import random
|
|
|
|
| 4 |
import json
|
| 5 |
import numpy as np
|
| 6 |
import torch
|
|
|
|
| 13 |
|
| 14 |
class Preprocess:
|
| 15 |
def __init__(self, tokenizer_vocab_path, tokenizer_max_len):
|
| 16 |
+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_vocab_path,
|
| 17 |
+
use_auth_token='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')
|
| 18 |
self.max_len = tokenizer_max_len
|
| 19 |
|
| 20 |
def clean_text(self, text):
|
|
|
|
| 60 |
data = TensorDataset(input_ids, attention_mask)
|
| 61 |
return data
|
| 62 |
|
| 63 |
+
|
| 64 |
class Facility_Model:
|
| 65 |
def __init__(self, facility_model_path: any,
|
| 66 |
max_len: int):
|
| 67 |
self.max_len = max_len
|
| 68 |
self.softmax = torch.nn.Softmax(dim=1)
|
| 69 |
self.gpu = False
|
| 70 |
+
self.model = AutoModelForSequenceClassification.from_pretrained(facility_model_path,
|
| 71 |
+
use_auth_token='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')
|
| 72 |
self.model.eval() # set pytorch model for inference mode
|
| 73 |
|
| 74 |
if torch.cuda.device_count() > 1:
|
|
|
|
| 108 |
"""
|
| 109 |
output_dict = {}
|
| 110 |
# transform the relation table(between label and intent)
|
| 111 |
+
path_table = pd.read_csv('/content/drive/MyDrive/dhis14000/dhis_label_relation_14357.csv')
|
| 112 |
|
| 113 |
+
label_intent_dict = path_table[["label", "corresponding_label"]].set_index("corresponding_label").to_dict()[
|
| 114 |
+
'label']
|
| 115 |
|
| 116 |
# transform the output into dictionary(between intent and probability)
|
| 117 |
for intent in range(pred.shape[1]):
|
|
|
|
| 161 |
# Initialize predictions list
|
| 162 |
predictions = []
|
| 163 |
|
| 164 |
+
# Iterate over rows with tqdm for progress tracking
|
| 165 |
# Iterate over rows with tqdm for progress tracking
|
| 166 |
for _, row in tqdm(batch_data.iterrows(), total=len(batch_data)):
|
| 167 |
+
text = row['pnc_fac_name'] # Replace 'facility_name' with the actual column name containing the text data
|
| 168 |
+
|
| 169 |
+
if pd.isnull(text):
|
| 170 |
+
cleaned_text = ""
|
| 171 |
+
else:
|
| 172 |
+
cleaned_text = processor.clean_text(text)
|
| 173 |
+
|
| 174 |
prepared_data = processor.process_tokenizer(cleaned_text)
|
| 175 |
+
|
| 176 |
+
if cleaned_text == "":
|
| 177 |
+
prediction = "" # Set prediction as empty string
|
| 178 |
+
else:
|
| 179 |
+
prediction = obj_Facility_Model.inference(prepared_data)
|
| 180 |
+
|
| 181 |
predictions.append(prediction)
|
| 182 |
|
| 183 |
# Create DataFrame for predictions
|
| 184 |
output_data = pd.DataFrame({'prediction': predictions})
|
| 185 |
+
|
| 186 |
# Merge with input DataFrame
|
| 187 |
+
pred_output_df = pd.concat([batch_data.reset_index(drop=True), output_data], axis=1)
|
| 188 |
+
|
| 189 |
# Save predictions to CSV
|
| 190 |
pred_output_df.to_csv(output_file, index=False)
|