Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from sklearn.preprocessing import LabelEncoder | |
| from transformers import BertTokenizer, RobertaTokenizer, DebertaTokenizer | |
| import pickle | |
| import os | |
| from config import TEXT_COLUMN, LABEL_COLUMNS, MAX_LEN, TOKENIZER_PATH, LABEL_ENCODERS_PATH, METADATA_COLUMNS | |
| class ComplianceDataset(Dataset): | |
| def __init__(self, texts, labels, tokenizer, max_len): | |
| self.texts = texts | |
| self.labels = labels | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| text = str(self.texts[idx]) | |
| inputs = self.tokenizer( | |
| text, | |
| padding='max_length', | |
| truncation=True, | |
| max_length=self.max_len, | |
| return_tensors="pt" | |
| ) | |
| inputs = {key: val.squeeze(0) for key, val in inputs.items()} | |
| labels = torch.tensor(self.labels[idx], dtype=torch.long) | |
| return inputs, labels | |
| class ComplianceDatasetWithMetadata(Dataset): | |
| def __init__(self, texts, metadata, labels, tokenizer, max_len): | |
| self.texts = texts | |
| self.metadata = metadata | |
| self.labels = labels | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| def __len__(self): | |
| return len(self.texts) | |
| def __getitem__(self, idx): | |
| text = str(self.texts[idx]) | |
| inputs = self.tokenizer( | |
| text, | |
| padding='max_length', | |
| truncation=True, | |
| max_length=self.max_len, | |
| return_tensors="pt" | |
| ) | |
| inputs = {key: val.squeeze(0) for key, val in inputs.items()} | |
| metadata = torch.tensor(self.metadata[idx], dtype=torch.float) | |
| labels = torch.tensor(self.labels[idx], dtype=torch.long) | |
| return inputs, metadata, labels | |
| def load_and_preprocess_data(data_path): | |
| data = pd.read_csv(data_path) | |
| data.fillna("Unknown", inplace=True) | |
| for col in METADATA_COLUMNS: | |
| if col in data.columns: | |
| data[col] = pd.to_numeric(data[col], errors='coerce').fillna(0) | |
| label_encoders = {col: LabelEncoder() for col in LABEL_COLUMNS} | |
| for col in LABEL_COLUMNS: | |
| data[col] = label_encoders[col].fit_transform(data[col]) | |
| return data, label_encoders | |
| def get_tokenizer(model_name): | |
| # Important: Order matters | |
| if "deberta" in model_name.lower(): | |
| return DebertaTokenizer.from_pretrained(model_name) | |
| elif "roberta" in model_name.lower(): | |
| return RobertaTokenizer.from_pretrained(model_name) | |
| elif "bert" in model_name.lower(): | |
| return BertTokenizer.from_pretrained(model_name) | |
| else: | |
| raise ValueError(f"Unsupported tokenizer for model: {model_name}") | |
| def save_label_encoders(label_encoders): | |
| with open(LABEL_ENCODERS_PATH, "wb") as f: | |
| pickle.dump(label_encoders, f) | |
| print(f"Label encoders saved to {LABEL_ENCODERS_PATH}") | |
| def load_label_encoders(): | |
| with open(LABEL_ENCODERS_PATH, "rb") as f: | |
| return pickle.load(f) | |
| def get_num_labels(label_encoders): | |
| return [len(label_encoders[col].classes_) for col in LABEL_COLUMNS] |