Spaces:
Build error
Build error
| import torch | |
| import pandas as pd | |
| from torch.utils.data import Dataset | |
| from sklearn.model_selection import train_test_split | |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
| MODEL_NAME = "bert-base-uncased" | |
| MODEL_PATH = "app/model.pth" | |
| class TextDataset(Dataset): | |
| def __init__(self, encodings, labels): | |
| self.encodings = encodings | |
| self.labels = labels | |
| def __getitem__(self, idx): | |
| return { | |
| "input_ids": self.encodings["input_ids"][idx], | |
| "attention_mask": self.encodings["attention_mask"][idx], | |
| "label": torch.tensor(self.labels[idx]) | |
| } | |
| def __len__(self): | |
| return len(self.labels) | |
| def load_data(tokenizer): | |
| df = pd.read_csv("app/data.csv") | |
| texts = df["text"].tolist() | |
| labels = df["label"].tolist() | |
| X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, random_state=42) | |
| X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) | |
| enc_train = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt") | |
| enc_val = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt") | |
| enc_test = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt") | |
| return ( | |
| TextDataset(enc_train, y_train), | |
| TextDataset(enc_val, y_val), | |
| TextDataset(enc_test, y_test) | |
| ) | |
| def save_model(model, tokenizer): | |
| torch.save(model.state_dict(), MODEL_PATH) | |
| tokenizer.save_pretrained("app/tokenizer/") | |
| def load_model(): | |
| tokenizer = AutoTokenizer.from_pretrained("app/tokenizer/") | |
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) | |
| model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu"))) | |
| model.eval() | |
| return model, tokenizer | |