import torch import pandas as pd from torch.utils.data import Dataset from sklearn.model_selection import train_test_split from transformers import AutoTokenizer, AutoModelForSequenceClassification MODEL_NAME = "bert-base-uncased" MODEL_PATH = "app/model.pth" class TextDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): return { "input_ids": self.encodings["input_ids"][idx], "attention_mask": self.encodings["attention_mask"][idx], "label": torch.tensor(self.labels[idx]) } def __len__(self): return len(self.labels) def load_data(tokenizer): df = pd.read_csv("app/data.csv") texts = df["text"].tolist() labels = df["label"].tolist() X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, random_state=42) X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42) enc_train = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt") enc_val = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt") enc_test = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt") return ( TextDataset(enc_train, y_train), TextDataset(enc_val, y_val), TextDataset(enc_test, y_test) ) def save_model(model, tokenizer): torch.save(model.state_dict(), MODEL_PATH) tokenizer.save_pretrained("app/tokenizer/") def load_model(): tokenizer = AutoTokenizer.from_pretrained("app/tokenizer/") model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2) model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu"))) model.eval() return model, tokenizer