File size: 1,850 Bytes
8505a58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import torch
import pandas as pd
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = "bert-base-uncased"
MODEL_PATH = "app/model.pth"

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "label": torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

def load_data(tokenizer):
    df = pd.read_csv("app/data.csv")
    texts = df["text"].tolist()
    labels = df["label"].tolist()

    X_train, X_temp, y_train, y_temp = train_test_split(texts, labels, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    enc_train = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
    enc_val = tokenizer(X_val, truncation=True, padding=True, return_tensors="pt")
    enc_test = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

    return (
        TextDataset(enc_train, y_train),
        TextDataset(enc_val, y_val),
        TextDataset(enc_test, y_test)
    )

def save_model(model, tokenizer):
    torch.save(model.state_dict(), MODEL_PATH)
    tokenizer.save_pretrained("app/tokenizer/")

def load_model():
    tokenizer = AutoTokenizer.from_pretrained("app/tokenizer/")
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device("cpu")))
    model.eval()
    return model, tokenizer