File size: 2,046 Bytes
fc1c2b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0797029
 
 
 
 
fc1c2b8
 
 
 
 
 
 
 
 
 
 
 
 
 
0797029
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
np.random.seed(42)
import random
random.seed(42)

import pandas as pd
from sklearn.metrics import classification_report

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
import json

from copy import deepcopy

import torch

import transformers as transformers

from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed
)
from transformers.file_utils import is_offline_mode
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version

from src.datasets import ContrastiveClassificationDataset
from src.data_collators import DataCollatorContrastiveClassification
from src.modeling import ContrastiveClassifierModel

from src.metrics import compute_metrics_bce

from transformers import EarlyStoppingCallback

from transformers.utils.hp_naming import TrialShortNamer

from pdb import set_trace

import json

def model_fn(model_dir):
    tokenizer = AutoTokenizer.from_pretrained('roberta-base', additional_special_tokens=('[COL]', '[VAL]'))
    model = ContrastiveClassifierModel(checkpoint_path=model_dir, len_tokenizer=len(tokenizer), model='roberta-base', frozen=False)
    return model, tokenizer


def predict_fn(data, model_and_tokenizer):
    # destruct model and tokenizer
    model, tokenizer = model_and_tokenizer

    test_dataset = ContrastiveClassificationDataset(data["inputs"], dataset_type='test', size=512, tokenizer='roberta-base', dataset='serialized')
    data_collator = DataCollatorContrastiveClassification(tokenizer)

    trainer = Trainer(
                model=model,
                data_collator=data_collator,
                compute_metrics=compute_metrics_bce,
            )

    predict_results = trainer.predict(test_dataset,metric_key_prefix="predict")
    df = test_dataset.data
    df['prediction'] = predict_results.predictions
    return {"values": df[df['prediction']==1].values.tolist()}