File size: 6,114 Bytes
3fef185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b2f797
3fef185
5b2f797
3fef185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b2f797
7936c71
 
 
 
 
617c3f7
 
2fffdc8
617c3f7
 
 
 
 
 
 
 
 
2fffdc8
 
617c3f7
 
 
 
2fffdc8
617c3f7
2fffdc8
617c3f7
 
2fffdc8
 
 
 
 
 
 
617c3f7
 
 
 
2fffdc8
 
 
 
 
 
 
 
 
 
617c3f7
 
 
 
 
 
2fffdc8
617c3f7
2fffdc8
617c3f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667fbf3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import os
import pickle
import numpy as np
from src.index import Indexer
import torch
import argparse
from src.text_embedding import TextEmbeddingModel
import random
from collections import Counter


def softmax_weights(scores, temperature=1.0):
    scores = np.array(scores)
    scores = scores / temperature
    e_scores = np.exp(scores - np.max(scores))
    return e_scores / np.sum(e_scores)

def normalize_fuzzy_cnt(fuzzy_cnt):
    total = sum(fuzzy_cnt.values())
    if total == 0:
        return fuzzy_cnt
    for key in fuzzy_cnt:
        fuzzy_cnt[key] /= total
    return fuzzy_cnt

def class_type_boost(query_type, candidate_type):
    if query_type == candidate_type:
        return 1.3
    elif abs(query_type - candidate_type) == 1:
        return 1.1
    elif abs(query_type - candidate_type) == 2:
        return 0.9
    else:
        return 0.8

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.

def load_pkl(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K):
    encoded_text = tokenizer.batch_encode_plus(
                        text_list,
                        return_tensors="pt",
                        max_length=512,
                        padding="max_length",
                        truncation=True,
                    )
    encoded_text = {k: v for k, v in encoded_text.items()}
    embeddings = model(encoded_text).cpu().detach().numpy()
    top_ids_and_scores = index.search_knn(embeddings, K)
    pred = []
    for i, (ids, scores) in enumerate(top_ids_and_scores):
        sorted_scores = np.argsort(scores)
        sorted_scores = sorted_scores[::-1]
        
        topk_ids = [ids[j] for j in sorted_scores]
        topk_scores = [scores[j] for j in sorted_scores]
        weights = softmax_weights(topk_scores, temperature=0.1)
        
        candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
        initial_pred = Counter(candidate_models).most_common(1)[0][0]
        
        fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
        for id, weight in zip(topk_ids, weights):
            label = (label_dict[int(id)], is_mixed_dict[int(id)])
            boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
            fuzzy_cnt[label] += weight * boost

            total_score = sum(fuzzy_cnt.values())
        final = dict()
        final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2)
        final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2)
        final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2)
        pred.append(final)
    return pred

def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model):
    encoded_text = tokenizer.batch_encode_plus(
                        text_list,
                        return_tensors="pt",
                        max_length=512,
                        padding="max_length",
                        truncation=True,
                    )
    encoded_text = {k: v for k, v in encoded_text.items()}
    embeddings = model(encoded_text).cpu().detach().numpy()
    
    # Get predictions using K=21
    top_ids_and_scores = index.search_knn(embeddings, K)
    pred = []
    
    for i, (ids, scores) in enumerate(top_ids_and_scores):
        sorted_scores = np.argsort(scores)[::-1]
        
        # Get all 21 results for 3-class prediction
        topk_ids = [ids[j] for j in sorted_scores]
        topk_scores = [scores[j] for j in sorted_scores]
        
        # Get top 9 results for model-specific prediction
        topk_ids_model = topk_ids[:K_model]
        topk_scores_model = topk_scores[:K_model]
        
        # Process 3-class prediction (using all 21)
        weights_3class = softmax_weights(topk_scores, temperature=0.1)
        candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
        initial_pred = Counter(candidate_models).most_common(1)[0][0]
        
        fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
        for id, weight in zip(topk_ids, weights_3class):
            label_3class = (label_dict[int(id)], is_mixed_dict[int(id)])
            boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred)
            fuzzy_cnt_3class[label_3class] += weight * boost_3class
        
        # Process model-specific prediction (using top 9)
        weights_model = softmax_weights(topk_scores_model, temperature=0.4)
        candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model]
        initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0]
        
        fuzzy_cnt_model = {
            (1, 0, 0): 0.0,  # Human
            (0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0,  # AI
            (1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0  # Human+AI
        }
        
        for id, weight in zip(topk_ids_model, weights_model):
            label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)])
            boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model)
            fuzzy_cnt_model[label_model] += weight * boost_model
        
        # Calculate 3-class probabilities
        total_score_3class = sum(fuzzy_cnt_3class.values())
        final_3class = {
            0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2),
            1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2),
            2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2)
        }
        
        # Get model-specific prediction
        final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get)
        
        # Combine both predictions
        final = {
            "score": final_3class,
            "model": final_model
        }
        pred.append(final)
        
    return pred