Spaces:
Sleeping
Sleeping
| import os | |
| import pickle | |
| import numpy as np | |
| from src.index import Indexer | |
| import torch | |
| import argparse | |
| from src.text_embedding import TextEmbeddingModel | |
| import random | |
| from collections import Counter | |
| def softmax_weights(scores, temperature=1.0): | |
| scores = np.array(scores) | |
| scores = scores / temperature | |
| e_scores = np.exp(scores - np.max(scores)) | |
| return e_scores / np.sum(e_scores) | |
| def normalize_fuzzy_cnt(fuzzy_cnt): | |
| total = sum(fuzzy_cnt.values()) | |
| if total == 0: | |
| return fuzzy_cnt | |
| for key in fuzzy_cnt: | |
| fuzzy_cnt[key] /= total | |
| return fuzzy_cnt | |
| def class_type_boost(query_type, candidate_type): | |
| if query_type == candidate_type: | |
| return 1.3 | |
| elif abs(query_type - candidate_type) == 1: | |
| return 1.1 | |
| elif abs(query_type - candidate_type) == 2: | |
| return 0.9 | |
| else: | |
| return 0.8 | |
| def set_seed(seed): | |
| torch.manual_seed(seed) | |
| torch.cuda.manual_seed(seed) | |
| torch.cuda.manual_seed_all(seed) # if you are using multi-GPU. | |
| np.random.seed(seed) # Numpy module. | |
| random.seed(seed) # Python random module. | |
| def load_pkl(path): | |
| with open(path, 'rb') as f: | |
| return pickle.load(f) | |
| def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K): | |
| encoded_text = tokenizer.batch_encode_plus( | |
| text_list, | |
| return_tensors="pt", | |
| max_length=512, | |
| padding="max_length", | |
| truncation=True, | |
| ) | |
| encoded_text = {k: v for k, v in encoded_text.items()} | |
| embeddings = model(encoded_text).cpu().detach().numpy() | |
| top_ids_and_scores = index.search_knn(embeddings, K) | |
| pred = [] | |
| for i, (ids, scores) in enumerate(top_ids_and_scores): | |
| sorted_scores = np.argsort(scores) | |
| sorted_scores = sorted_scores[::-1] | |
| topk_ids = [ids[j] for j in sorted_scores] | |
| topk_scores = [scores[j] for j in sorted_scores] | |
| weights = softmax_weights(topk_scores, temperature=0.1) | |
| candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids] | |
| initial_pred = Counter(candidate_models).most_common(1)[0][0] | |
| fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0} | |
| for id, weight in zip(topk_ids, weights): | |
| label = (label_dict[int(id)], is_mixed_dict[int(id)]) | |
| boost = class_type_boost(is_mixed_dict[int(id)],initial_pred) | |
| fuzzy_cnt[label] += weight * boost | |
| total_score = sum(fuzzy_cnt.values()) | |
| final = dict() | |
| final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2) | |
| final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2) | |
| final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2) | |
| pred.append(final) | |
| return pred | |
| def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model): | |
| encoded_text = tokenizer.batch_encode_plus( | |
| text_list, | |
| return_tensors="pt", | |
| max_length=512, | |
| padding="max_length", | |
| truncation=True, | |
| ) | |
| encoded_text = {k: v for k, v in encoded_text.items()} | |
| embeddings = model(encoded_text).cpu().detach().numpy() | |
| # Get predictions using K=21 | |
| top_ids_and_scores = index.search_knn(embeddings, K) | |
| pred = [] | |
| for i, (ids, scores) in enumerate(top_ids_and_scores): | |
| sorted_scores = np.argsort(scores)[::-1] | |
| # Get all 21 results for 3-class prediction | |
| topk_ids = [ids[j] for j in sorted_scores] | |
| topk_scores = [scores[j] for j in sorted_scores] | |
| # Get top 9 results for model-specific prediction | |
| topk_ids_model = topk_ids[:K_model] | |
| topk_scores_model = topk_scores[:K_model] | |
| # Process 3-class prediction (using all 21) | |
| weights_3class = softmax_weights(topk_scores, temperature=0.1) | |
| candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids] | |
| initial_pred = Counter(candidate_models).most_common(1)[0][0] | |
| fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0} | |
| for id, weight in zip(topk_ids, weights_3class): | |
| label_3class = (label_dict[int(id)], is_mixed_dict[int(id)]) | |
| boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred) | |
| fuzzy_cnt_3class[label_3class] += weight * boost_3class | |
| # Process model-specific prediction (using top 9) | |
| weights_model = softmax_weights(topk_scores_model, temperature=0.4) | |
| candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model] | |
| initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0] | |
| fuzzy_cnt_model = { | |
| (1, 0, 0): 0.0, # Human | |
| (0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0, # AI | |
| (1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0 # Human+AI | |
| } | |
| for id, weight in zip(topk_ids_model, weights_model): | |
| label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)]) | |
| boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model) | |
| fuzzy_cnt_model[label_model] += weight * boost_model | |
| # Calculate 3-class probabilities | |
| total_score_3class = sum(fuzzy_cnt_3class.values()) | |
| final_3class = { | |
| 0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2), | |
| 1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2), | |
| 2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2) | |
| } | |
| # Get model-specific prediction | |
| final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get) | |
| # Combine both predictions | |
| final = { | |
| "score": final_3class, | |
| "model": final_model | |
| } | |
| pred.append(final) | |
| return pred |