Spaces:

ngocminhta
/

falcon-api

Sleeping

File size: 6,114 Bytes

import os
import pickle
import numpy as np
from src.index import Indexer
import torch
import argparse
from src.text_embedding import TextEmbeddingModel
import random
from collections import Counter


def softmax_weights(scores, temperature=1.0):
    scores = np.array(scores)
    scores = scores / temperature
    e_scores = np.exp(scores - np.max(scores))
    return e_scores / np.sum(e_scores)

def normalize_fuzzy_cnt(fuzzy_cnt):
    total = sum(fuzzy_cnt.values())
    if total == 0:
        return fuzzy_cnt
    for key in fuzzy_cnt:
        fuzzy_cnt[key] /= total
    return fuzzy_cnt

def class_type_boost(query_type, candidate_type):
    if query_type == candidate_type:
        return 1.3
    elif abs(query_type - candidate_type) == 1:
        return 1.1
    elif abs(query_type - candidate_type) == 2:
        return 0.9
    else:
        return 0.8

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.

def load_pkl(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K):
    encoded_text = tokenizer.batch_encode_plus(
                        text_list,
                        return_tensors="pt",
                        max_length=512,
                        padding="max_length",
                        truncation=True,
                    )
    encoded_text = {k: v for k, v in encoded_text.items()}
    embeddings = model(encoded_text).cpu().detach().numpy()
    top_ids_and_scores = index.search_knn(embeddings, K)
    pred = []
    for i, (ids, scores) in enumerate(top_ids_and_scores):
        sorted_scores = np.argsort(scores)
        sorted_scores = sorted_scores[::-1]
        
        topk_ids = [ids[j] for j in sorted_scores]
        topk_scores = [scores[j] for j in sorted_scores]
        weights = softmax_weights(topk_scores, temperature=0.1)
        
        candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
        initial_pred = Counter(candidate_models).most_common(1)[0][0]
        
        fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
        for id, weight in zip(topk_ids, weights):
            label = (label_dict[int(id)], is_mixed_dict[int(id)])
            boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
            fuzzy_cnt[label] += weight * boost

            total_score = sum(fuzzy_cnt.values())
        final = dict()
        final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2)
        final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2)
        final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2)
        pred.append(final)
    return pred

def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model):
    encoded_text = tokenizer.batch_encode_plus(
                        text_list,
                        return_tensors="pt",
                        max_length=512,
                        padding="max_length",
                        truncation=True,
                    )
    encoded_text = {k: v for k, v in encoded_text.items()}
    embeddings = model(encoded_text).cpu().detach().numpy()
    
    # Get predictions using K=21
    top_ids_and_scores = index.search_knn(embeddings, K)
    pred = []
    
    for i, (ids, scores) in enumerate(top_ids_and_scores):
        sorted_scores = np.argsort(scores)[::-1]
        
        # Get all 21 results for 3-class prediction
        topk_ids = [ids[j] for j in sorted_scores]
        topk_scores = [scores[j] for j in sorted_scores]
        
        # Get top 9 results for model-specific prediction
        topk_ids_model = topk_ids[:K_model]
        topk_scores_model = topk_scores[:K_model]
        
        # Process 3-class prediction (using all 21)
        weights_3class = softmax_weights(topk_scores, temperature=0.1)
        candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
        initial_pred = Counter(candidate_models).most_common(1)[0][0]
        
        fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
        for id, weight in zip(topk_ids, weights_3class):
            label_3class = (label_dict[int(id)], is_mixed_dict[int(id)])
            boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred)
            fuzzy_cnt_3class[label_3class] += weight * boost_3class
        
        # Process model-specific prediction (using top 9)
        weights_model = softmax_weights(topk_scores_model, temperature=0.4)
        candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model]
        initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0]
        
        fuzzy_cnt_model = {
            (1, 0, 0): 0.0,  # Human
            (0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0,  # AI
            (1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0  # Human+AI
        }
        
        for id, weight in zip(topk_ids_model, weights_model):
            label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)])
            boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model)
            fuzzy_cnt_model[label_model] += weight * boost_model
        
        # Calculate 3-class probabilities
        total_score_3class = sum(fuzzy_cnt_3class.values())
        final_3class = {
            0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2),
            1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2),
            2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2)
        }
        
        # Get model-specific prediction
        final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get)
        
        # Combine both predictions
        final = {
            "score": final_3class,
            "model": final_model
        }
        pred.append(final)
        
    return pred