Spaces:

ngocminhta
/

falcon-api

Sleeping

falcon-api / infer.py

ngocminhta

update model search

2fffdc8 8 months ago

6.11 kB

	import os
	import pickle
	import numpy as np
	from src.index import Indexer
	import torch
	import argparse
	from src.text_embedding import TextEmbeddingModel
	import random
	from collections import Counter


	def softmax_weights(scores, temperature=1.0):
	scores = np.array(scores)
	scores = scores / temperature
	e_scores = np.exp(scores - np.max(scores))
	return e_scores / np.sum(e_scores)

	def normalize_fuzzy_cnt(fuzzy_cnt):
	total = sum(fuzzy_cnt.values())
	if total == 0:
	return fuzzy_cnt
	for key in fuzzy_cnt:
	fuzzy_cnt[key] /= total
	return fuzzy_cnt

	def class_type_boost(query_type, candidate_type):
	if query_type == candidate_type:
	return 1.3
	elif abs(query_type - candidate_type) == 1:
	return 1.1
	elif abs(query_type - candidate_type) == 2:
	return 0.9
	else:
	return 0.8

	def set_seed(seed):
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed) # if you are using multi-GPU.
	np.random.seed(seed) # Numpy module.
	random.seed(seed) # Python random module.

	def load_pkl(path):
	with open(path, 'rb') as f:
	return pickle.load(f)

	def infer_3_class(model, tokenizer, index, label_dict, is_mixed_dict, text_list, K):
	encoded_text = tokenizer.batch_encode_plus(
	text_list,
	return_tensors="pt",
	max_length=512,
	padding="max_length",
	truncation=True,
	)
	encoded_text = {k: v for k, v in encoded_text.items()}
	embeddings = model(encoded_text).cpu().detach().numpy()
	top_ids_and_scores = index.search_knn(embeddings, K)
	pred = []
	for i, (ids, scores) in enumerate(top_ids_and_scores):
	sorted_scores = np.argsort(scores)
	sorted_scores = sorted_scores[::-1]

	topk_ids = [ids[j] for j in sorted_scores]
	topk_scores = [scores[j] for j in sorted_scores]
	weights = softmax_weights(topk_scores, temperature=0.1)

	candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
	initial_pred = Counter(candidate_models).most_common(1)[0][0]

	fuzzy_cnt = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
	for id, weight in zip(topk_ids, weights):
	label = (label_dict[int(id)], is_mixed_dict[int(id)])
	boost = class_type_boost(is_mixed_dict[int(id)],initial_pred)
	fuzzy_cnt[label] += weight * boost

	total_score = sum(fuzzy_cnt.values())
	final = dict()
	final[0] = round(fuzzy_cnt[(1,0)] / total_score*100,2)
	final[1] = round(fuzzy_cnt[(0,10^3)] / total_score*100,2)
	final[2] = round(fuzzy_cnt[(1,1)] / total_score*100,2)
	pred.append(final)
	return pred

	def infer_model_specific(model, tokenizer, index, label_dict, is_mixed_dict, write_model_dict, text_list, K, K_model):
	encoded_text = tokenizer.batch_encode_plus(
	text_list,
	return_tensors="pt",
	max_length=512,
	padding="max_length",
	truncation=True,
	)
	encoded_text = {k: v for k, v in encoded_text.items()}
	embeddings = model(encoded_text).cpu().detach().numpy()

	# Get predictions using K=21
	top_ids_and_scores = index.search_knn(embeddings, K)
	pred = []

	for i, (ids, scores) in enumerate(top_ids_and_scores):
	sorted_scores = np.argsort(scores)[::-1]

	# Get all 21 results for 3-class prediction
	topk_ids = [ids[j] for j in sorted_scores]
	topk_scores = [scores[j] for j in sorted_scores]

	# Get top 9 results for model-specific prediction
	topk_ids_model = topk_ids[:K_model]
	topk_scores_model = topk_scores[:K_model]

	# Process 3-class prediction (using all 21)
	weights_3class = softmax_weights(topk_scores, temperature=0.1)
	candidate_models = [is_mixed_dict[int(_id)] for _id in topk_ids]
	initial_pred = Counter(candidate_models).most_common(1)[0][0]

	fuzzy_cnt_3class = {(1,0): 0.0, (0,10^3): 0.0, (1,1): 0.0}
	for id, weight in zip(topk_ids, weights_3class):
	label_3class = (label_dict[int(id)], is_mixed_dict[int(id)])
	boost_3class = class_type_boost(is_mixed_dict[int(id)], initial_pred)
	fuzzy_cnt_3class[label_3class] += weight * boost_3class

	# Process model-specific prediction (using top 9)
	weights_model = softmax_weights(topk_scores_model, temperature=0.4)
	candidate_models_model = [is_mixed_dict[int(_id)] for _id in topk_ids_model]
	initial_pred_model = Counter(candidate_models_model).most_common(1)[0][0]

	fuzzy_cnt_model = {
	(1, 0, 0): 0.0, # Human
	(0, 10^3, 1): 0.0, (0, 10^3, 2): 0.0, (0, 10^3, 3): 0.0, (0, 10^3, 4): 0.0, # AI
	(1, 1, 1): 0.0, (1, 1, 2): 0.0, (1, 1, 3): 0.0, (1, 1, 4): 0.0 # Human+AI
	}

	for id, weight in zip(topk_ids_model, weights_model):
	label_model = (label_dict[int(id)], is_mixed_dict[int(id)], write_model_dict[int(id)])
	boost_model = class_type_boost(is_mixed_dict[int(id)], initial_pred_model)
	fuzzy_cnt_model[label_model] += weight * boost_model

	# Calculate 3-class probabilities
	total_score_3class = sum(fuzzy_cnt_3class.values())
	final_3class = {
	0: round(fuzzy_cnt_3class[(1,0)] / total_score_3class * 100, 2),
	1: round(fuzzy_cnt_3class[(0,10^3)] / total_score_3class * 100, 2),
	2: round(fuzzy_cnt_3class[(1,1)] / total_score_3class * 100, 2)
	}

	# Get model-specific prediction
	final_model = max(fuzzy_cnt_model, key=fuzzy_cnt_model.get)

	# Combine both predictions
	final = {
	"score": final_3class,
	"model": final_model
	}
	pred.append(final)

	return pred