Spaces:

DazKha
/

phobert-multi-intent-classifier

Sleeping

App Files Files Community

phobert-multi-intent-classifier / predict.py

DazKha

fixed all...

ac5adff 27 days ago

raw

history blame contribute delete

9.73 kB

	import os
	import logging
	import torch
	import torch.nn as nn
	import numpy as np
	from seqeval.metrics.sequence_labeling import get_entities

	class VSLIMPredictor:
	def __init__(self, model, tokenizer, mappings, device, args):
	self.model = model
	self.tokenizer = tokenizer
	self.mappings = mappings
	self.args = args
	self.device = device

	# Extract mappings
	self.intent_labels = mappings['INTENT_LABELS']
	self.id2slot = mappings['ID2SLOT']
	self.id2tokint = mappings['ID2TOKINT']
	self.id2tagint = mappings['ID2TAGINT']

	def align_tokens_for_inference(self, tokens, max_len=None):
	"""Align pre-tokenized tokens to subwords for inference"""
	if max_len is None:
	max_len = self.args.max_seq_len

	subword_tokens = []
	word_to_subword_map = []

	for token in tokens:
	word_to_subword_map.append(len(subword_tokens))
	pieces = self.tokenizer.tokenize(token) or [self.tokenizer.unk_token]
	subword_tokens.extend(pieces)

	# Convert to input IDs with special tokens
	input_ids = self.tokenizer.convert_tokens_to_ids(subword_tokens)
	input_ids = self.tokenizer.build_inputs_with_special_tokens(input_ids)

	# Adjust mapping for special tokens
	word_to_subword_map = [idx + 1 for idx in word_to_subword_map]

	attention_mask = [1] * len(input_ids)
	token_type_ids = [0] * len(input_ids)

	# Truncate if necessary
	if len(input_ids) > max_len:
	input_ids = input_ids[:max_len]
	attention_mask = attention_mask[:max_len]
	token_type_ids = token_type_ids[:max_len]
	word_to_subword_map = [idx for idx in word_to_subword_map if idx < max_len]

	# Pad to max_len
	pad_len = max_len - len(input_ids)
	if pad_len > 0:
	pad_id = self.tokenizer.pad_token_id
	input_ids.extend([pad_id] * pad_len)
	attention_mask.extend([0] * pad_len)
	token_type_ids.extend([0] * pad_len)

	return (
	torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(self.device),
	torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0).to(self.device),
	torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0).to(self.device),
	word_to_subword_map
	)

	def predict_single(self, tokens, threshold=0.5):
	"""Full SLIM inference với debug info cho UI"""
	self.model.eval()

	with torch.no_grad():
	# 1) Align token -> subword
	input_ids, attention_mask, token_type_ids, word_positions = self.align_tokens_for_inference(tokens)

	# 2) Forward model chính
	batch_size, seq_len = input_ids.shape
	B_tag_mask = torch.zeros(batch_size, self.args.num_mask, seq_len, dtype=torch.long, device=self.device)
	BI_tag_mask = torch.zeros(batch_size, self.args.num_mask, seq_len, dtype=torch.float, device=self.device)
	tag_intent_label = torch.full(
	(batch_size, self.args.num_mask),
	self.args.ignore_index,
	dtype=torch.long,
	device=self.device
	)

	outputs = self.model(
	input_ids=input_ids,
	attention_mask=attention_mask,
	token_type_ids=token_type_ids,
	intent_label_ids=None,
	slot_labels_ids=None,
	intent_token_ids=None,
	B_tag_mask=B_tag_mask,
	BI_tag_mask=BI_tag_mask,
	tag_intent_label=tag_intent_label
	)

	slot_logits = outputs["slot_logits"][0]
	tokint_logits = outputs["intent_token_logits"][0] if outputs["intent_token_logits"] is not None else None
	uttint_logits = outputs["intent_logits"][0]

	# 3) Utterance-level intents + probabilities
	uttint_probs = uttint_logits.cpu().numpy() # shape [num_intents]
	predicted_intents = []

	for i, prob in enumerate(uttint_probs):
	if prob >= threshold:
	predicted_intents.append(self.intent_labels[i])

	if not predicted_intents:
	best_idx = np.argmax(uttint_probs)
	predicted_intents = [self.intent_labels[best_idx]]

	intent_probabilities = {
	self.intent_labels[i]: float(uttint_probs[i])
	for i in range(len(self.intent_labels))
	}

	# 4) Token-level predictions
	slot_predictions = []
	tokint_predictions = []

	for word_idx, subword_pos in enumerate(word_positions):
	if subword_pos >= slot_logits.size(0):
	slot_predictions.append("O")
	tokint_predictions.append("O")
	continue

	# Slot prediction
	slot_id = torch.argmax(slot_logits[subword_pos]).item()
	slot_tag = self.id2slot[slot_id]
	slot_predictions.append(slot_tag)

	# Token-intent prediction
	if tokint_logits is not None:
	if slot_tag == "O":
	tokint_predictions.append("O")
	else:
	tokint_id = torch.argmax(tokint_logits[subword_pos]).item()
	tokint_tag = self.id2tokint[tokint_id]
	tokint_predictions.append(tokint_tag)
	else:
	tokint_predictions.append("O")

	# Đảm bảo độ dài khớp với tokens gốc
	num_tokens = len(tokens)
	if len(slot_predictions) < num_tokens:
	slot_predictions.extend(["O"] * (num_tokens - len(slot_predictions)))
	tokint_predictions.extend(["O"] * (num_tokens - len(tokint_predictions)))
	elif len(slot_predictions) > num_tokens:
	slot_predictions = slot_predictions[:num_tokens]
	tokint_predictions = tokint_predictions[:num_tokens]

	# 5) Debug info cho UI (tokenized, BPE, h_cls_vector)
	# tokenized_text: chính là tokens sau underthesea
	tokenized_text = tokens

	# bpe_tokens: dựa trên input_ids + attention_mask
	input_ids_cpu = input_ids[0].cpu()
	attn_cpu = attention_mask[0].cpu()
	valid_ids = input_ids_cpu[attn_cpu == 1].tolist()
	bpe_tokens = self.tokenizer.convert_ids_to_tokens(valid_ids)

	# h_cls_vector: lấy từ encoder bên trong model (10 dims đầu)
	encoder_outputs = self.model.encoder(input_ids=input_ids, attention_mask=attention_mask)
	h_cls = encoder_outputs.pooler_output[0].cpu().numpy() # [hidden_size]
	h_cls_sample = h_cls[:10].tolist()

	return {
	# Kết quả chính như cũ
	"utterance_intents": predicted_intents,
	"slot_tags": slot_predictions,
	"token_intents": tokint_predictions,
	# Các trường phục vụ build_response_schema / UI
	"final_intents": predicted_intents,
	"intent_probabilities": intent_probabilities,
	"tokenized_text": tokenized_text,
	"bpe_tokens": bpe_tokens,
	"h_cls_vector": h_cls_sample,
	}

	def generate_predicted_masks_from_slots(self, slot_preds_list, max_seq_len):
	"""Generate B AND BI masks from predicted slots"""
	B_tag_mask_pred = []
	BI_tag_mask_pred = []

	for i in range(len(slot_preds_list)):
	entities = get_entities(slot_preds_list[i])
	entities = [tag for tag in entities if slot_preds_list[i][tag[1]].startswith('B')]

	if len(entities) > self.args.num_mask:
	entities = entities[:self.args.num_mask]

	B_entity_masks = []
	BI_entity_masks = []

	for entity_idx, entity in enumerate(entities):
	# B mask: only mark beginning
	B_mask = [0 for _ in range(max_seq_len)]
	start_idx = entity[1]
	B_mask[start_idx] = 1
	B_entity_masks.append(B_mask)

	# BI mask: weighted span
	BI_mask = [0.0 for _ in range(max_seq_len)]
	end_idx = entity[2] + 1
	weight = 1.0 / (end_idx - start_idx)
	for pos in range(start_idx, end_idx):
	if pos < len(slot_preds_list[i]):
	BI_mask[pos] = weight
	BI_entity_masks.append(BI_mask)

	# Pad to NUM_MASK
	for extra_idx in range(self.args.num_mask - len(B_entity_masks)):
	B_entity_masks.append([0 for _ in range(max_seq_len)])
	BI_entity_masks.append([0.0 for _ in range(max_seq_len)])

	B_tag_mask_pred.append(B_entity_masks)
	BI_tag_mask_pred.append(BI_entity_masks)

	return torch.LongTensor(B_tag_mask_pred), torch.FloatTensor(BI_tag_mask_pred)

	def align_masks_to_subwords(self, masks, word_to_subword_map, max_len):
	"""Align word-level masks to subword-level masks"""
	num_masks = len(masks)
	aligned_masks = torch.zeros(num_masks, max_len, dtype=torch.float)

	for mask_idx in range(num_masks):
	for word_idx, subword_idx in enumerate(word_to_subword_map):
	if word_idx < len(masks[mask_idx]) and subword_idx < max_len:
	aligned_masks[mask_idx, subword_idx] = masks[mask_idx][word_idx]

	return aligned_masks