Spaces:

akryldigital
/

audit_assistant

Sleeping

App Files Files Community

audit_assistant / src /ui_components /word_selection.py

akryldigital

add Word Level Saliency and Experimental techniques

000885e verified 2 months ago

raw

history blame contribute delete

6.04 kB

	"""
	Word-Level Token Selection for Saliency Maps

	This module provides utilities for selecting specific words from a query
	to focus saliency map computation on those words only.
	"""

	import logging
	from typing import List, Dict, Tuple, Optional
	import torch
	import numpy as np

	logger = logging.getLogger(__name__)


	def tokenize_query_with_word_mapping(
	processor,
	query_text: str
	) -> Tuple[List[int], Dict[int, Tuple[int, int]]]:
	"""
	Tokenize query and create mapping from word indices to token indices.

	Args:
	processor: ColPali processor with tokenizer
	query_text: Query text string

	Returns:
	Tuple of:
	- token_ids: List of token IDs
	- word_to_tokens: Dict mapping word_index -> (start_token_idx, end_token_idx)
	"""
	tokenizer = processor.processor.tokenizer

	# Tokenize the query
	tokens = tokenizer(query_text, return_offsets_mapping=True, add_special_tokens=False)
	token_ids = tokens['input_ids']
	offsets = tokens['offset_mapping']

	# Split query into words (simple whitespace-based splitting)
	words = query_text.split()
	word_to_tokens = {}

	word_idx = 0
	char_pos = 0

	for token_idx, (start_char, end_char) in enumerate(offsets):
	# Check if this token starts a new word
	if start_char >= char_pos and word_idx < len(words):
	# Find which word this token belongs to
	word_start = query_text.find(words[word_idx], char_pos)
	if word_start != -1 and start_char >= word_start:
	# This token belongs to the current word
	if word_idx not in word_to_tokens:
	word_to_tokens[word_idx] = (token_idx, token_idx)
	else:
	# Extend the token range for this word
	_, end_token = word_to_tokens[word_idx]
	word_to_tokens[word_idx] = (word_to_tokens[word_idx][0], token_idx)
	char_pos = word_start + len(words[word_idx])

	# Move to next word if we've passed it
	if end_char >= char_pos:
	word_idx += 1

	return token_ids, word_to_tokens


	def get_token_indices_for_words(
	processor,
	query_text: str,
	selected_word_indices: List[int],
	input_ids: Optional[torch.Tensor] = None,
	attention_mask: Optional[torch.Tensor] = None
	) -> List[int]:
	"""
	Get token indices corresponding to selected words.

	SIMPLE APPROACH: Tokenize the query, map words to tokens, return token indices.
	These indices can be used directly to filter the query embedding.

	Args:
	processor: ColPali processor
	query_text: Query text string
	selected_word_indices: List of word indices (0-based) to select
	input_ids: Optional (currently unused, for future compatibility)
	attention_mask: Optional (currently unused)

	Returns:
	List of token indices corresponding to selected words
	"""
	try:
	tokenizer = processor.processor.tokenizer
	except AttributeError:
	try:
	tokenizer = processor.tokenizer
	except AttributeError:
	logger.error("Could not access tokenizer from processor")
	return []

	# Split query into words
	words = query_text.split()
	logger.info(f"🔤 Word selection: query has {len(words)} words")
	logger.info(f" Selected word indices: {selected_word_indices}")
	logger.info(f" Selected words: {[words[i] for i in selected_word_indices if i < len(words)]}")

	# Simple approach: tokenize and map words to tokens using offsets
	try:
	tokens = tokenizer(query_text, return_offsets_mapping=True, add_special_tokens=False)
	offsets = tokens['offset_mapping']
	token_ids = tokens['input_ids']

	logger.info(f" Tokenization: {len(token_ids)} tokens")

	# Build word-to-token mapping
	word_to_tokens = {}
	current_char = 0

	for word_idx, word in enumerate(words):
	# Find where this word starts in the text
	word_start = query_text.find(word, current_char)
	if word_start == -1:
	continue
	word_end = word_start + len(word)

	# Find tokens that overlap with this word
	word_tokens = []
	for tok_idx, (tok_start, tok_end) in enumerate(offsets):
	# Check if token overlaps with word
	if tok_end > word_start and tok_start < word_end:
	word_tokens.append(tok_idx)

	if word_tokens:
	word_to_tokens[word_idx] = word_tokens

	current_char = word_end

	logger.info(f" Word-to-token mapping: {word_to_tokens}")

	# Collect token indices for selected words
	result_indices = []
	for word_idx in selected_word_indices:
	if word_idx in word_to_tokens:
	result_indices.extend(word_to_tokens[word_idx])

	result_indices = sorted(set(result_indices))
	logger.info(f" Result token indices: {result_indices}")

	return result_indices

	except Exception as e:
	logger.error(f"Error in word-to-token mapping: {e}")
	import traceback
	logger.debug(traceback.format_exc())
	return []


	def get_word_boundaries(query_text: str) -> List[Tuple[int, int, str]]:
	"""
	Get word boundaries (start, end, word) for a query.

	Args:
	query_text: Query text string

	Returns:
	List of tuples (start_char, end_char, word)
	"""
	words = query_text.split()
	boundaries = []
	char_pos = 0

	for word in words:
	start = query_text.find(word, char_pos)
	if start != -1:
	end = start + len(word)
	boundaries.append((start, end, word))
	char_pos = end

	return boundaries