llm2vec4cxr / usage_example.py

Upload LLM2Vec4CXR fine-tuned model

6c19590 verified 6 months ago

7.98 kB

	"""
	Example usage script for LLM2Vec4CXR model.
	This demonstrates how to load and use the model for chest X-ray report analysis.

	Prerequisites:
	1. Install the LLM2Vec4CXR package:
	pip install git+https://github.com/lukeingawesome/llm2vec4cxr.git

	Or clone and install in development mode:
	git clone https://github.com/lukeingawesome/llm2vec4cxr.git
	cd llm2vec4cxr
	pip install -e .

	2. The model will be automatically downloaded from Hugging Face when first used.
	"""

	import torch
	import torch.nn.functional as F
	from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec

	def load_llm2vec4cxr_model(model_name_or_path="lukeingawesome/llm2vec4cxr"):
	"""
	Load the LLM2Vec4CXR model with proper configuration.

	Args:
	model_name_or_path (str): Hugging Face model path or local path

	Returns:
	tuple: (model, tokenizer)
	"""
	# Load model with the specific configuration used for LLM2Vec4CXR
	model = LLM2Vec.from_pretrained(
	base_model_name_or_path=model_name_or_path,
	enable_bidirectional=True,
	pooling_mode="latent_attention", # This is the key modification
	max_length=512,
	torch_dtype=torch.bfloat16,
	)

	# Configure tokenizer
	tokenizer = model.tokenizer
	tokenizer.padding_side = 'left'

	return model, tokenizer

	def tokenize_with_separator(texts, tokenizer, max_length=512):
	"""
	Tokenize texts with special handling for separator-based splitting.
	This is useful for instruction-following tasks.

	Args:
	texts (list): List of texts to tokenize
	tokenizer: The tokenizer to use
	max_length (int): Maximum sequence length

	Returns:
	dict: Tokenized inputs with attention masks and embed masks
	"""
	texts_2 = []
	original_texts = []
	separator = '!@#$%^&*()'

	for text in texts:
	parts = text.split(separator)
	texts_2.append(parts[1] if len(parts) > 1 else "")
	original_texts.append("".join(parts))

	# Tokenize original texts
	tokenized = tokenizer(
	original_texts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=max_length,
	)

	# Create embedding masks for the separated parts
	embed_mask = None
	for t_i, t in enumerate(texts_2):
	ids = tokenizer(
	[t],
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=max_length,
	add_special_tokens=False,
	)

	e_m = torch.zeros_like(tokenized["attention_mask"][t_i])
	if len(ids["input_ids"][0]) > 0:
	e_m[-len(ids["input_ids"][0]):] = torch.ones(len(ids["input_ids"][0]))

	if embed_mask is None:
	embed_mask = e_m.unsqueeze(0)
	else:
	embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0)

	tokenized["embed_mask"] = embed_mask
	return tokenized

	def compute_similarities(model, tokenizer, texts, device):
	"""
	Compute similarity scores between the first text and all other texts.

	Args:
	model: The LLM2Vec model
	tokenizer: The tokenizer
	texts (list): List of texts to compare (first text is the reference)
	device: The device to run computations on

	Returns:
	tuple: (embeddings, similarities)
	"""
	with torch.no_grad():
	# Use separator-based tokenization if texts contain the separator
	if any('!@#$%^&*()' in text for text in texts):
	tokenized = tokenize_with_separator(texts, tokenizer, 512)
	else:
	tokenized = tokenizer(
	texts,
	return_tensors="pt",
	padding=True,
	truncation=True,
	max_length=512,
	)

	tokenized = tokenized.to(device)
	if hasattr(tokenized, 'to'):
	tokenized = tokenized.to(torch.bfloat16)
	else:
	# Convert each tensor in the dict
	for key in tokenized:
	if torch.is_tensor(tokenized[key]):
	tokenized[key] = tokenized[key].to(torch.bfloat16)

	embeddings = model(tokenized)

	# Compute cosine similarities between first embedding and all others
	similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1)

	return embeddings, similarities

	def main():
	"""
	Example usage of the LLM2Vec4CXR model for chest X-ray report analysis.
	"""
	# Set device
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"Using device: {device}")

	# Load the model
	print("Loading LLM2Vec4CXR model...")
	model, tokenizer = load_llm2vec4cxr_model()
	model = model.to(device).to(torch.bfloat16)
	model.eval()

	# Example 1: Basic text embedding using built-in method
	print("\n" + "="*60)
	print("Example 1: Basic Text Embedding (Built-in Method)")
	print("="*60)

	report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases."

	# Use the convenient built-in method
	embedding = model.encode_text(report)

	print(f"Report: {report}")
	print(f"Embedding shape: {embedding.shape}")
	print(f"Embedding norm: {torch.norm(embedding).item():.4f}")

	# Example 2: Instruction-based similarity comparison
	print("\n" + "="*60)
	print("Example 2: Instruction-based Similarity Comparison")
	print("="*60)

	separator = '!@#$%^&*()'
	instruction = 'Determine the change or the status of the pleural effusion.'
	report = 'There is a small increase in the left-sided effusion. There continues to be volume loss at both bases.'
	text = instruction + separator + report

	comparison_options = [
	'No pleural effusion',
	'Pleural effusion',
	'Effusion is seen in the right',
	'Effusion is seen in the left',
	'Pleural effusion is improving',
	'Pleural effusion is stable',
	'Pleural effusion is worsening'
	]

	all_texts = [text] + comparison_options

	# Use built-in method for instruction-based encoding
	embeddings = model.encode_with_instruction(all_texts)
	similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1)

	print(f"Original text: {report}")
	print(f"Instruction: {instruction}")
	print("\nSimilarity Scores:")
	print("-" * 50)

	for option, score in zip(comparison_options, similarities):
	print(f"{option:<35} \| {score.item():.4f}")

	# Find the most similar option
	best_match_idx = torch.argmax(similarities).item()
	print(f"\nBest match: {comparison_options[best_match_idx]} (score: {similarities[best_match_idx].item():.4f})")

	# Example 3: Multiple report comparison
	print("\n" + "="*60)
	print("Example 3: Multiple Report Comparison")
	print("="*60)

	reports = [
	"No acute cardiopulmonary abnormality.",
	"Small bilateral pleural effusions.",
	"Large left pleural effusion with compressive atelectasis.",
	"Interval improvement in bilateral pleural effusions.",
	"Worsening bilateral pleural effusions."
	]

	print("Computing embeddings for multiple reports...")
	# Use built-in method for multiple texts
	embeddings = model.encode_text(reports)

	# Compute pairwise similarities
	similarity_matrix = F.cosine_similarity(
	embeddings.unsqueeze(1),
	embeddings.unsqueeze(0),
	dim=2
	)

	print("\nPairwise Similarity Matrix:")
	print("-" * 30)
	for i, report1 in enumerate(reports):
	print(f"Report {i+1}: {report1[:30]}...")
	for j, report2 in enumerate(reports):
	print(f" vs Report {j+1}: {similarity_matrix[i][j].item():.4f}")
	print()

	if __name__ == "__main__":
	main()