| | """ |
| | Example usage script for LLM2Vec4CXR model. |
| | This demonstrates how to load and use the model for chest X-ray report analysis. |
| | |
| | Prerequisites: |
| | 1. Install the LLM2Vec4CXR package: |
| | pip install git+https://github.com/lukeingawesome/llm2vec4cxr.git |
| | |
| | Or clone and install in development mode: |
| | git clone https://github.com/lukeingawesome/llm2vec4cxr.git |
| | cd llm2vec4cxr |
| | pip install -e . |
| | |
| | 2. The model will be automatically downloaded from Hugging Face when first used. |
| | """ |
| |
|
| | import torch |
| | import torch.nn.functional as F |
| | from llm2vec_wrapper import LLM2VecWrapper as LLM2Vec |
| |
|
| | def load_llm2vec4cxr_model(model_name_or_path="lukeingawesome/llm2vec4cxr"): |
| | """ |
| | Load the LLM2Vec4CXR model with proper configuration. |
| | |
| | Args: |
| | model_name_or_path (str): Hugging Face model path or local path |
| | |
| | Returns: |
| | tuple: (model, tokenizer) |
| | """ |
| | |
| | model = LLM2Vec.from_pretrained( |
| | base_model_name_or_path=model_name_or_path, |
| | enable_bidirectional=True, |
| | pooling_mode="latent_attention", |
| | max_length=512, |
| | torch_dtype=torch.bfloat16, |
| | ) |
| | |
| | |
| | tokenizer = model.tokenizer |
| | tokenizer.padding_side = 'left' |
| | |
| | return model, tokenizer |
| |
|
| | def tokenize_with_separator(texts, tokenizer, max_length=512): |
| | """ |
| | Tokenize texts with special handling for separator-based splitting. |
| | This is useful for instruction-following tasks. |
| | |
| | Args: |
| | texts (list): List of texts to tokenize |
| | tokenizer: The tokenizer to use |
| | max_length (int): Maximum sequence length |
| | |
| | Returns: |
| | dict: Tokenized inputs with attention masks and embed masks |
| | """ |
| | texts_2 = [] |
| | original_texts = [] |
| | separator = '!@#$%^&*()' |
| | |
| | for text in texts: |
| | parts = text.split(separator) |
| | texts_2.append(parts[1] if len(parts) > 1 else "") |
| | original_texts.append("".join(parts)) |
| |
|
| | |
| | tokenized = tokenizer( |
| | original_texts, |
| | return_tensors="pt", |
| | padding=True, |
| | truncation=True, |
| | max_length=max_length, |
| | ) |
| | |
| | |
| | embed_mask = None |
| | for t_i, t in enumerate(texts_2): |
| | ids = tokenizer( |
| | [t], |
| | return_tensors="pt", |
| | padding=True, |
| | truncation=True, |
| | max_length=max_length, |
| | add_special_tokens=False, |
| | ) |
| | |
| | e_m = torch.zeros_like(tokenized["attention_mask"][t_i]) |
| | if len(ids["input_ids"][0]) > 0: |
| | e_m[-len(ids["input_ids"][0]):] = torch.ones(len(ids["input_ids"][0])) |
| | |
| | if embed_mask is None: |
| | embed_mask = e_m.unsqueeze(0) |
| | else: |
| | embed_mask = torch.cat((embed_mask, e_m.unsqueeze(0)), dim=0) |
| |
|
| | tokenized["embed_mask"] = embed_mask |
| | return tokenized |
| |
|
| | def compute_similarities(model, tokenizer, texts, device): |
| | """ |
| | Compute similarity scores between the first text and all other texts. |
| | |
| | Args: |
| | model: The LLM2Vec model |
| | tokenizer: The tokenizer |
| | texts (list): List of texts to compare (first text is the reference) |
| | device: The device to run computations on |
| | |
| | Returns: |
| | tuple: (embeddings, similarities) |
| | """ |
| | with torch.no_grad(): |
| | |
| | if any('!@#$%^&*()' in text for text in texts): |
| | tokenized = tokenize_with_separator(texts, tokenizer, 512) |
| | else: |
| | tokenized = tokenizer( |
| | texts, |
| | return_tensors="pt", |
| | padding=True, |
| | truncation=True, |
| | max_length=512, |
| | ) |
| | |
| | tokenized = tokenized.to(device) |
| | if hasattr(tokenized, 'to'): |
| | tokenized = tokenized.to(torch.bfloat16) |
| | else: |
| | |
| | for key in tokenized: |
| | if torch.is_tensor(tokenized[key]): |
| | tokenized[key] = tokenized[key].to(torch.bfloat16) |
| | |
| | embeddings = model(tokenized) |
| | |
| | |
| | similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1) |
| | |
| | return embeddings, similarities |
| |
|
| | def main(): |
| | """ |
| | Example usage of the LLM2Vec4CXR model for chest X-ray report analysis. |
| | """ |
| | |
| | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
| | print(f"Using device: {device}") |
| | |
| | |
| | print("Loading LLM2Vec4CXR model...") |
| | model, tokenizer = load_llm2vec4cxr_model() |
| | model = model.to(device).to(torch.bfloat16) |
| | model.eval() |
| | |
| | |
| | print("\n" + "="*60) |
| | print("Example 1: Basic Text Embedding (Built-in Method)") |
| | print("="*60) |
| | |
| | report = "There is a small increase in the left-sided effusion. There continues to be volume loss at both bases." |
| | |
| | |
| | embedding = model.encode_text(report) |
| | |
| | print(f"Report: {report}") |
| | print(f"Embedding shape: {embedding.shape}") |
| | print(f"Embedding norm: {torch.norm(embedding).item():.4f}") |
| | |
| | |
| | print("\n" + "="*60) |
| | print("Example 2: Instruction-based Similarity Comparison") |
| | print("="*60) |
| | |
| | separator = '!@#$%^&*()' |
| | instruction = 'Determine the change or the status of the pleural effusion.' |
| | report = 'There is a small increase in the left-sided effusion. There continues to be volume loss at both bases.' |
| | text = instruction + separator + report |
| | |
| | comparison_options = [ |
| | 'No pleural effusion', |
| | 'Pleural effusion', |
| | 'Effusion is seen in the right', |
| | 'Effusion is seen in the left', |
| | 'Pleural effusion is improving', |
| | 'Pleural effusion is stable', |
| | 'Pleural effusion is worsening' |
| | ] |
| | |
| | all_texts = [text] + comparison_options |
| | |
| | |
| | embeddings = model.encode_with_instruction(all_texts) |
| | similarities = F.cosine_similarity(embeddings[0], embeddings[1:], dim=1) |
| | |
| | print(f"Original text: {report}") |
| | print(f"Instruction: {instruction}") |
| | print("\nSimilarity Scores:") |
| | print("-" * 50) |
| | |
| | for option, score in zip(comparison_options, similarities): |
| | print(f"{option:<35} | {score.item():.4f}") |
| | |
| | |
| | best_match_idx = torch.argmax(similarities).item() |
| | print(f"\nBest match: {comparison_options[best_match_idx]} (score: {similarities[best_match_idx].item():.4f})") |
| | |
| | |
| | print("\n" + "="*60) |
| | print("Example 3: Multiple Report Comparison") |
| | print("="*60) |
| | |
| | reports = [ |
| | "No acute cardiopulmonary abnormality.", |
| | "Small bilateral pleural effusions.", |
| | "Large left pleural effusion with compressive atelectasis.", |
| | "Interval improvement in bilateral pleural effusions.", |
| | "Worsening bilateral pleural effusions." |
| | ] |
| | |
| | print("Computing embeddings for multiple reports...") |
| | |
| | embeddings = model.encode_text(reports) |
| | |
| | |
| | similarity_matrix = F.cosine_similarity( |
| | embeddings.unsqueeze(1), |
| | embeddings.unsqueeze(0), |
| | dim=2 |
| | ) |
| | |
| | print("\nPairwise Similarity Matrix:") |
| | print("-" * 30) |
| | for i, report1 in enumerate(reports): |
| | print(f"Report {i+1}: {report1[:30]}...") |
| | for j, report2 in enumerate(reports): |
| | print(f" vs Report {j+1}: {similarity_matrix[i][j].item():.4f}") |
| | print() |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|