Fix adapter to copy utility files to cached directory when loaded from Hugging Face

Browse files

Files changed (2) hide show

adapter.py +74 -12
test_adapter_fix.py +65 -0

adapter.py CHANGED Viewed

@@ -2,12 +2,74 @@ import os
 import sys
 import torch
 import numpy as np
 # Get the directory where this adapter.py file is located
 current_dir = os.path.dirname(os.path.abspath(__file__))
 if current_dir not in sys.path:
     sys.path.insert(0, current_dir)
 # Import utility modules
 from restoration import AbRestore
 from ablang_encodings import AbEncoding
@@ -131,7 +193,7 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
             if fragmented:
                 # For fragmented sequences, assume they're already in the right format
                 return seqs, 'HL'
             # For paired sequences, format them as VH|VL
             formatted_seqs = []
             for seq in seqs:
@@ -151,7 +213,7 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
                         formatted_seqs.append(seq[0] if seq else "")
                 else:
                     formatted_seqs.append(seq)
             return formatted_seqs, 'HL'
         valid_modes = [
@@ -245,34 +307,34 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
                 formatted_seqs.append('|'.join(s))
             else:
                 formatted_seqs.append(s)
         plls = []
         for seq in formatted_seqs:
             tokens = self.tokenizer([seq], padding=True, return_tensors='pt')
             input_ids = extract_input_ids(tokens, self.used_device)
             with torch.no_grad():
                 output = self.AbLang(input_ids)
                 if hasattr(output, 'last_hidden_state'):
                     logits = output.last_hidden_state
                 else:
                     logits = output
                 # Get the sequence (remove batch dimension)
                 logits = logits[0]  # [seq_len, vocab_size]
                 input_ids = input_ids[0]  # [seq_len]
                 # Exclude all special tokens (pad, mask, etc.)
                 if isinstance(self.tokenizer.all_special_tokens[0], int):
                     special_token_ids = set(self.tokenizer.all_special_tokens)
                 else:
                     special_token_ids = set(self.tokenizer.convert_tokens_to_ids(tok) for tok in self.tokenizer.all_special_tokens)
                 valid_mask = ~torch.isin(input_ids, torch.tensor(list(special_token_ids), device=input_ids.device))
                 if valid_mask.sum() > 0:
                     valid_logits = logits[valid_mask]
                     valid_labels = input_ids[valid_mask]
                     # Calculate cross-entropy loss
                     nll = torch.nn.functional.cross_entropy(
                         valid_logits,
@@ -282,9 +344,9 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
                     pll = -nll.item()
                 else:
                     pll = 0.0
                 plls.append(pll)
         return np.array(plls, dtype=np.float32)
     def probability(self, seqs, align=False, stepwise_masking=False, **kwargs):
@@ -306,10 +368,10 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
             logits = self._predict_logits(formatted_seqs)
         else:
             logits = self._predict_logits(formatted_seqs)
         # Apply softmax to get probabilities
         probs = logits.softmax(-1).cpu().numpy()
         if align:
             return probs
         else:

 import sys
 import torch
 import numpy as np
+import shutil
 # Get the directory where this adapter.py file is located
 current_dir = os.path.dirname(os.path.abspath(__file__))
 if current_dir not in sys.path:
     sys.path.insert(0, current_dir)
+# List of utility files that need to be available
+UTILITY_FILES = [
+    'restoration.py',
+    'ablang_encodings.py',
+    'alignment.py',
+    'scores.py',
+    'extra_utils.py',
+    'ablang.py',
+    'encoderblock.py'
+]
+def ensure_utility_files_available():
+    """
+    Ensure all utility files are available in the current directory.
+    If any are missing, try to copy them from the repository root.
+    """
+    missing_files = []
+    for file in UTILITY_FILES:
+        if not os.path.exists(file):
+            missing_files.append(file)
+    if missing_files:
+        # Try to find the repository root (where all utility files are)
+        # Look for common parent directories that might contain the files
+        possible_paths = [
+            os.path.join(current_dir, '..'),  # Parent directory
+            os.path.join(current_dir, '..', '..'),  # Grandparent directory
+            os.path.join(os.path.expanduser('~'), 'ablang2'),  # Home directory
+            '/data/hn533621/ablang2',  # Known repository location
+        ]
+        for path in possible_paths:
+            if os.path.exists(path):
+                # Check if all missing files exist in this path
+                all_found = True
+                for file in missing_files:
+                    if not os.path.exists(os.path.join(path, file)):
+                        all_found = False
+                        break
+                if all_found:
+                    # Copy all missing files
+                    for file in missing_files:
+                        src = os.path.join(path, file)
+                        dst = os.path.join(current_dir, file)
+                        shutil.copy2(src, dst)
+                        print(f"✅ Copied {file} to cached directory")
+                    return True
+        # If we get here, we couldn't find the files
+        raise FileNotFoundError(
+            f"Missing utility files: {missing_files}. "
+            "These files are required for the adapter to work. "
+            "Please ensure the repository is properly set up."
+        )
+    return True
+# Ensure utility files are available before importing
+ensure_utility_files_available()
 # Import utility modules
 from restoration import AbRestore
 from ablang_encodings import AbEncoding
             if fragmented:
                 # For fragmented sequences, assume they're already in the right format
                 return seqs, 'HL'
             # For paired sequences, format them as VH|VL
             formatted_seqs = []
             for seq in seqs:
                         formatted_seqs.append(seq[0] if seq else "")
                 else:
                     formatted_seqs.append(seq)
             return formatted_seqs, 'HL'
         valid_modes = [
                 formatted_seqs.append('|'.join(s))
             else:
                 formatted_seqs.append(s)
         plls = []
         for seq in formatted_seqs:
             tokens = self.tokenizer([seq], padding=True, return_tensors='pt')
             input_ids = extract_input_ids(tokens, self.used_device)
             with torch.no_grad():
                 output = self.AbLang(input_ids)
                 if hasattr(output, 'last_hidden_state'):
                     logits = output.last_hidden_state
                 else:
                     logits = output
                 # Get the sequence (remove batch dimension)
                 logits = logits[0]  # [seq_len, vocab_size]
                 input_ids = input_ids[0]  # [seq_len]
                 # Exclude all special tokens (pad, mask, etc.)
                 if isinstance(self.tokenizer.all_special_tokens[0], int):
                     special_token_ids = set(self.tokenizer.all_special_tokens)
                 else:
                     special_token_ids = set(self.tokenizer.convert_tokens_to_ids(tok) for tok in self.tokenizer.all_special_tokens)
                 valid_mask = ~torch.isin(input_ids, torch.tensor(list(special_token_ids), device=input_ids.device))
                 if valid_mask.sum() > 0:
                     valid_logits = logits[valid_mask]
                     valid_labels = input_ids[valid_mask]
                     # Calculate cross-entropy loss
                     nll = torch.nn.functional.cross_entropy(
                         valid_logits,
                     pll = -nll.item()
                 else:
                     pll = 0.0
                 plls.append(pll)
         return np.array(plls, dtype=np.float32)
     def probability(self, seqs, align=False, stepwise_masking=False, **kwargs):
             logits = self._predict_logits(formatted_seqs)
         else:
             logits = self._predict_logits(formatted_seqs)
         # Apply softmax to get probabilities
         probs = logits.softmax(-1).cpu().numpy()
         if align:
             return probs
         else:

test_adapter_fix.py ADDED Viewed

	@@ -0,0 +1,65 @@

+#!/usr/bin/env python3
+import sys
+import os
+from transformers import AutoModel, AutoTokenizer
+from transformers.utils import cached_file
+def test_adapter_from_outside():
+    """Test loading the adapter from outside the repository"""
+    print("🧪 Testing adapter loading from outside repository...")
+    # Clear cache first
+    cache_dir = os.path.expanduser("~/.cache/huggingface/hub/models--hemantn--ablang2")
+    if os.path.exists(cache_dir):
+        import shutil
+        shutil.rmtree(cache_dir)
+        print("🗑️  Cleared Hugging Face cache")
+    try:
+        # Load model and tokenizer
+        print("📥 Loading model and tokenizer...")
+        model = AutoModel.from_pretrained("hemantn/ablang2", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained("hemantn/ablang2", trust_remote_code=True)
+        # Find the cached model directory and import adapter
+        adapter_path = cached_file("hemantn/ablang2", "adapter.py")
+        cached_model_dir = os.path.dirname(adapter_path)
+        sys.path.insert(0, cached_model_dir)
+        print(f"📁 Cached model directory: {cached_model_dir}")
+        print(f"📄 Files in cached directory:")
+        for f in os.listdir(cached_model_dir):
+            print(f"   {f}")
+        # Import and create the adapter
+        print("🔧 Importing adapter...")
+        from adapter import AbLang2PairedHuggingFaceAdapter
+        ablang = AbLang2PairedHuggingFaceAdapter(model=model, tokenizer=tokenizer)
+        print("✅ Adapter created successfully!")
+        # Test basic functionality
+        print("🧬 Testing restore functionality...")
+        test_seq = [
+            'EVQ***SGGEVKKPGASVKVSCRASGYTFRNYGLTWVRQAPGQGLEWMGWISAYNGNTNYAQKFQGRVTLTTDTSTSTAYMELRSLRSDDTAVYFCAR**PGHGAAFMDVWGTGTTVTVSS',
+            'DIQLTQSPLSLPVTLGQPASISCRSS*SLEASDTNIYLSWFQQRPGQSPRRLIYKI*NRDSGVPDRFSGSGSGTHFTLRISRVEADDVAVYYCMQGTHWPPAFGQGTKVDIK'
+        ]
+        restored = ablang(test_seq, mode='restore')
+        print("✅ Restore functionality working!")
+        print(f"📊 Restored sequences: {len(restored)}")
+        return True
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    success = test_adapter_from_outside()
+    if success:
+        print("🎉 All tests passed! The adapter works from outside the repository.")
+    else:
+        print("💥 Tests failed. The adapter still has issues.")