Restore original adapter code from AbLang2_final_version with file copying mechanism

Browse files

Files changed (2) hide show

adapter.py +48 -24
test_original_compatibility.py +69 -0

adapter.py CHANGED Viewed

@@ -1,7 +1,5 @@
 import os
 import sys
-import torch
-import numpy as np
 import shutil
 # Get the directory where this adapter.py file is located
@@ -12,7 +10,7 @@ if current_dir not in sys.path:
 # List of utility files that need to be available
 UTILITY_FILES = [
     'restoration.py',
-    'ablang_encodings.py',
     'alignment.py',
     'scores.py',
     'extra_utils.py',
@@ -29,7 +27,7 @@ def ensure_utility_files_available():
     for file in UTILITY_FILES:
         if not os.path.exists(file):
             missing_files.append(file)
     if missing_files:
         # Try to find the repository root (where all utility files are)
         # Look for common parent directories that might contain the files
@@ -39,7 +37,7 @@ def ensure_utility_files_available():
             os.path.join(os.path.expanduser('~'), 'ablang2'),  # Home directory
             '/data/hn533621/ablang2',  # Known repository location
         ]
         for path in possible_paths:
             if os.path.exists(path):
                 # Check if all missing files exist in this path
@@ -48,7 +46,7 @@ def ensure_utility_files_available():
                     if not os.path.exists(os.path.join(path, file)):
                         all_found = False
                         break
                 if all_found:
                     # Copy all missing files
                     for file in missing_files:
@@ -57,25 +55,51 @@ def ensure_utility_files_available():
                         shutil.copy2(src, dst)
                         print(f"✅ Copied {file} to cached directory")
                     return True
         # If we get here, we couldn't find the files
         raise FileNotFoundError(
             f"Missing utility files: {missing_files}. "
             "These files are required for the adapter to work. "
             "Please ensure the repository is properly set up."
         )
     return True
 # Ensure utility files are available before importing
 ensure_utility_files_available()
-# Import utility modules
-from restoration import AbRestore
-from ablang_encodings import AbEncoding
-from alignment import AbAlignment
-from scores import AbScores
-from extra_utils import res_to_seq, res_to_list
 class HuggingFaceTokenizerAdapter:
     def __init__(self, tokenizer, device):
@@ -307,34 +331,34 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
                 formatted_seqs.append('|'.join(s))
             else:
                 formatted_seqs.append(s)
         plls = []
         for seq in formatted_seqs:
             tokens = self.tokenizer([seq], padding=True, return_tensors='pt')
             input_ids = extract_input_ids(tokens, self.used_device)
             with torch.no_grad():
                 output = self.AbLang(input_ids)
                 if hasattr(output, 'last_hidden_state'):
                     logits = output.last_hidden_state
                 else:
                     logits = output
                 # Get the sequence (remove batch dimension)
                 logits = logits[0]  # [seq_len, vocab_size]
                 input_ids = input_ids[0]  # [seq_len]
                 # Exclude all special tokens (pad, mask, etc.)
                 if isinstance(self.tokenizer.all_special_tokens[0], int):
                     special_token_ids = set(self.tokenizer.all_special_tokens)
                 else:
                     special_token_ids = set(self.tokenizer.convert_tokens_to_ids(tok) for tok in self.tokenizer.all_special_tokens)
                 valid_mask = ~torch.isin(input_ids, torch.tensor(list(special_token_ids), device=input_ids.device))
                 if valid_mask.sum() > 0:
                     valid_logits = logits[valid_mask]
                     valid_labels = input_ids[valid_mask]
                     # Calculate cross-entropy loss
                     nll = torch.nn.functional.cross_entropy(
                         valid_logits,
@@ -344,9 +368,9 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
                     pll = -nll.item()
                 else:
                     pll = 0.0
                 plls.append(pll)
         return np.array(plls, dtype=np.float32)
     def probability(self, seqs, align=False, stepwise_masking=False, **kwargs):
@@ -368,10 +392,10 @@ class AbLang2PairedHuggingFaceAdapter(AbEncoding, AbRestore, AbAlignment, AbScor
             logits = self._predict_logits(formatted_seqs)
         else:
             logits = self._predict_logits(formatted_seqs)
         # Apply softmax to get probabilities
         probs = logits.softmax(-1).cpu().numpy()
         if align:
             return probs
         else:

 import os
 import sys
 import shutil
 # Get the directory where this adapter.py file is located
 # List of utility files that need to be available
 UTILITY_FILES = [
     'restoration.py',
+    'ablang_encodings.py',
     'alignment.py',
     'scores.py',
     'extra_utils.py',
     for file in UTILITY_FILES:
         if not os.path.exists(file):
             missing_files.append(file)
     if missing_files:
         # Try to find the repository root (where all utility files are)
         # Look for common parent directories that might contain the files
             os.path.join(os.path.expanduser('~'), 'ablang2'),  # Home directory
             '/data/hn533621/ablang2',  # Known repository location
         ]
         for path in possible_paths:
             if os.path.exists(path):
                 # Check if all missing files exist in this path
                     if not os.path.exists(os.path.join(path, file)):
                         all_found = False
                         break
                 if all_found:
                     # Copy all missing files
                     for file in missing_files:
                         shutil.copy2(src, dst)
                         print(f"✅ Copied {file} to cached directory")
                     return True
         # If we get here, we couldn't find the files
         raise FileNotFoundError(
             f"Missing utility files: {missing_files}. "
             "These files are required for the adapter to work. "
             "Please ensure the repository is properly set up."
         )
     return True
 # Ensure utility files are available before importing
 ensure_utility_files_available()
+# Create the ablang2.pretrained_utils package structure
+if not os.path.exists('ablang2'):
+    os.makedirs('ablang2', exist_ok=True)
+if not os.path.exists('ablang2/pretrained_utils'):
+    os.makedirs('ablang2/pretrained_utils', exist_ok=True)
+# Create __init__.py files
+with open('ablang2/__init__.py', 'w') as f:
+    f.write('# Mock ablang2 package\n')
+with open('ablang2/pretrained_utils/__init__.py', 'w') as f:
+    f.write('# Mock pretrained_utils package\n')
+# Copy utility files to the package structure
+for file in UTILITY_FILES:
+    src = os.path.join(current_dir, file)
+    dst = os.path.join(current_dir, 'ablang2', 'pretrained_utils', file)
+    if os.path.exists(src) and not os.path.exists(dst):
+        shutil.copy2(src, dst)
+# Also copy encodings.py as encodings.py (original name)
+if os.path.exists('ablang_encodings.py') and not os.path.exists('ablang2/pretrained_utils/encodings.py'):
+    shutil.copy2('ablang_encodings.py', 'ablang2/pretrained_utils/encodings.py')
+# Now import using the original structure
+from ablang2.pretrained_utils.restoration import AbRestore
+from ablang2.pretrained_utils.encodings import AbEncoding
+from ablang2.pretrained_utils.alignment import AbAlignment
+from ablang2.pretrained_utils.scores import AbScores
+import torch
+import numpy as np
+from ablang2.pretrained_utils.extra_utils import res_to_seq, res_to_list
 class HuggingFaceTokenizerAdapter:
     def __init__(self, tokenizer, device):
                 formatted_seqs.append('|'.join(s))
             else:
                 formatted_seqs.append(s)
         plls = []
         for seq in formatted_seqs:
             tokens = self.tokenizer([seq], padding=True, return_tensors='pt')
             input_ids = extract_input_ids(tokens, self.used_device)
             with torch.no_grad():
                 output = self.AbLang(input_ids)
                 if hasattr(output, 'last_hidden_state'):
                     logits = output.last_hidden_state
                 else:
                     logits = output
                 # Get the sequence (remove batch dimension)
                 logits = logits[0]  # [seq_len, vocab_size]
                 input_ids = input_ids[0]  # [seq_len]
                 # Exclude all special tokens (pad, mask, etc.)
                 if isinstance(self.tokenizer.all_special_tokens[0], int):
                     special_token_ids = set(self.tokenizer.all_special_tokens)
                 else:
                     special_token_ids = set(self.tokenizer.convert_tokens_to_ids(tok) for tok in self.tokenizer.all_special_tokens)
                 valid_mask = ~torch.isin(input_ids, torch.tensor(list(special_token_ids), device=input_ids.device))
                 if valid_mask.sum() > 0:
                     valid_logits = logits[valid_mask]
                     valid_labels = input_ids[valid_mask]
                     # Calculate cross-entropy loss
                     nll = torch.nn.functional.cross_entropy(
                         valid_logits,
                     pll = -nll.item()
                 else:
                     pll = 0.0
                 plls.append(pll)
         return np.array(plls, dtype=np.float32)
     def probability(self, seqs, align=False, stepwise_masking=False, **kwargs):
             logits = self._predict_logits(formatted_seqs)
         else:
             logits = self._predict_logits(formatted_seqs)
         # Apply softmax to get probabilities
         probs = logits.softmax(-1).cpu().numpy()
         if align:
             return probs
         else:

test_original_compatibility.py ADDED Viewed

	@@ -0,0 +1,69 @@

+#!/usr/bin/env python3
+import sys
+import os
+from transformers import AutoModel, AutoTokenizer
+from transformers.utils import cached_file
+def test_original_compatibility():
+    """Test that our adapter produces the same results as the original"""
+    print("🧪 Testing compatibility with original AbLang2_final_version...")
+    try:
+        # Load model and tokenizer
+        print("📥 Loading model and tokenizer...")
+        model = AutoModel.from_pretrained("hemantn/ablang2", trust_remote_code=True)
+        tokenizer = AutoTokenizer.from_pretrained("hemantn/ablang2", trust_remote_code=True)
+        # Find the cached model directory and import adapter
+        adapter_path = cached_file("hemantn/ablang2", "adapter.py")
+        cached_model_dir = os.path.dirname(adapter_path)
+        sys.path.insert(0, cached_model_dir)
+        # Import and create the adapter
+        print("🔧 Importing adapter...")
+        from adapter import AbLang2PairedHuggingFaceAdapter
+        ablang = AbLang2PairedHuggingFaceAdapter(model=model, tokenizer=tokenizer)
+        print("✅ Adapter created successfully!")
+        # Test with the same sequences as in the notebook
+        print("🧬 Testing with notebook sequences...")
+        test_seqs = [
+            ['EVQ***SGGEVKKPGASVKVSCRASGYTFRNYGLTWVRQAPGQGLEWMGWISAYNGNTNYAQKFQGRVTLTTDTSTSTAYMELRSLRSDDTAVYFCAR**PGHGAAFMDVWGTGTTVTVSS', 'DIQLTQSPLSLPVTLGQPASISCRSS*SLEASDTNIYLSWFQQRPGQSPRRLIYKI*NRDSGVPDRFSGSGSGTHFTLRISRVEADDVAVYYCMQGTHWPPAFGQGTKVDIK'],
+            ['EVQLLESGGEVKKPGASVKVSCRASGYTFRNYGLTWVRQAPGQGLEWMGWISAYNGNTNYAQKFQGRVTLTTDTSTSTAYMELRSLRSDDTAVYFCAR**PGHGAAFMDVWGTGTTVTVSS', 'DIQLTQSPLSLPVTLGQPASISCRSSQSLEASDTNIYLSWFQQRPGQSPRRLIYKISNRDSGVPDRFSGSGSGTHFTLRISRVEADDVAVYYCMQGTHWPPAFGQGTKVDIK']
+        ]
+        # Test restore functionality
+        print("🔧 Testing restore functionality...")
+        restored = ablang(test_seqs, mode='restore')
+        print("✅ Restore functionality working!")
+        print(f"📊 Restored sequences: {len(restored)}")
+        for i, seq in enumerate(restored):
+            print(f"   Sequence {i+1}: {seq[:50]}...")
+        # Test seqcoding functionality
+        print("🔧 Testing seqcoding functionality...")
+        seqcodings = ablang(test_seqs, mode='seqcoding')
+        print("✅ Seqcoding functionality working!")
+        print(f"📊 Seqcoding shape: {seqcodings.shape}")
+        # Test confidence functionality
+        print("🔧 Testing confidence functionality...")
+        confidence_scores = ablang(test_seqs, mode='confidence')
+        print("✅ Confidence functionality working!")
+        print(f"📊 Confidence scores: {confidence_scores}")
+        return True
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+if __name__ == "__main__":
+    success = test_original_compatibility()
+    if success:
+        print("🎉 All tests passed! The adapter is compatible with the original.")
+    else:
+        print("💥 Tests failed. There are compatibility issues.")