test files for the quickstart guide

Browse files

Files changed (2) hide show

test_notebook_compatibility.py +41 -0
test_quickstart.py +93 -0

test_notebook_compatibility.py ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/usr/bin/env python3
+"""
+Test that existing notebook code still works with updated HF files
+"""
+from Bio.Seq import Seq
+from transformers import GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor
+import torch
+print("Testing notebook compatibility...")
+try:
+    # Import the custom components (they should be downloaded already)
+    from tokenizer import CodonTokenizer
+    from synonymous_logit_processor import generate_candidate_codons_with_generate
+    # Load model and tokenizer (notebook style)
+    print("Loading model and tokenizer...")
+    model = GPT2LMHeadModel.from_pretrained("naniltx/codonGPT")
+    tokenizer = CodonTokenizer()
+    print("✓ Model and tokenizer loaded successfully")
+    # Test the exact notebook usage pattern
+    print("\nTesting notebook usage pattern...")
+    # Example usage (from your notebook):
+    initial_codons = ["GCT", "TGT", "GAT"]
+    initial_codons = ['ATG', 'GAA', 'CTT', 'GTC']  # This overwrites the previous line
+    print("The initial prompt codons are:", " ".join(initial_codons))
+    # This should work with global model/tokenizer variables
+    generated_codons_generate = generate_candidate_codons_with_generate(initial_codons, temperature=0.7, top_k=5)
+    print("Generated with model.generate():", " ".join(generated_codons_generate))
+    print("\n✅ Notebook compatibility test passed!")
+    print("Your existing notebook code will continue to work unchanged.")
+except Exception as e:
+    print(f"\n❌ Compatibility test failed: {e}")
+    import traceback
+    traceback.print_exc()

test_quickstart.py ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/env python3
+"""
+Test the simplified quickstart guide examples
+"""
+import torch
+from transformers import GPT2LMHeadModel
+print("Testing simplified CodonGPT quickstart guide...")
+try:
+    # Test 1: Download custom components (simulate what users would do)
+    print("\n1. Testing custom component downloads...")
+    from huggingface_hub import hf_hub_download
+    # Download custom tokenizer and processor to current directory
+    hf_hub_download(repo_id="naniltx/codonGPT", filename="tokenizer.py", local_dir="./")
+    hf_hub_download(repo_id="naniltx/codonGPT", filename="synonymous_logit_processor.py", local_dir="./")
+    print("✓ Custom components downloaded successfully")
+    # Test 2: Import custom components
+    print("\n2. Testing custom component imports...")
+    from tokenizer import CodonTokenizer
+    from synonymous_logit_processor import SynonymMaskingLogitsProcessor
+    print("✓ Custom components imported successfully")
+    # Test 3: Load model directly from HF
+    print("\n3. Testing direct model loading from Hugging Face...")
+    model = GPT2LMHeadModel.from_pretrained("naniltx/codonGPT")
+    model.eval()
+    print("✓ Model loaded directly from HF successfully")
+    # Test 4: Load custom tokenizer
+    print("\n4. Testing custom tokenizer...")
+    tokenizer = CodonTokenizer()
+    print(f"✓ Tokenizer loaded successfully (vocab size: {tokenizer.vocab_size})")
+    # Test 5: Basic sequence generation
+    print("\n5. Testing basic sequence generation...")
+    input_sequence = "ATGAAACCC"
+    input_codons = [input_sequence[i:i+3] for i in range(0, len(input_sequence), 3)]
+    input_tokens = [tokenizer.bos_token_id] + tokenizer.convert_tokens_to_ids(input_codons)
+    input_tensor = torch.tensor([input_tokens])
+    with torch.no_grad():
+        outputs = model.generate(
+            input_tensor,
+            max_length=input_tensor.size(1) + 3,
+            temperature=1.0,
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    generated_tokens = outputs[0][input_tensor.size(1):].tolist()
+    generated_codons = [tokenizer.decode([token_id]) for token_id in generated_tokens
+                       if token_id not in [tokenizer.pad_token_id, tokenizer.eos_token_id]]
+    generated_sequence = ''.join(generated_codons)
+    print(f"✓ Input sequence: {input_sequence}")
+    print(f"✓ Generated sequence: {generated_sequence}")
+    # Test 6: Synonym-aware generation
+    print("\n6. Testing synonym-aware generation...")
+    from synonymous_logit_processor import generate_candidate_codons_with_generate
+    from Bio.Seq import Seq
+    initial_codons = ["ATG", "AAA", "CCC"]
+    optimized_codons = generate_candidate_codons_with_generate(
+        initial_codons,
+        model=model,
+        tokenizer=tokenizer,
+        temperature=1.0,
+        top_k=50,
+        top_p=0.9
+    )
+    print(f"✓ Original: {initial_codons}")
+    print(f"✓ Optimized: {optimized_codons}")
+    # Verify amino acid preservation
+    original_aa = ''.join([str(Seq(codon).translate()) for codon in initial_codons])
+    optimized_aa = ''.join([str(Seq(codon).translate()) for codon in optimized_codons])
+    print(f"✓ Original AA: {original_aa}")
+    print(f"✓ Optimized AA: {optimized_aa}")
+    print(f"✓ AA preserved: {original_aa == optimized_aa}")
+    print("\n🎉 All simplified quickstart tests passed!")
+except Exception as e:
+    print(f"\n❌ Test failed with error: {e}")
+    import traceback
+    traceback.print_exc()