anuj2054 commited on
Commit
ee7017c
Β·
verified Β·
1 Parent(s): 3fe45b2

test files for the quickstart guide

Browse files
test_notebook_compatibility.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test that existing notebook code still works with updated HF files
4
+ """
5
+
6
+ from Bio.Seq import Seq
7
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer, LogitsProcessor
8
+ import torch
9
+
10
+ print("Testing notebook compatibility...")
11
+
12
+ try:
13
+ # Import the custom components (they should be downloaded already)
14
+ from tokenizer import CodonTokenizer
15
+ from synonymous_logit_processor import generate_candidate_codons_with_generate
16
+
17
+ # Load model and tokenizer (notebook style)
18
+ print("Loading model and tokenizer...")
19
+ model = GPT2LMHeadModel.from_pretrained("naniltx/codonGPT")
20
+ tokenizer = CodonTokenizer()
21
+ print("βœ“ Model and tokenizer loaded successfully")
22
+
23
+ # Test the exact notebook usage pattern
24
+ print("\nTesting notebook usage pattern...")
25
+
26
+ # Example usage (from your notebook):
27
+ initial_codons = ["GCT", "TGT", "GAT"]
28
+ initial_codons = ['ATG', 'GAA', 'CTT', 'GTC'] # This overwrites the previous line
29
+ print("The initial prompt codons are:", " ".join(initial_codons))
30
+
31
+ # This should work with global model/tokenizer variables
32
+ generated_codons_generate = generate_candidate_codons_with_generate(initial_codons, temperature=0.7, top_k=5)
33
+ print("Generated with model.generate():", " ".join(generated_codons_generate))
34
+
35
+ print("\nβœ… Notebook compatibility test passed!")
36
+ print("Your existing notebook code will continue to work unchanged.")
37
+
38
+ except Exception as e:
39
+ print(f"\n❌ Compatibility test failed: {e}")
40
+ import traceback
41
+ traceback.print_exc()
test_quickstart.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test the simplified quickstart guide examples
4
+ """
5
+
6
+ import torch
7
+ from transformers import GPT2LMHeadModel
8
+
9
+ print("Testing simplified CodonGPT quickstart guide...")
10
+
11
+ try:
12
+ # Test 1: Download custom components (simulate what users would do)
13
+ print("\n1. Testing custom component downloads...")
14
+ from huggingface_hub import hf_hub_download
15
+
16
+ # Download custom tokenizer and processor to current directory
17
+ hf_hub_download(repo_id="naniltx/codonGPT", filename="tokenizer.py", local_dir="./")
18
+ hf_hub_download(repo_id="naniltx/codonGPT", filename="synonymous_logit_processor.py", local_dir="./")
19
+ print("βœ“ Custom components downloaded successfully")
20
+
21
+ # Test 2: Import custom components
22
+ print("\n2. Testing custom component imports...")
23
+ from tokenizer import CodonTokenizer
24
+ from synonymous_logit_processor import SynonymMaskingLogitsProcessor
25
+ print("βœ“ Custom components imported successfully")
26
+
27
+ # Test 3: Load model directly from HF
28
+ print("\n3. Testing direct model loading from Hugging Face...")
29
+ model = GPT2LMHeadModel.from_pretrained("naniltx/codonGPT")
30
+ model.eval()
31
+ print("βœ“ Model loaded directly from HF successfully")
32
+
33
+ # Test 4: Load custom tokenizer
34
+ print("\n4. Testing custom tokenizer...")
35
+ tokenizer = CodonTokenizer()
36
+ print(f"βœ“ Tokenizer loaded successfully (vocab size: {tokenizer.vocab_size})")
37
+
38
+ # Test 5: Basic sequence generation
39
+ print("\n5. Testing basic sequence generation...")
40
+ input_sequence = "ATGAAACCC"
41
+ input_codons = [input_sequence[i:i+3] for i in range(0, len(input_sequence), 3)]
42
+ input_tokens = [tokenizer.bos_token_id] + tokenizer.convert_tokens_to_ids(input_codons)
43
+ input_tensor = torch.tensor([input_tokens])
44
+
45
+ with torch.no_grad():
46
+ outputs = model.generate(
47
+ input_tensor,
48
+ max_length=input_tensor.size(1) + 3,
49
+ temperature=1.0,
50
+ do_sample=True,
51
+ pad_token_id=tokenizer.pad_token_id,
52
+ eos_token_id=tokenizer.eos_token_id
53
+ )
54
+
55
+ generated_tokens = outputs[0][input_tensor.size(1):].tolist()
56
+ generated_codons = [tokenizer.decode([token_id]) for token_id in generated_tokens
57
+ if token_id not in [tokenizer.pad_token_id, tokenizer.eos_token_id]]
58
+ generated_sequence = ''.join(generated_codons)
59
+
60
+ print(f"βœ“ Input sequence: {input_sequence}")
61
+ print(f"βœ“ Generated sequence: {generated_sequence}")
62
+
63
+ # Test 6: Synonym-aware generation
64
+ print("\n6. Testing synonym-aware generation...")
65
+ from synonymous_logit_processor import generate_candidate_codons_with_generate
66
+ from Bio.Seq import Seq
67
+
68
+ initial_codons = ["ATG", "AAA", "CCC"]
69
+ optimized_codons = generate_candidate_codons_with_generate(
70
+ initial_codons,
71
+ model=model,
72
+ tokenizer=tokenizer,
73
+ temperature=1.0,
74
+ top_k=50,
75
+ top_p=0.9
76
+ )
77
+
78
+ print(f"βœ“ Original: {initial_codons}")
79
+ print(f"βœ“ Optimized: {optimized_codons}")
80
+
81
+ # Verify amino acid preservation
82
+ original_aa = ''.join([str(Seq(codon).translate()) for codon in initial_codons])
83
+ optimized_aa = ''.join([str(Seq(codon).translate()) for codon in optimized_codons])
84
+ print(f"βœ“ Original AA: {original_aa}")
85
+ print(f"βœ“ Optimized AA: {optimized_aa}")
86
+ print(f"βœ“ AA preserved: {original_aa == optimized_aa}")
87
+
88
+ print("\nπŸŽ‰ All simplified quickstart tests passed!")
89
+
90
+ except Exception as e:
91
+ print(f"\n❌ Test failed with error: {e}")
92
+ import traceback
93
+ traceback.print_exc()