import torch, warnings, json, pathlib from transformers.models.auto.tokenization_auto import AutoTokenizer from transformers.models.auto.modeling_auto import AutoModelForCausalLM from configuration_evo2 import Evo2Config # --- Configuration --- root = pathlib.Path(".") # Choose a reasonable max sequence length for your inference task # Needs to be >= prompt_length + max_new_tokens INFERENCE_MAX_SEQLEN = 8192 # Or 4096, 16384, etc. print("Loading tokenizer...") tok = AutoTokenizer.from_pretrained(root, trust_remote_code=True) print(f"Loading configuration and overriding max_seqlen to {INFERENCE_MAX_SEQLEN}...") # Load the configuration object from the directory config = Evo2Config.from_pretrained(root, trust_remote_code=True) # Override the max_seqlen in the loaded config object config.max_seqlen = INFERENCE_MAX_SEQLEN # You might also want to ensure max_batch_size is appropriate if you change batching # config.max_batch_size = YOUR_BATCH_SIZE # Defaults to 1 in your config print("Loading model with modified config... (this takes ~30 s on first run)") model = AutoModelForCausalLM.from_pretrained( root, config=config, # Pass the modified config object torch_dtype=torch.bfloat16, # Specify dtype explicitly device_map="cuda:0", # Specify device map explicitly trust_remote_code=True # Add quantization here if needed, e.g., load_in_8bit=True ) # quick smoke-test prompt = "ATGGCGAAAACGTGGCTCGTCCGGTAGGGATCTGGAAACAATTGTAGACAGTTCCGAGTTGTCAAGGGCCA" tokens = tok(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate( input_ids=tokens['input_ids'], max_new_tokens=64, temperature=0.8, do_sample=True) print("\n--- Generated sequence ---\n", tok.decode(out[0], skip_special_tokens=True))