Quillan-Ronin / inference_tester.py
CrashOverrideX's picture
Add files using upload-large-folder tool
1c70d34 verified
#!/usr/bin/env python3
"""
Inference Testing Script for Trained Quillan Model
Tests the trained model on provided content to evaluate performance
"""
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import json
import random
import pickle
import time
from collections import defaultdict
class OptimizedBigramLanguageModel(nn.Module):
"""Optimized language model matching training architecture"""
def __init__(self, vocab_size, n_embd=512, n_head=8, n_layer=6, dropout=0.1):
super().__init__()
self.vocab_size = vocab_size
self.n_embd = n_embd
self.token_embedding = nn.Embedding(vocab_size, n_embd)
self.position_embedding = nn.Embedding(1024, n_embd)
self.layers = nn.ModuleList([
nn.TransformerDecoderLayer(
d_model=n_embd,
nhead=n_head,
dim_feedforward=n_embd * 4,
dropout=dropout,
batch_first=True,
norm_first=True
) for _ in range(n_layer)
])
self.ln_f = nn.LayerNorm(n_embd)
self.lm_head = nn.Linear(n_embd, vocab_size, bias=False)
self.token_embedding.weight = self.lm_head.weight
self.apply(self._init_weights)
def _init_weights(self, module):
if isinstance(module, nn.Linear):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, nn.Embedding):
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
def forward(self, idx, targets=None):
B, T = idx.shape
tok_emb = self.token_embedding(idx)
pos_emb = self.position_embedding(torch.arange(T, device=idx.device))
x = tok_emb + pos_emb
causal_mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(idx.device)
for layer in self.layers:
x = layer(x, x, tgt_mask=causal_mask)
x = self.ln_f(x)
logits = self.lm_head(x)
loss = None
if targets is not None:
loss = F.cross_entropy(
logits.view(-1, self.vocab_size),
targets.view(-1),
ignore_index=self.vocab_size - 4
)
return logits, loss
def generate(self, idx, max_new_tokens, temperature=0.8):
"""Generate text with temperature sampling"""
for _ in range(max_new_tokens):
idx_cond = idx[:, -512:]
logits, _ = self(idx_cond)
logits = logits[:, -1, :] / temperature
probs = F.softmax(logits, dim=-1)
idx_next = torch.multinomial(probs, num_samples=1)
idx = torch.cat((idx, idx_next), dim=1)
return idx
class ModelTester:
"""Test the trained model on various content types"""
def __init__(self, model_path="optimized_quillan_model.pt", tokenizer_path="optimized_tokenizer.pkl"):
self.model = None
self.tokenizer = None
self.device = torch.device('cpu')
self.load_model_and_tokenizer(model_path, tokenizer_path)
def load_model_and_tokenizer(self, model_path, tokenizer_path):
"""Load the trained model and tokenizer"""
try:
# Load tokenizer
with open(tokenizer_path, 'rb') as f:
self.tokenizer = pickle.load(f)
print(f"βœ… Loaded tokenizer with {len(self.tokenizer.vocab)} tokens")
# Load model
checkpoint = torch.load(model_path, map_location=self.device)
self.model = OptimizedBigramLanguageModel(
vocab_size=len(self.tokenizer.vocab),
n_embd=512,
n_head=8,
n_layer=6
)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.model.to(self.device)
self.model.eval()
print("βœ… Loaded trained model")
print(f" Best loss achieved: {checkpoint.get('loss', 'Unknown'):.4f}")
except Exception as e:
print(f"❌ Failed to load model/tokenizer: {e}")
self.model = None
self.tokenizer = None
def test_text_generation(self, prompt="", max_length=100, temperature=0.8):
"""Test text generation from a prompt"""
if not self.model or not self.tokenizer:
return "❌ Model not loaded"
print(f"\nπŸ” Testing generation from prompt: '{prompt}'")
# Encode prompt
if prompt:
tokens = self.tokenizer.encode(prompt, max_length=256)
else:
tokens = [0] * 10 # Start with padding
context = torch.tensor([tokens], dtype=torch.long).to(self.device)
start_time = time.time()
with torch.no_grad():
generated = self.model.generate(context, max_length, temperature)
end_time = time.time()
generated_text = self.tokenizer.decode(generated[0].tolist())
print(".2f" print(f" Generated {len(generated_text)} characters")
return generated_text
def test_content_types(self):
"""Test model on different content types from the dataset"""
if not self.model or not self.tokenizer:
print("❌ Model not loaded")
return
print("\nπŸ§ͺ Testing Model on Different Content Types")
# Test prompts from different content types
test_prompts = {
'narrative': "The AI system began to understand",
'creative': "In the shadows of the digital realm",
'factual': "The quantum computer processes information",
'philosophical': "The meaning of consciousness is",
'technical': "The neural network architecture consists of"
}
results = {}
for category, prompt in test_prompts.items():
print(f"\n--- Testing {category.upper()} Content ---")
generated = self.test_text_generation(prompt, max_length=150, temperature=0.7)
results[category] = generated
# Basic quality assessment
word_count = len(generated.split())
unique_words = len(set(generated.lower().split()))
diversity_ratio = unique_words / word_count if word_count > 0 else 0
print(f" Quality metrics: {word_count} words, {unique_words} unique, {diversity_ratio:.2f} diversity")
return results
def benchmark_generation_speed(self):
"""Benchmark generation speed"""
if not self.model or not self.tokenizer:
print("❌ Model not loaded")
return
print("\n⚑ Benchmarking Generation Speed")
test_prompts = [
"Hello world",
"The meaning of life is",
"Artificial intelligence will",
"In the future, humans will"
]
total_time = 0
total_tokens = 0
for prompt in test_prompts:
start_time = time.time()
generated = self.test_text_generation(prompt, max_length=50, temperature=1.0)
end_time = time.time()
tokens_generated = len(self.tokenizer.encode(generated)) - len(self.tokenizer.encode(prompt))
time_taken = end_time - start_time
total_time += time_taken
total_tokens += tokens_generated
print(f" {tokens_generated} tokens in {time_taken:.2f}s ({tokens_generated/time_taken:.1f} tokens/sec)")
avg_speed = total_tokens / total_time if total_time > 0 else 0
print(".1f"
def evaluate_coherence(self):
"""Evaluate text coherence and quality"""
if not self.model or not self.tokenizer:
print("❌ Model not loaded")
return
print("\n🎯 Evaluating Text Coherence")
# Generate multiple samples
prompts = [
"The future of AI",
"Quantum computing",
"Consciousness explained",
"Digital immortality"
]
coherence_scores = []
for prompt in prompts:
generated = self.test_text_generation(prompt, max_length=200, temperature=0.8)
# Simple coherence metrics
sentences = generated.split('.')
avg_sentence_length = sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
# Check for repeated patterns (lower score for repetition)
words = generated.lower().split()
unique_ratio = len(set(words)) / len(words) if words else 0
# Basic coherence score (0-1)
coherence = min(1.0, (avg_sentence_length / 15) * unique_ratio)
coherence_scores.append(coherence)
print(".2f"
overall_coherence = sum(coherence_scores) / len(coherence_scores)
print(".2f"
if overall_coherence > 0.6:
print(" 🌟 Excellent coherence - approaching Grok-level quality!")
elif overall_coherence > 0.4:
print(" πŸ‘ Good coherence - model is learning well")
else:
print(" πŸ“ˆ Improving coherence - more training needed")
return overall_coherence
def main():
"""Main testing function"""
print("πŸš€ Starting Model Inference Testing...")
tester = ModelTester()
if not tester.model:
print("❌ Could not load model. Please ensure training completed and saved the model.")
return
# Run comprehensive tests
tester.test_content_types()
tester.benchmark_generation_speed()
coherence_score = tester.evaluate_coherence()
print("
πŸŽ‰ Inference Testing Complete!" print(".2f"
if coherence_score > 0.6:
print("πŸ† Your model is performing at advanced levels!")
elif coherence_score > 0.4:
print("πŸ“ˆ Good progress - continue training for better results")
else:
print("πŸ”„ Model needs more training - current performance is developing")
if __name__ == "__main__":
main()