Spaces:
Sleeping
Sleeping
File size: 3,836 Bytes
28c5847 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import sys
import io
from tokenizer import StockBPE
from pathlib import Path
import time
# Fix console encoding
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
def train_and_verify():
data_path = Path("stock_corpus.txt")
if not data_path.exists():
print("β Error: stock_corpus.txt not found!")
print("Please run 'python download_stock_data.py' first")
return
# Check file size
file_size_mb = data_path.stat().st_size / (1024 * 1024)
print(f"File size: {file_size_mb:.2f} MB")
# Read data
print(f"Reading data from {data_path}...")
with open(data_path, "r", encoding="utf-8") as f:
text = f.read()
print(f"Data size: {len(text):,} characters")
print(f"Sample data:\n{text[:200]}...\n")
# Initialize tokenizer
tokenizer = StockBPE()
vocab_size = 5500 # Target > 5000
print(f"Training tokenizer with vocab size {vocab_size}...")
print("This should take 2-5 minutes...\n")
start_time = time.time()
tokenizer.train(text, vocab_size)
elapsed = time.time() - start_time
print(f"\nTraining took {elapsed:.2f} seconds ({elapsed/60:.2f} minutes)")
# Save tokenizer
print("\nSaving tokenizer...")
tokenizer.save("stock_bpe")
print("β Saved to: stock_bpe.merges and stock_bpe.vocab")
# Verification
print("\n" + "="*70)
print("VERIFICATION RESULTS")
print("="*70)
# Calculate compression ratio
ratio = tokenizer.calculate_compression_ratio(text)
print(f"Compression Ratio: {ratio:.2f}")
vocab_len = len(tokenizer.vocab)
print(f"Vocabulary Size: {vocab_len}")
# Check requirements
print("\n" + "="*70)
if vocab_len > 5000 and ratio >= 3:
print("β
SUCCESS: Requirements met!")
print(f" β Vocabulary size: {vocab_len} (required: > 5000)")
print(f" β Compression ratio: {ratio:.2f} (required: >= 3.0)")
else:
print("β οΈ WARNING: Requirements NOT fully met.")
if vocab_len <= 5000:
print(f" β Vocabulary size: {vocab_len} (required: > 5000)")
else:
print(f" β Vocabulary size: {vocab_len} (required: > 5000)")
if ratio < 3:
print(f" β Compression ratio: {ratio:.2f} (required: >= 3.0)")
else:
print(f" β Compression ratio: {ratio:.2f} (required: >= 3.0)")
print("="*70)
# Test encoding/decoding
print("\nTesting encoding/decoding...")
# Test with a sample stock data line
sample_lines = text.split('\n')[:3]
for sample_text in sample_lines:
if sample_text.strip():
encoded = tokenizer.encode(sample_text)
decoded = tokenizer.decode(encoded)
print(f"\nOriginal: {sample_text}")
print(f"Encoded: {encoded[:20]}... ({len(encoded)} tokens)")
print(f"Decoded: {decoded}")
print(f"Match: {'β' if sample_text == decoded else 'β'}")
assert sample_text == decoded, "Encoding/decoding mismatch!"
print("\nβ
All encoding/decoding tests passed!")
# Show some interesting statistics
print("\n" + "="*70)
print("STATISTICS")
print("="*70)
print(f"Total characters: {len(text):,}")
print(f"Total lines: {len(text.split(chr(10))):,}")
print(f"Vocabulary size: {vocab_len:,}")
print(f"Compression ratio: {ratio:.2f}x")
print(f"Original size: {len(text.encode('utf-8')):,} bytes")
print(f"Compressed size: {len(tokenizer.encode(text)):,} tokens")
print("="*70)
if __name__ == "__main__":
train_and_verify()
|