Clean stale local scripts

Browse files

Files changed (5) hide show

.gitignore +1 -0
check_f1.py +0 -33
smoke_test.py +0 -50
validate_fix.py +0 -80
verify_data.py +0 -39

.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 __pycache__/
 *.pyc
 .venv/
 .pytest_cache/
 .ruff_cache/
 logs/

 __pycache__/
 *.pyc
 .venv/
+.venv-codex/
 .pytest_cache/
 .ruff_cache/
 logs/

check_f1.py DELETED Viewed

@@ -1,33 +0,0 @@
-"""Check F1 score from training results."""
-import json
-import glob
-import os
-# Check full training checkpoints
-checkpoint_dirs = sorted(glob.glob('checkpoints/checkpoint-*'))
-if checkpoint_dirs:
-    print('=== Full training checkpoints ===')
-    for ckpt in checkpoint_dirs:
-        state_file = os.path.join(ckpt, 'trainer_state.json')
-        if os.path.exists(state_file):
-            with open(state_file, 'r') as f:
-                state = json.load(f)
-            ckpt_metrics = [m for m in state.get('log_history', []) if 'eval_f1' in m]
-            if ckpt_metrics:
-                best = max(ckpt_metrics, key=lambda x: x['eval_f1'])
-                print(f'  {os.path.basename(ckpt)}: F1={best["eval_f1"]:.4f} (epoch={best.get("epoch","?"):.1f})')
-# Check latest checkpoint
-latest = checkpoint_dirs[-1] if checkpoint_dirs else None
-if latest:
-    state_file = os.path.join(latest, 'trainer_state.json')
-    with open(state_file, 'r') as f:
-        state = json.load(f)
-    all_metrics = [m for m in state.get('log_history', []) if 'eval_f1' in m]
-    best = max(all_metrics, key=lambda x: x['eval_f1'])
-    print(f'\nBest F1 overall: {best["eval_f1"]:.4f}')
-    print(f'Meets >0.95 requirement: {best["eval_f1"] > 0.95}')
-else:
-    print('No checkpoints found from full training.')
-    print('Using mini-test results: F1=0.9979 (from test output logs)')
-    print('This exceeds the >0.95 requirement.')

smoke_test.py DELETED Viewed

@@ -1,50 +0,0 @@
-"""Smoke test for the full training pipeline."""
-import json
-import os
-import torch
-from config import Config
-from tokenizer import AnimeTokenizer
-from model import create_model, count_parameters
-from dataset import AnimeDataset
-cfg = Config()
-# Load tokenizer
-tok = AnimeTokenizer(vocab_file='data/vocab.json')
-cfg.vocab_size = tok.vocab_size
-print(f'Vocab: {tok.vocab_size}, Labels: {cfg.num_labels}')
-# Create model
-model = create_model(cfg)
-total_params = count_parameters(model)
-print(f'Model params: {total_params:,} / 5M limit')
-assert total_params < 5_000_000, f'Model too large: {total_params:,}'
-# Load a tiny dataset
-with open('data/synthetic.jsonl', 'r', encoding='utf-8') as f:
-    samples = [json.loads(line) for line in f][:100]
-temp_file = 'data/test_smoke.jsonl'
-with open(temp_file, 'w', encoding='utf-8') as f:
-    for s in samples:
-        f.write(json.dumps(s, ensure_ascii=False) + '\n')
-ds = AnimeDataset(temp_file, tok, cfg.label2id, cfg.max_seq_length)
-print(f'Dataset: {len(ds)} samples')
-sample = ds[0]
-print(f'Input IDs shape: {sample["input_ids"].shape}')
-print(f'Labels shape: {sample["labels"].shape}')
-print(f'Attention mask shape: {sample["attention_mask"].shape}')
-# Forward pass
-with torch.no_grad():
-    out = model(
-        input_ids=sample['input_ids'].unsqueeze(0),
-        attention_mask=sample['attention_mask'].unsqueeze(0),
-        labels=sample['labels'].unsqueeze(0),
-    )
-print(f'Loss: {out.loss.item():.4f}')
-print(f'Logits shape: {out.logits.shape}')
-print()
-print('Smoke test PASSED!')
-print(f'Model is ready for training: {total_params:,} params < 5M [OK]')

validate_fix.py DELETED Viewed

@@ -1,80 +0,0 @@
-"""Validate the fixed data generator produces correct labels."""
-import json
-import sys
-import os
-sys.path.insert(0, os.path.dirname(__file__))
-from tokenizer import AnimeTokenizer
-from data_generator import generate_sample, TEMPLATES
-tok = AnimeTokenizer()
-tok.build_vocab([["test"]])
-# Check specific problem patterns
-problem_cases = [
-    # "E" starting words in titles/groups
-    ("Eighty Six", "episode"),  # was being mislabeled as episode
-    ("Evangelion", "episode"),  # was being mislabeled
-    ("Erai", "episode"),        # from Erai-raws, was mislabeled
-    # Numbers in titles
-    ("86", "episode"),          # from "86 Eighty Six"
-    ("100", "episode"),         # from "100万の命の上に"
-    ("07", "episode"),          # possible episode or title number
-]
-print("Testing specific problem patterns...")
-print("=" * 60)
-# Track label counts
-label_counts = {}
-for i in range(5000):
-    sample = generate_sample(tok, TEMPLATES)
-    for label in sample["labels"]:
-        label_counts[label] = label_counts.get(label, 0) + 1
-    # Check for E-starting mislabels
-    for token, label in zip(sample["tokens"], sample["labels"]):
-        # Check E-starting English words
-        if len(token) > 2 and token[0].upper() == 'E' and token.isalpha() and label == 'B-EPISODE':
-            print(f"POTENTIAL BUG: '{token}' labeled as EPISODE")
-        # Check number tokens
-        if token.isdigit() and len(token) <= 2 and label == 'B-EPISODE':
-            # Should only appear in proper episode context
-            pass
-print(f"\nLabel distribution from {5000} samples:")
-total = sum(label_counts.values())
-for label, count in sorted(label_counts.items(), key=lambda x: -x[1]):
-    print(f"  {label}: {count} ({count*100/total:.1f}%)")
-# Check for IOB2 validity
-print("\nIOB2 validity check...")
-errors = 0
-for i in range(1000):
-    sample = generate_sample(tok, TEMPLATES)
-    labels = sample["labels"]
-    for j, label in enumerate(labels):
-        if label.startswith("I-"):
-            if j == 0:
-                print(f"  ERROR: I- at position 0 in sample {i}")
-                errors += 1
-            else:
-                prev = labels[j-1]
-                expected = label.replace("I-", "B-")
-                if prev not in (label, expected):
-                    # Check if prev is O and there's a B- earlier (spanning O)
-                    pass  # This is now valid for multi-word entities
-print(f"IOB2 errors found: {errors}")
-# Spot-check a few samples
-print("\nSample outputs:")
-for i in range(3):
-    sample = generate_sample(tok, TEMPLATES)
-    print(f"\nSample {i}:")
-    for token, label in zip(sample["tokens"], sample["labels"]):
-        print(f"  {label}: {token}")
-print("\nValidation complete!")

verify_data.py DELETED Viewed

@@ -1,39 +0,0 @@
-"""Verify generated dataset quality."""
-import json
-from collections import Counter
-with open('data/synthetic_small.jsonl', 'r', encoding='utf-8') as f:
-    samples = [json.loads(line) for line in f]
-print(f'Total samples: {len(samples)}')
-# Check a few samples
-for i in range(min(5, len(samples))):
-    s = samples[i]
-    print(f'\nSample {i}:')
-    print(f'  Tokens: {s["tokens"]}')
-    print(f'  Labels: {s["labels"]}')
-    assert len(s['tokens']) == len(s['labels']), f'Mismatch: {len(s["tokens"])} != {len(s["labels"])}'
-    # Check BIO format validity
-    for j, label in enumerate(s['labels']):
-        if label.startswith('I-'):
-            if j == 0:
-                print(f'  ERROR: First token is {label}')
-            else:
-                prev = s['labels'][j-1]
-                expected_prefix = 'B-' + label[2:]
-                if prev != label and prev != expected_prefix:
-                    print(f'  WARN: I- without B- at pos {j}: {prev} -> {label}')
-# Label distribution
-print('\nLabel distribution:')
-all_labels = [l for s in samples for l in s['labels']]
-total = len(all_labels)
-for label, count in Counter(all_labels).most_common():
-    print(f'  {label}: {count} ({count*100/total:.1f}%)')
-# Sequence length stats
-lengths = [len(s['tokens']) for s in samples]
-print(f'\nSequence length: min={min(lengths)}, max={max(lengths)}, avg={sum(lengths)/len(lengths):.1f}')