| |
| """ |
| Verification script to check [SCD] token implementation. |
| Run this after preparing data with prepare_scd.sh |
| """ |
|
|
| import gzip |
| import json |
| import sys |
| from pathlib import Path |
| from collections import Counter |
|
|
|
|
| def check_file_exists(path: Path) -> bool: |
| """Check if a file exists.""" |
| if path.exists(): |
| print(f"β Found: {path}") |
| return True |
| else: |
| print(f"β Missing: {path}") |
| return False |
|
|
|
|
| def verify_scd_in_supervisions(manifest_path: Path) -> dict: |
| """Verify [SCD] tokens are present in supervisions.""" |
| print(f"\nChecking [SCD] tokens in {manifest_path.name}...") |
|
|
| total_sups = 0 |
| scd_count = 0 |
| scd_per_utt = [] |
|
|
| with gzip.open(manifest_path, "rt") as f: |
| for line in f: |
| if not line.strip(): |
| continue |
| sup = json.loads(line) |
| total_sups += 1 |
| count = sup["text"].count("[SCD]") |
| scd_count += count |
| if count > 0: |
| scd_per_utt.append(count) |
|
|
| pct_with_scd = (len(scd_per_utt) / total_sups * 100) if total_sups > 0 else 0 |
| avg_scd = (scd_count / total_sups) if total_sups > 0 else 0 |
|
|
| stats = { |
| "total_supervisions": total_sups, |
| "total_scd": scd_count, |
| "supervisions_with_scd": len(scd_per_utt), |
| "pct_with_scd": pct_with_scd, |
| "avg_scd_per_utt": avg_scd, |
| "scd_distribution": Counter(scd_per_utt), |
| } |
|
|
| print(f" Total supervisions: {total_sups}") |
| print(f" Total [SCD] tokens: {scd_count}") |
| print(f" Supervisions with [SCD]: {len(scd_per_utt)} ({pct_with_scd:.1f}%)") |
| print(f" Average [SCD] per utterance: {avg_scd:.2f}") |
|
|
| if scd_per_utt: |
| print(f" [SCD] distribution:") |
| for count, freq in sorted(stats["scd_distribution"].items()): |
| print(f" {count} [SCD]: {freq} utterances") |
|
|
| return stats |
|
|
|
|
| def verify_bpe_model(lang_dir: Path): |
| """Verify [SCD] is a single token in BPE model.""" |
| print(f"\nChecking BPE model in {lang_dir}...") |
|
|
| tokens_file = lang_dir / "tokens.txt" |
| if not tokens_file.exists(): |
| print(f"β tokens.txt not found in {lang_dir}") |
| return False |
|
|
| |
| with open(tokens_file, "r") as f: |
| tokens = f.read() |
|
|
| if "[SCD]" in tokens: |
| |
| for line in tokens.split("\n"): |
| if "[SCD]" in line: |
| print(f"β [SCD] is a single token: {line.strip()}") |
| return True |
| else: |
| print("β [SCD] not found in tokens.txt") |
| return False |
|
|
|
|
| def verify_sentencepiece_encoding(bpe_model_path: Path): |
| """Verify SentencePiece correctly encodes [SCD] as single token.""" |
| print(f"\nVerifying SentencePiece encoding...") |
|
|
| try: |
| import sentencepiece as spm |
|
|
| sp = spm.SentencePieceProcessor() |
| sp.load(str(bpe_model_path)) |
|
|
| test_text = "hello this is geico [SCD] my name is alexa" |
| encoded = sp.encode(test_text, out_type=str) |
|
|
| print(f" Test text: {test_text}") |
| print(f" Encoded tokens: {encoded}") |
|
|
| if "[SCD]" in encoded: |
| print(f"β [SCD] is correctly encoded as a single token") |
| return True |
| else: |
| print(f"β [SCD] was split or not found in encoding") |
| return False |
|
|
| except ImportError: |
| print(" β sentencepiece not installed, skipping encoding test") |
| return None |
| except Exception as e: |
| print(f"β Error testing SentencePiece: {e}") |
| return False |
|
|
|
|
| def show_samples(manifest_path: Path, num_samples: int = 3): |
| """Show sample utterances with [SCD] tokens.""" |
| print(f"\nSample utterances with [SCD] tokens from {manifest_path.name}:") |
|
|
| samples = [] |
| with gzip.open(manifest_path, "rt") as f: |
| for line in f: |
| if not line.strip(): |
| continue |
| sup = json.loads(line) |
| if "[SCD]" in sup["text"]: |
| samples.append(sup) |
| if len(samples) >= num_samples: |
| break |
|
|
| for i, sup in enumerate(samples, 1): |
| print(f"\n--- Sample {i} ---") |
| print(f"ID: {sup['id']}") |
| print(f"Duration: {sup['duration']:.2f}s") |
| print(f"[SCD] count: {sup['text'].count('[SCD]')}") |
| if "custom" in sup: |
| print(f"Speakers: {sup['custom'].get('speakers', [])}") |
| print(f"Merged from: {sup['custom'].get('num_segments', 1)} segments") |
| print(f"Text: {sup['text'][:200]}...") |
|
|
|
|
| def main(): |
| print("=" * 70) |
| print("AMI [SCD] Token Implementation Verification") |
| print("=" * 70) |
|
|
| base_dir = Path("data") |
| all_checks_passed = True |
|
|
| |
| print("\n1. Checking directory structure...") |
| required_dirs = [ |
| base_dir / "manifests_scd", |
| base_dir / "cuts_scd", |
| base_dir / "lm_scd", |
| ] |
|
|
| for dir_path in required_dirs: |
| if not check_file_exists(dir_path): |
| all_checks_passed = False |
|
|
| |
| print("\n2. Checking manifest files...") |
| manifest_files = [ |
| base_dir / "manifests_scd" / "ami-ihm_supervisions_train_scd.jsonl.gz", |
| base_dir / "manifests_scd" / "ami-ihm_supervisions_dev_scd.jsonl.gz", |
| base_dir / "manifests_scd" / "ami-ihm_supervisions_test_scd.jsonl.gz", |
| ] |
|
|
| for manifest_path in manifest_files: |
| if not check_file_exists(manifest_path): |
| all_checks_passed = False |
|
|
| |
| train_manifest = base_dir / "manifests_scd" / "ami-ihm_supervisions_train_scd.jsonl.gz" |
| if train_manifest.exists(): |
| stats = verify_scd_in_supervisions(train_manifest) |
|
|
| |
| if stats["avg_scd_per_utt"] < 0.5: |
| print("β Warning: Low [SCD] frequency. Consider adjusting merging parameters.") |
| elif stats["avg_scd_per_utt"] > 5.0: |
| print("β Warning: High [SCD] frequency. Consider increasing max_duration.") |
|
|
| if stats["pct_with_scd"] < 20: |
| print("β Warning: Very few utterances contain [SCD]. Check merging logic.") |
|
|
| |
| print("\n3. Checking BPE model...") |
| lang_dirs = [ |
| base_dir / "lang_bpe_500_scd", |
| base_dir / "lang_bpe_1000_scd", |
| ] |
|
|
| bpe_found = False |
| for lang_dir in lang_dirs: |
| if lang_dir.exists(): |
| bpe_found = True |
| verify_bpe_model(lang_dir) |
|
|
| bpe_model_path = lang_dir / "bpe.model" |
| if bpe_model_path.exists(): |
| verify_sentencepiece_encoding(bpe_model_path) |
|
|
| if not bpe_found: |
| print("β No BPE model with [SCD] found. Run prepare_scd.sh stage 6.") |
| all_checks_passed = False |
|
|
| |
| if train_manifest.exists(): |
| show_samples(train_manifest, num_samples=3) |
|
|
| |
| print("\n" + "=" * 70) |
| if all_checks_passed: |
| print("β All checks passed! Your [SCD] setup is ready.") |
| print("\nNext steps:") |
| print("1. Review the samples above to ensure [SCD] placement looks correct") |
| print("2. Update your training script to use data/manifests_scd/ and data/lang_bpe_*_scd/") |
| print("3. Start training!") |
| else: |
| print("β Some checks failed. Please:") |
| print("1. Run: ./prepare_scd.sh --stage 0 --stop-stage 6") |
| print("2. Check for any errors in the preparation stages") |
| print("3. Run this verification script again") |
| print("=" * 70) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|