ranjit-task-logs-analysis / egs /ami /ASR /verify_scd_setup.py
odg123's picture
Upload icefall experiment results and logs
d596074 verified
#!/usr/bin/env python3
"""
Verification script to check [SCD] token implementation.
Run this after preparing data with prepare_scd.sh
"""
import gzip
import json
import sys
from pathlib import Path
from collections import Counter
def check_file_exists(path: Path) -> bool:
"""Check if a file exists."""
if path.exists():
print(f"βœ“ Found: {path}")
return True
else:
print(f"βœ— Missing: {path}")
return False
def verify_scd_in_supervisions(manifest_path: Path) -> dict:
"""Verify [SCD] tokens are present in supervisions."""
print(f"\nChecking [SCD] tokens in {manifest_path.name}...")
total_sups = 0
scd_count = 0
scd_per_utt = []
with gzip.open(manifest_path, "rt") as f:
for line in f:
if not line.strip():
continue
sup = json.loads(line)
total_sups += 1
count = sup["text"].count("[SCD]")
scd_count += count
if count > 0:
scd_per_utt.append(count)
pct_with_scd = (len(scd_per_utt) / total_sups * 100) if total_sups > 0 else 0
avg_scd = (scd_count / total_sups) if total_sups > 0 else 0
stats = {
"total_supervisions": total_sups,
"total_scd": scd_count,
"supervisions_with_scd": len(scd_per_utt),
"pct_with_scd": pct_with_scd,
"avg_scd_per_utt": avg_scd,
"scd_distribution": Counter(scd_per_utt),
}
print(f" Total supervisions: {total_sups}")
print(f" Total [SCD] tokens: {scd_count}")
print(f" Supervisions with [SCD]: {len(scd_per_utt)} ({pct_with_scd:.1f}%)")
print(f" Average [SCD] per utterance: {avg_scd:.2f}")
if scd_per_utt:
print(f" [SCD] distribution:")
for count, freq in sorted(stats["scd_distribution"].items()):
print(f" {count} [SCD]: {freq} utterances")
return stats
def verify_bpe_model(lang_dir: Path):
"""Verify [SCD] is a single token in BPE model."""
print(f"\nChecking BPE model in {lang_dir}...")
tokens_file = lang_dir / "tokens.txt"
if not tokens_file.exists():
print(f"βœ— tokens.txt not found in {lang_dir}")
return False
# Check if [SCD] is in tokens
with open(tokens_file, "r") as f:
tokens = f.read()
if "[SCD]" in tokens:
# Extract the line with [SCD]
for line in tokens.split("\n"):
if "[SCD]" in line:
print(f"βœ“ [SCD] is a single token: {line.strip()}")
return True
else:
print("βœ— [SCD] not found in tokens.txt")
return False
def verify_sentencepiece_encoding(bpe_model_path: Path):
"""Verify SentencePiece correctly encodes [SCD] as single token."""
print(f"\nVerifying SentencePiece encoding...")
try:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load(str(bpe_model_path))
test_text = "hello this is geico [SCD] my name is alexa"
encoded = sp.encode(test_text, out_type=str)
print(f" Test text: {test_text}")
print(f" Encoded tokens: {encoded}")
if "[SCD]" in encoded:
print(f"βœ“ [SCD] is correctly encoded as a single token")
return True
else:
print(f"βœ— [SCD] was split or not found in encoding")
return False
except ImportError:
print(" ⚠ sentencepiece not installed, skipping encoding test")
return None
except Exception as e:
print(f"βœ— Error testing SentencePiece: {e}")
return False
def show_samples(manifest_path: Path, num_samples: int = 3):
"""Show sample utterances with [SCD] tokens."""
print(f"\nSample utterances with [SCD] tokens from {manifest_path.name}:")
samples = []
with gzip.open(manifest_path, "rt") as f:
for line in f:
if not line.strip():
continue
sup = json.loads(line)
if "[SCD]" in sup["text"]:
samples.append(sup)
if len(samples) >= num_samples:
break
for i, sup in enumerate(samples, 1):
print(f"\n--- Sample {i} ---")
print(f"ID: {sup['id']}")
print(f"Duration: {sup['duration']:.2f}s")
print(f"[SCD] count: {sup['text'].count('[SCD]')}")
if "custom" in sup:
print(f"Speakers: {sup['custom'].get('speakers', [])}")
print(f"Merged from: {sup['custom'].get('num_segments', 1)} segments")
print(f"Text: {sup['text'][:200]}...")
def main():
print("=" * 70)
print("AMI [SCD] Token Implementation Verification")
print("=" * 70)
base_dir = Path("data")
all_checks_passed = True
# Check directory structure
print("\n1. Checking directory structure...")
required_dirs = [
base_dir / "manifests_scd",
base_dir / "cuts_scd",
base_dir / "lm_scd",
]
for dir_path in required_dirs:
if not check_file_exists(dir_path):
all_checks_passed = False
# Check manifest files
print("\n2. Checking manifest files...")
manifest_files = [
base_dir / "manifests_scd" / "ami-ihm_supervisions_train_scd.jsonl.gz",
base_dir / "manifests_scd" / "ami-ihm_supervisions_dev_scd.jsonl.gz",
base_dir / "manifests_scd" / "ami-ihm_supervisions_test_scd.jsonl.gz",
]
for manifest_path in manifest_files:
if not check_file_exists(manifest_path):
all_checks_passed = False
# Verify [SCD] tokens in train set
train_manifest = base_dir / "manifests_scd" / "ami-ihm_supervisions_train_scd.jsonl.gz"
if train_manifest.exists():
stats = verify_scd_in_supervisions(train_manifest)
# Check if [SCD] distribution is reasonable
if stats["avg_scd_per_utt"] < 0.5:
print("⚠ Warning: Low [SCD] frequency. Consider adjusting merging parameters.")
elif stats["avg_scd_per_utt"] > 5.0:
print("⚠ Warning: High [SCD] frequency. Consider increasing max_duration.")
if stats["pct_with_scd"] < 20:
print("⚠ Warning: Very few utterances contain [SCD]. Check merging logic.")
# Check BPE model
print("\n3. Checking BPE model...")
lang_dirs = [
base_dir / "lang_bpe_500_scd",
base_dir / "lang_bpe_1000_scd",
]
bpe_found = False
for lang_dir in lang_dirs:
if lang_dir.exists():
bpe_found = True
verify_bpe_model(lang_dir)
bpe_model_path = lang_dir / "bpe.model"
if bpe_model_path.exists():
verify_sentencepiece_encoding(bpe_model_path)
if not bpe_found:
print("βœ— No BPE model with [SCD] found. Run prepare_scd.sh stage 6.")
all_checks_passed = False
# Show samples
if train_manifest.exists():
show_samples(train_manifest, num_samples=3)
# Summary
print("\n" + "=" * 70)
if all_checks_passed:
print("βœ“ All checks passed! Your [SCD] setup is ready.")
print("\nNext steps:")
print("1. Review the samples above to ensure [SCD] placement looks correct")
print("2. Update your training script to use data/manifests_scd/ and data/lang_bpe_*_scd/")
print("3. Start training!")
else:
print("βœ— Some checks failed. Please:")
print("1. Run: ./prepare_scd.sh --stage 0 --stop-stage 6")
print("2. Check for any errors in the preparation stages")
print("3. Run this verification script again")
print("=" * 70)
if __name__ == "__main__":
main()