ranjit-task-logs-analysis / egs /ami /ASR /verify_scd_setup.py

Upload icefall experiment results and logs

d596074 verified 3 months ago

7.62 kB

	#!/usr/bin/env python3
	"""
	Verification script to check [SCD] token implementation.
	Run this after preparing data with prepare_scd.sh
	"""

	import gzip
	import json
	import sys
	from pathlib import Path
	from collections import Counter


	def check_file_exists(path: Path) -> bool:
	"""Check if a file exists."""
	if path.exists():
	print(f"✓ Found: {path}")
	return True
	else:
	print(f"✗ Missing: {path}")
	return False


	def verify_scd_in_supervisions(manifest_path: Path) -> dict:
	"""Verify [SCD] tokens are present in supervisions."""
	print(f"\nChecking [SCD] tokens in {manifest_path.name}...")

	total_sups = 0
	scd_count = 0
	scd_per_utt = []

	with gzip.open(manifest_path, "rt") as f:
	for line in f:
	if not line.strip():
	continue
	sup = json.loads(line)
	total_sups += 1
	count = sup["text"].count("[SCD]")
	scd_count += count
	if count > 0:
	scd_per_utt.append(count)

	pct_with_scd = (len(scd_per_utt) / total_sups * 100) if total_sups > 0 else 0
	avg_scd = (scd_count / total_sups) if total_sups > 0 else 0

	stats = {
	"total_supervisions": total_sups,
	"total_scd": scd_count,
	"supervisions_with_scd": len(scd_per_utt),
	"pct_with_scd": pct_with_scd,
	"avg_scd_per_utt": avg_scd,
	"scd_distribution": Counter(scd_per_utt),
	}

	print(f" Total supervisions: {total_sups}")
	print(f" Total [SCD] tokens: {scd_count}")
	print(f" Supervisions with [SCD]: {len(scd_per_utt)} ({pct_with_scd:.1f}%)")
	print(f" Average [SCD] per utterance: {avg_scd:.2f}")

	if scd_per_utt:
	print(f" [SCD] distribution:")
	for count, freq in sorted(stats["scd_distribution"].items()):
	print(f" {count} [SCD]: {freq} utterances")

	return stats


	def verify_bpe_model(lang_dir: Path):
	"""Verify [SCD] is a single token in BPE model."""
	print(f"\nChecking BPE model in {lang_dir}...")

	tokens_file = lang_dir / "tokens.txt"
	if not tokens_file.exists():
	print(f"✗ tokens.txt not found in {lang_dir}")
	return False

	# Check if [SCD] is in tokens
	with open(tokens_file, "r") as f:
	tokens = f.read()

	if "[SCD]" in tokens:
	# Extract the line with [SCD]
	for line in tokens.split("\n"):
	if "[SCD]" in line:
	print(f"✓ [SCD] is a single token: {line.strip()}")
	return True
	else:
	print("✗ [SCD] not found in tokens.txt")
	return False


	def verify_sentencepiece_encoding(bpe_model_path: Path):
	"""Verify SentencePiece correctly encodes [SCD] as single token."""
	print(f"\nVerifying SentencePiece encoding...")

	try:
	import sentencepiece as spm

	sp = spm.SentencePieceProcessor()
	sp.load(str(bpe_model_path))

	test_text = "hello this is geico [SCD] my name is alexa"
	encoded = sp.encode(test_text, out_type=str)

	print(f" Test text: {test_text}")
	print(f" Encoded tokens: {encoded}")

	if "[SCD]" in encoded:
	print(f"✓ [SCD] is correctly encoded as a single token")
	return True
	else:
	print(f"✗ [SCD] was split or not found in encoding")
	return False

	except ImportError:
	print(" ⚠ sentencepiece not installed, skipping encoding test")
	return None
	except Exception as e:
	print(f"✗ Error testing SentencePiece: {e}")
	return False


	def show_samples(manifest_path: Path, num_samples: int = 3):
	"""Show sample utterances with [SCD] tokens."""
	print(f"\nSample utterances with [SCD] tokens from {manifest_path.name}:")

	samples = []
	with gzip.open(manifest_path, "rt") as f:
	for line in f:
	if not line.strip():
	continue
	sup = json.loads(line)
	if "[SCD]" in sup["text"]:
	samples.append(sup)
	if len(samples) >= num_samples:
	break

	for i, sup in enumerate(samples, 1):
	print(f"\n--- Sample {i} ---")
	print(f"ID: {sup['id']}")
	print(f"Duration: {sup['duration']:.2f}s")
	print(f"[SCD] count: {sup['text'].count('[SCD]')}")
	if "custom" in sup:
	print(f"Speakers: {sup['custom'].get('speakers', [])}")
	print(f"Merged from: {sup['custom'].get('num_segments', 1)} segments")
	print(f"Text: {sup['text'][:200]}...")


	def main():
	print("=" * 70)
	print("AMI [SCD] Token Implementation Verification")
	print("=" * 70)

	base_dir = Path("data")
	all_checks_passed = True

	# Check directory structure
	print("\n1. Checking directory structure...")
	required_dirs = [
	base_dir / "manifests_scd",
	base_dir / "cuts_scd",
	base_dir / "lm_scd",
	]

	for dir_path in required_dirs:
	if not check_file_exists(dir_path):
	all_checks_passed = False

	# Check manifest files
	print("\n2. Checking manifest files...")
	manifest_files = [
	base_dir / "manifests_scd" / "ami-ihm_supervisions_train_scd.jsonl.gz",
	base_dir / "manifests_scd" / "ami-ihm_supervisions_dev_scd.jsonl.gz",
	base_dir / "manifests_scd" / "ami-ihm_supervisions_test_scd.jsonl.gz",
	]

	for manifest_path in manifest_files:
	if not check_file_exists(manifest_path):
	all_checks_passed = False

	# Verify [SCD] tokens in train set
	train_manifest = base_dir / "manifests_scd" / "ami-ihm_supervisions_train_scd.jsonl.gz"
	if train_manifest.exists():
	stats = verify_scd_in_supervisions(train_manifest)

	# Check if [SCD] distribution is reasonable
	if stats["avg_scd_per_utt"] < 0.5:
	print("⚠ Warning: Low [SCD] frequency. Consider adjusting merging parameters.")
	elif stats["avg_scd_per_utt"] > 5.0:
	print("⚠ Warning: High [SCD] frequency. Consider increasing max_duration.")

	if stats["pct_with_scd"] < 20:
	print("⚠ Warning: Very few utterances contain [SCD]. Check merging logic.")

	# Check BPE model
	print("\n3. Checking BPE model...")
	lang_dirs = [
	base_dir / "lang_bpe_500_scd",
	base_dir / "lang_bpe_1000_scd",
	]

	bpe_found = False
	for lang_dir in lang_dirs:
	if lang_dir.exists():
	bpe_found = True
	verify_bpe_model(lang_dir)

	bpe_model_path = lang_dir / "bpe.model"
	if bpe_model_path.exists():
	verify_sentencepiece_encoding(bpe_model_path)

	if not bpe_found:
	print("✗ No BPE model with [SCD] found. Run prepare_scd.sh stage 6.")
	all_checks_passed = False

	# Show samples
	if train_manifest.exists():
	show_samples(train_manifest, num_samples=3)

	# Summary
	print("\n" + "=" * 70)
	if all_checks_passed:
	print("✓ All checks passed! Your [SCD] setup is ready.")
	print("\nNext steps:")
	print("1. Review the samples above to ensure [SCD] placement looks correct")
	print("2. Update your training script to use data/manifests_scd/ and data/lang_bpe_*_scd/")
	print("3. Start training!")
	else:
	print("✗ Some checks failed. Please:")
	print("1. Run: ./prepare_scd.sh --stage 0 --stop-stage 6")
	print("2. Check for any errors in the preparation stages")
	print("3. Run this verification script again")
	print("=" * 70)


	if __name__ == "__main__":
	main()