vaani-cavp-engine / modules /elan_export.py
Shaankar39's picture
init: Vaani CAVP engine (CPU, accuracy-first β€” Whisper large-v3, spaCy trf)
7d5f092
"""ELAN Annotation Format (.eaf) Export
Generates standard ELAN XML files from voice profile analysis results.
Produces a multi-tier annotation corpus suitable for:
- PhD linguistic research
- Corpus-based phonological studies
- Cross-speaker contrastive analysis
- Longitudinal acquisition tracking
ELAN spec: https://archive.mpi.nl/tla/elan/documentation
Tiers generated:
1. Transcription β€” Word-level transcript with timestamps
2. Phonemes β€” Phone-level segmentation (from forced alignment or Wav2Vec)
3. Prosody β€” Intonation patterns, stress, rhythm annotations
4. L1_Interference β€” Detected L1 transfer patterns with severity
5. Voice_Quality β€” Phonation type, breathiness, creak annotations
6. Cognitive_Load β€” Filled pauses, hesitations, self-corrections
7. Connected_Speech β€” Assimilation, elision, linking events
8. CIF_Score β€” Overall Contrastive Interference Index per segment
9. Emotion β€” Emotional valence/arousal labels
10. Speaker_Metadata β€” Speaker ID, language, session info
"""
from __future__ import annotations
import logging
import time
import uuid
from pathlib import Path
from typing import Any
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
logger = logging.getLogger(__name__)
# ELAN namespace and schema
ELAN_SCHEMA = "http://www.mpi.nl/tools/elan/EAFv3.0.xsd"
ELAN_FORMAT = "3.0"
def _ts_id(counter: list[int]) -> str:
"""Generate a unique time slot ID."""
counter[0] += 1
return f"ts{counter[0]}"
def _ann_id(counter: list[int]) -> str:
"""Generate a unique annotation ID."""
counter[0] += 1
return f"a{counter[0]}"
def _ms_to_elan(ms: float) -> int:
"""Convert milliseconds (float) to ELAN time value (int ms)."""
return int(round(ms))
def generate_eaf(
profile: dict[str, Any],
audio_path: Path | str,
speaker_id: str = "anonymous",
student_name: str = "Student",
language: str = "en",
l1_language: str = "bho",
session_id: str | None = None,
) -> str:
"""Generate a complete ELAN .eaf XML document from analysis results.
Args:
profile: Full pipeline output dict (all 10 layers).
audio_path: Path to the source audio file.
speaker_id: Unique speaker identifier.
student_name: Display name.
language: Target language code.
l1_language: L1 language code.
session_id: Optional session identifier.
Returns:
EAF XML as a formatted string.
"""
audio_path = Path(audio_path)
session_id = session_id or f"session_{int(time.time())}"
ts_counter = [0]
ann_counter = [0]
# ── Root element ─────────────────────────────────────────────────
root = Element("ANNOTATION_DOCUMENT")
root.set("AUTHOR", "Contrastive Voice Profiling Engine")
root.set("DATE", time.strftime("%Y-%m-%dT%H:%M:%S+00:00"))
root.set("FORMAT", ELAN_FORMAT)
root.set("VERSION", ELAN_FORMAT)
root.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
root.set("xsi:noNamespaceSchemaLocation", ELAN_SCHEMA)
# ── Header ───────────────────────────────────────────────────────
header = SubElement(root, "HEADER")
header.set("MEDIA_FILE", "")
header.set("TIME_UNITS", "milliseconds")
media = SubElement(header, "MEDIA_DESCRIPTOR")
media.set("MEDIA_URL", f"file:///{audio_path.resolve()}")
media.set("MIME_TYPE", "audio/x-wav")
media.set("RELATIVE_MEDIA_URL", f"./{audio_path.name}")
# Properties
for key, val in [
("speaker_id", speaker_id),
("student_name", student_name),
("l1_language", l1_language),
("target_language", language),
("session_id", session_id),
("generator", "contrastive-voice-profiling-engine"),
]:
prop = SubElement(header, "PROPERTY")
prop.set("NAME", key)
prop.text = str(val)
# ── Collect all time points ──────────────────────────────────────
time_slots: list[tuple[str, int]] = []
def _add_ts(ms: float) -> str:
ts_id = _ts_id(ts_counter)
time_slots.append((ts_id, _ms_to_elan(ms)))
return ts_id
# ── Build tiers ──────────────────────────────────────────────────
tiers_data: list[dict] = []
# Tier 1: Transcription (word-level)
_build_transcription_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 2: Phonemes
_build_phoneme_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 3: Prosody
_build_prosody_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 4: L1 Interference
_build_l1_tier(profile, tiers_data, _add_ts, ann_counter, l1_language)
# Tier 5: Voice Quality
_build_voice_quality_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 6: Cognitive Load
_build_cognitive_load_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 7: Connected Speech
_build_connected_speech_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 8: CIF Score
_build_cif_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 9: Emotion
_build_emotion_tier(profile, tiers_data, _add_ts, ann_counter)
# Tier 10: Metadata (single span)
_build_metadata_tier(
profile, tiers_data, _add_ts, ann_counter,
speaker_id, student_name, language, l1_language, session_id,
)
# ── Write TIME_ORDER ─────────────────────────────────────────────
time_order = SubElement(root, "TIME_ORDER")
# Sort by time value for valid EAF
time_slots.sort(key=lambda x: x[1])
for ts_id, ts_val in time_slots:
ts_el = SubElement(time_order, "TIME_SLOT")
ts_el.set("TIME_SLOT_ID", ts_id)
ts_el.set("TIME_VALUE", str(ts_val))
# ── Write TIERs ──────────────────────────────────────────────────
for tier_info in tiers_data:
tier_el = SubElement(root, "TIER")
tier_el.set("LINGUISTIC_TYPE_REF", tier_info.get("type_ref", "default-lt"))
tier_el.set("TIER_ID", tier_info["tier_id"])
if "participant" in tier_info:
tier_el.set("PARTICIPANT", tier_info["participant"])
if "annotator" in tier_info:
tier_el.set("ANNOTATOR", tier_info["annotator"])
for ann in tier_info.get("annotations", []):
ann_el = SubElement(tier_el, "ANNOTATION")
align_ann = SubElement(ann_el, "ALIGNABLE_ANNOTATION")
align_ann.set("ANNOTATION_ID", ann["id"])
align_ann.set("TIME_SLOT_REF1", ann["ts1"])
align_ann.set("TIME_SLOT_REF2", ann["ts2"])
value_el = SubElement(align_ann, "ANNOTATION_VALUE")
value_el.text = ann["value"]
# ── Linguistic Types ─────────────────────────────────────────────
ling_types = [
"default-lt", "phoneme-lt", "prosody-lt", "interference-lt",
"voice-quality-lt", "cognitive-lt", "connected-speech-lt",
"cif-lt", "emotion-lt", "metadata-lt",
]
for lt in ling_types:
lt_el = SubElement(root, "LINGUISTIC_TYPE")
lt_el.set("GRAPHIC_REFERENCES", "false")
lt_el.set("LINGUISTIC_TYPE_ID", lt)
lt_el.set("TIME_ALIGNABLE", "true")
# ── Constraints ──────────────────────────────────────────────────
for stereo, desc in [
("Time_Subdivision", "Time subdivision of parent annotation's time interval"),
("Symbolic_Subdivision", "Symbolic subdivision of parent annotation's time interval"),
("Symbolic_Association", "1-1 association with a parent annotation"),
("Included_In", "Time included in parent annotation's time interval"),
]:
con = SubElement(root, "CONSTRAINT")
con.set("DESCRIPTION", desc)
con.set("STEREOTYPE", stereo)
# ── Format and return ────────────────────────────────────────────
raw_xml = tostring(root, encoding="unicode")
parsed = minidom.parseString(raw_xml)
return parsed.toprettyxml(indent=" ", encoding=None)
# ── Tier Builders ────────────────────────────────────────────────────────
def _build_transcription_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 1: Word-level transcription with timestamps."""
trans = profile.get("transcription", {})
word_ts = trans.get("word_timestamps", [])
annotations = []
for w in word_ts:
if not w.get("word"):
continue
start = w.get("start", 0) * 1000 if w.get("start", 0) < 100 else w.get("start", 0)
end = w.get("end", 0) * 1000 if w.get("end", 0) < 100 else w.get("end", 0)
ts1 = add_ts(start)
ts2 = add_ts(end)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": w["word"],
})
# If no word timestamps, use segment-level
if not annotations:
for seg in trans.get("segments", []):
ts1 = add_ts(seg["start"] * 1000)
ts2 = add_ts(seg["end"] * 1000)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": seg.get("text", "").strip(),
})
tiers.append({
"tier_id": "Transcription",
"type_ref": "default-lt",
"participant": "Speaker",
"annotator": "whisper",
"annotations": annotations,
})
def _build_phoneme_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 2: Phone-level segmentation."""
# Try forced alignment first, fall back to phoneme_analysis
fa = profile.get("forced_alignment", {})
phones = fa.get("phones", [])
# Fall back to Wav2Vec phoneme spans
if not phones:
pa = profile.get("phoneme_analysis", {})
phones = pa.get("phoneme_details", [])
annotations = []
for p in phones:
phone = p.get("phone") or p.get("phoneme", "")
start = p.get("start_ms", 0)
end = p.get("end_ms", start + p.get("duration_ms", 50))
if not phone:
continue
ts1 = add_ts(start)
ts2 = add_ts(end)
conf = p.get("confidence", 0)
source = p.get("source", "wav2vec")
label = f"{phone} [{source}:{conf:.2f}]" if conf else phone
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": label,
})
tiers.append({
"tier_id": "Phonemes",
"type_ref": "phoneme-lt",
"annotator": "forced_alignment",
"annotations": annotations,
})
def _build_prosody_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 3: Prosodic annotations (intonation, stress, rhythm)."""
prosody = profile.get("prosodic_profile", {})
annotations = []
# Intonation pattern annotations
intonation = prosody.get("intonation", {})
if intonation:
pattern = intonation.get("pattern", "unknown")
boundary_tones = intonation.get("boundary_tones", [])
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"Intonation: {pattern} | Tones: {', '.join(boundary_tones) if boundary_tones else 'N/A'}",
})
# Rhythm classification
rhythm = prosody.get("rhythm", {})
if rhythm:
rhythm_class = rhythm.get("rhythm_class", "unknown")
npvi = rhythm.get("nPVI_V", 0)
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"Rhythm: {rhythm_class} (nPVI={npvi:.1f})",
})
# Stressed words
stress = prosody.get("stress_patterns", [])
for s in stress:
if s.get("stressed"):
start = s.get("start", 0)
end = s.get("end", start + 200)
# Convert seconds to ms if needed
if start < 100:
start *= 1000
end *= 1000
ts1 = add_ts(start)
ts2 = add_ts(end)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"STRESS: {s.get('word', '')}",
})
tiers.append({
"tier_id": "Prosody",
"type_ref": "prosody-lt",
"annotator": "prosodic_profiling",
"annotations": annotations,
})
def _build_l1_tier(profile, tiers, add_ts, ann_ctr, l1_language):
"""Tier 4: L1 interference patterns."""
l1 = profile.get("l1_interference", {})
annotations = []
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
# Overall interference score
interference_score = l1.get("interference_score", 0)
display_name = l1.get("l1_display_name", l1_language)
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"L1={display_name} | Interference={interference_score}/100",
})
# Individual detected patterns
patterns = l1.get("detected_patterns", [])
for pat in patterns:
name = pat.get("pattern") or pat.get("name", "unknown")
severity = pat.get("severity", "low")
evidence = pat.get("evidence", "")
remediation = pat.get("remediation", "")
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
value = f"[{severity.upper()}] {name}"
if evidence:
value += f" | {evidence}"
if remediation:
value += f" | FIX: {remediation}"
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": value,
})
tiers.append({
"tier_id": "L1_Interference",
"type_ref": "interference-lt",
"annotator": "l1_targets",
"annotations": annotations,
})
def _build_voice_quality_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 5: Voice quality annotations."""
vq = profile.get("voice_quality", {})
vs = profile.get("voicesauce", {})
annotations = []
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
# Phonation type
phonation = vs.get("phonation_type") or vq.get("breathiness", {}).get("classification", "modal")
register = vq.get("register", {}).get("type", "unknown")
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"Phonation: {phonation} | Register: {register}",
})
# VoiceSauce measures
if vs:
h1h2 = vs.get("H1_H2", {})
cpp = vs.get("CPP", {})
shr = vs.get("SHR", {})
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": (
f"H1-H2={h1h2.get('mean', 0):.1f}dB | "
f"CPP={cpp.get('mean', 0):.1f}dB | "
f"SHR={shr.get('mean', 0):.3f} | "
f"Breathiness={vs.get('breathiness_index', 0):.2f} | "
f"Creak={vs.get('creak_index', 0):.2f}"
),
})
# Nasality
nas = vq.get("nasality", {})
if nas:
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"Nasality index: {nas.get('nasality_index', 0):.2f}",
})
tiers.append({
"tier_id": "Voice_Quality",
"type_ref": "voice-quality-lt",
"annotator": "voice_quality+voicesauce",
"annotations": annotations,
})
def _build_cognitive_load_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 6: Cognitive load markers (filled pauses, hesitations)."""
mb = profile.get("morpheme_boundary", {})
cog = mb.get("cognitive_load", {})
annotations = []
# Filled pauses
indicators = cog.get("indicators", [])
for ind in indicators:
if isinstance(ind, str):
# Simple string indicator β€” create utterance-level annotation
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"COGNITIVE: {ind}",
})
elif isinstance(ind, dict):
start = ind.get("start_ms", 0)
end = ind.get("end_ms", start + 200)
ts1 = add_ts(start)
ts2 = add_ts(end)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"COGNITIVE: {ind.get('type', 'marker')} β€” {ind.get('description', '')}",
})
# Overall score
score = cog.get("score", 0)
if score > 0:
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"Cognitive Load Score: {score}/100",
})
tiers.append({
"tier_id": "Cognitive_Load",
"type_ref": "cognitive-lt",
"annotator": "morpheme_boundary",
"annotations": annotations,
})
def _build_connected_speech_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 7: Connected speech processes (assimilation, elision, linking)."""
cs = profile.get("connected_speech", {})
annotations = []
for process_type in ["assimilations", "elisions", "linkings", "reductions"]:
events = cs.get(process_type, [])
for ev in events:
if isinstance(ev, dict):
start = ev.get("start_ms", ev.get("position_ms", 0))
end = ev.get("end_ms", start + 150)
label = ev.get("label") or ev.get("type", process_type.rstrip("s"))
context = ev.get("context", "")
ts1 = add_ts(start)
ts2 = add_ts(end)
value = f"[{process_type.upper().rstrip('S')}] {label}"
if context:
value += f" | {context}"
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": value,
})
# Fluency score
fluency = cs.get("fluency_score", 0)
if fluency > 0:
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"Fluency Score: {fluency}/100",
})
tiers.append({
"tier_id": "Connected_Speech",
"type_ref": "connected-speech-lt",
"annotator": "connected_speech",
"annotations": annotations,
})
def _build_cif_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 8: CIF (Contrastive Interference Field) scores."""
cif = profile.get("cif_analysis", {})
annotations = []
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
overall_cii = cif.get("overall_cii", 0)
severity = cif.get("overall_severity", "unknown")
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"CII={overall_cii:.3f} [{severity}]",
})
# Per-dimension CII
dimensions = cif.get("dimensions", {})
for dim_name, dim_data in dimensions.items():
if isinstance(dim_data, dict):
dim_cii = dim_data.get("cii", 0)
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"CIF-{dim_name}: {dim_cii:.3f}",
})
tiers.append({
"tier_id": "CIF_Score",
"type_ref": "cif-lt",
"annotator": "cif_model",
"annotations": annotations,
})
def _build_emotion_tier(profile, tiers, add_ts, ann_ctr):
"""Tier 9: Emotional analysis annotations."""
annotations = []
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
# SpeechBrain emotion
ai = profile.get("ai_classification", {})
sb = ai.get("speechbrain", {})
if sb:
emotion = sb.get("emotion", {})
if emotion:
label = emotion.get("label", "unknown")
conf = emotion.get("confidence", 0)
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"SpeechBrain: {label} ({conf:.2f})",
})
# auDeep emotion
audeep = profile.get("audeep", {})
if audeep:
primary = audeep.get("primary_emotion", "neutral")
conf = audeep.get("emotion_confidence", 0)
v = audeep.get("valence", 0)
a = audeep.get("arousal", 0)
d = audeep.get("dominance", 0)
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
annotations.append({
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": f"auDeep: {primary} ({conf:.2f}) | V={v:.2f} A={a:.2f} D={d:.2f}",
})
tiers.append({
"tier_id": "Emotion",
"type_ref": "emotion-lt",
"annotator": "speechbrain+audeep",
"annotations": annotations,
})
def _build_metadata_tier(profile, tiers, add_ts, ann_ctr,
speaker_id, student_name, language, l1_language, session_id):
"""Tier 10: Session metadata."""
duration_ms = profile.get("transcription", {}).get("duration_seconds", 5) * 1000
processing_ms = profile.get("processing_time_ms", 0)
ts1 = add_ts(0)
ts2 = add_ts(duration_ms)
meta_str = (
f"Speaker: {speaker_id} ({student_name}) | "
f"L1: {l1_language} β†’ L2: {language} | "
f"Session: {session_id} | "
f"Processing: {processing_ms:.0f}ms"
)
tiers.append({
"tier_id": "Speaker_Metadata",
"type_ref": "metadata-lt",
"participant": speaker_id,
"annotations": [{
"id": _ann_id(ann_ctr),
"ts1": ts1, "ts2": ts2,
"value": meta_str,
}],
})
# ── Public API ───────────────────────────────────────────────────────────
def export_eaf(
profile: dict[str, Any],
audio_path: Path | str,
output_path: Path | str | None = None,
speaker_id: str = "anonymous",
student_name: str = "Student",
language: str = "en",
l1_language: str = "bho",
session_id: str | None = None,
) -> tuple[str, Path]:
"""Export analysis results to ELAN .eaf format.
Args:
profile: Full pipeline output dict.
audio_path: Path to source audio.
output_path: Where to save the .eaf file. If None, saves next to audio.
speaker_id: Speaker identifier.
student_name: Display name.
language: Target language.
l1_language: L1 language code.
session_id: Session identifier.
Returns:
Tuple of (eaf_xml_string, output_path).
"""
audio_path = Path(audio_path)
if output_path is None:
output_path = audio_path.with_suffix(".eaf")
else:
output_path = Path(output_path)
eaf_xml = generate_eaf(
profile=profile,
audio_path=audio_path,
speaker_id=speaker_id,
student_name=student_name,
language=language,
l1_language=l1_language,
session_id=session_id,
)
output_path.write_text(eaf_xml, encoding="utf-8")
logger.info("ELAN export saved to %s", output_path)
return eaf_xml, output_path