""" TextGrid Exporter Exports tiered annotations to Praat TextGrid format. TextGrid is the native format for Praat (https://www.fon.hum.uva.nl/praat/), a tool widely used for phonetic analysis and annotation. The TextGrid format supports: - Interval tiers (segments with start/end times) - Point tiers (single-point annotations) - Multiple tiers with independent time alignments Note: TextGrid doesn't natively support hierarchical relationships between tiers, so the export flattens the hierarchy while preserving all annotations. """ import logging import os from typing import Dict, List, Any, Optional, Tuple from .base import BaseExporter, ExportContext, ExportResult logger = logging.getLogger(__name__) class TextGridExporter(BaseExporter): """ Exports tiered annotations to Praat TextGrid format. This exporter creates TextGrid files that can be opened in Praat for phonetic analysis or further annotation. """ format_name = "textgrid" description = "Praat TextGrid format for phonetic annotation" file_extensions = [".TextGrid"] def can_export(self, context: ExportContext) -> Tuple[bool, str]: """ Check if the context contains tiered_annotation schema. Args: context: ExportContext to validate Returns: Tuple of (can_export, reason) """ for schema in context.schemas: if schema.get("annotation_type") == "tiered_annotation": return True, "" return False, "No tiered_annotation schema found in configuration" def export( self, context: ExportContext, output_path: str, options: Optional[dict] = None ) -> ExportResult: """ Export annotations to TextGrid format. Args: context: ExportContext with annotation data output_path: Directory path for output files options: Optional settings: - format: "long" (default) or "short" TextGrid format - fill_gaps: Whether to fill gaps between annotations Returns: ExportResult with status and file paths """ options = options or {} files_written = [] warnings = [] stats = {"instances": 0, "annotations": 0, "tiers": 0} # Create output directory os.makedirs(output_path, exist_ok=True) use_short_format = options.get("format", "long") == "short" # Find tiered_annotation schemas tiered_schemas = [ s for s in context.schemas if s.get("annotation_type") == "tiered_annotation" ] for instance_id, item in context.items.items(): # Get annotations for this instance instance_annotations = [ a for a in context.annotations if a.get("instance_id") == instance_id ] for schema in tiered_schemas: schema_name = schema.get("name", "tiered") # Get tiered annotation data for this schema tiered_data = None for ann in instance_annotations: if schema_name in ann.get("labels", {}): try: import json raw_value = ann["labels"][schema_name] if isinstance(raw_value, str): tiered_data = json.loads(raw_value) elif isinstance(raw_value, dict): tiered_data = raw_value except (json.JSONDecodeError, TypeError): pass break if not tiered_data: continue # Generate TextGrid content content = self._create_textgrid( schema, tiered_data, use_short_format ) # Write to file safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in instance_id) filename = f"{safe_id}_{schema_name}.TextGrid" filepath = os.path.join(output_path, filename) with open(filepath, 'w', encoding='utf-8') as f: f.write(content) files_written.append(filepath) stats["instances"] += 1 # Count annotations annotations = tiered_data.get("annotations", {}) for tier_anns in annotations.values(): stats["annotations"] += len(tier_anns) stats["tiers"] = len(schema.get("tiers", [])) return ExportResult( success=True, format_name=self.format_name, files_written=files_written, warnings=warnings, stats=stats ) def _create_textgrid( self, schema: dict, tiered_data: dict, use_short_format: bool = False ) -> str: """ Create TextGrid file content. Args: schema: The tiered_annotation schema configuration tiered_data: The annotation data use_short_format: Whether to use short TextGrid format Returns: TextGrid file content as string """ tiers = schema.get("tiers", []) annotations = tiered_data.get("annotations", {}) # Calculate time bounds min_time = 0.0 max_time = self._get_max_time(annotations) if max_time == 0: max_time = 1.0 # Default duration if no annotations if use_short_format: return self._create_short_textgrid(tiers, annotations, min_time, max_time) else: return self._create_long_textgrid(tiers, annotations, min_time, max_time) def _create_long_textgrid( self, tiers: List[dict], annotations: Dict[str, List[dict]], min_time: float, max_time: float ) -> str: """Create long format TextGrid (more readable).""" lines = [] lines.append('File type = "ooTextFile"') lines.append('Object class = "TextGrid"') lines.append('') lines.append(f'xmin = {min_time}') lines.append(f'xmax = {max_time}') lines.append('tiers? ') lines.append(f'size = {len(tiers)}') lines.append('item []:') for i, tier_def in enumerate(tiers, 1): tier_name = tier_def["name"] tier_anns = annotations.get(tier_name, []) # Sort annotations by start time sorted_anns = sorted(tier_anns, key=lambda a: a.get("start_time", 0)) # Fill gaps to create complete intervals intervals = self._create_intervals(sorted_anns, min_time, max_time) lines.append(f' item [{i}]:') lines.append(' class = "IntervalTier"') lines.append(f' name = "{self._escape_text(tier_name)}"') lines.append(f' xmin = {min_time}') lines.append(f' xmax = {max_time}') lines.append(f' intervals: size = {len(intervals)}') for j, interval in enumerate(intervals, 1): lines.append(f' intervals [{j}]:') lines.append(f' xmin = {interval["start"]}') lines.append(f' xmax = {interval["end"]}') lines.append(f' text = "{self._escape_text(interval["text"])}"') return '\n'.join(lines) def _create_short_textgrid( self, tiers: List[dict], annotations: Dict[str, List[dict]], min_time: float, max_time: float ) -> str: """Create short format TextGrid (more compact).""" lines = [] lines.append('File type = "ooTextFile"') lines.append('Object class = "TextGrid"') lines.append('') lines.append(str(min_time)) lines.append(str(max_time)) lines.append('') lines.append(str(len(tiers))) for tier_def in tiers: tier_name = tier_def["name"] tier_anns = annotations.get(tier_name, []) # Sort and create intervals sorted_anns = sorted(tier_anns, key=lambda a: a.get("start_time", 0)) intervals = self._create_intervals(sorted_anns, min_time, max_time) lines.append('"IntervalTier"') lines.append(f'"{self._escape_text(tier_name)}"') lines.append(str(min_time)) lines.append(str(max_time)) lines.append(str(len(intervals))) for interval in intervals: lines.append(str(interval["start"])) lines.append(str(interval["end"])) lines.append(f'"{self._escape_text(interval["text"])}"') return '\n'.join(lines) def _create_intervals( self, annotations: List[dict], min_time: float, max_time: float ) -> List[dict]: """ Create a complete list of intervals, filling gaps with empty intervals. Args: annotations: Sorted list of annotations min_time: Start time of the TextGrid max_time: End time of the TextGrid Returns: List of interval dicts with start, end, and text """ intervals = [] current_time = min_time for ann in annotations: start_sec = ann.get("start_time", 0) / 1000.0 # Convert ms to seconds end_sec = ann.get("end_time", 0) / 1000.0 text = ann.get("value") or ann.get("label", "") # Add empty interval for gap if start_sec > current_time + 0.0001: # Small tolerance intervals.append({ "start": current_time, "end": start_sec, "text": "" }) # Add annotation interval intervals.append({ "start": start_sec, "end": end_sec, "text": text }) current_time = end_sec # Add final empty interval if needed if current_time < max_time - 0.0001: intervals.append({ "start": current_time, "end": max_time, "text": "" }) # If no intervals at all, create one empty interval if not intervals: intervals.append({ "start": min_time, "end": max_time, "text": "" }) return intervals def _get_max_time(self, annotations: Dict[str, List[dict]]) -> float: """Get the maximum end time from all annotations in seconds.""" max_time = 0.0 for tier_anns in annotations.values(): for ann in tier_anns: end_time = ann.get("end_time", 0) if end_time: max_time = max(max_time, end_time / 1000.0) # Convert ms to seconds return max_time def _escape_text(self, text: str) -> str: """Escape special characters for TextGrid format.""" if not text: return "" # Escape quotes and backslashes text = text.replace('\\', '\\\\') text = text.replace('"', '\\"') # Remove or replace newlines text = text.replace('\n', ' ').replace('\r', '') return text