codebook / potato /export /textgrid_exporter.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
11.6 kB
"""
TextGrid Exporter
Exports tiered annotations to Praat TextGrid format.
TextGrid is the native format for Praat (https://www.fon.hum.uva.nl/praat/),
a tool widely used for phonetic analysis and annotation.
The TextGrid format supports:
- Interval tiers (segments with start/end times)
- Point tiers (single-point annotations)
- Multiple tiers with independent time alignments
Note: TextGrid doesn't natively support hierarchical relationships between
tiers, so the export flattens the hierarchy while preserving all annotations.
"""
import logging
import os
from typing import Dict, List, Any, Optional, Tuple
from .base import BaseExporter, ExportContext, ExportResult
logger = logging.getLogger(__name__)
class TextGridExporter(BaseExporter):
"""
Exports tiered annotations to Praat TextGrid format.
This exporter creates TextGrid files that can be opened in Praat
for phonetic analysis or further annotation.
"""
format_name = "textgrid"
description = "Praat TextGrid format for phonetic annotation"
file_extensions = [".TextGrid"]
def can_export(self, context: ExportContext) -> Tuple[bool, str]:
"""
Check if the context contains tiered_annotation schema.
Args:
context: ExportContext to validate
Returns:
Tuple of (can_export, reason)
"""
for schema in context.schemas:
if schema.get("annotation_type") == "tiered_annotation":
return True, ""
return False, "No tiered_annotation schema found in configuration"
def export(
self,
context: ExportContext,
output_path: str,
options: Optional[dict] = None
) -> ExportResult:
"""
Export annotations to TextGrid format.
Args:
context: ExportContext with annotation data
output_path: Directory path for output files
options: Optional settings:
- format: "long" (default) or "short" TextGrid format
- fill_gaps: Whether to fill gaps between annotations
Returns:
ExportResult with status and file paths
"""
options = options or {}
files_written = []
warnings = []
stats = {"instances": 0, "annotations": 0, "tiers": 0}
# Create output directory
os.makedirs(output_path, exist_ok=True)
use_short_format = options.get("format", "long") == "short"
# Find tiered_annotation schemas
tiered_schemas = [
s for s in context.schemas
if s.get("annotation_type") == "tiered_annotation"
]
for instance_id, item in context.items.items():
# Get annotations for this instance
instance_annotations = [
a for a in context.annotations
if a.get("instance_id") == instance_id
]
for schema in tiered_schemas:
schema_name = schema.get("name", "tiered")
# Get tiered annotation data for this schema
tiered_data = None
for ann in instance_annotations:
if schema_name in ann.get("labels", {}):
try:
import json
raw_value = ann["labels"][schema_name]
if isinstance(raw_value, str):
tiered_data = json.loads(raw_value)
elif isinstance(raw_value, dict):
tiered_data = raw_value
except (json.JSONDecodeError, TypeError):
pass
break
if not tiered_data:
continue
# Generate TextGrid content
content = self._create_textgrid(
schema,
tiered_data,
use_short_format
)
# Write to file
safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in instance_id)
filename = f"{safe_id}_{schema_name}.TextGrid"
filepath = os.path.join(output_path, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(content)
files_written.append(filepath)
stats["instances"] += 1
# Count annotations
annotations = tiered_data.get("annotations", {})
for tier_anns in annotations.values():
stats["annotations"] += len(tier_anns)
stats["tiers"] = len(schema.get("tiers", []))
return ExportResult(
success=True,
format_name=self.format_name,
files_written=files_written,
warnings=warnings,
stats=stats
)
def _create_textgrid(
self,
schema: dict,
tiered_data: dict,
use_short_format: bool = False
) -> str:
"""
Create TextGrid file content.
Args:
schema: The tiered_annotation schema configuration
tiered_data: The annotation data
use_short_format: Whether to use short TextGrid format
Returns:
TextGrid file content as string
"""
tiers = schema.get("tiers", [])
annotations = tiered_data.get("annotations", {})
# Calculate time bounds
min_time = 0.0
max_time = self._get_max_time(annotations)
if max_time == 0:
max_time = 1.0 # Default duration if no annotations
if use_short_format:
return self._create_short_textgrid(tiers, annotations, min_time, max_time)
else:
return self._create_long_textgrid(tiers, annotations, min_time, max_time)
def _create_long_textgrid(
self,
tiers: List[dict],
annotations: Dict[str, List[dict]],
min_time: float,
max_time: float
) -> str:
"""Create long format TextGrid (more readable)."""
lines = []
lines.append('File type = "ooTextFile"')
lines.append('Object class = "TextGrid"')
lines.append('')
lines.append(f'xmin = {min_time}')
lines.append(f'xmax = {max_time}')
lines.append('tiers? <exists>')
lines.append(f'size = {len(tiers)}')
lines.append('item []:')
for i, tier_def in enumerate(tiers, 1):
tier_name = tier_def["name"]
tier_anns = annotations.get(tier_name, [])
# Sort annotations by start time
sorted_anns = sorted(tier_anns, key=lambda a: a.get("start_time", 0))
# Fill gaps to create complete intervals
intervals = self._create_intervals(sorted_anns, min_time, max_time)
lines.append(f' item [{i}]:')
lines.append(' class = "IntervalTier"')
lines.append(f' name = "{self._escape_text(tier_name)}"')
lines.append(f' xmin = {min_time}')
lines.append(f' xmax = {max_time}')
lines.append(f' intervals: size = {len(intervals)}')
for j, interval in enumerate(intervals, 1):
lines.append(f' intervals [{j}]:')
lines.append(f' xmin = {interval["start"]}')
lines.append(f' xmax = {interval["end"]}')
lines.append(f' text = "{self._escape_text(interval["text"])}"')
return '\n'.join(lines)
def _create_short_textgrid(
self,
tiers: List[dict],
annotations: Dict[str, List[dict]],
min_time: float,
max_time: float
) -> str:
"""Create short format TextGrid (more compact)."""
lines = []
lines.append('File type = "ooTextFile"')
lines.append('Object class = "TextGrid"')
lines.append('')
lines.append(str(min_time))
lines.append(str(max_time))
lines.append('<exists>')
lines.append(str(len(tiers)))
for tier_def in tiers:
tier_name = tier_def["name"]
tier_anns = annotations.get(tier_name, [])
# Sort and create intervals
sorted_anns = sorted(tier_anns, key=lambda a: a.get("start_time", 0))
intervals = self._create_intervals(sorted_anns, min_time, max_time)
lines.append('"IntervalTier"')
lines.append(f'"{self._escape_text(tier_name)}"')
lines.append(str(min_time))
lines.append(str(max_time))
lines.append(str(len(intervals)))
for interval in intervals:
lines.append(str(interval["start"]))
lines.append(str(interval["end"]))
lines.append(f'"{self._escape_text(interval["text"])}"')
return '\n'.join(lines)
def _create_intervals(
self,
annotations: List[dict],
min_time: float,
max_time: float
) -> List[dict]:
"""
Create a complete list of intervals, filling gaps with empty intervals.
Args:
annotations: Sorted list of annotations
min_time: Start time of the TextGrid
max_time: End time of the TextGrid
Returns:
List of interval dicts with start, end, and text
"""
intervals = []
current_time = min_time
for ann in annotations:
start_sec = ann.get("start_time", 0) / 1000.0 # Convert ms to seconds
end_sec = ann.get("end_time", 0) / 1000.0
text = ann.get("value") or ann.get("label", "")
# Add empty interval for gap
if start_sec > current_time + 0.0001: # Small tolerance
intervals.append({
"start": current_time,
"end": start_sec,
"text": ""
})
# Add annotation interval
intervals.append({
"start": start_sec,
"end": end_sec,
"text": text
})
current_time = end_sec
# Add final empty interval if needed
if current_time < max_time - 0.0001:
intervals.append({
"start": current_time,
"end": max_time,
"text": ""
})
# If no intervals at all, create one empty interval
if not intervals:
intervals.append({
"start": min_time,
"end": max_time,
"text": ""
})
return intervals
def _get_max_time(self, annotations: Dict[str, List[dict]]) -> float:
"""Get the maximum end time from all annotations in seconds."""
max_time = 0.0
for tier_anns in annotations.values():
for ann in tier_anns:
end_time = ann.get("end_time", 0)
if end_time:
max_time = max(max_time, end_time / 1000.0) # Convert ms to seconds
return max_time
def _escape_text(self, text: str) -> str:
"""Escape special characters for TextGrid format."""
if not text:
return ""
# Escape quotes and backslashes
text = text.replace('\\', '\\\\')
text = text.replace('"', '\\"')
# Remove or replace newlines
text = text.replace('\n', ' ').replace('\r', '')
return text