File size: 11,574 Bytes
aceb1b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
"""
TextGrid Exporter

Exports tiered annotations to Praat TextGrid format.
TextGrid is the native format for Praat (https://www.fon.hum.uva.nl/praat/),
a tool widely used for phonetic analysis and annotation.

The TextGrid format supports:
- Interval tiers (segments with start/end times)
- Point tiers (single-point annotations)
- Multiple tiers with independent time alignments

Note: TextGrid doesn't natively support hierarchical relationships between
tiers, so the export flattens the hierarchy while preserving all annotations.
"""

import logging
import os
from typing import Dict, List, Any, Optional, Tuple

from .base import BaseExporter, ExportContext, ExportResult

logger = logging.getLogger(__name__)


class TextGridExporter(BaseExporter):
    """
    Exports tiered annotations to Praat TextGrid format.

    This exporter creates TextGrid files that can be opened in Praat
    for phonetic analysis or further annotation.
    """

    format_name = "textgrid"
    description = "Praat TextGrid format for phonetic annotation"
    file_extensions = [".TextGrid"]

    def can_export(self, context: ExportContext) -> Tuple[bool, str]:
        """
        Check if the context contains tiered_annotation schema.

        Args:
            context: ExportContext to validate

        Returns:
            Tuple of (can_export, reason)
        """
        for schema in context.schemas:
            if schema.get("annotation_type") == "tiered_annotation":
                return True, ""

        return False, "No tiered_annotation schema found in configuration"

    def export(
        self,
        context: ExportContext,
        output_path: str,
        options: Optional[dict] = None
    ) -> ExportResult:
        """
        Export annotations to TextGrid format.

        Args:
            context: ExportContext with annotation data
            output_path: Directory path for output files
            options: Optional settings:
                - format: "long" (default) or "short" TextGrid format
                - fill_gaps: Whether to fill gaps between annotations

        Returns:
            ExportResult with status and file paths
        """
        options = options or {}
        files_written = []
        warnings = []
        stats = {"instances": 0, "annotations": 0, "tiers": 0}

        # Create output directory
        os.makedirs(output_path, exist_ok=True)

        use_short_format = options.get("format", "long") == "short"

        # Find tiered_annotation schemas
        tiered_schemas = [
            s for s in context.schemas
            if s.get("annotation_type") == "tiered_annotation"
        ]

        for instance_id, item in context.items.items():
            # Get annotations for this instance
            instance_annotations = [
                a for a in context.annotations
                if a.get("instance_id") == instance_id
            ]

            for schema in tiered_schemas:
                schema_name = schema.get("name", "tiered")

                # Get tiered annotation data for this schema
                tiered_data = None
                for ann in instance_annotations:
                    if schema_name in ann.get("labels", {}):
                        try:
                            import json
                            raw_value = ann["labels"][schema_name]
                            if isinstance(raw_value, str):
                                tiered_data = json.loads(raw_value)
                            elif isinstance(raw_value, dict):
                                tiered_data = raw_value
                        except (json.JSONDecodeError, TypeError):
                            pass
                        break

                if not tiered_data:
                    continue

                # Generate TextGrid content
                content = self._create_textgrid(
                    schema,
                    tiered_data,
                    use_short_format
                )

                # Write to file
                safe_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in instance_id)
                filename = f"{safe_id}_{schema_name}.TextGrid"
                filepath = os.path.join(output_path, filename)

                with open(filepath, 'w', encoding='utf-8') as f:
                    f.write(content)

                files_written.append(filepath)
                stats["instances"] += 1

                # Count annotations
                annotations = tiered_data.get("annotations", {})
                for tier_anns in annotations.values():
                    stats["annotations"] += len(tier_anns)
                stats["tiers"] = len(schema.get("tiers", []))

        return ExportResult(
            success=True,
            format_name=self.format_name,
            files_written=files_written,
            warnings=warnings,
            stats=stats
        )

    def _create_textgrid(
        self,
        schema: dict,
        tiered_data: dict,
        use_short_format: bool = False
    ) -> str:
        """
        Create TextGrid file content.

        Args:
            schema: The tiered_annotation schema configuration
            tiered_data: The annotation data
            use_short_format: Whether to use short TextGrid format

        Returns:
            TextGrid file content as string
        """
        tiers = schema.get("tiers", [])
        annotations = tiered_data.get("annotations", {})

        # Calculate time bounds
        min_time = 0.0
        max_time = self._get_max_time(annotations)

        if max_time == 0:
            max_time = 1.0  # Default duration if no annotations

        if use_short_format:
            return self._create_short_textgrid(tiers, annotations, min_time, max_time)
        else:
            return self._create_long_textgrid(tiers, annotations, min_time, max_time)

    def _create_long_textgrid(
        self,
        tiers: List[dict],
        annotations: Dict[str, List[dict]],
        min_time: float,
        max_time: float
    ) -> str:
        """Create long format TextGrid (more readable)."""
        lines = []
        lines.append('File type = "ooTextFile"')
        lines.append('Object class = "TextGrid"')
        lines.append('')
        lines.append(f'xmin = {min_time}')
        lines.append(f'xmax = {max_time}')
        lines.append('tiers? <exists>')
        lines.append(f'size = {len(tiers)}')
        lines.append('item []:')

        for i, tier_def in enumerate(tiers, 1):
            tier_name = tier_def["name"]
            tier_anns = annotations.get(tier_name, [])

            # Sort annotations by start time
            sorted_anns = sorted(tier_anns, key=lambda a: a.get("start_time", 0))

            # Fill gaps to create complete intervals
            intervals = self._create_intervals(sorted_anns, min_time, max_time)

            lines.append(f'    item [{i}]:')
            lines.append('        class = "IntervalTier"')
            lines.append(f'        name = "{self._escape_text(tier_name)}"')
            lines.append(f'        xmin = {min_time}')
            lines.append(f'        xmax = {max_time}')
            lines.append(f'        intervals: size = {len(intervals)}')

            for j, interval in enumerate(intervals, 1):
                lines.append(f'        intervals [{j}]:')
                lines.append(f'            xmin = {interval["start"]}')
                lines.append(f'            xmax = {interval["end"]}')
                lines.append(f'            text = "{self._escape_text(interval["text"])}"')

        return '\n'.join(lines)

    def _create_short_textgrid(
        self,
        tiers: List[dict],
        annotations: Dict[str, List[dict]],
        min_time: float,
        max_time: float
    ) -> str:
        """Create short format TextGrid (more compact)."""
        lines = []
        lines.append('File type = "ooTextFile"')
        lines.append('Object class = "TextGrid"')
        lines.append('')
        lines.append(str(min_time))
        lines.append(str(max_time))
        lines.append('<exists>')
        lines.append(str(len(tiers)))

        for tier_def in tiers:
            tier_name = tier_def["name"]
            tier_anns = annotations.get(tier_name, [])

            # Sort and create intervals
            sorted_anns = sorted(tier_anns, key=lambda a: a.get("start_time", 0))
            intervals = self._create_intervals(sorted_anns, min_time, max_time)

            lines.append('"IntervalTier"')
            lines.append(f'"{self._escape_text(tier_name)}"')
            lines.append(str(min_time))
            lines.append(str(max_time))
            lines.append(str(len(intervals)))

            for interval in intervals:
                lines.append(str(interval["start"]))
                lines.append(str(interval["end"]))
                lines.append(f'"{self._escape_text(interval["text"])}"')

        return '\n'.join(lines)

    def _create_intervals(
        self,
        annotations: List[dict],
        min_time: float,
        max_time: float
    ) -> List[dict]:
        """
        Create a complete list of intervals, filling gaps with empty intervals.

        Args:
            annotations: Sorted list of annotations
            min_time: Start time of the TextGrid
            max_time: End time of the TextGrid

        Returns:
            List of interval dicts with start, end, and text
        """
        intervals = []
        current_time = min_time

        for ann in annotations:
            start_sec = ann.get("start_time", 0) / 1000.0  # Convert ms to seconds
            end_sec = ann.get("end_time", 0) / 1000.0
            text = ann.get("value") or ann.get("label", "")

            # Add empty interval for gap
            if start_sec > current_time + 0.0001:  # Small tolerance
                intervals.append({
                    "start": current_time,
                    "end": start_sec,
                    "text": ""
                })

            # Add annotation interval
            intervals.append({
                "start": start_sec,
                "end": end_sec,
                "text": text
            })
            current_time = end_sec

        # Add final empty interval if needed
        if current_time < max_time - 0.0001:
            intervals.append({
                "start": current_time,
                "end": max_time,
                "text": ""
            })

        # If no intervals at all, create one empty interval
        if not intervals:
            intervals.append({
                "start": min_time,
                "end": max_time,
                "text": ""
            })

        return intervals

    def _get_max_time(self, annotations: Dict[str, List[dict]]) -> float:
        """Get the maximum end time from all annotations in seconds."""
        max_time = 0.0

        for tier_anns in annotations.values():
            for ann in tier_anns:
                end_time = ann.get("end_time", 0)
                if end_time:
                    max_time = max(max_time, end_time / 1000.0)  # Convert ms to seconds

        return max_time

    def _escape_text(self, text: str) -> str:
        """Escape special characters for TextGrid format."""
        if not text:
            return ""
        # Escape quotes and backslashes
        text = text.replace('\\', '\\\\')
        text = text.replace('"', '\\"')
        # Remove or replace newlines
        text = text.replace('\n', ' ').replace('\r', '')
        return text