File size: 7,031 Bytes
7d5f092
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""CONNECTED SPEECH MODULE
Coarticulation, assimilation, elision, linking, and reduction patterns.
These are the hallmarks of fluent, natural speech production.
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any

import numpy as np


@dataclass
class AssimilationEvent:
    position_ms: int
    word_boundary: str     # e.g., "ten boys" -> "tem boys"
    type: str              # "place", "voice", "manner", "nasalization"
    direction: str         # "progressive", "regressive", "reciprocal"
    expected: str
    produced: str
    is_target_like: bool


@dataclass
class ElisionEvent:
    position_ms: int
    word: str
    elided_segment: str
    context: str           # e.g., "last night" -> /las naɪt/
    is_natural: bool       # natural in connected speech vs. error


@dataclass
class LinkingEvent:
    position_ms: int
    word_boundary: str
    link_type: str         # "liaison", "intrusive_r", "linking_r", "glottal", "resyllabification"
    description: str


@dataclass
class ReductionEvent:
    word: str
    full_form: str
    reduced_form: str
    vowel_reduced: bool
    syllable_deleted: bool
    reduction_type: str    # "schwa_reduction", "syllable_deletion", "cluster_simplification"


@dataclass
class ConnectedSpeechResult:
    assimilations: list[AssimilationEvent]
    elisions: list[ElisionEvent]
    linkings: list[LinkingEvent]
    reductions: list[ReductionEvent]
    coarticulation_index: float    # 0-1, degree of coarticulation
    fluency_score: float           # 0-100
    connected_speech_ratio: float  # proportion showing connected speech features
    word_boundary_clarity: float   # 0-1, how clearly word boundaries are maintained


# Common connected speech patterns in English
COMMON_ASSIMILATIONS = {
    ("n", "b"): ("m", "place"),
    ("n", "p"): ("m", "place"),
    ("n", "m"): ("m", "place"),
    ("n", "k"): ("ŋ", "place"),
    ("n", "g"): ("ŋ", "place"),
    ("d", "j"): ("dʒ", "manner"),
    ("t", "j"): ("tʃ", "manner"),
    ("s", "j"): ("ʃ", "manner"),
    ("z", "j"): ("ʒ", "manner"),
}

COMMON_ELISIONS = {
    "and": "n",
    "because": "cos",
    "going to": "gonna",
    "want to": "wanna",
    "got to": "gotta",
    "them": "em",
    "about": "bout",
}

FUNCTION_WORDS_REDUCIBLE = {
    "a", "an", "the", "to", "of", "for", "and", "but", "or",
    "is", "are", "was", "were", "has", "have", "had",
    "can", "could", "will", "would", "shall", "should",
    "do", "does", "did", "am", "be", "been",
    "at", "in", "on", "by", "from", "with",
    "he", "she", "we", "they", "them", "his", "her",
}


def analyze_connected_speech(
    word_timestamps: list[dict[str, Any]],
    phoneme_spans: list[dict[str, Any]],
    transcript: str,
    formant_trajectories: dict[str, list[float]],
) -> ConnectedSpeechResult:
    """Analyze connected speech phenomena."""
    words = word_timestamps or []
    assimilations: list[AssimilationEvent] = []
    elisions: list[ElisionEvent] = []
    linkings: list[LinkingEvent] = []
    reductions: list[ReductionEvent] = []

    # --- Detect assimilation at word boundaries ---
    for i in range(len(words) - 1):
        w1 = words[i].get("word", "").lower().strip()
        w2 = words[i + 1].get("word", "").lower().strip()
        if not w1 or not w2:
            continue

        last_char = w1[-1]
        first_char = w2[0]
        boundary = f"{w1} {w2}"
        pos = int(words[i].get("end", 0) * 1000)

        pair = (last_char, first_char)
        if pair in COMMON_ASSIMILATIONS:
            result_phoneme, assim_type = COMMON_ASSIMILATIONS[pair]
            assimilations.append(AssimilationEvent(
                position_ms=pos,
                word_boundary=boundary,
                type=assim_type,
                direction="regressive",
                expected=last_char,
                produced=result_phoneme,
                is_target_like=True,
            ))

        # --- Linking detection ---
        gap_ms = (words[i + 1].get("start", 0) - words[i].get("end", 0)) * 1000
        if gap_ms < 30:
            # Very short gap = linking
            if w1[-1] in "aeiou" and w2[0] in "aeiou":
                linkings.append(LinkingEvent(
                    position_ms=pos,
                    word_boundary=boundary,
                    link_type="liaison",
                    description=f"vowel-to-vowel linking: {w1} -> {w2}",
                ))
            elif w1[-1] == "r" and w2[0] in "aeiou":
                linkings.append(LinkingEvent(
                    position_ms=pos,
                    word_boundary=boundary,
                    link_type="linking_r",
                    description=f"linking /r/: {w1} -> {w2}",
                ))

    # --- Detect elisions ---
    for w in words:
        wtext = w.get("word", "").lower().strip()
        dur = (w.get("end", 0) - w.get("start", 0)) * 1000
        if wtext in COMMON_ELISIONS:
            elisions.append(ElisionEvent(
                position_ms=int(w.get("start", 0) * 1000),
                word=wtext,
                elided_segment=COMMON_ELISIONS[wtext],
                context=f"reduced form of '{wtext}'",
                is_natural=True,
            ))

    # --- Detect vowel reduction in function words ---
    for w in words:
        wtext = w.get("word", "").lower().strip()
        dur = (w.get("end", 0) - w.get("start", 0)) * 1000
        if wtext in FUNCTION_WORDS_REDUCIBLE and dur < 150:
            reductions.append(ReductionEvent(
                word=wtext,
                full_form=wtext,
                reduced_form=f"[ə] reduced",
                vowel_reduced=True,
                syllable_deleted=False,
                reduction_type="schwa_reduction",
            ))

    # --- Coarticulation index from formant trajectories ---
    f1_traj = formant_trajectories.get("f1_trajectory", [])
    f2_traj = formant_trajectories.get("f2_trajectory", [])
    if len(f1_traj) > 3:
        f1_diffs = np.diff(f1_traj)
        smoothness = 1.0 - min(1.0, float(np.std(f1_diffs)) / 100)
        coart_index = smoothness
    else:
        coart_index = 0.5

    total_features = len(assimilations) + len(elisions) + len(linkings) + len(reductions)
    total_boundaries = max(1, len(words) - 1)
    cs_ratio = min(1.0, total_features / total_boundaries)

    # Word boundary clarity (inverse of connected speech ratio)
    boundary_clarity = 1.0 - cs_ratio * 0.5

    # Fluency score
    fluency = min(100.0, (
        cs_ratio * 30 +
        coart_index * 30 +
        (len(linkings) / max(1, total_boundaries)) * 20 +
        (len(reductions) / max(1, len(words))) * 20
    ))

    return ConnectedSpeechResult(
        assimilations=assimilations,
        elisions=elisions,
        linkings=linkings,
        reductions=reductions,
        coarticulation_index=round(coart_index, 4),
        fluency_score=round(fluency, 2),
        connected_speech_ratio=round(cs_ratio, 4),
        word_boundary_clarity=round(boundary_clarity, 4),
    )