Spaces:
Sleeping
Sleeping
colab-user commited on
Commit ·
9b7234d
1
Parent(s): 0184e12
fix align words & text
Browse files- app/services/processor.py +31 -17
- app/services/transcription.py +2 -2
app/services/processor.py
CHANGED
|
@@ -46,24 +46,28 @@ class ProcessingResult:
|
|
| 46 |
csv_content: str = ""
|
| 47 |
|
| 48 |
|
| 49 |
-
def normalize_asr_result(result):
|
| 50 |
-
"""
|
| 51 |
-
Build text ONLY from words of THIS segment
|
| 52 |
-
"""
|
| 53 |
words = []
|
| 54 |
|
| 55 |
-
for w in result.words
|
| 56 |
-
if not w.word.strip():
|
| 57 |
continue
|
| 58 |
words.append({
|
| 59 |
-
"word": w
|
| 60 |
-
"start": float(w
|
| 61 |
-
"end": float(w
|
| 62 |
})
|
| 63 |
|
| 64 |
-
text =
|
|
|
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
|
|
@@ -167,6 +171,7 @@ class Processor:
|
|
| 167 |
|
| 168 |
# 4. Normalize speakers
|
| 169 |
raw_speakers = sorted({seg.speaker for seg in diarization_segments})
|
|
|
|
| 170 |
|
| 171 |
speaker_map = {
|
| 172 |
spk: f"Speaker {i+1}"
|
|
@@ -247,17 +252,13 @@ class Processor:
|
|
| 247 |
)
|
| 248 |
|
| 249 |
text, raw_words = normalize_asr_result(result)
|
| 250 |
-
if not raw_words:
|
| 251 |
-
speaker = diarization_segments[0].speaker
|
| 252 |
-
label = speaker_map.get(speaker, speaker)
|
| 253 |
-
role = roles.get(label, "KH")
|
| 254 |
-
|
| 255 |
processed_segments.append(
|
| 256 |
TranscriptSegment(
|
| 257 |
start=w_start,
|
| 258 |
end=w_end,
|
| 259 |
speaker=label,
|
| 260 |
-
role=
|
| 261 |
text=text
|
| 262 |
)
|
| 263 |
)
|
|
@@ -289,6 +290,19 @@ class Processor:
|
|
| 289 |
word_objs,
|
| 290 |
diarization_segments
|
| 291 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
# ===== MAP WORD → ROLE =====
|
| 294 |
for seg in aligned_segments:
|
|
|
|
| 46 |
csv_content: str = ""
|
| 47 |
|
| 48 |
|
| 49 |
+
def normalize_asr_result(result: dict):
|
|
|
|
|
|
|
|
|
|
| 50 |
words = []
|
| 51 |
|
| 52 |
+
for w in result.get("words", []):
|
| 53 |
+
if not w.get("word", "").strip():
|
| 54 |
continue
|
| 55 |
words.append({
|
| 56 |
+
"word": w["word"].strip(),
|
| 57 |
+
"start": float(w["start"]),
|
| 58 |
+
"end": float(w["end"]),
|
| 59 |
})
|
| 60 |
|
| 61 |
+
text = result.get("text", "").strip()
|
| 62 |
+
return text, words
|
| 63 |
|
| 64 |
+
|
| 65 |
+
def guess_speaker_by_time(start, end, diarization_segments):
|
| 66 |
+
mid = (start + end) / 2
|
| 67 |
+
for d in diarization_segments:
|
| 68 |
+
if d.start <= mid <= d.end:
|
| 69 |
+
return d.speaker
|
| 70 |
+
return diarization_segments[0].speaker
|
| 71 |
|
| 72 |
|
| 73 |
|
|
|
|
| 171 |
|
| 172 |
# 4. Normalize speakers
|
| 173 |
raw_speakers = sorted({seg.speaker for seg in diarization_segments})
|
| 174 |
+
raw_speakers = guess_speaker_by_time(w_start, w_end, diarization_segments)
|
| 175 |
|
| 176 |
speaker_map = {
|
| 177 |
spk: f"Speaker {i+1}"
|
|
|
|
| 252 |
)
|
| 253 |
|
| 254 |
text, raw_words = normalize_asr_result(result)
|
| 255 |
+
if text and not raw_words:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
processed_segments.append(
|
| 257 |
TranscriptSegment(
|
| 258 |
start=w_start,
|
| 259 |
end=w_end,
|
| 260 |
speaker=label,
|
| 261 |
+
role=roles.get(label, "KH"),
|
| 262 |
text=text
|
| 263 |
)
|
| 264 |
)
|
|
|
|
| 290 |
word_objs,
|
| 291 |
diarization_segments
|
| 292 |
)
|
| 293 |
+
|
| 294 |
+
if raw_words and not aligned_segments:
|
| 295 |
+
processed_segments.append(
|
| 296 |
+
TranscriptSegment(
|
| 297 |
+
start=w_start,
|
| 298 |
+
end=w_end,
|
| 299 |
+
speaker=speakers[0],
|
| 300 |
+
role=roles[speakers[0]],
|
| 301 |
+
text=text
|
| 302 |
+
)
|
| 303 |
+
)
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
|
| 307 |
# ===== MAP WORD → ROLE =====
|
| 308 |
for seg in aligned_segments:
|
app/services/transcription.py
CHANGED
|
@@ -217,8 +217,8 @@ class TranscriptionService:
|
|
| 217 |
initial_prompt: Optional[str] = None,
|
| 218 |
prefix_text: Optional[str] = None,
|
| 219 |
condition_on_previous_text: bool = False,
|
| 220 |
-
no_speech_threshold: float = 0.
|
| 221 |
-
log_prob_threshold: float = -
|
| 222 |
compression_ratio_threshold: float = 2.3,
|
| 223 |
) -> Dict:
|
| 224 |
"""
|
|
|
|
| 217 |
initial_prompt: Optional[str] = None,
|
| 218 |
prefix_text: Optional[str] = None,
|
| 219 |
condition_on_previous_text: bool = False,
|
| 220 |
+
no_speech_threshold: float = 0.3,
|
| 221 |
+
log_prob_threshold: float = -2.0,
|
| 222 |
compression_ratio_threshold: float = 2.3,
|
| 223 |
) -> Dict:
|
| 224 |
"""
|