Capstone04
/

Bootstrapping

Automatic Speech Recognition

speaker-diarization

Model card Files Files and versions

Capstone04 commited on Nov 16, 2025

Commit

24ed202

·

verified ·

1 Parent(s): 83ddb5c

Upload folder using huggingface_hub

Files changed (1) hide show

asr_diarization/pipeline.py +15 -3

asr_diarization/pipeline.py CHANGED Viewed

@@ -406,16 +406,28 @@ class ASR_Diarization:
             results["DER"] = round(der_score * 100, 2)
         if ref_json and os.path.exists(hyp_json):
-            def load_words(path):
                 data = json.load(open(path))
                 # Filter out NSE events for WER calculation (only use speech)
                 speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
                 # NEW: Directly use segment text instead of tokens
                 return " ".join([seg["text"] for seg in speech_segments])
-            ref_text, hyp_text = load_words(ref_json), load_words(hyp_json)
             transform = Compose([ToLowerCase(), RemovePunctuation(),
-                                 RemoveMultipleSpaces(), Strip()])
             results["WER_raw"] = round(wer(ref_text, hyp_text), 4)
             results["WER_normalized"] = round(wer(transform(ref_text), transform(hyp_text)), 4)

             results["DER"] = round(der_score * 100, 2)
         if ref_json and os.path.exists(hyp_json):
+            def load_words_from_hypothesis(path):
+                """Load text from YOUR pipeline output (has 'text' field)"""
                 data = json.load(open(path))
                 # Filter out NSE events for WER calculation (only use speech)
                 speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
                 # NEW: Directly use segment text instead of tokens
                 return " ".join([seg["text"] for seg in speech_segments])
+            def load_words_from_reference(path):
+                """Load text from REFERENCE file (has 'tokens' field)"""
+                data = json.load(open(path))
+                # Filter out NSE events for WER calculation (only use speech)
+                speech_segments = [seg for seg in data if seg.get("speaker") != "NSE"]
+                # Reference format has tokens, not direct text
+                return " ".join([tok["text"] for seg in speech_segments for tok in seg["tokens"]])
+            # Use appropriate loader for each file
+            ref_text = load_words_from_reference(ref_json)
+            hyp_text = load_words_from_hypothesis(hyp_json)
             transform = Compose([ToLowerCase(), RemovePunctuation(),
+                                RemoveMultipleSpaces(), Strip()])
             results["WER_raw"] = round(wer(ref_text, hyp_text), 4)
             results["WER_normalized"] = round(wer(transform(ref_text), transform(hyp_text)), 4)