hon9kon9ize
/

wav2vec2bert-jyutping

Safetensors

wav2vec2-bert

Model card Files Files and versions

xet

Community

indiejoseph commited on Nov 25, 2025

Commit

ddd91c4

verified ·

1 Parent(s): 08663a8

Update handler.py

Browse files

Files changed (1) hide show

handler.py +11 -9

handler.py CHANGED Viewed

@@ -3,6 +3,7 @@ import re
 from itertools import groupby
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union, Dict, List, Any
 import torch
 import torch.nn as nn
 from transformers.modeling_outputs import ModelOutput
@@ -47,8 +48,11 @@ ONSETS = {
 class SpeechToJyutpingPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
         self.tone_tokenizer = Wav2Vec2CTCTokenizer(
-            "tone_vocab.json",
             unk_token="[UNK]",
             pad_token="[PAD]",
             word_delimiter_token="|",
@@ -95,7 +99,6 @@ class SpeechToJyutpingPipeline(Pipeline):
         sample_rate = 16000
         symbols = [w for w in transcription.split(" ") if len(w) > 0]
-        duration_sec = model_outputs["duration"] / sample_rate
         ids_w_index = [(i, _id.item()) for i, _id in enumerate(predicted_ids[0])]
         # remove entries which are just "padding" (i.e. no characers are recognized)
@@ -151,7 +154,7 @@ class SpeechToJyutpingPipeline(Pipeline):
         transcription = re.sub(
             r"\s+", " ", "".join(transcription).replace("_", " ").strip()
         )
-        tone_probs = torch.stack(tone_probs).cpu().numpy()
         return {"transcription": transcription, "tone_probs": tone_probs}
@@ -388,15 +391,14 @@ class Wav2Vec2BertForCantonese(Wav2Vec2BertPreTrainedModel):
 class EndpointHandler:
-    def __init__(self, path="hon9kon9ize/wav2vec2bert-jyutping"):
-        feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(path)
-        tokenizer = Wav2Vec2CTCTokenizer(
-            "vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|"
-        )
         self.pipeline = pipeline(
             task="speech-to-jyutping",
-            model=Wav2Vec2BertForCantonese.from_pretrained(path),
             feature_extractor=feature_extractor,
             tokenizer=tokenizer,
         )

 from itertools import groupby
 from dataclasses import dataclass
 from typing import Optional, Tuple, Union, Dict, List, Any
+from huggingface_hub import hf_hub_download
 import torch
 import torch.nn as nn
 from transformers.modeling_outputs import ModelOutput
 class SpeechToJyutpingPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
+        tone_vocab_file = hf_hub_download(
+            repo_id="hon9kon9ize/wav2vec2bert-jyutping", filename="tone_vocab.json"
+        )
         self.tone_tokenizer = Wav2Vec2CTCTokenizer(
+            tone_vocab_file,
             unk_token="[UNK]",
             pad_token="[PAD]",
             word_delimiter_token="|",
         sample_rate = 16000
         symbols = [w for w in transcription.split(" ") if len(w) > 0]
         ids_w_index = [(i, _id.item()) for i, _id in enumerate(predicted_ids[0])]
         # remove entries which are just "padding" (i.e. no characers are recognized)
         transcription = re.sub(
             r"\s+", " ", "".join(transcription).replace("_", " ").strip()
         )
+        tone_probs = torch.stack(tone_probs).cpu().tolist()
         return {"transcription": transcription, "tone_probs": tone_probs}
 class EndpointHandler:
+    def __init__(self, path="."):
+        model_path = "hon9kon9ize/wav2vec2bert-jyutping"
+        feature_extractor = SeamlessM4TFeatureExtractor.from_pretrained(model_path)
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(model_path)
         self.pipeline = pipeline(
             task="speech-to-jyutping",
+            model=Wav2Vec2BertForCantonese.from_pretrained(model_path),
             feature_extractor=feature_extractor,
             tokenizer=tokenizer,
         )