Merge branch 'vad'
Browse files* vad:
[fix]: update view.
update vad parameter
change to vad audio translate
add whisper fine tune for chinese
vad parameters v1 test
fix vad bug
[fix]: update installation.
fix vad buf
- config.py +6 -5
- frontend/assets/{index-eff0154e.css → index-2c7aa850.css} +1 -1
- frontend/assets/{index-0364c095.js → index-640e640f.js} +0 -0
- frontend/index.html +2 -2
- main.py +2 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin +3 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin +3 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json +64 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil +0 -0
- moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin +3 -0
- moyoyo_asr_models/ggml-small.bin +3 -0
- transcribe/helpers/vadprocessor.py +262 -242
- transcribe/helpers/whisper.py +8 -4
- transcribe/pipelines/__init__.py +1 -1
- transcribe/pipelines/pipe_translate.py +3 -0
- transcribe/pipelines/pipe_vad.py +21 -79
- transcribe/pipelines/pipe_whisper.py +12 -2
- transcribe/strategy.py +1 -1
- transcribe/translatepipes.py +22 -11
- transcribe/whisper_llm_serve.py +62 -103
config.py
CHANGED
|
@@ -2,7 +2,7 @@ import pathlib
|
|
| 2 |
import re
|
| 3 |
import logging
|
| 4 |
|
| 5 |
-
DEBUG =
|
| 6 |
TEST = False
|
| 7 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
| 8 |
|
|
@@ -10,7 +10,7 @@ logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
|
| 10 |
logging.basicConfig(
|
| 11 |
level=logging.DEBUG if DEBUG else logging.INFO,
|
| 12 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
| 13 |
-
filename='translator.log',
|
| 14 |
datefmt="%H:%M:%S"
|
| 15 |
)
|
| 16 |
|
|
@@ -50,9 +50,10 @@ MAX_LENTH_ZH = 4
|
|
| 50 |
WHISPER_PROMPT_EN = ""# "The following is an English sentence."
|
| 51 |
MAX_LENGTH_EN= 8
|
| 52 |
|
| 53 |
-
|
| 54 |
-
WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
| 55 |
-
|
|
|
|
| 56 |
# LLM
|
| 57 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
| 58 |
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
|
|
|
| 2 |
import re
|
| 3 |
import logging
|
| 4 |
|
| 5 |
+
DEBUG = True
|
| 6 |
TEST = False
|
| 7 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
| 8 |
|
|
|
|
| 10 |
logging.basicConfig(
|
| 11 |
level=logging.DEBUG if DEBUG else logging.INFO,
|
| 12 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
| 13 |
+
filename='translator.log',
|
| 14 |
datefmt="%H:%M:%S"
|
| 15 |
)
|
| 16 |
|
|
|
|
| 50 |
WHISPER_PROMPT_EN = ""# "The following is an English sentence."
|
| 51 |
MAX_LENGTH_EN= 8
|
| 52 |
|
| 53 |
+
WHISPER_MODEL_EN = 'medium-q5_0'
|
| 54 |
+
# WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
| 55 |
+
# WHISPER_MODEL_ZH = 'small'
|
| 56 |
+
WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
|
| 57 |
# LLM
|
| 58 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
| 59 |
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
frontend/assets/{index-eff0154e.css → index-2c7aa850.css}
RENAMED
|
@@ -1 +1 @@
|
|
| 1 |
-
html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;line-height:1.5;font-weight:400;color-scheme:light dark;color:#ffffffde;background-color:#242424;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%}a{font-weight:500;color:#646cff;text-decoration:inherit}a:hover{color:#535bf2}body{margin:0;display:flex;place-items:center;min-width:320px;height:auto;min-height:auto;color:#333;background:#fff}h1{font-size:3.2em;line-height:1.1}button{border-radius:8px;border:1px solid transparent;padding:.6em 1.2em;font-size:1em;font-weight:500;font-family:inherit;background-color:#1a1a1a;cursor:pointer;transition:border-color .25s}.card{border-bottom:solid 2px lightgray;align-items:center;justify-content:center;margin-top:40px;display:flex;max-width:1024px;width:100%}.seg-title{margin:24px 0;font-size:20px;font-weight:500}.seg-co{width:1022px;text-align:left;border-left:solid 6px midnightblue;padding-left:8px;margin-left:2px;margin-top:36px;line-height:24px}#app{margin:0 auto;padding:0;text-align:center;width:100%}.ant-btn{padding:4px 12px}@media (prefers-color-scheme: light){:root{color:#213547;background-color:#fff}a:hover{color:#747bff}button{background-color:#f9f9f9}}.ant-card{background:#f5f6fa}.ant-card .ant-card-actions{background-color:#e8e8f8cc!important}.ant-popover{max-width:800px!important}.ant-form-item{background:transparent;margin-bottom:40px!important}.ant-form-item .ant-form-item-explain-error{color:#ff4d4f;text-align:left!important}.ant-form-item-label label{font-size:18px!important;color:#1a1a1a!important;font-weight:500!important}.ant-tooltip{max-width:1022px!important}.ant-page-header-heading{width:1022px!important}.highlight{background:ghostwhite}.content[data-v-178d5f9f]{background-color:#fff;max-width:1280px;min-height:720px;margin:0 auto;display:flex;flex-direction:column;align-items:center;justify-content:space-between}.not-found-wrapper[data-v-aef52a59]{height:calc(100vh - 104px)}.view-wrapper[data-v-
|
|
|
|
| 1 |
+
html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;line-height:1.5;font-weight:400;color-scheme:light dark;color:#ffffffde;background-color:#242424;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%}a{font-weight:500;color:#646cff;text-decoration:inherit}a:hover{color:#535bf2}body{margin:0;display:flex;place-items:center;min-width:320px;height:auto;min-height:auto;color:#333;background:#fff}h1{font-size:3.2em;line-height:1.1}button{border-radius:8px;border:1px solid transparent;padding:.6em 1.2em;font-size:1em;font-weight:500;font-family:inherit;background-color:#1a1a1a;cursor:pointer;transition:border-color .25s}.card{border-bottom:solid 2px lightgray;align-items:center;justify-content:center;margin-top:40px;display:flex;max-width:1024px;width:100%}.seg-title{margin:24px 0;font-size:20px;font-weight:500}.seg-co{width:1022px;text-align:left;border-left:solid 6px midnightblue;padding-left:8px;margin-left:2px;margin-top:36px;line-height:24px}#app{margin:0 auto;padding:0;text-align:center;width:100%}.ant-btn{padding:4px 12px}@media (prefers-color-scheme: light){:root{color:#213547;background-color:#fff}a:hover{color:#747bff}button{background-color:#f9f9f9}}.ant-card{background:#f5f6fa}.ant-card .ant-card-actions{background-color:#e8e8f8cc!important}.ant-popover{max-width:800px!important}.ant-form-item{background:transparent;margin-bottom:40px!important}.ant-form-item .ant-form-item-explain-error{color:#ff4d4f;text-align:left!important}.ant-form-item-label label{font-size:18px!important;color:#1a1a1a!important;font-weight:500!important}.ant-tooltip{max-width:1022px!important}.ant-page-header-heading{width:1022px!important}.highlight{background:ghostwhite}.content[data-v-178d5f9f]{background-color:#fff;max-width:1280px;min-height:720px;margin:0 auto;display:flex;flex-direction:column;align-items:center;justify-content:space-between}.not-found-wrapper[data-v-aef52a59]{height:calc(100vh - 104px)}.config-content[data-v-ba23d083]{width:420px;margin:12px}.config-content .config-block[data-v-ba23d083]{margin:12px;padding-bottom:12px}.view-wrapper[data-v-ba23d083]{width:100%;height:100%;background-color:#fff}.view-wrapper .wrapper-width-fixed[data-v-ba23d083]{width:1280px}.view-wrapper .wrapper-width-auto[data-v-ba23d083]{width:100vw}.view-wrapper .content-wrapper[data-v-ba23d083]{text-align:left;max-width:100vw;min-width:320px;margin-bottom:64px;min-height:calc(100vh - 438px)}.view-wrapper .content-wrapper .chat-box[data-v-ba23d083]{width:100%;height:54vh;border-radius:4px;padding:12px;color:#2e2f33;font-size:18px}.view-wrapper .content-wrapper .chat-box-placeholder[data-v-ba23d083]{width:100%;height:58vh;border-radius:4px;padding:12px;font-size:18px;color:#a4a6ac}.view-wrapper .content-wrapper .actions-box[data-v-ba23d083]{display:flex;align-items:center;justify-content:space-between;margin:0 24px;height:48px}.view-wrapper .content-wrapper .actions-box .left-actions[data-v-ba23d083]{display:flex;align-items:center;justify-content:space-between;width:288px}.view-wrapper .content-wrapper .trans-list[data-v-ba23d083]{overflow-y:auto;width:100%;height:58vh;scrollbar-width:none;-ms-overflow-style:none}.view-wrapper .content-wrapper .trans-list[data-v-ba23d083]::-webkit-scrollbar{display:none}.view-wrapper .content-wrapper .trans-list .node[data-v-ba23d083]{margin-bottom:36px;width:100%!important;transition:all .3s ease}.view-wrapper .content-wrapper .trans-list .node .trans-time[data-v-ba23d083]{font-size:14px;color:#c4c6cc}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-16[data-v-ba23d083]{font-size:16px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-18[data-v-ba23d083]{font-size:18px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-20[data-v-ba23d083]{font-size:20px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-22[data-v-ba23d083]{font-size:22px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-24[data-v-ba23d083]{font-size:24px}.view-wrapper .content-wrapper .trans-list .node .trans-src-lang[data-v-ba23d083]{color:#909299;font-weight:500}.view-wrapper .content-wrapper .trans-list .node .trans-dst-lang[data-v-ba23d083]{color:#2e2f33;font-weight:600}.view-wrapper .content-wrapper .trans-list .current_node[data-v-ba23d083]{background-color:#f0f1f7;padding:4px 8px}@keyframes highlight-ba23d083{0%{background-color:transparent}50%{background-color:#fff1ce80}to{background-color:transparent}}@keyframes slideIn-ba23d083{0%{opacity:0;transform:translateY(10px)}to{opacity:1;transform:translateY(0)}}.content-wrapper[data-v-c39ab0d6]{text-align:left;max-width:800px;min-width:320px;margin-bottom:64px;min-height:calc(100vh - 438px)}.content-wrapper .content-box[data-v-c39ab0d6]{padding:24px;height:240px;background-color:#e8e8e8;border-radius:16px;width:50%;margin:48px auto;min-width:300px}.content-wrapper .video-box[data-v-c39ab0d6]{max-width:800px;min-width:320px;width:90vw;height:auto}
|
frontend/assets/{index-0364c095.js → index-640e640f.js}
RENAMED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/index.html
CHANGED
|
@@ -5,8 +5,8 @@
|
|
| 5 |
<link rel="icon" type="image/svg+xml" href="./favicon.ico" />
|
| 6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
<title>Translator</title>
|
| 8 |
-
<script type="module" crossorigin src="./assets/index-
|
| 9 |
-
<link rel="stylesheet" href="./assets/index-
|
| 10 |
</head>
|
| 11 |
<body>
|
| 12 |
<div id="app"></div>
|
|
|
|
| 5 |
<link rel="icon" type="image/svg+xml" href="./favicon.ico" />
|
| 6 |
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 7 |
<title>Translator</title>
|
| 8 |
+
<script type="module" crossorigin src="./assets/index-640e640f.js"></script>
|
| 9 |
+
<link rel="stylesheet" href="./assets/index-2c7aa850.css">
|
| 10 |
</head>
|
| 11 |
<body>
|
| 12 |
<div id="app"></div>
|
main.py
CHANGED
|
@@ -57,6 +57,7 @@ async def root():
|
|
| 57 |
async def translate(websocket: WebSocket):
|
| 58 |
query_parameters_dict = websocket.query_params
|
| 59 |
from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
|
|
|
|
| 60 |
client = WhisperTranscriptionService(
|
| 61 |
websocket,
|
| 62 |
pipe,
|
|
@@ -64,6 +65,7 @@ async def translate(websocket: WebSocket):
|
|
| 64 |
client_uid=f"{uuid1()}",
|
| 65 |
)
|
| 66 |
|
|
|
|
| 67 |
if from_lang and to_lang:
|
| 68 |
client.set_language(from_lang, to_lang)
|
| 69 |
logger.info(f"Source lange: {from_lang} -> Dst lange: {to_lang}")
|
|
|
|
| 57 |
async def translate(websocket: WebSocket):
|
| 58 |
query_parameters_dict = websocket.query_params
|
| 59 |
from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
|
| 60 |
+
|
| 61 |
client = WhisperTranscriptionService(
|
| 62 |
websocket,
|
| 63 |
pipe,
|
|
|
|
| 65 |
client_uid=f"{uuid1()}",
|
| 66 |
)
|
| 67 |
|
| 68 |
+
|
| 69 |
if from_lang and to_lang:
|
| 70 |
client.set_language(from_lang, to_lang)
|
| 71 |
logger.info(f"Source lange: {from_lang} -> Dst lange: {to_lang}")
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:18ad2072ae82872c2ba8a187071e1e7d6c1105253685e7aa95138adcf07874e0
|
| 3 |
+
size 207
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:05fe28591b40616fa0c34ad7b853133623f5300923ec812acb11459c411acf3b
|
| 3 |
+
size 149
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"storagePrecision" : "Float16",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[]",
|
| 13 |
+
"name" : "output",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"modelParameters" : [
|
| 18 |
+
|
| 19 |
+
],
|
| 20 |
+
"specificationVersion" : 6,
|
| 21 |
+
"mlProgramOperationTypeHistogram" : {
|
| 22 |
+
"Linear" : 72,
|
| 23 |
+
"Matmul" : 24,
|
| 24 |
+
"Cast" : 2,
|
| 25 |
+
"Conv" : 2,
|
| 26 |
+
"Softmax" : 12,
|
| 27 |
+
"Add" : 25,
|
| 28 |
+
"LayerNorm" : 25,
|
| 29 |
+
"Mul" : 24,
|
| 30 |
+
"Transpose" : 49,
|
| 31 |
+
"Gelu" : 14,
|
| 32 |
+
"Reshape" : 48
|
| 33 |
+
},
|
| 34 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 35 |
+
"isUpdatable" : "0",
|
| 36 |
+
"availability" : {
|
| 37 |
+
"macOS" : "12.0",
|
| 38 |
+
"tvOS" : "15.0",
|
| 39 |
+
"watchOS" : "8.0",
|
| 40 |
+
"iOS" : "15.0",
|
| 41 |
+
"macCatalyst" : "15.0"
|
| 42 |
+
},
|
| 43 |
+
"modelType" : {
|
| 44 |
+
"name" : "MLModelType_mlProgram"
|
| 45 |
+
},
|
| 46 |
+
"userDefinedMetadata" : {
|
| 47 |
+
|
| 48 |
+
},
|
| 49 |
+
"inputSchema" : [
|
| 50 |
+
{
|
| 51 |
+
"hasShapeFlexibility" : "0",
|
| 52 |
+
"isOptional" : "0",
|
| 53 |
+
"dataType" : "Float32",
|
| 54 |
+
"formattedType" : "MultiArray (Float32 1 × 80 × 3000)",
|
| 55 |
+
"shortDescription" : "",
|
| 56 |
+
"shape" : "[1, 80, 3000]",
|
| 57 |
+
"name" : "logmel_data",
|
| 58 |
+
"type" : "MultiArray"
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
"generatedClassName" : "coreml_encoder_small",
|
| 62 |
+
"method" : "predict"
|
| 63 |
+
}
|
| 64 |
+
]
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:87eed4ae76f11a2d4a50786bc7423d4b45c2d0d9ca05577a3bd2557452072eaf
|
| 3 |
+
size 176339456
|
moyoyo_asr_models/ggml-small.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3f6ef171491de375b741059400ba9a0aead023122b7a7db731b4943f9baa0f97
|
| 3 |
+
size 487601984
|
transcribe/helpers/vadprocessor.py
CHANGED
|
@@ -1,276 +1,296 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
|
|
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import onnxruntime
|
| 5 |
-
from datetime import timedelta
|
| 6 |
-
from pydub import AudioSegment
|
| 7 |
-
from silero_vad import load_silero_vad, get_speech_timestamps, VADIterator
|
| 8 |
-
import os
|
| 9 |
-
import logging
|
| 10 |
-
|
| 11 |
-
class FixedVADIterator(VADIterator):
|
| 12 |
-
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
| 13 |
-
If audio to be processed at once is long and multiple voiced segments detected,
|
| 14 |
-
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
| 15 |
-
'''
|
| 16 |
|
| 17 |
-
|
| 18 |
-
super().reset_states()
|
| 19 |
-
self.buffer = np.array([],dtype=np.float32)
|
| 20 |
-
|
| 21 |
-
def __call__(self, x, return_seconds=False):
|
| 22 |
-
self.buffer = np.append(self.buffer, x)
|
| 23 |
-
ret = None
|
| 24 |
-
while len(self.buffer) >= 512:
|
| 25 |
-
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
| 26 |
-
self.buffer = self.buffer[512:]
|
| 27 |
-
if ret is None:
|
| 28 |
-
ret = r
|
| 29 |
-
elif r is not None:
|
| 30 |
-
if 'end' in r:
|
| 31 |
-
ret['end'] = r['end'] # the latter end
|
| 32 |
-
if 'start' in r and 'end' in ret: # there is an earlier start.
|
| 33 |
-
# Remove end, merging this segment with the previous one.
|
| 34 |
-
del ret['end']
|
| 35 |
-
return ret if ret != {} else None
|
| 36 |
-
|
| 37 |
-
class SileroVADProcessor:
|
| 38 |
-
"""
|
| 39 |
-
A class for processing audio files using Silero VAD to detect voice activity
|
| 40 |
-
and extract voice segments from audio files.
|
| 41 |
-
"""
|
| 42 |
|
| 43 |
-
def __init__(self,
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
max_speech_duration=20,
|
| 48 |
-
min_silence_duration=250,
|
| 49 |
-
sample_rate=16000,
|
| 50 |
-
ort_providers=None):
|
| 51 |
-
"""
|
| 52 |
-
Initialize the SileroVADProcessor.
|
| 53 |
-
|
| 54 |
-
Args:
|
| 55 |
-
activate_threshold (float): Threshold for voice activity detection
|
| 56 |
-
fusion_threshold (float): Threshold for merging close speech segments (seconds)
|
| 57 |
-
min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
|
| 58 |
-
max_speech_duration (float): Maximum duration of speech (seconds)
|
| 59 |
-
min_silence_duration (int): Minimum silence duration (ms)
|
| 60 |
-
sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
|
| 61 |
-
ort_providers (list): ONNX Runtime providers for acceleration
|
| 62 |
-
"""
|
| 63 |
-
# VAD parameters
|
| 64 |
-
self.activate_threshold = activate_threshold
|
| 65 |
-
self.fusion_threshold = fusion_threshold
|
| 66 |
-
self.min_speech_duration = min_speech_duration
|
| 67 |
-
self.max_speech_duration = max_speech_duration
|
| 68 |
-
self.min_silence_duration = min_silence_duration
|
| 69 |
-
self.sample_rate = sample_rate
|
| 70 |
-
self.ort_providers = ort_providers if ort_providers else []
|
| 71 |
-
|
| 72 |
-
# Initialize logger
|
| 73 |
-
self.logger = logging.getLogger(__name__)
|
| 74 |
-
|
| 75 |
-
# Load Silero VAD model
|
| 76 |
-
self._init_onnx_session()
|
| 77 |
-
self.silero_vad = load_silero_vad(onnx=True)
|
| 78 |
-
|
| 79 |
-
def _init_onnx_session(self):
|
| 80 |
-
"""Initialize ONNX Runtime session with appropriate settings."""
|
| 81 |
-
session_opts = onnxruntime.SessionOptions()
|
| 82 |
-
session_opts.log_severity_level = 3
|
| 83 |
-
session_opts.inter_op_num_threads = 0
|
| 84 |
-
session_opts.intra_op_num_threads = 0
|
| 85 |
-
session_opts.enable_cpu_mem_arena = True
|
| 86 |
-
session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
| 87 |
-
session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 88 |
-
|
| 89 |
-
session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
|
| 90 |
-
session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
|
| 91 |
-
session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
|
| 92 |
-
|
| 93 |
-
# Set the session_opts to be used by silero_vad
|
| 94 |
-
# onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
|
| 95 |
-
|
| 96 |
-
def load_audio(self, audio_path):
|
| 97 |
-
"""
|
| 98 |
-
Load audio file and prepare it for VAD processing.
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
"""
|
| 106 |
-
self.logger.info(f"Loading audio from {audio_path}")
|
| 107 |
-
audio_segment = AudioSegment.from_file(audio_path)
|
| 108 |
-
audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
|
| 109 |
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
| 113 |
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
def model(self):
|
| 119 |
-
return self.silero_vad
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
timestamps (list): List of (start, end) tuples
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
if (end - start) >= self.min_speech_duration]
|
| 134 |
-
|
| 135 |
-
# Fuse timestamps in two passes for better merging
|
| 136 |
-
fused_timestamps_1st = []
|
| 137 |
-
for start, end in filtered_timestamps:
|
| 138 |
-
if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
|
| 139 |
-
fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
|
| 140 |
-
else:
|
| 141 |
-
fused_timestamps_1st.append((start, end))
|
| 142 |
|
| 143 |
-
|
| 144 |
-
for start, end in fused_timestamps_1st:
|
| 145 |
-
if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
|
| 146 |
-
fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
|
| 147 |
-
else:
|
| 148 |
-
fused_timestamps_2nd.append((start, end))
|
| 149 |
|
| 150 |
-
|
|
|
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
milliseconds = int((td_sec - total_seconds) * 1000)
|
| 166 |
-
hours = total_seconds // 3600
|
| 167 |
-
minutes = (total_seconds % 3600) // 60
|
| 168 |
-
seconds = total_seconds % 60
|
| 169 |
-
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
| 170 |
-
|
| 171 |
-
def detect_speech(self, audio:np.array):
|
| 172 |
-
"""
|
| 173 |
-
Run VAD on the audio file to detect speech segments.
|
| 174 |
|
| 175 |
-
|
| 176 |
-
|
| 177 |
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
model=self.silero_vad,
|
| 187 |
-
threshold=self.activate_threshold,
|
| 188 |
-
max_speech_duration_s=self.max_speech_duration,
|
| 189 |
-
min_speech_duration_ms=int(self.min_speech_duration * 1000),
|
| 190 |
-
min_silence_duration_ms=self.min_silence_duration,
|
| 191 |
-
return_seconds=True
|
| 192 |
-
)
|
| 193 |
-
|
| 194 |
-
# Convert to simple format and process
|
| 195 |
-
timestamps = [(item['start'], item['end']) for item in raw_timestamps]
|
| 196 |
-
processed_timestamps = self.process_timestamps(timestamps)
|
| 197 |
-
|
| 198 |
-
# Clean up
|
| 199 |
-
del audio
|
| 200 |
-
gc.collect()
|
| 201 |
-
|
| 202 |
-
self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
|
| 203 |
-
return processed_timestamps
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
|
|
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
output_prefix (str): Prefix for output files
|
| 211 |
-
"""
|
| 212 |
-
# Save timestamps in seconds (VTT format)
|
| 213 |
-
seconds_path = f"{output_prefix}_timestamps_second.txt"
|
| 214 |
-
with open(seconds_path, "w", encoding='UTF-8') as file:
|
| 215 |
-
self.logger.info("Saving timestamps in seconds format")
|
| 216 |
-
for start, end in timestamps:
|
| 217 |
-
s_time = self.format_time(start)
|
| 218 |
-
e_time = self.format_time(end)
|
| 219 |
-
line = f"{s_time} --> {e_time}\n"
|
| 220 |
-
file.write(line)
|
| 221 |
-
|
| 222 |
-
# Save timestamps in sample indices
|
| 223 |
-
indices_path = f"{output_prefix}_timestamps_indices.txt"
|
| 224 |
-
with open(indices_path, "w", encoding='UTF-8') as file:
|
| 225 |
-
self.logger.info("Saving timestamps in indices format")
|
| 226 |
-
for start, end in timestamps:
|
| 227 |
-
line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
|
| 228 |
-
file.write(line)
|
| 229 |
-
|
| 230 |
-
self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
|
| 231 |
-
|
| 232 |
-
def extract_speech_segments(self, audio_segment, timestamps):
|
| 233 |
-
"""
|
| 234 |
-
Extract speech segments from the audio and combine them into a single audio file.
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
# Convert seconds to milliseconds for pydub
|
| 248 |
-
start_ms = int(start * 1000)
|
| 249 |
-
end_ms = int(end * 1000)
|
| 250 |
|
| 251 |
-
# Ensure the end time does not exceed the length of the audio segment
|
| 252 |
-
if end_ms > len(audio_segment):
|
| 253 |
-
end_ms = len(audio_segment)
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
|
| 258 |
-
|
| 259 |
-
combined_speech = np.append(combined_speech, segment)
|
| 260 |
|
| 261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
|
| 263 |
-
def
|
| 264 |
"""
|
| 265 |
-
|
|
|
|
| 266 |
|
| 267 |
-
|
| 268 |
-
|
| 269 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
-
|
| 272 |
-
|
| 273 |
|
| 274 |
-
combined_speech = self.extract_speech_segments(audio_array, timestamps)
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from copy import deepcopy
|
| 2 |
+
from queue import Queue, Empty
|
| 3 |
+
from time import time
|
| 4 |
+
from config import VAD_MODEL_PATH
|
| 5 |
+
# from silero_vad import load_silero_vad
|
| 6 |
import numpy as np
|
| 7 |
import onnxruntime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
class OnnxWrapper():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
def __init__(self, path, force_onnx_cpu=False):
|
| 12 |
+
opts = onnxruntime.SessionOptions()
|
| 13 |
+
opts.inter_op_num_threads = 1
|
| 14 |
+
opts.intra_op_num_threads = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
|
| 17 |
+
self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
|
| 18 |
+
else:
|
| 19 |
+
self.session = onnxruntime.InferenceSession(path, sess_options=opts)
|
| 20 |
|
| 21 |
+
self.reset_states()
|
| 22 |
+
self.sample_rates = [16000]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
def _validate_input(self, x: np.ndarray, sr: int):
|
| 25 |
+
if x.ndim == 1:
|
| 26 |
+
x = x[None]
|
| 27 |
+
if x.ndim > 2:
|
| 28 |
+
raise ValueError(f"Too many dimensions for input audio chunk {x.ndim}")
|
| 29 |
|
| 30 |
+
if sr != 16000 and (sr % 16000 == 0):
|
| 31 |
+
step = sr // 16000
|
| 32 |
+
x = x[:, ::step]
|
| 33 |
+
sr = 16000
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
if sr not in self.sample_rates:
|
| 36 |
+
raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
|
| 37 |
+
if sr / x.shape[1] > 31.25:
|
| 38 |
+
raise ValueError("Input audio chunk is too short")
|
| 39 |
|
| 40 |
+
return x, sr
|
|
|
|
| 41 |
|
| 42 |
+
def reset_states(self, batch_size=1):
|
| 43 |
+
self._state = np.zeros((2, batch_size, 128)).astype(np.float32)
|
| 44 |
+
self._context = np.zeros(0)
|
| 45 |
+
self._last_sr = 0
|
| 46 |
+
self._last_batch_size = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
def __call__(self, x, sr: int):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
x, sr = self._validate_input(x, sr)
|
| 51 |
+
num_samples = 512 if sr == 16000 else 256
|
| 52 |
|
| 53 |
+
if x.shape[-1] != num_samples:
|
| 54 |
+
raise ValueError(
|
| 55 |
+
f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
|
| 56 |
|
| 57 |
+
batch_size = x.shape[0]
|
| 58 |
+
context_size = 64 if sr == 16000 else 32
|
| 59 |
|
| 60 |
+
if not self._last_batch_size:
|
| 61 |
+
self.reset_states(batch_size)
|
| 62 |
+
if (self._last_sr) and (self._last_sr != sr):
|
| 63 |
+
self.reset_states(batch_size)
|
| 64 |
+
if (self._last_batch_size) and (self._last_batch_size != batch_size):
|
| 65 |
+
self.reset_states(batch_size)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
if not len(self._context):
|
| 68 |
+
self._context = np.zeros((batch_size, context_size)).astype(np.float32)
|
| 69 |
|
| 70 |
+
x = np.concatenate([self._context, x], axis=1)
|
| 71 |
+
if sr in [8000, 16000]:
|
| 72 |
+
ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')}
|
| 73 |
+
ort_outs = self.session.run(None, ort_inputs)
|
| 74 |
+
out, state = ort_outs
|
| 75 |
+
self._state = state
|
| 76 |
+
else:
|
| 77 |
+
raise ValueError()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
self._context = x[..., -context_size:]
|
| 80 |
+
self._last_sr = sr
|
| 81 |
+
self._last_batch_size = batch_size
|
| 82 |
|
| 83 |
+
# out = torch.from_numpy(out)
|
| 84 |
+
return out
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
+
def audio_forward(self, audio: np.ndarray, sr: int):
|
| 87 |
+
outs = []
|
| 88 |
+
x, sr = self._validate_input(audio, sr)
|
| 89 |
+
self.reset_states()
|
| 90 |
+
num_samples = 512 if sr == 16000 else 256
|
| 91 |
|
| 92 |
+
if x.shape[1] % num_samples:
|
| 93 |
+
pad_num = num_samples - (x.shape[1] % num_samples)
|
| 94 |
+
x = np.pad(x, ((0, 0), (0, pad_num)), 'constant', constant_values=(0.0, 0.0))
|
| 95 |
+
|
| 96 |
+
for i in range(0, x.shape[1], num_samples):
|
| 97 |
+
wavs_batch = x[:, i:i + num_samples]
|
| 98 |
+
out_chunk = self.__call__(wavs_batch, sr)
|
| 99 |
+
outs.append(out_chunk)
|
| 100 |
|
| 101 |
+
stacked = np.concatenate(outs, axis=1)
|
| 102 |
+
return stacked
|
|
|
|
|
|
|
|
|
|
| 103 |
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
+
class VADIteratorOnnx:
|
| 106 |
+
def __init__(self,
|
| 107 |
+
threshold: float = 0.5,
|
| 108 |
+
sampling_rate: int = 16000,
|
| 109 |
+
min_silence_duration_ms: int = 100,
|
| 110 |
+
max_speech_duration_s: float = float('inf'),
|
| 111 |
+
):
|
| 112 |
+
self.model = OnnxWrapper(VAD_MODEL_PATH, True)
|
| 113 |
+
self.threshold = threshold
|
| 114 |
+
self.sampling_rate = sampling_rate
|
| 115 |
+
|
| 116 |
+
if sampling_rate not in [8000, 16000]:
|
| 117 |
+
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
| 118 |
+
|
| 119 |
+
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
| 120 |
+
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
| 121 |
+
# self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
| 122 |
+
self.reset_states()
|
| 123 |
|
| 124 |
+
def reset_states(self):
|
|
|
|
| 125 |
|
| 126 |
+
self.model.reset_states()
|
| 127 |
+
self.triggered = False
|
| 128 |
+
self.temp_end = 0
|
| 129 |
+
self.current_sample = 0
|
| 130 |
+
self.start = 0
|
| 131 |
|
| 132 |
+
def __call__(self, x: np.ndarray, return_seconds=False):
|
| 133 |
"""
|
| 134 |
+
x: np.ndarray
|
| 135 |
+
audio chunk (see examples in repo)
|
| 136 |
|
| 137 |
+
return_seconds: bool (default - False)
|
| 138 |
+
whether return timestamps in seconds (default - samples)
|
| 139 |
"""
|
| 140 |
+
|
| 141 |
+
window_size_samples = 512 if self.sampling_rate == 16000 else 256
|
| 142 |
+
x = x[:window_size_samples]
|
| 143 |
+
if len(x) < window_size_samples:
|
| 144 |
+
x = np.pad(x, ((0, 0), (0, window_size_samples - len(x))), 'constant', constant_values=0.0)
|
| 145 |
+
|
| 146 |
+
self.current_sample += window_size_samples
|
| 147 |
+
|
| 148 |
+
speech_prob = self.model(x, self.sampling_rate)[0,0]
|
| 149 |
+
# print(f"{self.current_sample/self.sampling_rate:.2f}: {speech_prob}")
|
| 150 |
+
|
| 151 |
+
if (speech_prob >= self.threshold) and self.temp_end:
|
| 152 |
+
self.temp_end = 0
|
| 153 |
+
|
| 154 |
+
if (speech_prob >= self.threshold) and not self.triggered:
|
| 155 |
+
self.triggered = True
|
| 156 |
+
speech_start = max(0, self.current_sample - window_size_samples)
|
| 157 |
+
self.start = speech_start
|
| 158 |
+
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
| 159 |
+
|
| 160 |
+
if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
|
| 161 |
+
if self.temp_end:
|
| 162 |
+
self.temp_end = 0
|
| 163 |
+
self.start = self.current_sample
|
| 164 |
+
return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
|
| 165 |
+
|
| 166 |
+
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
| 167 |
+
if not self.temp_end:
|
| 168 |
+
self.temp_end = self.current_sample
|
| 169 |
+
if self.current_sample - self.temp_end < self.min_silence_samples:
|
| 170 |
+
return None
|
| 171 |
+
else:
|
| 172 |
+
speech_end = self.temp_end - window_size_samples
|
| 173 |
+
self.temp_end = 0
|
| 174 |
+
self.triggered = False
|
| 175 |
+
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
|
| 176 |
|
| 177 |
+
return None
|
| 178 |
+
|
| 179 |
|
|
|
|
| 180 |
|
| 181 |
+
class VadV2:
|
| 182 |
+
def __init__(self,
|
| 183 |
+
threshold: float = 0.5,
|
| 184 |
+
sampling_rate: int = 16000,
|
| 185 |
+
min_silence_duration_ms: int = 100,
|
| 186 |
+
speech_pad_ms: int = 30,
|
| 187 |
+
max_speech_duration_s: float = float('inf')):
|
| 188 |
+
# self.vad_iterator = VADIterator(threshold, sampling_rate, min_silence_duration_ms)
|
| 189 |
+
self.vad_iterator = VADIteratorOnnx(threshold, sampling_rate, min_silence_duration_ms, max_speech_duration_s)
|
| 190 |
+
self.speech_pad_samples = int(sampling_rate * speech_pad_ms / 1000)
|
| 191 |
+
self.sampling_rate = sampling_rate
|
| 192 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 193 |
+
self.start = 0
|
| 194 |
+
self.end = 0
|
| 195 |
+
self.offset = 0
|
| 196 |
+
assert speech_pad_ms <= min_silence_duration_ms, "speech_pad_ms should be less than min_silence_duration_ms"
|
| 197 |
+
self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
|
| 198 |
+
|
| 199 |
+
self.silence_chunk_size = 0
|
| 200 |
+
self.silence_chunk_threshold = 60 / (512 / self.sampling_rate)
|
| 201 |
+
|
| 202 |
+
def reset(self):
|
| 203 |
+
self.audio_buffer = np.array([], dtype=np.float32)
|
| 204 |
+
self.start = 0
|
| 205 |
+
self.end = 0
|
| 206 |
+
self.offset = 0
|
| 207 |
+
self.vad_iterator.reset_states()
|
| 208 |
+
|
| 209 |
+
def __call__(self, x: np.ndarray = None):
|
| 210 |
+
if x is None:
|
| 211 |
+
if self.start:
|
| 212 |
+
start = max(self.offset, self.start - self.speech_pad_samples)
|
| 213 |
+
end = self.offset + len(self.audio_buffer)
|
| 214 |
+
start_ts = round(start / self.sampling_rate, 1)
|
| 215 |
+
end_ts = round(end / self.sampling_rate, 1)
|
| 216 |
+
audio_data = self.audio_buffer[start - self.offset: end - self.offset]
|
| 217 |
+
result = {
|
| 218 |
+
"start": start_ts,
|
| 219 |
+
"end": end_ts,
|
| 220 |
+
"audio": audio_data,
|
| 221 |
+
}
|
| 222 |
+
else:
|
| 223 |
+
result = None
|
| 224 |
+
self.reset()
|
| 225 |
+
return result
|
| 226 |
+
|
| 227 |
+
self.audio_buffer = np.append(self.audio_buffer, deepcopy(x))
|
| 228 |
+
|
| 229 |
+
result = self.vad_iterator(x)
|
| 230 |
+
if result is not None:
|
| 231 |
+
# self.start = result.get('start', self.start)
|
| 232 |
+
# self.end = result.get('end', self.end)
|
| 233 |
+
self.silence_chunk_size = 0
|
| 234 |
+
|
| 235 |
+
if 'start' in result:
|
| 236 |
+
self.start = result['start']
|
| 237 |
+
if 'end' in result:
|
| 238 |
+
self.end = result['end']
|
| 239 |
+
else:
|
| 240 |
+
self.silence_chunk_size += 1
|
| 241 |
+
|
| 242 |
+
if self.start == 0 and len(self.audio_buffer) > self.speech_pad_samples:
|
| 243 |
+
self.offset += len(self.audio_buffer) - self.speech_pad_samples
|
| 244 |
+
self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
|
| 245 |
+
|
| 246 |
+
if self.silence_chunk_size >= self.silence_chunk_threshold:
|
| 247 |
+
self.offset += len(self.audio_buffer) - self.speech_pad_samples
|
| 248 |
+
self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
|
| 249 |
+
self.silence_chunk_size = 0
|
| 250 |
+
|
| 251 |
+
if self.end > self.start:
|
| 252 |
+
start = max(self.offset, self.start - self.speech_pad_samples)
|
| 253 |
+
end = self.end + self.speech_pad_samples
|
| 254 |
+
start_ts = round(start / self.sampling_rate, 1)
|
| 255 |
+
end_ts = round(end / self.sampling_rate, 1)
|
| 256 |
+
audio_data = self.audio_buffer[start - self.offset: end - self.offset]
|
| 257 |
+
self.audio_buffer = self.audio_buffer[self.end - self.offset:]
|
| 258 |
+
self.offset = self.end
|
| 259 |
+
self.start = self.end
|
| 260 |
+
# self.start = 0
|
| 261 |
+
self.end = 0
|
| 262 |
+
result = {
|
| 263 |
+
"start": start_ts,
|
| 264 |
+
"end": end_ts,
|
| 265 |
+
"audio": audio_data,
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
return result
|
| 269 |
+
return None
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
class VadProcessor:
|
| 274 |
+
def __init__(
|
| 275 |
+
self,
|
| 276 |
+
prob_threshold=0.5,
|
| 277 |
+
silence_s=0.3,
|
| 278 |
+
cache_s=0.25,
|
| 279 |
+
sr=16000
|
| 280 |
+
):
|
| 281 |
+
self.prob_thres = prob_threshold
|
| 282 |
+
self.cache_s = cache_s
|
| 283 |
+
self.sr = sr
|
| 284 |
+
self.silence_s = silence_s
|
| 285 |
+
|
| 286 |
+
self.vad = VadV2(self.prob_thres, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
def process_audio(self, audio_buffer: np.ndarray):
|
| 290 |
+
audio = np.array([], np.float32)
|
| 291 |
+
for i in range(0, len(audio_buffer), 512):
|
| 292 |
+
chunk = audio_buffer[i:i+512]
|
| 293 |
+
ret = self.vad(chunk)
|
| 294 |
+
if ret:
|
| 295 |
+
audio = np.append(audio, ret['audio'])
|
| 296 |
+
return audio
|
transcribe/helpers/whisper.py
CHANGED
|
@@ -9,10 +9,14 @@ logger = getLogger(__name__)
|
|
| 9 |
|
| 10 |
class WhisperCPP:
|
| 11 |
|
| 12 |
-
def __init__(self, warmup=True) -> None:
|
| 13 |
models_dir = config.MODEL_DIR.as_posix()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
self.model = Model(
|
| 15 |
-
model=
|
| 16 |
models_dir=models_dir,
|
| 17 |
print_realtime=False,
|
| 18 |
print_progress=False,
|
|
@@ -47,9 +51,9 @@ class WhisperCPP:
|
|
| 47 |
audio_buffer,
|
| 48 |
initial_prompt=prompt,
|
| 49 |
language=language,
|
| 50 |
-
token_timestamps=True,
|
| 51 |
# split_on_word=True,
|
| 52 |
-
max_len=max_len
|
| 53 |
)
|
| 54 |
return output
|
| 55 |
except Exception as e:
|
|
|
|
| 9 |
|
| 10 |
class WhisperCPP:
|
| 11 |
|
| 12 |
+
def __init__(self, source_lange: str='en', warmup=True) -> None:
|
| 13 |
models_dir = config.MODEL_DIR.as_posix()
|
| 14 |
+
if source_lange == "zh":
|
| 15 |
+
whisper_model = config.WHISPER_MODEL_ZH
|
| 16 |
+
else:
|
| 17 |
+
whisper_model = config.WHISPER_MODEL_EN
|
| 18 |
self.model = Model(
|
| 19 |
+
model=whisper_model,
|
| 20 |
models_dir=models_dir,
|
| 21 |
print_realtime=False,
|
| 22 |
print_progress=False,
|
|
|
|
| 51 |
audio_buffer,
|
| 52 |
initial_prompt=prompt,
|
| 53 |
language=language,
|
| 54 |
+
# token_timestamps=True,
|
| 55 |
# split_on_word=True,
|
| 56 |
+
# max_len=max_len
|
| 57 |
)
|
| 58 |
return output
|
| 59 |
except Exception as e:
|
transcribe/pipelines/__init__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
|
| 2 |
from .pipe_translate import TranslatePipe, Translate7BPipe
|
| 3 |
-
from .pipe_whisper import WhisperPipe
|
| 4 |
from .pipe_vad import VadPipe
|
| 5 |
from .base import MetaItem
|
|
|
|
| 1 |
|
| 2 |
from .pipe_translate import TranslatePipe, Translate7BPipe
|
| 3 |
+
from .pipe_whisper import WhisperPipe, WhisperChinese
|
| 4 |
from .pipe_vad import VadPipe
|
| 5 |
from .base import MetaItem
|
transcribe/pipelines/pipe_translate.py
CHANGED
|
@@ -35,3 +35,6 @@ class Translate7BPipe(TranslatePipe):
|
|
| 35 |
if cls.translator is None:
|
| 36 |
cls.translator = QwenTranslator(LLM_LARGE_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH)
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
if cls.translator is None:
|
| 36 |
cls.translator = QwenTranslator(LLM_LARGE_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH)
|
| 37 |
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -1,99 +1,41 @@
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
-
from ..helpers.vadprocessor import
|
| 4 |
import numpy as np
|
| 5 |
from silero_vad import get_speech_timestamps
|
| 6 |
-
import torch
|
| 7 |
from typing import List
|
| 8 |
-
|
| 9 |
|
| 10 |
-
|
| 11 |
-
chunks = []
|
| 12 |
-
silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
|
| 13 |
-
silence = torch.zeros(silent_samples) # 创建300ms的静音
|
| 14 |
-
|
| 15 |
-
for i in range(len(tss)):
|
| 16 |
-
# 先添加当前语音片段
|
| 17 |
-
chunks.append(wav[tss[i]['start']: tss[i]['end']])
|
| 18 |
-
|
| 19 |
-
# 如果不是最后一个片段,且与下一个片段间隔大于100ms,则添加静音
|
| 20 |
-
if i < len(tss) - 1:
|
| 21 |
-
gap = tss[i+1]['start'] - tss[i]['end']
|
| 22 |
-
if gap > 0.1 * sample_rate: # 判断间隔是否大于100ms
|
| 23 |
-
chunks.append(silence) # 添加300ms静音
|
| 24 |
-
|
| 25 |
-
return torch.cat(chunks)
|
| 26 |
|
| 27 |
-
def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
|
| 28 |
-
chunks = []
|
| 29 |
-
silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
|
| 30 |
-
silence = torch.zeros(silent_samples) # 创建300ms的静音
|
| 31 |
-
min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
|
| 32 |
-
|
| 33 |
-
# 对时间戳进行简单的平滑处理
|
| 34 |
-
smoothed_tss = []
|
| 35 |
-
for i, ts in enumerate(tss):
|
| 36 |
-
if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
|
| 37 |
-
smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
|
| 38 |
-
else:
|
| 39 |
-
smoothed_tss.append(ts)
|
| 40 |
-
|
| 41 |
-
for i in range(len(smoothed_tss)):
|
| 42 |
-
# 添加当前语音片段
|
| 43 |
-
chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
|
| 44 |
-
|
| 45 |
-
# 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
|
| 46 |
-
if i < len(smoothed_tss) - 1:
|
| 47 |
-
gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
|
| 48 |
-
if gap > min_gap_samples:
|
| 49 |
-
# 根据间隔大小动态调整静音长度,但最大不超过300ms
|
| 50 |
-
silence_length = min(gap // 2, silent_samples)
|
| 51 |
-
chunks.append(torch.zeros(silence_length))
|
| 52 |
-
|
| 53 |
-
return torch.cat(chunks)
|
| 54 |
|
| 55 |
class VadPipe(BasePipe):
|
| 56 |
-
|
| 57 |
sample_rate = 16000
|
| 58 |
window_size_samples = 512
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
|
| 61 |
@classmethod
|
| 62 |
def init(cls):
|
| 63 |
-
if cls.
|
| 64 |
-
cls.
|
| 65 |
-
activate_threshold=0.45, # 降低以捕获更多音频
|
| 66 |
-
fusion_threshold=0.45, # 提高以更好地融合语音片段
|
| 67 |
-
min_speech_duration=0.2, # 略微降低以捕获短音节
|
| 68 |
-
max_speech_duration=20, # 保持不变
|
| 69 |
-
min_silence_duration=300, # 增加到300毫秒,允许说话间的自然停顿
|
| 70 |
-
sample_rate=cls.sample_rate # 采样率,音频信号的采样频率
|
| 71 |
-
)
|
| 72 |
-
cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
|
| 73 |
-
cls.vac.reset_states()
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
def get_previous_buffer(self):
|
| 77 |
-
if len(self.previous_buffer) == 2:
|
| 78 |
-
return self.previous_buffer[-1]
|
| 79 |
-
return np.array([], dtype=np.float32)
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
# def reduce_noise(self, data):
|
| 83 |
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
| 84 |
-
|
| 85 |
|
| 86 |
-
|
| 87 |
-
source_audio = in_data.source_audio
|
| 88 |
-
source_audio = np.frombuffer(source_audio, dtype=np.float32)
|
| 89 |
-
# source_audio = self.reduce_noise(source_audio)
|
| 90 |
-
send_audio = b""
|
| 91 |
-
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
|
| 92 |
-
|
| 93 |
-
if speech_timestamps:
|
| 94 |
-
send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
|
| 95 |
-
send_audio = send_audio.numpy()
|
| 96 |
-
in_data.audio = send_audio
|
| 97 |
-
# send_audio = self.reduce_noise(send_audio).tobytes()
|
| 98 |
-
in_data.source_audio = b""
|
| 99 |
-
return in_data
|
|
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
+
from ..helpers.vadprocessor import VadV2
|
| 4 |
import numpy as np
|
| 5 |
from silero_vad import get_speech_timestamps
|
|
|
|
| 6 |
from typing import List
|
| 7 |
+
import logging
|
| 8 |
|
| 9 |
+
# import noisereduce as nr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class VadPipe(BasePipe):
|
| 13 |
+
vac = None
|
| 14 |
sample_rate = 16000
|
| 15 |
window_size_samples = 512
|
| 16 |
+
chunk_size = 512
|
| 17 |
+
prob_threshold=0.5,
|
| 18 |
+
silence_s=0.5,
|
| 19 |
+
cache_s=0.25,
|
| 20 |
+
|
| 21 |
|
| 22 |
|
| 23 |
@classmethod
|
| 24 |
def init(cls):
|
| 25 |
+
if cls.vac is None:
|
| 26 |
+
cls.vac = VadV2(cls.prob_threshold, cls.sample_rate, cls.silence_s * 1000, cls.cache_s * 1000, max_speech_duration_s=15)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
+
def process(self, in_data: MetaItem) -> MetaItem:
|
| 29 |
+
audio_buffer = np.frombuffer(in_data.source_audio)
|
| 30 |
+
vad_audio = self.vac(audio_buffer)
|
| 31 |
+
if vad_audio:
|
| 32 |
+
in_data.audio = vad_audio['audio']
|
| 33 |
+
else:
|
| 34 |
+
in_data.audio = b""
|
| 35 |
+
return in_data
|
| 36 |
+
|
| 37 |
|
| 38 |
# def reduce_noise(self, data):
|
| 39 |
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
|
|
|
| 40 |
|
| 41 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
transcribe/pipelines/pipe_whisper.py
CHANGED
|
@@ -7,16 +7,18 @@ class WhisperPipe(BasePipe):
|
|
| 7 |
whisper = None
|
| 8 |
|
| 9 |
|
|
|
|
| 10 |
@classmethod
|
| 11 |
def init(cls):
|
| 12 |
if cls.whisper is None:
|
|
|
|
| 13 |
cls.whisper = WhisperCPP()
|
| 14 |
-
|
| 15 |
|
| 16 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 17 |
audio_data = in_data.audio
|
| 18 |
source_language = in_data.source_language
|
| 19 |
-
segments = self.whisper.transcribe(audio_data, source_language)
|
| 20 |
texts = "".join([s.text for s in segments])
|
| 21 |
in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
|
| 22 |
in_data.transcribe_content = texts
|
|
@@ -30,3 +32,11 @@ class WhisperPipe(BasePipe):
|
|
| 30 |
if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
|
| 31 |
printable.append(char)
|
| 32 |
return ''.join(printable).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
whisper = None
|
| 8 |
|
| 9 |
|
| 10 |
+
|
| 11 |
@classmethod
|
| 12 |
def init(cls):
|
| 13 |
if cls.whisper is None:
|
| 14 |
+
# cls.zh_whisper = WhisperCPP(source_lange='zh')
|
| 15 |
cls.whisper = WhisperCPP()
|
| 16 |
+
|
| 17 |
|
| 18 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 19 |
audio_data = in_data.audio
|
| 20 |
source_language = in_data.source_language
|
| 21 |
+
segments = self.whisper.transcribe(audio_data, source_language)
|
| 22 |
texts = "".join([s.text for s in segments])
|
| 23 |
in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
|
| 24 |
in_data.transcribe_content = texts
|
|
|
|
| 32 |
if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
|
| 33 |
printable.append(char)
|
| 34 |
return ''.join(printable).strip()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class WhisperChinese(WhisperPipe):
|
| 39 |
+
@classmethod
|
| 40 |
+
def init(cls):
|
| 41 |
+
if cls.whisper is None:
|
| 42 |
+
cls.whisper = WhisperCPP(source_lange='zh')
|
transcribe/strategy.py
CHANGED
|
@@ -111,7 +111,7 @@ class TranscriptChunk:
|
|
| 111 |
return 0
|
| 112 |
|
| 113 |
score = self._calculate_similarity(self.join(), chunk.join())
|
| 114 |
-
logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
|
| 115 |
return score
|
| 116 |
|
| 117 |
def only_punctuation(self)->bool:
|
|
|
|
| 111 |
return 0
|
| 112 |
|
| 113 |
score = self._calculate_similarity(self.join(), chunk.join())
|
| 114 |
+
# logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
|
| 115 |
return score
|
| 116 |
|
| 117 |
def only_punctuation(self)->bool:
|
transcribe/translatepipes.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from transcribe.pipelines import WhisperPipe, TranslatePipe, MetaItem,
|
| 2 |
import multiprocessing as mp
|
| 3 |
import config
|
| 4 |
|
|
@@ -11,14 +11,18 @@ class TranslatePipes:
|
|
| 11 |
# self.result_queue = mp.Queue()
|
| 12 |
|
| 13 |
# whisper 转录
|
| 14 |
-
self.
|
|
|
|
| 15 |
|
| 16 |
# llm 翻译
|
| 17 |
-
self._translate_pipe = self._launch_process(TranslatePipe())
|
| 18 |
|
| 19 |
self._translate_7b_pipe = self._launch_process(Translate7BPipe())
|
| 20 |
# vad
|
| 21 |
-
self._vad_pipe = self._launch_process(VadPipe())
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def _launch_process(self, process_obj):
|
| 24 |
process_obj.daemon = True
|
|
@@ -26,9 +30,10 @@ class TranslatePipes:
|
|
| 26 |
return process_obj
|
| 27 |
|
| 28 |
def wait_ready(self):
|
| 29 |
-
self.
|
| 30 |
-
self.
|
| 31 |
-
self.
|
|
|
|
| 32 |
self._translate_7b_pipe.wait()
|
| 33 |
|
| 34 |
def translate(self, text, src_lang, dst_lang) -> MetaItem:
|
|
@@ -45,14 +50,20 @@ class TranslatePipes:
|
|
| 45 |
transcribe_content=text,
|
| 46 |
source_language=src_lang,
|
| 47 |
destination_language=dst_lang)
|
| 48 |
-
self.
|
| 49 |
-
return self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def transcrible(self, audio_buffer:bytes, src_lang: str) -> MetaItem:
|
|
|
|
| 53 |
item = MetaItem(audio=audio_buffer, source_language=src_lang)
|
| 54 |
-
|
| 55 |
-
return
|
| 56 |
|
| 57 |
def voice_detect(self, audio_buffer:bytes) -> MetaItem:
|
| 58 |
item = MetaItem(source_audio=audio_buffer)
|
|
|
|
| 1 |
+
from transcribe.pipelines import WhisperPipe, TranslatePipe, MetaItem, WhisperChinese, Translate7BPipe
|
| 2 |
import multiprocessing as mp
|
| 3 |
import config
|
| 4 |
|
|
|
|
| 11 |
# self.result_queue = mp.Queue()
|
| 12 |
|
| 13 |
# whisper 转录
|
| 14 |
+
self._whisper_pipe_en = self._launch_process(WhisperPipe())
|
| 15 |
+
self._whisper_pipe_zh = self._launch_process(WhisperChinese())
|
| 16 |
|
| 17 |
# llm 翻译
|
| 18 |
+
# self._translate_pipe = self._launch_process(TranslatePipe())
|
| 19 |
|
| 20 |
self._translate_7b_pipe = self._launch_process(Translate7BPipe())
|
| 21 |
# vad
|
| 22 |
+
# self._vad_pipe = self._launch_process(VadPipe())
|
| 23 |
+
|
| 24 |
+
# def reset(self):
|
| 25 |
+
# self._vad_pipe.reset()
|
| 26 |
|
| 27 |
def _launch_process(self, process_obj):
|
| 28 |
process_obj.daemon = True
|
|
|
|
| 30 |
return process_obj
|
| 31 |
|
| 32 |
def wait_ready(self):
|
| 33 |
+
self._whisper_pipe_zh.wait()
|
| 34 |
+
self._whisper_pipe_en.wait()
|
| 35 |
+
# self._translate_pipe.wait()
|
| 36 |
+
# self._vad_pipe.wait()
|
| 37 |
self._translate_7b_pipe.wait()
|
| 38 |
|
| 39 |
def translate(self, text, src_lang, dst_lang) -> MetaItem:
|
|
|
|
| 50 |
transcribe_content=text,
|
| 51 |
source_language=src_lang,
|
| 52 |
destination_language=dst_lang)
|
| 53 |
+
self._translate_7b_pipe.input_queue.put(item)
|
| 54 |
+
return self._translate_7b_pipe.output_queue.get()
|
| 55 |
+
|
| 56 |
+
def get_whisper_model(self, lang:str='en'):
|
| 57 |
+
if lang == 'zh':
|
| 58 |
+
return self._whisper_pipe_zh
|
| 59 |
+
return self._whisper_pipe_en
|
| 60 |
|
| 61 |
|
| 62 |
def transcrible(self, audio_buffer:bytes, src_lang: str) -> MetaItem:
|
| 63 |
+
whisper_model = self.get_whisper_model(src_lang)
|
| 64 |
item = MetaItem(audio=audio_buffer, source_language=src_lang)
|
| 65 |
+
whisper_model.input_queue.put(item)
|
| 66 |
+
return whisper_model.output_queue.get()
|
| 67 |
|
| 68 |
def voice_detect(self, audio_buffer:bytes) -> MetaItem:
|
| 69 |
item = MetaItem(source_audio=audio_buffer)
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -8,44 +8,48 @@ from typing import List, Optional, Iterator, Tuple, Any
|
|
| 8 |
import asyncio
|
| 9 |
import numpy as np
|
| 10 |
import config
|
| 11 |
-
|
| 12 |
from api_model import TransResult, Message, DebugResult
|
| 13 |
-
|
| 14 |
from .utils import log_block, save_to_wave, TestDataWriter
|
| 15 |
from .translatepipes import TranslatePipes
|
| 16 |
from .strategy import (
|
| 17 |
TranscriptStabilityAnalyzer, TranscriptToken)
|
| 18 |
-
import
|
|
|
|
| 19 |
|
| 20 |
logger = getLogger("TranscriptionService")
|
| 21 |
|
| 22 |
|
| 23 |
-
class WhisperTranscriptionService
|
| 24 |
"""
|
| 25 |
Whisper语音转录服务类,处理音频流转录和翻译
|
| 26 |
"""
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
| 29 |
-
|
| 30 |
self.source_language = language # 源语言
|
| 31 |
self.target_language = dst_lang # 目标翻译语言
|
| 32 |
-
|
| 33 |
# 转录结果稳定性管理
|
| 34 |
-
|
| 35 |
self._translate_pipe = pipe
|
| 36 |
|
| 37 |
# 音频处理相关
|
| 38 |
self.sample_rate = 16000
|
| 39 |
-
|
| 40 |
self.lock = threading.Lock()
|
| 41 |
self._frame_queue = queue.Queue()
|
| 42 |
-
|
| 43 |
|
| 44 |
# 文本分隔符,根据语言设置
|
| 45 |
self.text_separator = self._get_text_separator(language)
|
| 46 |
self.loop = asyncio.get_event_loop()
|
| 47 |
# 发送就绪状态
|
| 48 |
-
|
| 49 |
self._transcrible_analysis = None
|
| 50 |
# 启动处理线程
|
| 51 |
self._translate_thread_stop = threading.Event()
|
|
@@ -53,7 +57,8 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 53 |
|
| 54 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 55 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 56 |
-
|
|
|
|
| 57 |
# for test
|
| 58 |
self._transcrible_time_cost = 0.
|
| 59 |
self._translate_time_cost = 0.
|
|
@@ -82,9 +87,9 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 82 |
"""根据语言返回适当的文本分隔符"""
|
| 83 |
return "" if language == "zh" else " "
|
| 84 |
|
| 85 |
-
def send_ready_state(self) -> None:
|
| 86 |
"""发送服务就绪状态消息"""
|
| 87 |
-
self.websocket.send(json.dumps({
|
| 88 |
"uid": self.client_uid,
|
| 89 |
"message": self.SERVER_READY,
|
| 90 |
"backend": "whisper_transcription"
|
|
@@ -94,10 +99,10 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 94 |
"""设置源语言和目标语言"""
|
| 95 |
self.source_language = source_lang
|
| 96 |
self.target_language = target_lang
|
| 97 |
-
self.text_separator = self._get_text_separator(source_lang)
|
| 98 |
-
self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
|
| 99 |
|
| 100 |
-
def
|
| 101 |
"""添加音频帧到处理队列"""
|
| 102 |
self._frame_queue.put(frame_np)
|
| 103 |
|
|
@@ -105,68 +110,21 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 105 |
"""从队列获取音频帧并合并到缓冲区"""
|
| 106 |
while not self._frame_processing_thread_stop.is_set():
|
| 107 |
try:
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
|
|
|
|
|
|
| 116 |
except queue.Empty:
|
| 117 |
pass
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
with self.lock:
|
| 122 |
-
if self.frames_np is not None:
|
| 123 |
-
# self._c+= 1
|
| 124 |
-
frame = self.frames_np.copy()
|
| 125 |
-
processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
|
| 126 |
-
self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
|
| 127 |
-
return self.frames_np.copy()
|
| 128 |
-
# if len(frame) > self.sample_rate:
|
| 129 |
-
# save_to_wave(f"{self._c}-org.wav", frame)
|
| 130 |
-
# save_to_wave(f"{self._c}-vad.wav", self.frames_np)
|
| 131 |
-
|
| 132 |
-
def _update_audio_buffer(self, offset: int) -> None:
|
| 133 |
-
"""从音频缓冲区中移除已处理的部分"""
|
| 134 |
-
with self.lock:
|
| 135 |
-
if self.frames_np is not None and offset > 0:
|
| 136 |
-
# self._c += 1
|
| 137 |
-
# before = self.frames_np.copy()
|
| 138 |
-
self.frames_np = self.frames_np[offset:]
|
| 139 |
-
# after = self.frames_np.copy()
|
| 140 |
-
# save_to_wave(f"./tests/{self._c}_before_cut_{offset}.wav", before)
|
| 141 |
-
# save_to_wave(f"./tests/{self._c}_cut.wav", before[:offset])
|
| 142 |
-
# save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
def _get_audio_for_processing(self) -> Optional[np.ndarray]:
|
| 146 |
-
"""准备用于处理的音频块"""
|
| 147 |
-
# 应用VAD处理
|
| 148 |
-
frame_np = self._apply_voice_activity_detection()
|
| 149 |
-
# frame_np = self.frames_np.copy()
|
| 150 |
-
# 没有音频帧
|
| 151 |
-
if frame_np is None:
|
| 152 |
-
return None
|
| 153 |
-
|
| 154 |
-
frames = frame_np.copy()
|
| 155 |
-
|
| 156 |
-
# 音频过短时的处理
|
| 157 |
-
if len(frames) <= 10:
|
| 158 |
-
# 极短音频段,清空并返回None
|
| 159 |
-
# self._update_audio_buffer(len(frames))
|
| 160 |
-
return None
|
| 161 |
-
if len(frames) < self.sample_rate:
|
| 162 |
-
# 不足一秒的音频,补充静音
|
| 163 |
-
silence_audio = np.zeros((self.sample_rate + 1000,), dtype=np.float32)
|
| 164 |
-
silence_audio[-len(frames):] = frames
|
| 165 |
-
return silence_audio.copy()
|
| 166 |
-
|
| 167 |
-
return frames
|
| 168 |
-
|
| 169 |
-
def _transcribe_audio(self, audio_buffer: np.ndarray) -> List[TranscriptToken]:
|
| 170 |
"""转录音频并返回转录片段"""
|
| 171 |
log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
|
| 172 |
start_time = time.perf_counter()
|
|
@@ -175,14 +133,11 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 175 |
segments = result.segments
|
| 176 |
time_diff = (time.perf_counter() - start_time)
|
| 177 |
logger.debug(f"📝 Transcrible Segments: {segments} ")
|
| 178 |
-
logger.debug(f"📝 Transcrible: {self.text_separator.join(seg.text for seg in segments)} ")
|
| 179 |
log_block("📝 Transcrible output", f"{self.text_separator.join(seg.text for seg in segments)}", "")
|
| 180 |
log_block("📝 Transcrible time", f"{time_diff:.3f}", "s")
|
| 181 |
self._transcrible_time_cost = round(time_diff, 3)
|
| 182 |
-
return
|
| 183 |
-
TranscriptToken(text=s.text, t0=s.t0, t1=s.t1)
|
| 184 |
-
for s in segments
|
| 185 |
-
]
|
| 186 |
|
| 187 |
def _translate_text(self, text: str) -> str:
|
| 188 |
"""将文本翻译为目标语言"""
|
|
@@ -216,40 +171,44 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 216 |
self._translate_time_cost = round(time_diff, 3)
|
| 217 |
return translated_text
|
| 218 |
|
| 219 |
-
|
| 220 |
-
|
| 221 |
def _transcription_processing_loop(self) -> None:
|
| 222 |
"""主转录处理循环"""
|
| 223 |
-
c = 0
|
| 224 |
-
while not self._translate_thread_stop.is_set():
|
| 225 |
-
if self.exit:
|
| 226 |
-
logger.info("Exiting transcription thread")
|
| 227 |
-
break
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
logger.info("Waiting for audio data...")
|
| 233 |
-
continue
|
| 234 |
-
|
| 235 |
-
# 获取音频块进行处理
|
| 236 |
-
audio_buffer = self._get_audio_for_processing()
|
| 237 |
-
if audio_buffer is None:
|
| 238 |
time.sleep(0.2)
|
| 239 |
continue
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
# try:
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
|
|
|
| 248 |
self._send_result_to_client(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
# except Exception as e:
|
| 251 |
# logger.error(f"Error processing audio: {e}")
|
| 252 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
| 254 |
"""
|
| 255 |
处理转录结果,生成翻译结果
|
|
|
|
| 8 |
import asyncio
|
| 9 |
import numpy as np
|
| 10 |
import config
|
| 11 |
+
|
| 12 |
from api_model import TransResult, Message, DebugResult
|
| 13 |
+
|
| 14 |
from .utils import log_block, save_to_wave, TestDataWriter
|
| 15 |
from .translatepipes import TranslatePipes
|
| 16 |
from .strategy import (
|
| 17 |
TranscriptStabilityAnalyzer, TranscriptToken)
|
| 18 |
+
from transcribe.helpers.vadprocessor import VadProcessor
|
| 19 |
+
from transcribe.pipelines import MetaItem
|
| 20 |
|
| 21 |
logger = getLogger("TranscriptionService")
|
| 22 |
|
| 23 |
|
| 24 |
+
class WhisperTranscriptionService:
|
| 25 |
"""
|
| 26 |
Whisper语音转录服务类,处理音频流转录和翻译
|
| 27 |
"""
|
| 28 |
|
| 29 |
+
SERVER_READY = "SERVER_READY"
|
| 30 |
+
DISCONNECT = "DISCONNECT"
|
| 31 |
+
|
| 32 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
| 33 |
+
|
| 34 |
self.source_language = language # 源语言
|
| 35 |
self.target_language = dst_lang # 目标翻译语言
|
| 36 |
+
self.client_uid = client_uid
|
| 37 |
# 转录结果稳定性管理
|
| 38 |
+
self.websocket = websocket
|
| 39 |
self._translate_pipe = pipe
|
| 40 |
|
| 41 |
# 音频处理相关
|
| 42 |
self.sample_rate = 16000
|
| 43 |
+
|
| 44 |
self.lock = threading.Lock()
|
| 45 |
self._frame_queue = queue.Queue()
|
| 46 |
+
self._vad_frame_queue = queue.Queue()
|
| 47 |
|
| 48 |
# 文本分隔符,根据语言设置
|
| 49 |
self.text_separator = self._get_text_separator(language)
|
| 50 |
self.loop = asyncio.get_event_loop()
|
| 51 |
# 发送就绪状态
|
| 52 |
+
|
| 53 |
self._transcrible_analysis = None
|
| 54 |
# 启动处理线程
|
| 55 |
self._translate_thread_stop = threading.Event()
|
|
|
|
| 57 |
|
| 58 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 59 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 60 |
+
self._vad = VadProcessor()
|
| 61 |
+
self.row_number = 0
|
| 62 |
# for test
|
| 63 |
self._transcrible_time_cost = 0.
|
| 64 |
self._translate_time_cost = 0.
|
|
|
|
| 87 |
"""根据语言返回适当的文本分隔符"""
|
| 88 |
return "" if language == "zh" else " "
|
| 89 |
|
| 90 |
+
async def send_ready_state(self) -> None:
|
| 91 |
"""发送服务就绪状态消息"""
|
| 92 |
+
await self.websocket.send(json.dumps({
|
| 93 |
"uid": self.client_uid,
|
| 94 |
"message": self.SERVER_READY,
|
| 95 |
"backend": "whisper_transcription"
|
|
|
|
| 99 |
"""设置源语言和目标语言"""
|
| 100 |
self.source_language = source_lang
|
| 101 |
self.target_language = target_lang
|
| 102 |
+
# self.text_separator = self._get_text_separator(source_lang)
|
| 103 |
+
# self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
|
| 104 |
|
| 105 |
+
def add_frames(self, frame_np: np.ndarray) -> None:
|
| 106 |
"""添加音频帧到处理队列"""
|
| 107 |
self._frame_queue.put(frame_np)
|
| 108 |
|
|
|
|
| 110 |
"""从队列获取音频帧并合并到缓冲区"""
|
| 111 |
while not self._frame_processing_thread_stop.is_set():
|
| 112 |
try:
|
| 113 |
+
audio = self._frame_queue.get(timeout=0.1)
|
| 114 |
+
# save_to_wave(f"{self._c}_before_vad.wav", audio)
|
| 115 |
+
processed_audio = self._vad.process_audio(audio)
|
| 116 |
+
if processed_audio.shape[0] > 0:
|
| 117 |
+
# vad_processed_audio = processed_audio
|
| 118 |
+
# save_to_wave(f"{self._c}_after_vad.wav", processed_audio)
|
| 119 |
+
# vad_frame_obj = np.frombuffer(processed_audio.audio, dtype=np.float32)
|
| 120 |
+
logger.debug(f"Vad frame: {processed_audio.shape[0]/self.sample_rate:.2f}")
|
| 121 |
+
# apply vad speech check:
|
| 122 |
+
self._vad_frame_queue.put(processed_audio)
|
| 123 |
except queue.Empty:
|
| 124 |
pass
|
| 125 |
|
| 126 |
+
|
| 127 |
+
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
"""转录音频并返回转录片段"""
|
| 129 |
log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
|
| 130 |
start_time = time.perf_counter()
|
|
|
|
| 133 |
segments = result.segments
|
| 134 |
time_diff = (time.perf_counter() - start_time)
|
| 135 |
logger.debug(f"📝 Transcrible Segments: {segments} ")
|
| 136 |
+
# logger.debug(f"📝 Transcrible: {self.text_separator.join(seg.text for seg in segments)} ")
|
| 137 |
log_block("📝 Transcrible output", f"{self.text_separator.join(seg.text for seg in segments)}", "")
|
| 138 |
log_block("📝 Transcrible time", f"{time_diff:.3f}", "s")
|
| 139 |
self._transcrible_time_cost = round(time_diff, 3)
|
| 140 |
+
return result
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
def _translate_text(self, text: str) -> str:
|
| 143 |
"""将文本翻译为目标语言"""
|
|
|
|
| 171 |
self._translate_time_cost = round(time_diff, 3)
|
| 172 |
return translated_text
|
| 173 |
|
|
|
|
|
|
|
| 174 |
def _transcription_processing_loop(self) -> None:
|
| 175 |
"""主转录处理循环"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
+
while not self._translate_thread_stop.is_set():
|
| 178 |
+
audio_buffer = self._vad_frame_queue.get()
|
| 179 |
+
if audio_buffer is None or len(audio_buffer) < int(self.sample_rate):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
time.sleep(0.2)
|
| 181 |
continue
|
| 182 |
+
|
| 183 |
+
logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
|
|
|
|
| 184 |
# try:
|
| 185 |
+
meta_item = self._transcribe_audio(audio_buffer)
|
| 186 |
+
segments = meta_item.segments
|
| 187 |
+
logger.debug(f"Segments: {segments}")
|
| 188 |
+
if len(segments):
|
| 189 |
+
result = self._process_transcription_results_2(segments)
|
| 190 |
self._send_result_to_client(result)
|
| 191 |
+
time.sleep(0.1)
|
| 192 |
+
# 处理转录结果并发送到客户端
|
| 193 |
+
# for result in self._process_transcription_results(segments, audio_buffer):
|
| 194 |
+
# self._send_result_to_client(result)
|
| 195 |
|
| 196 |
# except Exception as e:
|
| 197 |
# logger.error(f"Error processing audio: {e}")
|
| 198 |
|
| 199 |
+
def _process_transcription_results_2(self, segments: List[TranscriptToken],):
|
| 200 |
+
seg = segments[0]
|
| 201 |
+
item = TransResult(
|
| 202 |
+
seg_id=self.row_number,
|
| 203 |
+
context=seg.text,
|
| 204 |
+
from_=self.source_language,
|
| 205 |
+
to=self.target_language,
|
| 206 |
+
tran_content=self._translate_text_large(seg.text),
|
| 207 |
+
partial=False
|
| 208 |
+
)
|
| 209 |
+
self.row_number += 1
|
| 210 |
+
return item
|
| 211 |
+
|
| 212 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
| 213 |
"""
|
| 214 |
处理转录结果,生成翻译结果
|