Xin Zhang commited on
Commit
a79d676
·
2 Parent(s): 9b1398f 601009a

Merge branch 'vad'

Browse files

* vad:
[fix]: update view.
update vad parameter
change to vad audio translate
add whisper fine tune for chinese
vad parameters v1 test
fix vad bug
[fix]: update installation.
fix vad buf

config.py CHANGED
@@ -2,7 +2,7 @@ import pathlib
2
  import re
3
  import logging
4
 
5
- DEBUG = False
6
  TEST = False
7
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
 
@@ -10,7 +10,7 @@ logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
10
  logging.basicConfig(
11
  level=logging.DEBUG if DEBUG else logging.INFO,
12
  format="%(asctime)s - %(levelname)s - %(message)s",
13
- filename='translator.log',
14
  datefmt="%H:%M:%S"
15
  )
16
 
@@ -50,9 +50,10 @@ MAX_LENTH_ZH = 4
50
  WHISPER_PROMPT_EN = ""# "The following is an English sentence."
51
  MAX_LENGTH_EN= 8
52
 
53
- # WHISPER_MODEL = 'medium-q5_0'
54
- WHISPER_MODEL = 'large-v3-turbo-q5_0'
55
-
 
56
  # LLM
57
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
58
  LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
 
2
  import re
3
  import logging
4
 
5
+ DEBUG = True
6
  TEST = False
7
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
 
 
10
  logging.basicConfig(
11
  level=logging.DEBUG if DEBUG else logging.INFO,
12
  format="%(asctime)s - %(levelname)s - %(message)s",
13
+ filename='translator.log',
14
  datefmt="%H:%M:%S"
15
  )
16
 
 
50
  WHISPER_PROMPT_EN = ""# "The following is an English sentence."
51
  MAX_LENGTH_EN= 8
52
 
53
+ WHISPER_MODEL_EN = 'medium-q5_0'
54
+ # WHISPER_MODEL = 'large-v3-turbo-q5_0'
55
+ # WHISPER_MODEL_ZH = 'small'
56
+ WHISPER_MODEL_ZH = 'large-v3-turbo-q5_0'
57
  # LLM
58
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
59
  LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
frontend/assets/{index-eff0154e.css → index-2c7aa850.css} RENAMED
@@ -1 +1 @@
1
- html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;line-height:1.5;font-weight:400;color-scheme:light dark;color:#ffffffde;background-color:#242424;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%}a{font-weight:500;color:#646cff;text-decoration:inherit}a:hover{color:#535bf2}body{margin:0;display:flex;place-items:center;min-width:320px;height:auto;min-height:auto;color:#333;background:#fff}h1{font-size:3.2em;line-height:1.1}button{border-radius:8px;border:1px solid transparent;padding:.6em 1.2em;font-size:1em;font-weight:500;font-family:inherit;background-color:#1a1a1a;cursor:pointer;transition:border-color .25s}.card{border-bottom:solid 2px lightgray;align-items:center;justify-content:center;margin-top:40px;display:flex;max-width:1024px;width:100%}.seg-title{margin:24px 0;font-size:20px;font-weight:500}.seg-co{width:1022px;text-align:left;border-left:solid 6px midnightblue;padding-left:8px;margin-left:2px;margin-top:36px;line-height:24px}#app{margin:0 auto;padding:0;text-align:center;width:100%}.ant-btn{padding:4px 12px}@media (prefers-color-scheme: light){:root{color:#213547;background-color:#fff}a:hover{color:#747bff}button{background-color:#f9f9f9}}.ant-card{background:#f5f6fa}.ant-card .ant-card-actions{background-color:#e8e8f8cc!important}.ant-popover{max-width:800px!important}.ant-form-item{background:transparent;margin-bottom:40px!important}.ant-form-item .ant-form-item-explain-error{color:#ff4d4f;text-align:left!important}.ant-form-item-label label{font-size:18px!important;color:#1a1a1a!important;font-weight:500!important}.ant-tooltip{max-width:1022px!important}.ant-page-header-heading{width:1022px!important}.highlight{background:ghostwhite}.content[data-v-178d5f9f]{background-color:#fff;max-width:1280px;min-height:720px;margin:0 auto;display:flex;flex-direction:column;align-items:center;justify-content:space-between}.not-found-wrapper[data-v-aef52a59]{height:calc(100vh - 104px)}.view-wrapper[data-v-863105d2]{width:100%;height:100%;background-color:#fff}.view-wrapper .content-wrapper[data-v-863105d2]{text-align:left;width:1280px;max-width:100vw;min-width:320px;margin-bottom:64px;min-height:calc(100vh - 438px)}.view-wrapper .content-wrapper .chat-box[data-v-863105d2]{width:100%;height:54vh;border-radius:4px;padding:12px;color:#2e2f33;font-size:18px}.view-wrapper .content-wrapper .chat-box-placeholder[data-v-863105d2]{width:100%;height:58vh;border-radius:4px;padding:12px;font-size:18px;color:#a4a6ac}.view-wrapper .content-wrapper .actions-box[data-v-863105d2]{display:flex;align-items:center;justify-content:space-between;margin:0 24px;height:48px}.view-wrapper .content-wrapper .trans-list[data-v-863105d2]{overflow-y:auto;width:100%;height:58vh;scrollbar-width:none;-ms-overflow-style:none}.view-wrapper .content-wrapper .trans-list[data-v-863105d2]::-webkit-scrollbar{display:none}.view-wrapper .content-wrapper .trans-list .node[data-v-863105d2]{margin-bottom:36px;width:100%!important;transition:all .3s ease}.view-wrapper .content-wrapper .trans-list .node .trans-time[data-v-863105d2]{font-size:14px;color:#c4c6cc}.view-wrapper .content-wrapper .trans-list .node .trans-src-lang[data-v-863105d2]{font-size:18px;color:#909299;font-weight:500}.view-wrapper .content-wrapper .trans-list .node .trans-dst-lang[data-v-863105d2]{font-size:18px;color:#2e2f33;font-weight:600}.view-wrapper .content-wrapper .trans-list .current_node[data-v-863105d2]{background-color:#f0f1f7;padding:4px 8px}@keyframes highlight-863105d2{0%{background-color:transparent}50%{background-color:#fff1ce80}to{background-color:transparent}}@keyframes slideIn-863105d2{0%{opacity:0;transform:translateY(10px)}to{opacity:1;transform:translateY(0)}}.content-wrapper[data-v-c39ab0d6]{text-align:left;max-width:800px;min-width:320px;margin-bottom:64px;min-height:calc(100vh - 438px)}.content-wrapper .content-box[data-v-c39ab0d6]{padding:24px;height:240px;background-color:#e8e8e8;border-radius:16px;width:50%;margin:48px auto;min-width:300px}.content-wrapper .video-box[data-v-c39ab0d6]{max-width:800px;min-width:320px;width:90vw;height:auto}
 
1
+ html,body{width:100%;height:100%}input::-ms-clear,input::-ms-reveal{display:none}*,*:before,*:after{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-ms-text-size-adjust:100%;-ms-overflow-style:scrollbar;-webkit-tap-highlight-color:rgba(0,0,0,0)}body{margin:0}[tabindex="-1"]:focus{outline:none}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5em;font-weight:500}p{margin-top:0;margin-bottom:1em}abbr[title],abbr[data-original-title]{-webkit-text-decoration:underline dotted;text-decoration:underline;text-decoration:underline dotted;border-bottom:0;cursor:help}address{margin-bottom:1em;font-style:normal;line-height:inherit}input[type=text],input[type=password],input[type=number],textarea{-webkit-appearance:none}ol,ul,dl{margin-top:0;margin-bottom:1em}ol ol,ul ul,ol ul,ul ol{margin-bottom:0}dt{font-weight:500}dd{margin-bottom:.5em;margin-left:0}blockquote{margin:0 0 1em}dfn{font-style:italic}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}pre,code,kbd,samp{font-size:1em;font-family:SFMono-Regular,Consolas,Liberation Mono,Menlo,Courier,monospace}pre{margin-top:0;margin-bottom:1em;overflow:auto}figure{margin:0 0 1em}img{vertical-align:middle;border-style:none}a,area,button,[role=button],input:not([type=range]),label,select,summary,textarea{touch-action:manipulation}table{border-collapse:collapse}caption{padding-top:.75em;padding-bottom:.3em;text-align:left;caption-side:bottom}input,button,select,optgroup,textarea{margin:0;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}button,html [type=button],[type=reset],[type=submit]{-webkit-appearance:button}button::-moz-focus-inner,[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner{padding:0;border-style:none}input[type=radio],input[type=checkbox]{box-sizing:border-box;padding:0}input[type=date],input[type=time],input[type=datetime-local],input[type=month]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;margin:0;padding:0;border:0}legend{display:block;width:100%;max-width:100%;margin-bottom:.5em;padding:0;color:inherit;font-size:1.5em;line-height:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-cancel-button,[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item}template{display:none}[hidden]{display:none!important}mark{padding:.2em;background-color:#feffe6}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;line-height:1.5;font-weight:400;color-scheme:light dark;color:#ffffffde;background-color:#242424;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%}a{font-weight:500;color:#646cff;text-decoration:inherit}a:hover{color:#535bf2}body{margin:0;display:flex;place-items:center;min-width:320px;height:auto;min-height:auto;color:#333;background:#fff}h1{font-size:3.2em;line-height:1.1}button{border-radius:8px;border:1px solid transparent;padding:.6em 1.2em;font-size:1em;font-weight:500;font-family:inherit;background-color:#1a1a1a;cursor:pointer;transition:border-color .25s}.card{border-bottom:solid 2px lightgray;align-items:center;justify-content:center;margin-top:40px;display:flex;max-width:1024px;width:100%}.seg-title{margin:24px 0;font-size:20px;font-weight:500}.seg-co{width:1022px;text-align:left;border-left:solid 6px midnightblue;padding-left:8px;margin-left:2px;margin-top:36px;line-height:24px}#app{margin:0 auto;padding:0;text-align:center;width:100%}.ant-btn{padding:4px 12px}@media (prefers-color-scheme: light){:root{color:#213547;background-color:#fff}a:hover{color:#747bff}button{background-color:#f9f9f9}}.ant-card{background:#f5f6fa}.ant-card .ant-card-actions{background-color:#e8e8f8cc!important}.ant-popover{max-width:800px!important}.ant-form-item{background:transparent;margin-bottom:40px!important}.ant-form-item .ant-form-item-explain-error{color:#ff4d4f;text-align:left!important}.ant-form-item-label label{font-size:18px!important;color:#1a1a1a!important;font-weight:500!important}.ant-tooltip{max-width:1022px!important}.ant-page-header-heading{width:1022px!important}.highlight{background:ghostwhite}.content[data-v-178d5f9f]{background-color:#fff;max-width:1280px;min-height:720px;margin:0 auto;display:flex;flex-direction:column;align-items:center;justify-content:space-between}.not-found-wrapper[data-v-aef52a59]{height:calc(100vh - 104px)}.config-content[data-v-ba23d083]{width:420px;margin:12px}.config-content .config-block[data-v-ba23d083]{margin:12px;padding-bottom:12px}.view-wrapper[data-v-ba23d083]{width:100%;height:100%;background-color:#fff}.view-wrapper .wrapper-width-fixed[data-v-ba23d083]{width:1280px}.view-wrapper .wrapper-width-auto[data-v-ba23d083]{width:100vw}.view-wrapper .content-wrapper[data-v-ba23d083]{text-align:left;max-width:100vw;min-width:320px;margin-bottom:64px;min-height:calc(100vh - 438px)}.view-wrapper .content-wrapper .chat-box[data-v-ba23d083]{width:100%;height:54vh;border-radius:4px;padding:12px;color:#2e2f33;font-size:18px}.view-wrapper .content-wrapper .chat-box-placeholder[data-v-ba23d083]{width:100%;height:58vh;border-radius:4px;padding:12px;font-size:18px;color:#a4a6ac}.view-wrapper .content-wrapper .actions-box[data-v-ba23d083]{display:flex;align-items:center;justify-content:space-between;margin:0 24px;height:48px}.view-wrapper .content-wrapper .actions-box .left-actions[data-v-ba23d083]{display:flex;align-items:center;justify-content:space-between;width:288px}.view-wrapper .content-wrapper .trans-list[data-v-ba23d083]{overflow-y:auto;width:100%;height:58vh;scrollbar-width:none;-ms-overflow-style:none}.view-wrapper .content-wrapper .trans-list[data-v-ba23d083]::-webkit-scrollbar{display:none}.view-wrapper .content-wrapper .trans-list .node[data-v-ba23d083]{margin-bottom:36px;width:100%!important;transition:all .3s ease}.view-wrapper .content-wrapper .trans-list .node .trans-time[data-v-ba23d083]{font-size:14px;color:#c4c6cc}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-16[data-v-ba23d083]{font-size:16px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-18[data-v-ba23d083]{font-size:18px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-20[data-v-ba23d083]{font-size:20px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-22[data-v-ba23d083]{font-size:22px}.view-wrapper .content-wrapper .trans-list .node .trans-font-size-24[data-v-ba23d083]{font-size:24px}.view-wrapper .content-wrapper .trans-list .node .trans-src-lang[data-v-ba23d083]{color:#909299;font-weight:500}.view-wrapper .content-wrapper .trans-list .node .trans-dst-lang[data-v-ba23d083]{color:#2e2f33;font-weight:600}.view-wrapper .content-wrapper .trans-list .current_node[data-v-ba23d083]{background-color:#f0f1f7;padding:4px 8px}@keyframes highlight-ba23d083{0%{background-color:transparent}50%{background-color:#fff1ce80}to{background-color:transparent}}@keyframes slideIn-ba23d083{0%{opacity:0;transform:translateY(10px)}to{opacity:1;transform:translateY(0)}}.content-wrapper[data-v-c39ab0d6]{text-align:left;max-width:800px;min-width:320px;margin-bottom:64px;min-height:calc(100vh - 438px)}.content-wrapper .content-box[data-v-c39ab0d6]{padding:24px;height:240px;background-color:#e8e8e8;border-radius:16px;width:50%;margin:48px auto;min-width:300px}.content-wrapper .video-box[data-v-c39ab0d6]{max-width:800px;min-width:320px;width:90vw;height:auto}
frontend/assets/{index-0364c095.js → index-640e640f.js} RENAMED
The diff for this file is too large to render. See raw diff
 
frontend/index.html CHANGED
@@ -5,8 +5,8 @@
5
  <link rel="icon" type="image/svg+xml" href="./favicon.ico" />
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
  <title>Translator</title>
8
- <script type="module" crossorigin src="./assets/index-0364c095.js"></script>
9
- <link rel="stylesheet" href="./assets/index-eff0154e.css">
10
  </head>
11
  <body>
12
  <div id="app"></div>
 
5
  <link rel="icon" type="image/svg+xml" href="./favicon.ico" />
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
  <title>Translator</title>
8
+ <script type="module" crossorigin src="./assets/index-640e640f.js"></script>
9
+ <link rel="stylesheet" href="./assets/index-2c7aa850.css">
10
  </head>
11
  <body>
12
  <div id="app"></div>
main.py CHANGED
@@ -57,6 +57,7 @@ async def root():
57
  async def translate(websocket: WebSocket):
58
  query_parameters_dict = websocket.query_params
59
  from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
 
60
  client = WhisperTranscriptionService(
61
  websocket,
62
  pipe,
@@ -64,6 +65,7 @@ async def translate(websocket: WebSocket):
64
  client_uid=f"{uuid1()}",
65
  )
66
 
 
67
  if from_lang and to_lang:
68
  client.set_language(from_lang, to_lang)
69
  logger.info(f"Source lange: {from_lang} -> Dst lange: {to_lang}")
 
57
  async def translate(websocket: WebSocket):
58
  query_parameters_dict = websocket.query_params
59
  from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
60
+
61
  client = WhisperTranscriptionService(
62
  websocket,
63
  pipe,
 
65
  client_uid=f"{uuid1()}",
66
  )
67
 
68
+
69
  if from_lang and to_lang:
70
  client.set_language(from_lang, to_lang)
71
  logger.info(f"Source lange: {from_lang} -> Dst lange: {to_lang}")
moyoyo_asr_models/ggml-small-encoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18ad2072ae82872c2ba8a187071e1e7d6c1105253685e7aa95138adcf07874e0
3
+ size 207
moyoyo_asr_models/ggml-small-encoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05fe28591b40616fa0c34ad7b853133623f5300923ec812acb11459c411acf3b
3
+ size 149
moyoyo_asr_models/ggml-small-encoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "storagePrecision" : "Float16",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32)",
11
+ "shortDescription" : "",
12
+ "shape" : "[]",
13
+ "name" : "output",
14
+ "type" : "MultiArray"
15
+ }
16
+ ],
17
+ "modelParameters" : [
18
+
19
+ ],
20
+ "specificationVersion" : 6,
21
+ "mlProgramOperationTypeHistogram" : {
22
+ "Linear" : 72,
23
+ "Matmul" : 24,
24
+ "Cast" : 2,
25
+ "Conv" : 2,
26
+ "Softmax" : 12,
27
+ "Add" : 25,
28
+ "LayerNorm" : 25,
29
+ "Mul" : 24,
30
+ "Transpose" : 49,
31
+ "Gelu" : 14,
32
+ "Reshape" : 48
33
+ },
34
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
35
+ "isUpdatable" : "0",
36
+ "availability" : {
37
+ "macOS" : "12.0",
38
+ "tvOS" : "15.0",
39
+ "watchOS" : "8.0",
40
+ "iOS" : "15.0",
41
+ "macCatalyst" : "15.0"
42
+ },
43
+ "modelType" : {
44
+ "name" : "MLModelType_mlProgram"
45
+ },
46
+ "userDefinedMetadata" : {
47
+
48
+ },
49
+ "inputSchema" : [
50
+ {
51
+ "hasShapeFlexibility" : "0",
52
+ "isOptional" : "0",
53
+ "dataType" : "Float32",
54
+ "formattedType" : "MultiArray (Float32 1 × 80 × 3000)",
55
+ "shortDescription" : "",
56
+ "shape" : "[1, 80, 3000]",
57
+ "name" : "logmel_data",
58
+ "type" : "MultiArray"
59
+ }
60
+ ],
61
+ "generatedClassName" : "coreml_encoder_small",
62
+ "method" : "predict"
63
+ }
64
+ ]
moyoyo_asr_models/ggml-small-encoder.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
moyoyo_asr_models/ggml-small-encoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87eed4ae76f11a2d4a50786bc7423d4b45c2d0d9ca05577a3bd2557452072eaf
3
+ size 176339456
moyoyo_asr_models/ggml-small.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f6ef171491de375b741059400ba9a0aead023122b7a7db731b4943f9baa0f97
3
+ size 487601984
transcribe/helpers/vadprocessor.py CHANGED
@@ -1,276 +1,296 @@
1
- import gc
2
- import time
 
 
 
3
  import numpy as np
4
  import onnxruntime
5
- from datetime import timedelta
6
- from pydub import AudioSegment
7
- from silero_vad import load_silero_vad, get_speech_timestamps, VADIterator
8
- import os
9
- import logging
10
-
11
- class FixedVADIterator(VADIterator):
12
- '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
13
- If audio to be processed at once is long and multiple voiced segments detected,
14
- then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
15
- '''
16
 
17
- def reset_states(self):
18
- super().reset_states()
19
- self.buffer = np.array([],dtype=np.float32)
20
-
21
- def __call__(self, x, return_seconds=False):
22
- self.buffer = np.append(self.buffer, x)
23
- ret = None
24
- while len(self.buffer) >= 512:
25
- r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
26
- self.buffer = self.buffer[512:]
27
- if ret is None:
28
- ret = r
29
- elif r is not None:
30
- if 'end' in r:
31
- ret['end'] = r['end'] # the latter end
32
- if 'start' in r and 'end' in ret: # there is an earlier start.
33
- # Remove end, merging this segment with the previous one.
34
- del ret['end']
35
- return ret if ret != {} else None
36
-
37
- class SileroVADProcessor:
38
- """
39
- A class for processing audio files using Silero VAD to detect voice activity
40
- and extract voice segments from audio files.
41
- """
42
 
43
- def __init__(self,
44
- activate_threshold=0.5,
45
- fusion_threshold=0.3,
46
- min_speech_duration=0.25,
47
- max_speech_duration=20,
48
- min_silence_duration=250,
49
- sample_rate=16000,
50
- ort_providers=None):
51
- """
52
- Initialize the SileroVADProcessor.
53
-
54
- Args:
55
- activate_threshold (float): Threshold for voice activity detection
56
- fusion_threshold (float): Threshold for merging close speech segments (seconds)
57
- min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
58
- max_speech_duration (float): Maximum duration of speech (seconds)
59
- min_silence_duration (int): Minimum silence duration (ms)
60
- sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
61
- ort_providers (list): ONNX Runtime providers for acceleration
62
- """
63
- # VAD parameters
64
- self.activate_threshold = activate_threshold
65
- self.fusion_threshold = fusion_threshold
66
- self.min_speech_duration = min_speech_duration
67
- self.max_speech_duration = max_speech_duration
68
- self.min_silence_duration = min_silence_duration
69
- self.sample_rate = sample_rate
70
- self.ort_providers = ort_providers if ort_providers else []
71
-
72
- # Initialize logger
73
- self.logger = logging.getLogger(__name__)
74
-
75
- # Load Silero VAD model
76
- self._init_onnx_session()
77
- self.silero_vad = load_silero_vad(onnx=True)
78
-
79
- def _init_onnx_session(self):
80
- """Initialize ONNX Runtime session with appropriate settings."""
81
- session_opts = onnxruntime.SessionOptions()
82
- session_opts.log_severity_level = 3
83
- session_opts.inter_op_num_threads = 0
84
- session_opts.intra_op_num_threads = 0
85
- session_opts.enable_cpu_mem_arena = True
86
- session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
87
- session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
88
-
89
- session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
90
- session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
91
- session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
92
-
93
- # Set the session_opts to be used by silero_vad
94
- # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
95
-
96
- def load_audio(self, audio_path):
97
- """
98
- Load audio file and prepare it for VAD processing.
99
 
100
- Args:
101
- audio_path (str): Path to the audio file
 
 
102
 
103
- Returns:
104
- numpy.ndarray: Audio data as numpy array
105
- """
106
- self.logger.info(f"Loading audio from {audio_path}")
107
- audio_segment = AudioSegment.from_file(audio_path)
108
- audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
109
 
110
- # Convert to numpy array and normalize
111
- dtype = np.float16 if self.use_gpu_fp16 else np.float32
112
- audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
 
 
113
 
114
- self.audio_segment = audio_segment # Store for later use
115
- return audio_array
116
-
117
- @property
118
- def model(self):
119
- return self.silero_vad
120
 
121
- def process_timestamps(self, timestamps):
122
- """
123
- Process VAD timestamps: filter short segments and merge close segments.
 
124
 
125
- Args:
126
- timestamps (list): List of (start, end) tuples
127
 
128
- Returns:
129
- list: Processed list of (start, end) tuples
130
- """
131
- # Filter out short durations
132
- filtered_timestamps = [(start, end) for start, end in timestamps
133
- if (end - start) >= self.min_speech_duration]
134
-
135
- # Fuse timestamps in two passes for better merging
136
- fused_timestamps_1st = []
137
- for start, end in filtered_timestamps:
138
- if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
139
- fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
140
- else:
141
- fused_timestamps_1st.append((start, end))
142
 
143
- fused_timestamps_2nd = []
144
- for start, end in fused_timestamps_1st:
145
- if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
146
- fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
147
- else:
148
- fused_timestamps_2nd.append((start, end))
149
 
150
- return fused_timestamps_2nd
 
151
 
152
- def format_time(self, seconds):
153
- """
154
- Convert seconds to VTT time format 'hh:mm:ss.mmm'.
155
 
156
- Args:
157
- seconds (float): Time in seconds
158
 
159
- Returns:
160
- str: Formatted time string
161
- """
162
- td = timedelta(seconds=seconds)
163
- td_sec = td.total_seconds()
164
- total_seconds = int(td_sec)
165
- milliseconds = int((td_sec - total_seconds) * 1000)
166
- hours = total_seconds // 3600
167
- minutes = (total_seconds % 3600) // 60
168
- seconds = total_seconds % 60
169
- return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
170
-
171
- def detect_speech(self, audio:np.array):
172
- """
173
- Run VAD on the audio file to detect speech segments.
174
 
175
- Args:
176
- audio_path (str): Path to the audio file
177
 
178
- Returns:
179
- list: List of processed timestamps as (start, end) tuples
180
- """
181
- self.logger.info("Starting VAD process")
182
- start_time = time.time()
183
- # Get speech timestamps
184
- raw_timestamps = get_speech_timestamps(
185
- audio,
186
- model=self.silero_vad,
187
- threshold=self.activate_threshold,
188
- max_speech_duration_s=self.max_speech_duration,
189
- min_speech_duration_ms=int(self.min_speech_duration * 1000),
190
- min_silence_duration_ms=self.min_silence_duration,
191
- return_seconds=True
192
- )
193
-
194
- # Convert to simple format and process
195
- timestamps = [(item['start'], item['end']) for item in raw_timestamps]
196
- processed_timestamps = self.process_timestamps(timestamps)
197
-
198
- # Clean up
199
- del audio
200
- gc.collect()
201
-
202
- self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
203
- return processed_timestamps
204
 
205
- """
206
- Save timestamps in both second and sample indices formats.
 
207
 
208
- Args:
209
- timestamps (list): List of (start, end) tuples
210
- output_prefix (str): Prefix for output files
211
- """
212
- # Save timestamps in seconds (VTT format)
213
- seconds_path = f"{output_prefix}_timestamps_second.txt"
214
- with open(seconds_path, "w", encoding='UTF-8') as file:
215
- self.logger.info("Saving timestamps in seconds format")
216
- for start, end in timestamps:
217
- s_time = self.format_time(start)
218
- e_time = self.format_time(end)
219
- line = f"{s_time} --> {e_time}\n"
220
- file.write(line)
221
-
222
- # Save timestamps in sample indices
223
- indices_path = f"{output_prefix}_timestamps_indices.txt"
224
- with open(indices_path, "w", encoding='UTF-8') as file:
225
- self.logger.info("Saving timestamps in indices format")
226
- for start, end in timestamps:
227
- line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
228
- file.write(line)
229
-
230
- self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
231
-
232
- def extract_speech_segments(self, audio_segment, timestamps):
233
- """
234
- Extract speech segments from the audio and combine them into a single audio file.
235
 
236
- Args:
237
- timestamps (list): List of (start, end) tuples indicating speech segments
 
 
 
238
 
239
- Returns:
240
- AudioSegment: The combined speech segments
241
- """
242
- audio_segment = audio_segment.numpy()
243
- combined_speech = np.array([], dtype=np.float32)
 
 
 
244
 
245
- # Extract and combine each speech segment
246
- for i, (start, end) in enumerate(timestamps):
247
- # Convert seconds to milliseconds for pydub
248
- start_ms = int(start * 1000)
249
- end_ms = int(end * 1000)
250
 
251
- # Ensure the end time does not exceed the length of the audio segment
252
- if end_ms > len(audio_segment):
253
- end_ms = len(audio_segment)
254
 
255
- # Extract the segment
256
- segment = audio_segment[start_ms:end_ms]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
- # Add to combined audio
259
- combined_speech = np.append(combined_speech, segment)
260
 
261
- return combined_speech
 
 
 
 
262
 
263
- def process_audio(self, audio_array:np.array):
264
  """
265
- Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
 
266
 
267
- Returns:
268
- tuple: (timestamps, output_speech_path if extract_speech else None)
269
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
 
271
- # Run VAD to detect speech
272
- timestamps = self.detect_speech(audio_array)
273
 
274
- combined_speech = self.extract_speech_segments(audio_array, timestamps)
275
 
276
- return timestamps, combined_speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from copy import deepcopy
2
+ from queue import Queue, Empty
3
+ from time import time
4
+ from config import VAD_MODEL_PATH
5
+ # from silero_vad import load_silero_vad
6
  import numpy as np
7
  import onnxruntime
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ class OnnxWrapper():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ def __init__(self, path, force_onnx_cpu=False):
12
+ opts = onnxruntime.SessionOptions()
13
+ opts.inter_op_num_threads = 1
14
+ opts.intra_op_num_threads = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
17
+ self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
18
+ else:
19
+ self.session = onnxruntime.InferenceSession(path, sess_options=opts)
20
 
21
+ self.reset_states()
22
+ self.sample_rates = [16000]
 
 
 
 
23
 
24
+ def _validate_input(self, x: np.ndarray, sr: int):
25
+ if x.ndim == 1:
26
+ x = x[None]
27
+ if x.ndim > 2:
28
+ raise ValueError(f"Too many dimensions for input audio chunk {x.ndim}")
29
 
30
+ if sr != 16000 and (sr % 16000 == 0):
31
+ step = sr // 16000
32
+ x = x[:, ::step]
33
+ sr = 16000
 
 
34
 
35
+ if sr not in self.sample_rates:
36
+ raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
37
+ if sr / x.shape[1] > 31.25:
38
+ raise ValueError("Input audio chunk is too short")
39
 
40
+ return x, sr
 
41
 
42
+ def reset_states(self, batch_size=1):
43
+ self._state = np.zeros((2, batch_size, 128)).astype(np.float32)
44
+ self._context = np.zeros(0)
45
+ self._last_sr = 0
46
+ self._last_batch_size = 0
 
 
 
 
 
 
 
 
 
47
 
48
+ def __call__(self, x, sr: int):
 
 
 
 
 
49
 
50
+ x, sr = self._validate_input(x, sr)
51
+ num_samples = 512 if sr == 16000 else 256
52
 
53
+ if x.shape[-1] != num_samples:
54
+ raise ValueError(
55
+ f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
56
 
57
+ batch_size = x.shape[0]
58
+ context_size = 64 if sr == 16000 else 32
59
 
60
+ if not self._last_batch_size:
61
+ self.reset_states(batch_size)
62
+ if (self._last_sr) and (self._last_sr != sr):
63
+ self.reset_states(batch_size)
64
+ if (self._last_batch_size) and (self._last_batch_size != batch_size):
65
+ self.reset_states(batch_size)
 
 
 
 
 
 
 
 
 
66
 
67
+ if not len(self._context):
68
+ self._context = np.zeros((batch_size, context_size)).astype(np.float32)
69
 
70
+ x = np.concatenate([self._context, x], axis=1)
71
+ if sr in [8000, 16000]:
72
+ ort_inputs = {'input': x, 'state': self._state, 'sr': np.array(sr, dtype='int64')}
73
+ ort_outs = self.session.run(None, ort_inputs)
74
+ out, state = ort_outs
75
+ self._state = state
76
+ else:
77
+ raise ValueError()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
+ self._context = x[..., -context_size:]
80
+ self._last_sr = sr
81
+ self._last_batch_size = batch_size
82
 
83
+ # out = torch.from_numpy(out)
84
+ return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ def audio_forward(self, audio: np.ndarray, sr: int):
87
+ outs = []
88
+ x, sr = self._validate_input(audio, sr)
89
+ self.reset_states()
90
+ num_samples = 512 if sr == 16000 else 256
91
 
92
+ if x.shape[1] % num_samples:
93
+ pad_num = num_samples - (x.shape[1] % num_samples)
94
+ x = np.pad(x, ((0, 0), (0, pad_num)), 'constant', constant_values=(0.0, 0.0))
95
+
96
+ for i in range(0, x.shape[1], num_samples):
97
+ wavs_batch = x[:, i:i + num_samples]
98
+ out_chunk = self.__call__(wavs_batch, sr)
99
+ outs.append(out_chunk)
100
 
101
+ stacked = np.concatenate(outs, axis=1)
102
+ return stacked
 
 
 
103
 
 
 
 
104
 
105
+ class VADIteratorOnnx:
106
+ def __init__(self,
107
+ threshold: float = 0.5,
108
+ sampling_rate: int = 16000,
109
+ min_silence_duration_ms: int = 100,
110
+ max_speech_duration_s: float = float('inf'),
111
+ ):
112
+ self.model = OnnxWrapper(VAD_MODEL_PATH, True)
113
+ self.threshold = threshold
114
+ self.sampling_rate = sampling_rate
115
+
116
+ if sampling_rate not in [8000, 16000]:
117
+ raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
118
+
119
+ self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
120
+ self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
121
+ # self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
122
+ self.reset_states()
123
 
124
+ def reset_states(self):
 
125
 
126
+ self.model.reset_states()
127
+ self.triggered = False
128
+ self.temp_end = 0
129
+ self.current_sample = 0
130
+ self.start = 0
131
 
132
+ def __call__(self, x: np.ndarray, return_seconds=False):
133
  """
134
+ x: np.ndarray
135
+ audio chunk (see examples in repo)
136
 
137
+ return_seconds: bool (default - False)
138
+ whether return timestamps in seconds (default - samples)
139
  """
140
+
141
+ window_size_samples = 512 if self.sampling_rate == 16000 else 256
142
+ x = x[:window_size_samples]
143
+ if len(x) < window_size_samples:
144
+ x = np.pad(x, ((0, 0), (0, window_size_samples - len(x))), 'constant', constant_values=0.0)
145
+
146
+ self.current_sample += window_size_samples
147
+
148
+ speech_prob = self.model(x, self.sampling_rate)[0,0]
149
+ # print(f"{self.current_sample/self.sampling_rate:.2f}: {speech_prob}")
150
+
151
+ if (speech_prob >= self.threshold) and self.temp_end:
152
+ self.temp_end = 0
153
+
154
+ if (speech_prob >= self.threshold) and not self.triggered:
155
+ self.triggered = True
156
+ speech_start = max(0, self.current_sample - window_size_samples)
157
+ self.start = speech_start
158
+ return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
159
+
160
+ if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
161
+ if self.temp_end:
162
+ self.temp_end = 0
163
+ self.start = self.current_sample
164
+ return {'end': int(self.current_sample) if not return_seconds else round(self.current_sample / self.sampling_rate, 1)}
165
+
166
+ if (speech_prob < self.threshold - 0.15) and self.triggered:
167
+ if not self.temp_end:
168
+ self.temp_end = self.current_sample
169
+ if self.current_sample - self.temp_end < self.min_silence_samples:
170
+ return None
171
+ else:
172
+ speech_end = self.temp_end - window_size_samples
173
+ self.temp_end = 0
174
+ self.triggered = False
175
+ return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
176
 
177
+ return None
178
+
179
 
 
180
 
181
+ class VadV2:
182
+ def __init__(self,
183
+ threshold: float = 0.5,
184
+ sampling_rate: int = 16000,
185
+ min_silence_duration_ms: int = 100,
186
+ speech_pad_ms: int = 30,
187
+ max_speech_duration_s: float = float('inf')):
188
+ # self.vad_iterator = VADIterator(threshold, sampling_rate, min_silence_duration_ms)
189
+ self.vad_iterator = VADIteratorOnnx(threshold, sampling_rate, min_silence_duration_ms, max_speech_duration_s)
190
+ self.speech_pad_samples = int(sampling_rate * speech_pad_ms / 1000)
191
+ self.sampling_rate = sampling_rate
192
+ self.audio_buffer = np.array([], dtype=np.float32)
193
+ self.start = 0
194
+ self.end = 0
195
+ self.offset = 0
196
+ assert speech_pad_ms <= min_silence_duration_ms, "speech_pad_ms should be less than min_silence_duration_ms"
197
+ self.max_speech_samples = int(sampling_rate * max_speech_duration_s)
198
+
199
+ self.silence_chunk_size = 0
200
+ self.silence_chunk_threshold = 60 / (512 / self.sampling_rate)
201
+
202
+ def reset(self):
203
+ self.audio_buffer = np.array([], dtype=np.float32)
204
+ self.start = 0
205
+ self.end = 0
206
+ self.offset = 0
207
+ self.vad_iterator.reset_states()
208
+
209
+ def __call__(self, x: np.ndarray = None):
210
+ if x is None:
211
+ if self.start:
212
+ start = max(self.offset, self.start - self.speech_pad_samples)
213
+ end = self.offset + len(self.audio_buffer)
214
+ start_ts = round(start / self.sampling_rate, 1)
215
+ end_ts = round(end / self.sampling_rate, 1)
216
+ audio_data = self.audio_buffer[start - self.offset: end - self.offset]
217
+ result = {
218
+ "start": start_ts,
219
+ "end": end_ts,
220
+ "audio": audio_data,
221
+ }
222
+ else:
223
+ result = None
224
+ self.reset()
225
+ return result
226
+
227
+ self.audio_buffer = np.append(self.audio_buffer, deepcopy(x))
228
+
229
+ result = self.vad_iterator(x)
230
+ if result is not None:
231
+ # self.start = result.get('start', self.start)
232
+ # self.end = result.get('end', self.end)
233
+ self.silence_chunk_size = 0
234
+
235
+ if 'start' in result:
236
+ self.start = result['start']
237
+ if 'end' in result:
238
+ self.end = result['end']
239
+ else:
240
+ self.silence_chunk_size += 1
241
+
242
+ if self.start == 0 and len(self.audio_buffer) > self.speech_pad_samples:
243
+ self.offset += len(self.audio_buffer) - self.speech_pad_samples
244
+ self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
245
+
246
+ if self.silence_chunk_size >= self.silence_chunk_threshold:
247
+ self.offset += len(self.audio_buffer) - self.speech_pad_samples
248
+ self.audio_buffer = self.audio_buffer[-self.speech_pad_samples:]
249
+ self.silence_chunk_size = 0
250
+
251
+ if self.end > self.start:
252
+ start = max(self.offset, self.start - self.speech_pad_samples)
253
+ end = self.end + self.speech_pad_samples
254
+ start_ts = round(start / self.sampling_rate, 1)
255
+ end_ts = round(end / self.sampling_rate, 1)
256
+ audio_data = self.audio_buffer[start - self.offset: end - self.offset]
257
+ self.audio_buffer = self.audio_buffer[self.end - self.offset:]
258
+ self.offset = self.end
259
+ self.start = self.end
260
+ # self.start = 0
261
+ self.end = 0
262
+ result = {
263
+ "start": start_ts,
264
+ "end": end_ts,
265
+ "audio": audio_data,
266
+ }
267
+
268
+ return result
269
+ return None
270
+
271
+
272
+
273
+ class VadProcessor:
274
+ def __init__(
275
+ self,
276
+ prob_threshold=0.5,
277
+ silence_s=0.3,
278
+ cache_s=0.25,
279
+ sr=16000
280
+ ):
281
+ self.prob_thres = prob_threshold
282
+ self.cache_s = cache_s
283
+ self.sr = sr
284
+ self.silence_s = silence_s
285
+
286
+ self.vad = VadV2(self.prob_thres, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
287
+
288
+
289
+ def process_audio(self, audio_buffer: np.ndarray):
290
+ audio = np.array([], np.float32)
291
+ for i in range(0, len(audio_buffer), 512):
292
+ chunk = audio_buffer[i:i+512]
293
+ ret = self.vad(chunk)
294
+ if ret:
295
+ audio = np.append(audio, ret['audio'])
296
+ return audio
transcribe/helpers/whisper.py CHANGED
@@ -9,10 +9,14 @@ logger = getLogger(__name__)
9
 
10
  class WhisperCPP:
11
 
12
- def __init__(self, warmup=True) -> None:
13
  models_dir = config.MODEL_DIR.as_posix()
 
 
 
 
14
  self.model = Model(
15
- model=config.WHISPER_MODEL,
16
  models_dir=models_dir,
17
  print_realtime=False,
18
  print_progress=False,
@@ -47,9 +51,9 @@ class WhisperCPP:
47
  audio_buffer,
48
  initial_prompt=prompt,
49
  language=language,
50
- token_timestamps=True,
51
  # split_on_word=True,
52
- max_len=max_len
53
  )
54
  return output
55
  except Exception as e:
 
9
 
10
  class WhisperCPP:
11
 
12
+ def __init__(self, source_lange: str='en', warmup=True) -> None:
13
  models_dir = config.MODEL_DIR.as_posix()
14
+ if source_lange == "zh":
15
+ whisper_model = config.WHISPER_MODEL_ZH
16
+ else:
17
+ whisper_model = config.WHISPER_MODEL_EN
18
  self.model = Model(
19
+ model=whisper_model,
20
  models_dir=models_dir,
21
  print_realtime=False,
22
  print_progress=False,
 
51
  audio_buffer,
52
  initial_prompt=prompt,
53
  language=language,
54
+ # token_timestamps=True,
55
  # split_on_word=True,
56
+ # max_len=max_len
57
  )
58
  return output
59
  except Exception as e:
transcribe/pipelines/__init__.py CHANGED
@@ -1,5 +1,5 @@
1
 
2
  from .pipe_translate import TranslatePipe, Translate7BPipe
3
- from .pipe_whisper import WhisperPipe
4
  from .pipe_vad import VadPipe
5
  from .base import MetaItem
 
1
 
2
  from .pipe_translate import TranslatePipe, Translate7BPipe
3
+ from .pipe_whisper import WhisperPipe, WhisperChinese
4
  from .pipe_vad import VadPipe
5
  from .base import MetaItem
transcribe/pipelines/pipe_translate.py CHANGED
@@ -35,3 +35,6 @@ class Translate7BPipe(TranslatePipe):
35
  if cls.translator is None:
36
  cls.translator = QwenTranslator(LLM_LARGE_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH)
37
 
 
 
 
 
35
  if cls.translator is None:
36
  cls.translator = QwenTranslator(LLM_LARGE_MODEL_PATH, LLM_SYS_PROMPT_EN, LLM_SYS_PROMPT_ZH)
37
 
38
+
39
+
40
+
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,99 +1,41 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
4
  import numpy as np
5
  from silero_vad import get_speech_timestamps
6
- import torch
7
  from typing import List
8
- # import noisereduce as nr
9
 
10
- def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
11
- chunks = []
12
- silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
13
- silence = torch.zeros(silent_samples) # 创建300ms的静音
14
-
15
- for i in range(len(tss)):
16
- # 先添加当前语音片段
17
- chunks.append(wav[tss[i]['start']: tss[i]['end']])
18
-
19
- # 如果不是最后一个片段,且与下一个片段间隔大于100ms,则添加静音
20
- if i < len(tss) - 1:
21
- gap = tss[i+1]['start'] - tss[i]['end']
22
- if gap > 0.1 * sample_rate: # 判断间隔是否大于100ms
23
- chunks.append(silence) # 添加300ms静音
24
-
25
- return torch.cat(chunks)
26
 
27
- def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
28
- chunks = []
29
- silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
30
- silence = torch.zeros(silent_samples) # 创建300ms的静音
31
- min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
32
-
33
- # 对时间戳进行简单的平滑处理
34
- smoothed_tss = []
35
- for i, ts in enumerate(tss):
36
- if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
37
- smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
38
- else:
39
- smoothed_tss.append(ts)
40
-
41
- for i in range(len(smoothed_tss)):
42
- # 添加当前语音片段
43
- chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
44
-
45
- # 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
46
- if i < len(smoothed_tss) - 1:
47
- gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
48
- if gap > min_gap_samples:
49
- # 根据间隔大小动态调整静音长度,但最大不超过300ms
50
- silence_length = min(gap // 2, silent_samples)
51
- chunks.append(torch.zeros(silence_length))
52
-
53
- return torch.cat(chunks)
54
 
55
  class VadPipe(BasePipe):
56
- model = None
57
  sample_rate = 16000
58
  window_size_samples = 512
 
 
 
 
 
59
 
60
 
61
  @classmethod
62
  def init(cls):
63
- if cls.model is None:
64
- cls.model = SileroVADProcessor(
65
- activate_threshold=0.45, # 降低以捕获更多音频
66
- fusion_threshold=0.45, # 提高以更好地融合语音片段
67
- min_speech_duration=0.2, # 略微降低以捕获短音节
68
- max_speech_duration=20, # 保持不变
69
- min_silence_duration=300, # 增加到300毫秒,允许说话间的自然停顿
70
- sample_rate=cls.sample_rate # 采样率,音频信号的采样频率
71
- )
72
- cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
73
- cls.vac.reset_states()
74
-
75
-
76
- def get_previous_buffer(self):
77
- if len(self.previous_buffer) == 2:
78
- return self.previous_buffer[-1]
79
- return np.array([], dtype=np.float32)
80
 
 
 
 
 
 
 
 
 
 
81
 
82
  # def reduce_noise(self, data):
83
  # return nr.reduce_noise(y=data, sr=self.sample_rate)
84
-
85
 
86
- def process(self, in_data: MetaItem) -> MetaItem:
87
- source_audio = in_data.source_audio
88
- source_audio = np.frombuffer(source_audio, dtype=np.float32)
89
- # source_audio = self.reduce_noise(source_audio)
90
- send_audio = b""
91
- speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
92
-
93
- if speech_timestamps:
94
- send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
95
- send_audio = send_audio.numpy()
96
- in_data.audio = send_audio
97
- # send_audio = self.reduce_noise(send_audio).tobytes()
98
- in_data.source_audio = b""
99
- return in_data
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import VadV2
4
  import numpy as np
5
  from silero_vad import get_speech_timestamps
 
6
  from typing import List
7
+ import logging
8
 
9
+ # import noisereduce as nr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  class VadPipe(BasePipe):
13
+ vac = None
14
  sample_rate = 16000
15
  window_size_samples = 512
16
+ chunk_size = 512
17
+ prob_threshold=0.5,
18
+ silence_s=0.5,
19
+ cache_s=0.25,
20
+
21
 
22
 
23
  @classmethod
24
  def init(cls):
25
+ if cls.vac is None:
26
+ cls.vac = VadV2(cls.prob_threshold, cls.sample_rate, cls.silence_s * 1000, cls.cache_s * 1000, max_speech_duration_s=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ def process(self, in_data: MetaItem) -> MetaItem:
29
+ audio_buffer = np.frombuffer(in_data.source_audio)
30
+ vad_audio = self.vac(audio_buffer)
31
+ if vad_audio:
32
+ in_data.audio = vad_audio['audio']
33
+ else:
34
+ in_data.audio = b""
35
+ return in_data
36
+
37
 
38
  # def reduce_noise(self, data):
39
  # return nr.reduce_noise(y=data, sr=self.sample_rate)
 
40
 
41
+
 
 
 
 
 
 
 
 
 
 
 
 
 
transcribe/pipelines/pipe_whisper.py CHANGED
@@ -7,16 +7,18 @@ class WhisperPipe(BasePipe):
7
  whisper = None
8
 
9
 
 
10
  @classmethod
11
  def init(cls):
12
  if cls.whisper is None:
 
13
  cls.whisper = WhisperCPP()
14
-
15
 
16
  def process(self, in_data: MetaItem) -> MetaItem:
17
  audio_data = in_data.audio
18
  source_language = in_data.source_language
19
- segments = self.whisper.transcribe(audio_data, source_language) or []
20
  texts = "".join([s.text for s in segments])
21
  in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
22
  in_data.transcribe_content = texts
@@ -30,3 +32,11 @@ class WhisperPipe(BasePipe):
30
  if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
31
  printable.append(char)
32
  return ''.join(printable).strip()
 
 
 
 
 
 
 
 
 
7
  whisper = None
8
 
9
 
10
+
11
  @classmethod
12
  def init(cls):
13
  if cls.whisper is None:
14
+ # cls.zh_whisper = WhisperCPP(source_lange='zh')
15
  cls.whisper = WhisperCPP()
16
+
17
 
18
  def process(self, in_data: MetaItem) -> MetaItem:
19
  audio_data = in_data.audio
20
  source_language = in_data.source_language
21
+ segments = self.whisper.transcribe(audio_data, source_language)
22
  texts = "".join([s.text for s in segments])
23
  in_data.segments = [Segment(t0=s.t0, t1=s.t1, text=self.filter_chinese_printable(s.text)) for s in segments]
24
  in_data.transcribe_content = texts
 
32
  if unicodedata.category(char) != 'Cc': # 不可打印字符的分类为 'Cc'
33
  printable.append(char)
34
  return ''.join(printable).strip()
35
+
36
+
37
+
38
+ class WhisperChinese(WhisperPipe):
39
+ @classmethod
40
+ def init(cls):
41
+ if cls.whisper is None:
42
+ cls.whisper = WhisperCPP(source_lange='zh')
transcribe/strategy.py CHANGED
@@ -111,7 +111,7 @@ class TranscriptChunk:
111
  return 0
112
 
113
  score = self._calculate_similarity(self.join(), chunk.join())
114
- logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
115
  return score
116
 
117
  def only_punctuation(self)->bool:
 
111
  return 0
112
 
113
  score = self._calculate_similarity(self.join(), chunk.join())
114
+ # logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
115
  return score
116
 
117
  def only_punctuation(self)->bool:
transcribe/translatepipes.py CHANGED
@@ -1,4 +1,4 @@
1
- from transcribe.pipelines import WhisperPipe, TranslatePipe, MetaItem, VadPipe, Translate7BPipe
2
  import multiprocessing as mp
3
  import config
4
 
@@ -11,14 +11,18 @@ class TranslatePipes:
11
  # self.result_queue = mp.Queue()
12
 
13
  # whisper 转录
14
- self._whisper_pipe = self._launch_process(WhisperPipe())
 
15
 
16
  # llm 翻译
17
- self._translate_pipe = self._launch_process(TranslatePipe())
18
 
19
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
20
  # vad
21
- self._vad_pipe = self._launch_process(VadPipe())
 
 
 
22
 
23
  def _launch_process(self, process_obj):
24
  process_obj.daemon = True
@@ -26,9 +30,10 @@ class TranslatePipes:
26
  return process_obj
27
 
28
  def wait_ready(self):
29
- self._whisper_pipe.wait()
30
- self._translate_pipe.wait()
31
- self._vad_pipe.wait()
 
32
  self._translate_7b_pipe.wait()
33
 
34
  def translate(self, text, src_lang, dst_lang) -> MetaItem:
@@ -45,14 +50,20 @@ class TranslatePipes:
45
  transcribe_content=text,
46
  source_language=src_lang,
47
  destination_language=dst_lang)
48
- self._translate_pipe.input_queue.put(item)
49
- return self._translate_pipe.output_queue.get()
 
 
 
 
 
50
 
51
 
52
  def transcrible(self, audio_buffer:bytes, src_lang: str) -> MetaItem:
 
53
  item = MetaItem(audio=audio_buffer, source_language=src_lang)
54
- self._whisper_pipe.input_queue.put(item)
55
- return self._whisper_pipe.output_queue.get()
56
 
57
  def voice_detect(self, audio_buffer:bytes) -> MetaItem:
58
  item = MetaItem(source_audio=audio_buffer)
 
1
+ from transcribe.pipelines import WhisperPipe, TranslatePipe, MetaItem, WhisperChinese, Translate7BPipe
2
  import multiprocessing as mp
3
  import config
4
 
 
11
  # self.result_queue = mp.Queue()
12
 
13
  # whisper 转录
14
+ self._whisper_pipe_en = self._launch_process(WhisperPipe())
15
+ self._whisper_pipe_zh = self._launch_process(WhisperChinese())
16
 
17
  # llm 翻译
18
+ # self._translate_pipe = self._launch_process(TranslatePipe())
19
 
20
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
21
  # vad
22
+ # self._vad_pipe = self._launch_process(VadPipe())
23
+
24
+ # def reset(self):
25
+ # self._vad_pipe.reset()
26
 
27
  def _launch_process(self, process_obj):
28
  process_obj.daemon = True
 
30
  return process_obj
31
 
32
  def wait_ready(self):
33
+ self._whisper_pipe_zh.wait()
34
+ self._whisper_pipe_en.wait()
35
+ # self._translate_pipe.wait()
36
+ # self._vad_pipe.wait()
37
  self._translate_7b_pipe.wait()
38
 
39
  def translate(self, text, src_lang, dst_lang) -> MetaItem:
 
50
  transcribe_content=text,
51
  source_language=src_lang,
52
  destination_language=dst_lang)
53
+ self._translate_7b_pipe.input_queue.put(item)
54
+ return self._translate_7b_pipe.output_queue.get()
55
+
56
+ def get_whisper_model(self, lang:str='en'):
57
+ if lang == 'zh':
58
+ return self._whisper_pipe_zh
59
+ return self._whisper_pipe_en
60
 
61
 
62
  def transcrible(self, audio_buffer:bytes, src_lang: str) -> MetaItem:
63
+ whisper_model = self.get_whisper_model(src_lang)
64
  item = MetaItem(audio=audio_buffer, source_language=src_lang)
65
+ whisper_model.input_queue.put(item)
66
+ return whisper_model.output_queue.get()
67
 
68
  def voice_detect(self, audio_buffer:bytes) -> MetaItem:
69
  item = MetaItem(source_audio=audio_buffer)
transcribe/whisper_llm_serve.py CHANGED
@@ -8,44 +8,48 @@ from typing import List, Optional, Iterator, Tuple, Any
8
  import asyncio
9
  import numpy as np
10
  import config
11
- # import wordninja
12
  from api_model import TransResult, Message, DebugResult
13
- from .server import ServeClientBase
14
  from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
18
- import csv
 
19
 
20
  logger = getLogger("TranscriptionService")
21
 
22
 
23
- class WhisperTranscriptionService(ServeClientBase):
24
  """
25
  Whisper语音转录服务类,处理音频流转录和翻译
26
  """
27
 
 
 
 
28
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
29
- super().__init__(client_uid, websocket)
30
  self.source_language = language # 源语言
31
  self.target_language = dst_lang # 目标翻译语言
32
-
33
  # 转录结果稳定性管理
34
-
35
  self._translate_pipe = pipe
36
 
37
  # 音频处理相关
38
  self.sample_rate = 16000
39
- self.frames_np = None
40
  self.lock = threading.Lock()
41
  self._frame_queue = queue.Queue()
42
-
43
 
44
  # 文本分隔符,根据语言设置
45
  self.text_separator = self._get_text_separator(language)
46
  self.loop = asyncio.get_event_loop()
47
  # 发送就绪状态
48
- self.send_ready_state()
49
  self._transcrible_analysis = None
50
  # 启动处理线程
51
  self._translate_thread_stop = threading.Event()
@@ -53,7 +57,8 @@ class WhisperTranscriptionService(ServeClientBase):
53
 
54
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
55
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
56
-
 
57
  # for test
58
  self._transcrible_time_cost = 0.
59
  self._translate_time_cost = 0.
@@ -82,9 +87,9 @@ class WhisperTranscriptionService(ServeClientBase):
82
  """根据语言返回适当的文本分隔符"""
83
  return "" if language == "zh" else " "
84
 
85
- def send_ready_state(self) -> None:
86
  """发送服务就绪状态消息"""
87
- self.websocket.send(json.dumps({
88
  "uid": self.client_uid,
89
  "message": self.SERVER_READY,
90
  "backend": "whisper_transcription"
@@ -94,10 +99,10 @@ class WhisperTranscriptionService(ServeClientBase):
94
  """设置源语言和目标语言"""
95
  self.source_language = source_lang
96
  self.target_language = target_lang
97
- self.text_separator = self._get_text_separator(source_lang)
98
- self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
99
 
100
- def add_audio_frames(self, frame_np: np.ndarray) -> None:
101
  """添加音频帧到处理队列"""
102
  self._frame_queue.put(frame_np)
103
 
@@ -105,68 +110,21 @@ class WhisperTranscriptionService(ServeClientBase):
105
  """从队列获取音频帧并合并到缓冲区"""
106
  while not self._frame_processing_thread_stop.is_set():
107
  try:
108
- frame_np = self._frame_queue.get(timeout=0.1)
109
- if frame_np is None:
110
- logger.error("Received None frame, stopping thread")
111
- with self.lock:
112
- if self.frames_np is None:
113
- self.frames_np = frame_np.copy()
114
- else:
115
- self.frames_np = np.append(self.frames_np, frame_np)
 
 
116
  except queue.Empty:
117
  pass
118
 
119
- def _apply_voice_activity_detection(self) -> None:
120
- """应用语音活动检测来优化音频缓冲区"""
121
- with self.lock:
122
- if self.frames_np is not None:
123
- # self._c+= 1
124
- frame = self.frames_np.copy()
125
- processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
126
- self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
127
- return self.frames_np.copy()
128
- # if len(frame) > self.sample_rate:
129
- # save_to_wave(f"{self._c}-org.wav", frame)
130
- # save_to_wave(f"{self._c}-vad.wav", self.frames_np)
131
-
132
- def _update_audio_buffer(self, offset: int) -> None:
133
- """从音频缓冲区中移除已处理的部分"""
134
- with self.lock:
135
- if self.frames_np is not None and offset > 0:
136
- # self._c += 1
137
- # before = self.frames_np.copy()
138
- self.frames_np = self.frames_np[offset:]
139
- # after = self.frames_np.copy()
140
- # save_to_wave(f"./tests/{self._c}_before_cut_{offset}.wav", before)
141
- # save_to_wave(f"./tests/{self._c}_cut.wav", before[:offset])
142
- # save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
143
-
144
-
145
- def _get_audio_for_processing(self) -> Optional[np.ndarray]:
146
- """准备用于处理的音频块"""
147
- # 应用VAD处理
148
- frame_np = self._apply_voice_activity_detection()
149
- # frame_np = self.frames_np.copy()
150
- # 没有音频帧
151
- if frame_np is None:
152
- return None
153
-
154
- frames = frame_np.copy()
155
-
156
- # 音频过短时的处理
157
- if len(frames) <= 10:
158
- # 极短音频段,清空并返回None
159
- # self._update_audio_buffer(len(frames))
160
- return None
161
- if len(frames) < self.sample_rate:
162
- # 不足一秒的音频,补充静音
163
- silence_audio = np.zeros((self.sample_rate + 1000,), dtype=np.float32)
164
- silence_audio[-len(frames):] = frames
165
- return silence_audio.copy()
166
-
167
- return frames
168
-
169
- def _transcribe_audio(self, audio_buffer: np.ndarray) -> List[TranscriptToken]:
170
  """转录音频并返回转录片段"""
171
  log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
172
  start_time = time.perf_counter()
@@ -175,14 +133,11 @@ class WhisperTranscriptionService(ServeClientBase):
175
  segments = result.segments
176
  time_diff = (time.perf_counter() - start_time)
177
  logger.debug(f"📝 Transcrible Segments: {segments} ")
178
- logger.debug(f"📝 Transcrible: {self.text_separator.join(seg.text for seg in segments)} ")
179
  log_block("📝 Transcrible output", f"{self.text_separator.join(seg.text for seg in segments)}", "")
180
  log_block("📝 Transcrible time", f"{time_diff:.3f}", "s")
181
  self._transcrible_time_cost = round(time_diff, 3)
182
- return [
183
- TranscriptToken(text=s.text, t0=s.t0, t1=s.t1)
184
- for s in segments
185
- ]
186
 
187
  def _translate_text(self, text: str) -> str:
188
  """将文本翻译为目标语言"""
@@ -216,40 +171,44 @@ class WhisperTranscriptionService(ServeClientBase):
216
  self._translate_time_cost = round(time_diff, 3)
217
  return translated_text
218
 
219
-
220
-
221
  def _transcription_processing_loop(self) -> None:
222
  """主转录处理循环"""
223
- c = 0
224
- while not self._translate_thread_stop.is_set():
225
- if self.exit:
226
- logger.info("Exiting transcription thread")
227
- break
228
 
229
- # 等待音频数据
230
- if self.frames_np is None:
231
- time.sleep(0.2)
232
- logger.info("Waiting for audio data...")
233
- continue
234
-
235
- # 获取音频块进行处理
236
- audio_buffer = self._get_audio_for_processing()
237
- if audio_buffer is None:
238
  time.sleep(0.2)
239
  continue
240
- logger.debug(f"🥤 Buffer Length: {len(audio_buffer)/self.sample_rate:.2f} ")
241
-
242
-
243
  # try:
244
- segments = self._transcribe_audio(audio_buffer)
245
-
246
- # 处理转录结果并发送到客户端
247
- for result in self._process_transcription_results(segments, audio_buffer):
 
248
  self._send_result_to_client(result)
 
 
 
 
249
 
250
  # except Exception as e:
251
  # logger.error(f"Error processing audio: {e}")
252
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
254
  """
255
  处理转录结果,生成翻译结果
 
8
  import asyncio
9
  import numpy as np
10
  import config
11
+
12
  from api_model import TransResult, Message, DebugResult
13
+
14
  from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
18
+ from transcribe.helpers.vadprocessor import VadProcessor
19
+ from transcribe.pipelines import MetaItem
20
 
21
  logger = getLogger("TranscriptionService")
22
 
23
 
24
+ class WhisperTranscriptionService:
25
  """
26
  Whisper语音转录服务类,处理音频流转录和翻译
27
  """
28
 
29
+ SERVER_READY = "SERVER_READY"
30
+ DISCONNECT = "DISCONNECT"
31
+
32
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
33
+
34
  self.source_language = language # 源语言
35
  self.target_language = dst_lang # 目标翻译语言
36
+ self.client_uid = client_uid
37
  # 转录结果稳定性管理
38
+ self.websocket = websocket
39
  self._translate_pipe = pipe
40
 
41
  # 音频处理相关
42
  self.sample_rate = 16000
43
+
44
  self.lock = threading.Lock()
45
  self._frame_queue = queue.Queue()
46
+ self._vad_frame_queue = queue.Queue()
47
 
48
  # 文本分隔符,根据语言设置
49
  self.text_separator = self._get_text_separator(language)
50
  self.loop = asyncio.get_event_loop()
51
  # 发送就绪状态
52
+
53
  self._transcrible_analysis = None
54
  # 启动处理线程
55
  self._translate_thread_stop = threading.Event()
 
57
 
58
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
59
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
60
+ self._vad = VadProcessor()
61
+ self.row_number = 0
62
  # for test
63
  self._transcrible_time_cost = 0.
64
  self._translate_time_cost = 0.
 
87
  """根据语言返回适当的文本分隔符"""
88
  return "" if language == "zh" else " "
89
 
90
+ async def send_ready_state(self) -> None:
91
  """发送服务就绪状态消息"""
92
+ await self.websocket.send(json.dumps({
93
  "uid": self.client_uid,
94
  "message": self.SERVER_READY,
95
  "backend": "whisper_transcription"
 
99
  """设置源语言和目标语言"""
100
  self.source_language = source_lang
101
  self.target_language = target_lang
102
+ # self.text_separator = self._get_text_separator(source_lang)
103
+ # self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
104
 
105
+ def add_frames(self, frame_np: np.ndarray) -> None:
106
  """添加音频帧到处理队列"""
107
  self._frame_queue.put(frame_np)
108
 
 
110
  """从队列获取音频帧并合并到缓冲区"""
111
  while not self._frame_processing_thread_stop.is_set():
112
  try:
113
+ audio = self._frame_queue.get(timeout=0.1)
114
+ # save_to_wave(f"{self._c}_before_vad.wav", audio)
115
+ processed_audio = self._vad.process_audio(audio)
116
+ if processed_audio.shape[0] > 0:
117
+ # vad_processed_audio = processed_audio
118
+ # save_to_wave(f"{self._c}_after_vad.wav", processed_audio)
119
+ # vad_frame_obj = np.frombuffer(processed_audio.audio, dtype=np.float32)
120
+ logger.debug(f"Vad frame: {processed_audio.shape[0]/self.sample_rate:.2f}")
121
+ # apply vad speech check:
122
+ self._vad_frame_queue.put(processed_audio)
123
  except queue.Empty:
124
  pass
125
 
126
+
127
+ def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  """转录音频并返回转录片段"""
129
  log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
130
  start_time = time.perf_counter()
 
133
  segments = result.segments
134
  time_diff = (time.perf_counter() - start_time)
135
  logger.debug(f"📝 Transcrible Segments: {segments} ")
136
+ # logger.debug(f"📝 Transcrible: {self.text_separator.join(seg.text for seg in segments)} ")
137
  log_block("📝 Transcrible output", f"{self.text_separator.join(seg.text for seg in segments)}", "")
138
  log_block("📝 Transcrible time", f"{time_diff:.3f}", "s")
139
  self._transcrible_time_cost = round(time_diff, 3)
140
+ return result
 
 
 
141
 
142
  def _translate_text(self, text: str) -> str:
143
  """将文本翻译为目标语言"""
 
171
  self._translate_time_cost = round(time_diff, 3)
172
  return translated_text
173
 
 
 
174
  def _transcription_processing_loop(self) -> None:
175
  """主转录处理循环"""
 
 
 
 
 
176
 
177
+ while not self._translate_thread_stop.is_set():
178
+ audio_buffer = self._vad_frame_queue.get()
179
+ if audio_buffer is None or len(audio_buffer) < int(self.sample_rate):
 
 
 
 
 
 
180
  time.sleep(0.2)
181
  continue
182
+
183
+ logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
 
184
  # try:
185
+ meta_item = self._transcribe_audio(audio_buffer)
186
+ segments = meta_item.segments
187
+ logger.debug(f"Segments: {segments}")
188
+ if len(segments):
189
+ result = self._process_transcription_results_2(segments)
190
  self._send_result_to_client(result)
191
+ time.sleep(0.1)
192
+ # 处理转录结果并发送到客户端
193
+ # for result in self._process_transcription_results(segments, audio_buffer):
194
+ # self._send_result_to_client(result)
195
 
196
  # except Exception as e:
197
  # logger.error(f"Error processing audio: {e}")
198
 
199
+ def _process_transcription_results_2(self, segments: List[TranscriptToken],):
200
+ seg = segments[0]
201
+ item = TransResult(
202
+ seg_id=self.row_number,
203
+ context=seg.text,
204
+ from_=self.source_language,
205
+ to=self.target_language,
206
+ tran_content=self._translate_text_large(seg.text),
207
+ partial=False
208
+ )
209
+ self.row_number += 1
210
+ return item
211
+
212
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
213
  """
214
  处理转录结果,生成翻译结果