CrazyMonkey0 commited on
Commit
c84acef
·
1 Parent(s): 20a7446

feat(asr): replace Whisper HF with faster-whisper for CPU-friendly transcription

Browse files
Files changed (3) hide show
  1. app/routes/asr.py +24 -34
  2. app/routes/tts.py +4 -1
  3. requirements.txt +19 -0
app/routes/asr.py CHANGED
@@ -1,40 +1,30 @@
1
- from transformers import WhisperForConditionalGeneration, WhisperProcessor
2
- from fastapi import APIRouter, Request, UploadFile, File
3
- import librosa
4
  import io
5
- import soundfile as sf
6
- import os
7
 
8
  router = APIRouter()
9
 
10
- def load_model_asr():
11
- processor = WhisperProcessor.from_pretrained("openai/whisper-small.en")
12
- model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small.en")
13
- return processor, model
14
 
15
- @router.post("/asr")
16
- async def asr(request: Request, audio: UploadFile = File(...)):
17
- # Get the loaded ASR model and processor
18
- processor, model = request.app.state.processor_asr, request.app.state.model_asr
19
- # Audio file path
20
- audio_bytes = await audio.read()
21
- buffer = io.BytesIO(audio_bytes)
22
-
23
- # Loading audio file
24
- audio_data, sampling_rate = sf.read(buffer, dtype="float32")
25
- if sampling_rate != 16000:
26
- audio_data = librosa.resample(audio_data, orig_sr=sampling_rate, target_sr=16000)
27
- sampling_rate = 16000
28
-
29
- # Preparing input data
30
- inputs = processor(audio_data, return_tensors="pt", sampling_rate=sampling_rate)
31
- input_features = inputs["input_features"]
32
-
33
- # Generating token IDs
34
- output = model.generate(input_features)
35
 
36
- # Decoding tokens into text
37
- transcription = processor.batch_decode(output, skip_special_tokens=True)
38
-
39
-
40
- return {"transcription": transcription[0]}
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, UploadFile, File, HTTPException
2
+ from faster_whisper import WhisperModel
 
3
  import io
 
 
4
 
5
  router = APIRouter()
6
 
7
+ # Ładujemy model raz
8
+ model = WhisperModel("tiny", device="cpu", compute_type="int8")
 
 
9
 
10
+ # Globalny lock dla bezpieczeństwa RAM
11
+ import asyncio
12
+ asr_lock = asyncio.Lock()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ @router.post("/asr")
15
+ async def asr(audio: UploadFile = File(...)):
16
+ async with asr_lock:
17
+ try:
18
+ # Wczytanie audio do BytesIO
19
+ audio_bytes = await audio.read()
20
+ buffer = io.BytesIO(audio_bytes)
21
+
22
+ # faster-whisper obsługuje path lub file-like object
23
+ segments, info = model.transcribe(buffer, beam_size=1, vad_filter=True)
24
+
25
+ # Sklejamy transkrypcję
26
+ transcription = " ".join([s.text for s in segments])
27
+ return {"transcription": transcription}
28
+
29
+ except Exception as e:
30
+ raise HTTPException(status_code=500, detail=str(e))
app/routes/tts.py CHANGED
@@ -6,7 +6,10 @@ import numpy as np
6
  import scipy.io.wavfile as wavfile
7
  import torch
8
  import io
 
 
9
 
 
10
 
11
  # load TTS model
12
  def load_model_tts():
@@ -36,7 +39,7 @@ def send_audio(request: Request, text: str,) -> bytes:
36
  buffer.seek(0)
37
 
38
  file_name = f"{uuid.uuid4()}.wav"
39
- upload_url = "http://kowalskidev.pl/ai_tutor/upload/"
40
 
41
  try:
42
  files = {"file": (file_name, buffer, "audio/wav")}
 
6
  import scipy.io.wavfile as wavfile
7
  import torch
8
  import io
9
+ import os
10
+ from dotenv import load_dotenv
11
 
12
+ load_dotenv()
13
 
14
  # load TTS model
15
  def load_model_tts():
 
39
  buffer.seek(0)
40
 
41
  file_name = f"{uuid.uuid4()}.wav"
42
+ upload_url = os.getenv("UPLOAD_URL")
43
 
44
  try:
45
  files = {"file": (file_name, buffer, "audio/wav")}
requirements.txt CHANGED
@@ -10,6 +10,7 @@ asttokens==3.0.0
10
  astunparse==1.6.3
11
  attrs==25.3.0
12
  audioread==3.0.1
 
13
  babel==2.17.0
14
  blis==1.2.0
15
  catalogue==2.0.10
@@ -19,8 +20,10 @@ charset-normalizer==3.4.1
19
  click==8.1.8
20
  cloudpathlib==0.21.0
21
  colorama==0.4.6
 
22
  confection==0.1.5
23
  csvw==3.5.1
 
24
  curated-tokenizers==0.0.9
25
  curated-transformers==0.1.1
26
  cymem==2.0.11
@@ -36,6 +39,7 @@ email_validator==2.2.0
36
  executing==2.2.0
37
  fastapi==0.115.11
38
  fastapi-cli==0.0.7
 
39
  filelock==3.18.0
40
  flatbuffers==25.2.10
41
  frozenlist==1.5.0
@@ -52,6 +56,7 @@ httpcore==1.0.7
52
  httptools==0.6.4
53
  httpx==0.28.1
54
  huggingface-hub==0.36.0
 
55
  idna==3.10
56
  inflect==7.5.0
57
  ipython==9.0.2
@@ -92,6 +97,20 @@ nltk==3.9.1
92
  num2words==0.5.14
93
  numba==0.61.0
94
  numpy==1.26.4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  opt_einsum==3.4.0
96
  optree==0.14.1
97
  orjson==3.10.15
 
10
  astunparse==1.6.3
11
  attrs==25.3.0
12
  audioread==3.0.1
13
+ av==16.0.1
14
  babel==2.17.0
15
  blis==1.2.0
16
  catalogue==2.0.10
 
20
  click==8.1.8
21
  cloudpathlib==0.21.0
22
  colorama==0.4.6
23
+ coloredlogs==15.0.1
24
  confection==0.1.5
25
  csvw==3.5.1
26
+ ctranslate2==4.6.2
27
  curated-tokenizers==0.0.9
28
  curated-transformers==0.1.1
29
  cymem==2.0.11
 
39
  executing==2.2.0
40
  fastapi==0.115.11
41
  fastapi-cli==0.0.7
42
+ faster-whisper==1.2.1
43
  filelock==3.18.0
44
  flatbuffers==25.2.10
45
  frozenlist==1.5.0
 
56
  httptools==0.6.4
57
  httpx==0.28.1
58
  huggingface-hub==0.36.0
59
+ humanfriendly==10.0
60
  idna==3.10
61
  inflect==7.5.0
62
  ipython==9.0.2
 
97
  num2words==0.5.14
98
  numba==0.61.0
99
  numpy==1.26.4
100
+ nvidia-cublas-cu12==12.4.5.8
101
+ nvidia-cuda-cupti-cu12==12.4.127
102
+ nvidia-cuda-nvrtc-cu12==12.4.127
103
+ nvidia-cuda-runtime-cu12==12.4.127
104
+ nvidia-cudnn-cu12==9.1.0.70
105
+ nvidia-cufft-cu12==11.2.1.3
106
+ nvidia-curand-cu12==10.3.5.147
107
+ nvidia-cusolver-cu12==11.6.1.9
108
+ nvidia-cusparse-cu12==12.3.1.170
109
+ nvidia-cusparselt-cu12==0.6.2
110
+ nvidia-nccl-cu12==2.21.5
111
+ nvidia-nvjitlink-cu12==12.4.127
112
+ nvidia-nvtx-cu12==12.4.127
113
+ onnxruntime==1.23.2
114
  opt_einsum==3.4.0
115
  optree==0.14.1
116
  orjson==3.10.15