Yassine commited on
Commit
1602deb
·
1 Parent(s): 7b036e8

Add audio transcription feature using Whisper model and update Dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +1 -0
  2. main.py +33 -2
  3. requirements.txt +2 -0
Dockerfile CHANGED
@@ -1,5 +1,6 @@
1
  FROM python:3.9
2
 
 
3
  RUN useradd -m -u 1000 user
4
  USER user
5
  ENV PATH="/home/user/.local/bin:$PATH"
 
1
  FROM python:3.9
2
 
3
+ RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
4
  RUN useradd -m -u 1000 user
5
  USER user
6
  ENV PATH="/home/user/.local/bin:$PATH"
main.py CHANGED
@@ -1,11 +1,12 @@
1
- from fastapi import FastAPI, Body
2
  import torch
3
  import spacy
4
  import os
5
  from pathlib import Path
6
  from fastapi.middleware.cors import CORSMiddleware
7
- from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
8
  from pydantic import BaseModel
 
9
 
10
  # Define input model
11
 
@@ -99,6 +100,26 @@ nlp = spacy.load('fr_core_news_lg')
99
 
100
  # Set device (CPU or GPU)
101
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  ner_model = ner_model.to(device)
103
  type_model = type_model.to(device)
104
 
@@ -200,3 +221,13 @@ async def analyze_text(input_data: TextInput):
200
  "confidence": confidence,
201
  "entities": filtered_entities
202
  }
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Body, UploadFile, File
2
  import torch
3
  import spacy
4
  import os
5
  from pathlib import Path
6
  from fastapi.middleware.cors import CORSMiddleware
7
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8
  from pydantic import BaseModel
9
+ import tempfile
10
 
11
  # Define input model
12
 
 
100
 
101
  # Set device (CPU or GPU)
102
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
103
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
104
+
105
+ # Load Whisper model and processor
106
+ model_id = "openai/whisper-large-v3-turbo"
107
+ whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(
108
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
109
+ )
110
+ whisper_model.to(device)
111
+
112
+ whisper_processor = AutoProcessor.from_pretrained(model_id)
113
+
114
+ whisper_pipe = pipeline(
115
+ "automatic-speech-recognition",
116
+ model=whisper_model,
117
+ tokenizer=whisper_processor.tokenizer,
118
+ feature_extractor=whisper_processor.feature_extractor,
119
+ torch_dtype=torch_dtype,
120
+ device=device,
121
+ )
122
+
123
  ner_model = ner_model.to(device)
124
  type_model = type_model.to(device)
125
 
 
221
  "confidence": confidence,
222
  "entities": filtered_entities
223
  }
224
+
225
+
226
+ @app.post("/transcribe/")
227
+ async def transcribe_audio(file: UploadFile = File(...)):
228
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
229
+ tmp.write(await file.read())
230
+ tmp_path = tmp.name
231
+
232
+ result = whisper_pipe(tmp_path)
233
+ return {"transcription": result["text"]}
requirements.txt CHANGED
@@ -6,3 +6,5 @@ pydantic==2.9.2
6
  safetensors==0.4.5
7
  spacy==3.7.2
8
  fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl
 
 
 
6
  safetensors==0.4.5
7
  spacy==3.7.2
8
  fr-core-news-lg @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_lg-3.7.0/fr_core_news_lg-3.7.0-py3-none-any.whl
9
+ torchaudio
10
+ datasets[audio]