Spaces:
Running
Running
Update asr-tts_service.py
Browse files- asr-tts_service.py +41 -22
asr-tts_service.py
CHANGED
|
@@ -7,9 +7,12 @@ import numpy as np
|
|
| 7 |
import soundfile as sf
|
| 8 |
import torch
|
| 9 |
import google.generativeai as genai
|
|
|
|
|
|
|
| 10 |
from flask import Flask, request, jsonify
|
| 11 |
from transformers import pipeline, AutoTokenizer
|
| 12 |
from parler_tts import ParlerTTSForConditionalGeneration
|
|
|
|
| 13 |
from dotenv import load_dotenv
|
| 14 |
|
| 15 |
# Charger les variables d'environnement
|
|
@@ -126,24 +129,15 @@ def smooth_concat(segments, sr, fade_ms=20):
|
|
| 126 |
# --- Logique TTS et Traduction ---
|
| 127 |
|
| 128 |
def generate_tts_optimized(text: str) -> str:
|
| 129 |
-
# Fixer la graine (Seed) pour que le timbre de voix soit constant
|
| 130 |
-
# La seed 98 donne souvent un grain très proche du fichier d'Orange
|
| 131 |
torch.manual_seed(98)
|
| 132 |
-
|
| 133 |
-
# 1. Préparation du texte
|
| 134 |
text = convert_digits_in_text(text)
|
| 135 |
-
# Découpage en morceaux de 100 caractères max pour éviter que le modèle s'essouffle
|
| 136 |
chunks = split_by_sentences(text, max_chars=100)
|
| 137 |
audio_segments = []
|
| 138 |
|
| 139 |
for chunk in chunks:
|
| 140 |
if not chunk.strip(): continue
|
| 141 |
-
|
| 142 |
-
# Ajout d'une ponctuation forcée pour une intonation propre
|
| 143 |
full_chunk = chunk if chunk.endswith(('.', '!', '?')) else chunk + "."
|
| 144 |
-
|
| 145 |
prompt_ids = tts_tokenizer(full_chunk, return_tensors="pt").input_ids.to(device)
|
| 146 |
-
|
| 147 |
with torch.no_grad():
|
| 148 |
audio = tts_model.generate(
|
| 149 |
input_ids=description_id,
|
|
@@ -153,23 +147,19 @@ def generate_tts_optimized(text: str) -> str:
|
|
| 153 |
temperature=1.0,
|
| 154 |
min_new_tokens=20
|
| 155 |
)
|
| 156 |
-
|
| 157 |
audio_np = audio.cpu().numpy().squeeze().astype(np.float32)
|
| 158 |
if audio_np.size > 0:
|
| 159 |
audio_segments.append(audio_np)
|
| 160 |
|
| 161 |
if not audio_segments: return ""
|
| 162 |
-
|
| 163 |
-
# 2. Assemblage avec fondu enchaîné pour éviter les "clics"
|
| 164 |
final_audio = smooth_concat(audio_segments, tts_model.config.sampling_rate)
|
| 165 |
final_audio = normalize_audio(final_audio)
|
| 166 |
|
| 167 |
-
# 3. Encodage en Base64 pour votre interface
|
| 168 |
buffer = io.BytesIO()
|
| 169 |
sf.write(buffer, final_audio, tts_model.config.sampling_rate, format="WAV")
|
| 170 |
buffer.seek(0)
|
| 171 |
return "data:audio/wav;base64," + base64.b64encode(buffer.read()).decode()
|
| 172 |
-
|
| 173 |
def french_to_wolof_with_gemini(text: str) -> str:
|
| 174 |
prompt = f"""
|
| 175 |
Tu es un traducteur expert en wolof travaillant pour la Sen'eau.
|
|
@@ -178,13 +168,12 @@ def french_to_wolof_with_gemini(text: str) -> str:
|
|
| 178 |
Utilise un ton poli, professionnel et garde les termes techniques usuels (compteur, branchement, Assistante virtuelle ).
|
| 179 |
|
| 180 |
Texte : {text}"""
|
| 181 |
-
|
| 182 |
try:
|
| 183 |
response = model_gemini.generate_content(prompt)
|
| 184 |
return response.text.strip()
|
| 185 |
except Exception as e:
|
| 186 |
return "Naka ngeen def ! Man la ADAMA, seen Assistante virtuelle bu Sen'eau. Ma ngi fi ngir dimbali leen ci seeni laaj yépp yu jëm ci wàllu ndoxum naan ci Sénégal."
|
| 187 |
-
|
| 188 |
def wolof_to_french_gemini(text: str) -> str:
|
| 189 |
prompt = f"""
|
| 190 |
Tu es un traducteur expert en français travaillant pour la Sen'eau.
|
|
@@ -214,25 +203,55 @@ def transcribe():
|
|
| 214 |
data = np.asarray(data, dtype=np.float32)
|
| 215 |
if data.ndim > 1: data = data.mean(axis=1)
|
| 216 |
|
| 217 |
-
# ASR Local
|
| 218 |
wolof_text = asr(normalize_audio(data))["text"]
|
| 219 |
-
|
| 220 |
-
# Si l'ASR est quasi vide, on ne sollicite pas Gemini
|
| 221 |
if len(wolof_text.strip()) < 2:
|
| 222 |
return "Bonjour"
|
| 223 |
|
| 224 |
-
# Traduction Gemini
|
| 225 |
french_text = wolof_to_french_gemini(wolof_text)
|
| 226 |
return french_text
|
| 227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
@app.route("/tts", methods=["POST"])
|
| 229 |
def tts():
|
| 230 |
payload = request.get_json()
|
| 231 |
if not payload or "text" not in payload: return jsonify({"error": "Texte manquant"}), 400
|
| 232 |
|
| 233 |
-
# Traduction Gemini
|
| 234 |
wolof_text = french_to_wolof_with_gemini(payload["text"])
|
| 235 |
-
# Audio Local
|
| 236 |
audio_base64 = generate_tts_optimized(wolof_text)
|
| 237 |
|
| 238 |
return jsonify({"wolof_text": wolof_text, "audio": audio_base64})
|
|
|
|
| 7 |
import soundfile as sf
|
| 8 |
import torch
|
| 9 |
import google.generativeai as genai
|
| 10 |
+
import requests # Ajouté pour WhatsApp
|
| 11 |
+
import tempfile # Ajouté pour WhatsApp
|
| 12 |
from flask import Flask, request, jsonify
|
| 13 |
from transformers import pipeline, AutoTokenizer
|
| 14 |
from parler_tts import ParlerTTSForConditionalGeneration
|
| 15 |
+
from pydub import AudioSegment # Ajouté pour convertir le format OGG de WhatsApp
|
| 16 |
from dotenv import load_dotenv
|
| 17 |
|
| 18 |
# Charger les variables d'environnement
|
|
|
|
| 129 |
# --- Logique TTS et Traduction ---
|
| 130 |
|
| 131 |
def generate_tts_optimized(text: str) -> str:
|
|
|
|
|
|
|
| 132 |
torch.manual_seed(98)
|
|
|
|
|
|
|
| 133 |
text = convert_digits_in_text(text)
|
|
|
|
| 134 |
chunks = split_by_sentences(text, max_chars=100)
|
| 135 |
audio_segments = []
|
| 136 |
|
| 137 |
for chunk in chunks:
|
| 138 |
if not chunk.strip(): continue
|
|
|
|
|
|
|
| 139 |
full_chunk = chunk if chunk.endswith(('.', '!', '?')) else chunk + "."
|
|
|
|
| 140 |
prompt_ids = tts_tokenizer(full_chunk, return_tensors="pt").input_ids.to(device)
|
|
|
|
| 141 |
with torch.no_grad():
|
| 142 |
audio = tts_model.generate(
|
| 143 |
input_ids=description_id,
|
|
|
|
| 147 |
temperature=1.0,
|
| 148 |
min_new_tokens=20
|
| 149 |
)
|
|
|
|
| 150 |
audio_np = audio.cpu().numpy().squeeze().astype(np.float32)
|
| 151 |
if audio_np.size > 0:
|
| 152 |
audio_segments.append(audio_np)
|
| 153 |
|
| 154 |
if not audio_segments: return ""
|
|
|
|
|
|
|
| 155 |
final_audio = smooth_concat(audio_segments, tts_model.config.sampling_rate)
|
| 156 |
final_audio = normalize_audio(final_audio)
|
| 157 |
|
|
|
|
| 158 |
buffer = io.BytesIO()
|
| 159 |
sf.write(buffer, final_audio, tts_model.config.sampling_rate, format="WAV")
|
| 160 |
buffer.seek(0)
|
| 161 |
return "data:audio/wav;base64," + base64.b64encode(buffer.read()).decode()
|
| 162 |
+
|
| 163 |
def french_to_wolof_with_gemini(text: str) -> str:
|
| 164 |
prompt = f"""
|
| 165 |
Tu es un traducteur expert en wolof travaillant pour la Sen'eau.
|
|
|
|
| 168 |
Utilise un ton poli, professionnel et garde les termes techniques usuels (compteur, branchement, Assistante virtuelle ).
|
| 169 |
|
| 170 |
Texte : {text}"""
|
|
|
|
| 171 |
try:
|
| 172 |
response = model_gemini.generate_content(prompt)
|
| 173 |
return response.text.strip()
|
| 174 |
except Exception as e:
|
| 175 |
return "Naka ngeen def ! Man la ADAMA, seen Assistante virtuelle bu Sen'eau. Ma ngi fi ngir dimbali leen ci seeni laaj yépp yu jëm ci wàllu ndoxum naan ci Sénégal."
|
| 176 |
+
|
| 177 |
def wolof_to_french_gemini(text: str) -> str:
|
| 178 |
prompt = f"""
|
| 179 |
Tu es un traducteur expert en français travaillant pour la Sen'eau.
|
|
|
|
| 203 |
data = np.asarray(data, dtype=np.float32)
|
| 204 |
if data.ndim > 1: data = data.mean(axis=1)
|
| 205 |
|
|
|
|
| 206 |
wolof_text = asr(normalize_audio(data))["text"]
|
|
|
|
|
|
|
| 207 |
if len(wolof_text.strip()) < 2:
|
| 208 |
return "Bonjour"
|
| 209 |
|
|
|
|
| 210 |
french_text = wolof_to_french_gemini(wolof_text)
|
| 211 |
return french_text
|
| 212 |
|
| 213 |
+
# --- NOUVELLE ROUTE : Spécifique pour WhatsApp (URL) ---
|
| 214 |
+
@app.route("/transcribe_from_url", methods=["POST"])
|
| 215 |
+
def transcribe_from_url():
|
| 216 |
+
payload = request.get_json()
|
| 217 |
+
audio_url = payload.get('url')
|
| 218 |
+
if not audio_url: return "Bonjour", 400
|
| 219 |
+
|
| 220 |
+
try:
|
| 221 |
+
# 1. Télécharger l'audio WhatsApp (format .ogg)
|
| 222 |
+
resp = requests.get(audio_url)
|
| 223 |
+
with tempfile.NamedTemporaryFile(suffix=".ogg", delete=False) as tmp:
|
| 224 |
+
tmp.write(resp.content)
|
| 225 |
+
tmp_path = tmp.name
|
| 226 |
+
|
| 227 |
+
# 2. Convertir OGG -> WAV (16kHz mono pour Wav2Vec2)
|
| 228 |
+
audio = AudioSegment.from_file(tmp_path)
|
| 229 |
+
wav_io = io.BytesIO()
|
| 230 |
+
audio.set_frame_rate(16000).set_channels(1).export(wav_io, format="wav")
|
| 231 |
+
wav_io.seek(0)
|
| 232 |
+
|
| 233 |
+
# 3. Charger dans numpy
|
| 234 |
+
data, sr = sf.read(wav_io)
|
| 235 |
+
data = np.asarray(data, dtype=np.float32)
|
| 236 |
+
if data.ndim > 1: data = data.mean(axis=1)
|
| 237 |
+
|
| 238 |
+
os.remove(tmp_path) # Nettoyage
|
| 239 |
+
|
| 240 |
+
# 4. ASR + Traduction (Réutilisation de votre logique)
|
| 241 |
+
wolof_text = asr(normalize_audio(data))["text"]
|
| 242 |
+
if len(wolof_text.strip()) < 2: return "Bonjour"
|
| 243 |
+
return wolof_to_french_gemini(wolof_text)
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.error(f"Erreur WhatsApp ASR: {e}")
|
| 247 |
+
return "Bonjour"
|
| 248 |
+
|
| 249 |
@app.route("/tts", methods=["POST"])
|
| 250 |
def tts():
|
| 251 |
payload = request.get_json()
|
| 252 |
if not payload or "text" not in payload: return jsonify({"error": "Texte manquant"}), 400
|
| 253 |
|
|
|
|
| 254 |
wolof_text = french_to_wolof_with_gemini(payload["text"])
|
|
|
|
| 255 |
audio_base64 = generate_tts_optimized(wolof_text)
|
| 256 |
|
| 257 |
return jsonify({"wolof_text": wolof_text, "audio": audio_base64})
|