Spaces:
Sleeping
Sleeping
all
Browse files- Dockerfile +1 -0
- app.py +90 -19
- requirements.txt +0 -1
Dockerfile
CHANGED
|
@@ -36,6 +36,7 @@ RUN mkdir -p /models/huggingface && chmod -R 777 /models/huggingface
|
|
| 36 |
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
|
| 37 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
|
| 38 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
|
|
|
|
| 39 |
&& find /models/huggingface -name '*.lock' -delete
|
| 40 |
|
| 41 |
RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
|
|
|
|
| 36 |
RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-hau')" \
|
| 37 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-eng')" \
|
| 38 |
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='facebook/mms-tts-yor')" \
|
| 39 |
+
&& python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='NCAIR1/N-ATLaS')" \
|
| 40 |
&& find /models/huggingface -name '*.lock' -delete
|
| 41 |
|
| 42 |
RUN python -c "from transformers import pipeline; pipeline('text-to-speech', model='facebook/mms-tts-hau')" \
|
app.py
CHANGED
|
@@ -9,8 +9,7 @@ import soundfile as sf
|
|
| 9 |
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
| 10 |
from fastapi.responses import FileResponse
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
-
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
|
| 13 |
-
from langdetect import detect
|
| 14 |
import imageio_ffmpeg
|
| 15 |
import logging
|
| 16 |
from contextlib import asynccontextmanager
|
|
@@ -40,6 +39,7 @@ app.add_middleware(
|
|
| 40 |
|
| 41 |
ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
|
| 42 |
tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
|
|
|
|
| 43 |
|
| 44 |
asr_models = {
|
| 45 |
"ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
|
|
@@ -49,7 +49,7 @@ asr_models = {
|
|
| 49 |
}
|
| 50 |
|
| 51 |
def load_models():
|
| 52 |
-
global tts_ha, tts_en, tts_yo, tts_ig
|
| 53 |
device = 0 if torch.cuda.is_available() else -1
|
| 54 |
hf_token = os.getenv("HF_TOKEN")
|
| 55 |
if hf_token:
|
|
@@ -82,6 +82,7 @@ def load_models():
|
|
| 82 |
tts_ig = None
|
| 83 |
logger.info("Igbo TTS model disabled - will return text responses for Igbo language")
|
| 84 |
|
|
|
|
| 85 |
|
| 86 |
logger.info("Deferred ASR model loads: will lazy-load per language on first use")
|
| 87 |
|
|
@@ -195,23 +196,93 @@ IGBO_WORDS = [
|
|
| 195 |
"ugbo","akụkọ","mmiri","ala","ọrụ","ncheta","ọhụrụ","ugwu","nri","ahụhụ"
|
| 196 |
]
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
def detect_language(text: str) -> str:
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
def text_to_speech_file(text: str) -> str:
|
| 217 |
lang = detect_language(text)
|
|
|
|
| 9 |
from fastapi import FastAPI, File, UploadFile, HTTPException, Form
|
| 10 |
from fastapi.responses import FileResponse
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
+
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM
|
|
|
|
| 13 |
import imageio_ffmpeg
|
| 14 |
import logging
|
| 15 |
from contextlib import asynccontextmanager
|
|
|
|
| 39 |
|
| 40 |
ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
|
| 41 |
tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
|
| 42 |
+
natlas_tokenizer, natlas_model = None, None
|
| 43 |
|
| 44 |
asr_models = {
|
| 45 |
"ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
|
|
|
|
| 49 |
}
|
| 50 |
|
| 51 |
def load_models():
|
| 52 |
+
global tts_ha, tts_en, tts_yo, tts_ig, natlas_tokenizer, natlas_model
|
| 53 |
device = 0 if torch.cuda.is_available() else -1
|
| 54 |
hf_token = os.getenv("HF_TOKEN")
|
| 55 |
if hf_token:
|
|
|
|
| 82 |
tts_ig = None
|
| 83 |
logger.info("Igbo TTS model disabled - will return text responses for Igbo language")
|
| 84 |
|
| 85 |
+
logger.info("N-ATLaS language identification model will be lazy-loaded on first use")
|
| 86 |
|
| 87 |
logger.info("Deferred ASR model loads: will lazy-load per language on first use")
|
| 88 |
|
|
|
|
| 196 |
"ugbo","akụkọ","mmiri","ala","ọrụ","ncheta","ọhụrụ","ugwu","nri","ahụhụ"
|
| 197 |
]
|
| 198 |
|
| 199 |
+
def _load_natlas():
|
| 200 |
+
global natlas_tokenizer, natlas_model
|
| 201 |
+
if natlas_tokenizer is not None and natlas_model is not None:
|
| 202 |
+
return True
|
| 203 |
+
|
| 204 |
+
hf_token = os.getenv("HF_TOKEN")
|
| 205 |
+
if hf_token:
|
| 206 |
+
hf_token = hf_token.strip()
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
logger.info("Lazy-loading N-ATLaS language identification model...")
|
| 210 |
+
natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
|
| 211 |
+
natlas_model = AutoModelForCausalLM.from_pretrained(
|
| 212 |
+
"NCAIR1/N-ATLaS",
|
| 213 |
+
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 214 |
+
device_map="auto" if torch.cuda.is_available() else None,
|
| 215 |
+
token=hf_token
|
| 216 |
+
)
|
| 217 |
+
logger.info("Loaded N-ATLaS language identification model")
|
| 218 |
+
return True
|
| 219 |
+
except Exception as e:
|
| 220 |
+
logger.exception("Failed to load N-ATLaS model")
|
| 221 |
+
natlas_tokenizer, natlas_model = None, None
|
| 222 |
+
return False
|
| 223 |
+
|
| 224 |
def detect_language(text: str) -> str:
|
| 225 |
+
if not _load_natlas():
|
| 226 |
+
logger.warning("N-ATLaS model not available, falling back to keyword detection")
|
| 227 |
+
text_lower = text.lower()
|
| 228 |
+
if any(word in text_lower for word in HAUSA_WORDS):
|
| 229 |
+
return "ha"
|
| 230 |
+
elif any(word in text_lower for word in YORUBA_WORDS):
|
| 231 |
+
return "yo"
|
| 232 |
+
elif any(word in text_lower for word in IGBO_WORDS):
|
| 233 |
+
return "ig"
|
| 234 |
+
else:
|
| 235 |
+
return "en"
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
messages = [
|
| 239 |
+
{'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
|
| 240 |
+
{'role': 'user', 'content': f'What language is this text written in? "{text}"'}
|
| 241 |
+
]
|
| 242 |
+
|
| 243 |
+
formatted_text = natlas_tokenizer.apply_chat_template(
|
| 244 |
+
messages,
|
| 245 |
+
add_generation_prompt=True,
|
| 246 |
+
tokenize=False
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
input_tokens = natlas_tokenizer(formatted_text, return_tensors='pt', add_special_tokens=False)
|
| 250 |
+
if torch.cuda.is_available():
|
| 251 |
+
input_tokens = input_tokens.to('cuda')
|
| 252 |
+
|
| 253 |
+
with torch.no_grad():
|
| 254 |
+
outputs = natlas_model.generate(
|
| 255 |
+
**input_tokens,
|
| 256 |
+
max_new_tokens=10,
|
| 257 |
+
use_cache=True,
|
| 258 |
+
repetition_penalty=1.1,
|
| 259 |
+
temperature=0.1,
|
| 260 |
+
do_sample=False
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
response = natlas_tokenizer.batch_decode(outputs)[0]
|
| 264 |
+
response_text = response.split(messages[1]['content'])[-1].strip().lower()
|
| 265 |
+
|
| 266 |
+
if 'ha' in response_text:
|
| 267 |
+
return "ha"
|
| 268 |
+
elif 'yo' in response_text:
|
| 269 |
+
return "yo"
|
| 270 |
+
elif 'ig' in response_text:
|
| 271 |
+
return "ig"
|
| 272 |
+
else:
|
| 273 |
+
return "en"
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
logger.exception(f"Language detection failed: {e}")
|
| 277 |
+
text_lower = text.lower()
|
| 278 |
+
if any(word in text_lower for word in HAUSA_WORDS):
|
| 279 |
+
return "ha"
|
| 280 |
+
elif any(word in text_lower for word in YORUBA_WORDS):
|
| 281 |
+
return "yo"
|
| 282 |
+
elif any(word in text_lower for word in IGBO_WORDS):
|
| 283 |
+
return "ig"
|
| 284 |
+
else:
|
| 285 |
+
return "en"
|
| 286 |
|
| 287 |
def text_to_speech_file(text: str) -> str:
|
| 288 |
lang = detect_language(text)
|
requirements.txt
CHANGED
|
@@ -15,7 +15,6 @@ aiofiles
|
|
| 15 |
accelerate
|
| 16 |
sentencepiece
|
| 17 |
protobuf
|
| 18 |
-
langdetect
|
| 19 |
nest-asyncio
|
| 20 |
|
| 21 |
|
|
|
|
| 15 |
accelerate
|
| 16 |
sentencepiece
|
| 17 |
protobuf
|
|
|
|
| 18 |
nest-asyncio
|
| 19 |
|
| 20 |
|