Spaces:
Sleeping
Sleeping
identifier
Browse files
app.py
CHANGED
|
@@ -205,36 +205,52 @@ def _load_natlas():
|
|
| 205 |
if hf_token:
|
| 206 |
hf_token = hf_token.strip()
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
try:
|
| 209 |
logger.info("Lazy-loading N-ATLaS language identification model...")
|
|
|
|
|
|
|
| 210 |
natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
|
| 211 |
natlas_model = AutoModelForCausalLM.from_pretrained(
|
| 212 |
"NCAIR1/N-ATLaS",
|
| 213 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 214 |
device_map="auto" if torch.cuda.is_available() else None,
|
| 215 |
-
token=hf_token
|
|
|
|
|
|
|
|
|
|
| 216 |
)
|
| 217 |
-
logger.info("
|
| 218 |
return True
|
| 219 |
except Exception as e:
|
| 220 |
-
logger.exception("Failed to load N-ATLaS model")
|
| 221 |
natlas_tokenizer, natlas_model = None, None
|
| 222 |
return False
|
| 223 |
|
| 224 |
def detect_language(text: str) -> str:
|
|
|
|
|
|
|
| 225 |
if not _load_natlas():
|
| 226 |
logger.warning("N-ATLaS model not available, falling back to keyword detection")
|
| 227 |
text_lower = text.lower()
|
| 228 |
if any(word in text_lower for word in HAUSA_WORDS):
|
|
|
|
| 229 |
return "ha"
|
| 230 |
elif any(word in text_lower for word in YORUBA_WORDS):
|
|
|
|
| 231 |
return "yo"
|
| 232 |
elif any(word in text_lower for word in IGBO_WORDS):
|
|
|
|
| 233 |
return "ig"
|
| 234 |
else:
|
|
|
|
| 235 |
return "en"
|
| 236 |
|
| 237 |
try:
|
|
|
|
| 238 |
messages = [
|
| 239 |
{'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
|
| 240 |
{'role': 'user', 'content': f'What language is this text written in? "{text}"'}
|
|
@@ -263,17 +279,24 @@ def detect_language(text: str) -> str:
|
|
| 263 |
response = natlas_tokenizer.batch_decode(outputs)[0]
|
| 264 |
response_text = response.split(messages[1]['content'])[-1].strip().lower()
|
| 265 |
|
|
|
|
|
|
|
| 266 |
if 'ha' in response_text:
|
|
|
|
| 267 |
return "ha"
|
| 268 |
elif 'yo' in response_text:
|
|
|
|
| 269 |
return "yo"
|
| 270 |
elif 'ig' in response_text:
|
|
|
|
| 271 |
return "ig"
|
| 272 |
else:
|
|
|
|
| 273 |
return "en"
|
| 274 |
|
| 275 |
except Exception as e:
|
| 276 |
logger.exception(f"Language detection failed: {e}")
|
|
|
|
| 277 |
text_lower = text.lower()
|
| 278 |
if any(word in text_lower for word in HAUSA_WORDS):
|
| 279 |
return "ha"
|
|
@@ -339,7 +362,24 @@ async def root():
|
|
| 339 |
|
| 340 |
@app.get("/health")
|
| 341 |
async def health():
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
|
| 344 |
@app.post("/chat")
|
| 345 |
async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
|
|
|
|
| 205 |
if hf_token:
|
| 206 |
hf_token = hf_token.strip()
|
| 207 |
|
| 208 |
+
if not hf_token:
|
| 209 |
+
logger.error("HF_TOKEN not available for N-ATLaS model access")
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
try:
|
| 213 |
logger.info("Lazy-loading N-ATLaS language identification model...")
|
| 214 |
+
logger.info("This may take a few minutes as the model loads its shards...")
|
| 215 |
+
|
| 216 |
natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
|
| 217 |
natlas_model = AutoModelForCausalLM.from_pretrained(
|
| 218 |
"NCAIR1/N-ATLaS",
|
| 219 |
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
|
| 220 |
device_map="auto" if torch.cuda.is_available() else None,
|
| 221 |
+
token=hf_token,
|
| 222 |
+
trust_remote_code=True,
|
| 223 |
+
low_cpu_mem_usage=True,
|
| 224 |
+
use_cache=True
|
| 225 |
)
|
| 226 |
+
logger.info("Successfully loaded N-ATLaS language identification model")
|
| 227 |
return True
|
| 228 |
except Exception as e:
|
| 229 |
+
logger.exception(f"Failed to load N-ATLaS model: {e}")
|
| 230 |
natlas_tokenizer, natlas_model = None, None
|
| 231 |
return False
|
| 232 |
|
| 233 |
def detect_language(text: str) -> str:
|
| 234 |
+
logger.info(f"Detecting language for text: '{text[:50]}...'")
|
| 235 |
+
|
| 236 |
if not _load_natlas():
|
| 237 |
logger.warning("N-ATLaS model not available, falling back to keyword detection")
|
| 238 |
text_lower = text.lower()
|
| 239 |
if any(word in text_lower for word in HAUSA_WORDS):
|
| 240 |
+
logger.info("Keyword detection: Hausa")
|
| 241 |
return "ha"
|
| 242 |
elif any(word in text_lower for word in YORUBA_WORDS):
|
| 243 |
+
logger.info("Keyword detection: Yoruba")
|
| 244 |
return "yo"
|
| 245 |
elif any(word in text_lower for word in IGBO_WORDS):
|
| 246 |
+
logger.info("Keyword detection: Igbo")
|
| 247 |
return "ig"
|
| 248 |
else:
|
| 249 |
+
logger.info("Keyword detection: English (default)")
|
| 250 |
return "en"
|
| 251 |
|
| 252 |
try:
|
| 253 |
+
logger.info("Using N-ATLaS for language detection")
|
| 254 |
messages = [
|
| 255 |
{'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
|
| 256 |
{'role': 'user', 'content': f'What language is this text written in? "{text}"'}
|
|
|
|
| 279 |
response = natlas_tokenizer.batch_decode(outputs)[0]
|
| 280 |
response_text = response.split(messages[1]['content'])[-1].strip().lower()
|
| 281 |
|
| 282 |
+
logger.info(f"N-ATLaS response: '{response_text}'")
|
| 283 |
+
|
| 284 |
if 'ha' in response_text:
|
| 285 |
+
logger.info("N-ATLaS detection: Hausa")
|
| 286 |
return "ha"
|
| 287 |
elif 'yo' in response_text:
|
| 288 |
+
logger.info("N-ATLaS detection: Yoruba")
|
| 289 |
return "yo"
|
| 290 |
elif 'ig' in response_text:
|
| 291 |
+
logger.info("N-ATLaS detection: Igbo")
|
| 292 |
return "ig"
|
| 293 |
else:
|
| 294 |
+
logger.info("N-ATLaS detection: English (default)")
|
| 295 |
return "en"
|
| 296 |
|
| 297 |
except Exception as e:
|
| 298 |
logger.exception(f"Language detection failed: {e}")
|
| 299 |
+
logger.warning("Falling back to keyword detection due to N-ATLaS error")
|
| 300 |
text_lower = text.lower()
|
| 301 |
if any(word in text_lower for word in HAUSA_WORDS):
|
| 302 |
return "ha"
|
|
|
|
| 362 |
|
| 363 |
@app.get("/health")
|
| 364 |
async def health():
|
| 365 |
+
natlas_status = "loaded" if natlas_tokenizer is not None and natlas_model is not None else "not_loaded"
|
| 366 |
+
return {
|
| 367 |
+
"message": "Farmlingua AI Speech Interface is running!",
|
| 368 |
+
"natlas_status": natlas_status,
|
| 369 |
+
"tts_models": {
|
| 370 |
+
"hausa": tts_ha is not None,
|
| 371 |
+
"english": tts_en is not None,
|
| 372 |
+
"yoruba": tts_yo is not None,
|
| 373 |
+
"igbo": False
|
| 374 |
+
}
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
@app.get("/status")
|
| 378 |
+
async def status():
|
| 379 |
+
return {
|
| 380 |
+
"natlas_loaded": natlas_tokenizer is not None and natlas_model is not None,
|
| 381 |
+
"loading_message": "N-ATLaS model is loading shards, please wait..." if natlas_tokenizer is None else "N-ATLaS model is ready"
|
| 382 |
+
}
|
| 383 |
|
| 384 |
@app.post("/chat")
|
| 385 |
async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
|