nexusbert commited on
Commit
cd0d2d4
·
1 Parent(s): b8370c9

identifier

Browse files
Files changed (1) hide show
  1. app.py +44 -4
app.py CHANGED
@@ -205,36 +205,52 @@ def _load_natlas():
205
  if hf_token:
206
  hf_token = hf_token.strip()
207
 
 
 
 
 
208
  try:
209
  logger.info("Lazy-loading N-ATLaS language identification model...")
 
 
210
  natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
211
  natlas_model = AutoModelForCausalLM.from_pretrained(
212
  "NCAIR1/N-ATLaS",
213
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
214
  device_map="auto" if torch.cuda.is_available() else None,
215
- token=hf_token
 
 
 
216
  )
217
- logger.info("Loaded N-ATLaS language identification model")
218
  return True
219
  except Exception as e:
220
- logger.exception("Failed to load N-ATLaS model")
221
  natlas_tokenizer, natlas_model = None, None
222
  return False
223
 
224
  def detect_language(text: str) -> str:
 
 
225
  if not _load_natlas():
226
  logger.warning("N-ATLaS model not available, falling back to keyword detection")
227
  text_lower = text.lower()
228
  if any(word in text_lower for word in HAUSA_WORDS):
 
229
  return "ha"
230
  elif any(word in text_lower for word in YORUBA_WORDS):
 
231
  return "yo"
232
  elif any(word in text_lower for word in IGBO_WORDS):
 
233
  return "ig"
234
  else:
 
235
  return "en"
236
 
237
  try:
 
238
  messages = [
239
  {'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
240
  {'role': 'user', 'content': f'What language is this text written in? "{text}"'}
@@ -263,17 +279,24 @@ def detect_language(text: str) -> str:
263
  response = natlas_tokenizer.batch_decode(outputs)[0]
264
  response_text = response.split(messages[1]['content'])[-1].strip().lower()
265
 
 
 
266
  if 'ha' in response_text:
 
267
  return "ha"
268
  elif 'yo' in response_text:
 
269
  return "yo"
270
  elif 'ig' in response_text:
 
271
  return "ig"
272
  else:
 
273
  return "en"
274
 
275
  except Exception as e:
276
  logger.exception(f"Language detection failed: {e}")
 
277
  text_lower = text.lower()
278
  if any(word in text_lower for word in HAUSA_WORDS):
279
  return "ha"
@@ -339,7 +362,24 @@ async def root():
339
 
340
  @app.get("/health")
341
  async def health():
342
- return {"message": "Farmlingua AI Speech Interface is running!"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
 
344
  @app.post("/chat")
345
  async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):
 
205
  if hf_token:
206
  hf_token = hf_token.strip()
207
 
208
+ if not hf_token:
209
+ logger.error("HF_TOKEN not available for N-ATLaS model access")
210
+ return False
211
+
212
  try:
213
  logger.info("Lazy-loading N-ATLaS language identification model...")
214
+ logger.info("This may take a few minutes as the model loads its shards...")
215
+
216
  natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
217
  natlas_model = AutoModelForCausalLM.from_pretrained(
218
  "NCAIR1/N-ATLaS",
219
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
220
  device_map="auto" if torch.cuda.is_available() else None,
221
+ token=hf_token,
222
+ trust_remote_code=True,
223
+ low_cpu_mem_usage=True,
224
+ use_cache=True
225
  )
226
+ logger.info("Successfully loaded N-ATLaS language identification model")
227
  return True
228
  except Exception as e:
229
+ logger.exception(f"Failed to load N-ATLaS model: {e}")
230
  natlas_tokenizer, natlas_model = None, None
231
  return False
232
 
233
  def detect_language(text: str) -> str:
234
+ logger.info(f"Detecting language for text: '{text[:50]}...'")
235
+
236
  if not _load_natlas():
237
  logger.warning("N-ATLaS model not available, falling back to keyword detection")
238
  text_lower = text.lower()
239
  if any(word in text_lower for word in HAUSA_WORDS):
240
+ logger.info("Keyword detection: Hausa")
241
  return "ha"
242
  elif any(word in text_lower for word in YORUBA_WORDS):
243
+ logger.info("Keyword detection: Yoruba")
244
  return "yo"
245
  elif any(word in text_lower for word in IGBO_WORDS):
246
+ logger.info("Keyword detection: Igbo")
247
  return "ig"
248
  else:
249
+ logger.info("Keyword detection: English (default)")
250
  return "en"
251
 
252
  try:
253
+ logger.info("Using N-ATLaS for language detection")
254
  messages = [
255
  {'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
256
  {'role': 'user', 'content': f'What language is this text written in? "{text}"'}
 
279
  response = natlas_tokenizer.batch_decode(outputs)[0]
280
  response_text = response.split(messages[1]['content'])[-1].strip().lower()
281
 
282
+ logger.info(f"N-ATLaS response: '{response_text}'")
283
+
284
  if 'ha' in response_text:
285
+ logger.info("N-ATLaS detection: Hausa")
286
  return "ha"
287
  elif 'yo' in response_text:
288
+ logger.info("N-ATLaS detection: Yoruba")
289
  return "yo"
290
  elif 'ig' in response_text:
291
+ logger.info("N-ATLaS detection: Igbo")
292
  return "ig"
293
  else:
294
+ logger.info("N-ATLaS detection: English (default)")
295
  return "en"
296
 
297
  except Exception as e:
298
  logger.exception(f"Language detection failed: {e}")
299
+ logger.warning("Falling back to keyword detection due to N-ATLaS error")
300
  text_lower = text.lower()
301
  if any(word in text_lower for word in HAUSA_WORDS):
302
  return "ha"
 
362
 
363
  @app.get("/health")
364
  async def health():
365
+ natlas_status = "loaded" if natlas_tokenizer is not None and natlas_model is not None else "not_loaded"
366
+ return {
367
+ "message": "Farmlingua AI Speech Interface is running!",
368
+ "natlas_status": natlas_status,
369
+ "tts_models": {
370
+ "hausa": tts_ha is not None,
371
+ "english": tts_en is not None,
372
+ "yoruba": tts_yo is not None,
373
+ "igbo": False
374
+ }
375
+ }
376
+
377
+ @app.get("/status")
378
+ async def status():
379
+ return {
380
+ "natlas_loaded": natlas_tokenizer is not None and natlas_model is not None,
381
+ "loading_message": "N-ATLaS model is loading shards, please wait..." if natlas_tokenizer is None else "N-ATLaS model is ready"
382
+ }
383
 
384
  @app.post("/chat")
385
  async def chat(text: str = Form(...), speak: bool = False, raw: bool = False):