Spaces:

nexusbert
/

milestone3

Sleeping

App Files Files Community

nexusbert commited on Oct 12, 2025

Commit

fc4a5de

verified ·

1 Parent(s): cd0d2d4

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -103

app.py CHANGED Viewed

@@ -21,10 +21,24 @@ nest_asyncio.apply()
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    load_models()
     yield
 app = FastAPI(title="Farmlingua AI Speech Interface", version="1.0.0", lifespan=lifespan)
@@ -36,55 +50,50 @@ app.add_middleware(
     allow_headers=["*"],
 )
-ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
-tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
-natlas_tokenizer, natlas_model = None, None
-asr_models = {
-    "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
-    "yo": {"repo": "NCAIR1/Yoruba-ASR", "model": None, "proc": None},
-    "ig": {"repo": "NCAIR1/Igbo-ASR", "model": None, "proc": None},
-    "en": {"repo": "NCAIR1/NigerianAccentedEnglish", "model": None, "proc": None},
-}
 def load_models():
-    global tts_ha, tts_en, tts_yo, tts_ig, natlas_tokenizer, natlas_model
     device = 0 if torch.cuda.is_available() else -1
-    hf_token = os.getenv("HF_TOKEN")
-    if hf_token:
-        hf_token = hf_token.strip()
-    if not hf_token:
-        logger.warning("HF_TOKEN not set! This may cause authentication failures for gated repositories.")
-        logger.warning("Please set HF_TOKEN environment variable to access restricted models.")
-    else:
-        logger.info("HF_TOKEN is set and ready for authenticated model access.")
-    logger.info("Loading TTS models...")
-    try:
-        tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device)
-        logger.info("Loaded TTS (Hausa)")
-    except Exception as e:
-        logger.exception("Failed to load TTS (Hausa)")
-        tts_ha = None
-    try:
-        tts_en = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device)
-        logger.info("Loaded TTS (English)")
-    except Exception:
-        logger.exception("Failed to load TTS (English)")
-        tts_en = None
-    try:
-        tts_yo = pipeline("text-to-speech", model="facebook/mms-tts-yor", device=device)
-        logger.info("Loaded TTS (Yoruba)")
-    except Exception:
-        logger.exception("Failed to load TTS (Yoruba)")
-        tts_yo = None
-    tts_ig = None
-    logger.info("Igbo TTS model disabled - will return text responses for Igbo language")
-    logger.info("N-ATLaS language identification model will be lazy-loaded on first use")
-    logger.info("Deferred ASR model loads: will lazy-load per language on first use")
 def _get_asr(lang_code: str):
     entry = asr_models.get(lang_code)
@@ -93,9 +102,7 @@ def _get_asr(lang_code: str):
     if entry["model"] is not None and entry["proc"] is not None:
         return entry["model"], entry["proc"]
     repo_id = entry["repo"]
-    hf_token = os.getenv("HF_TOKEN")
-    if hf_token:
-        hf_token = hf_token.strip()
     try:
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
@@ -162,17 +169,14 @@ def speech_to_text(audio_data: bytes) -> str:
         text = _run_whisper(model, proc, audio_array)
         if text:
             candidates.append((code, text))
     for lang_code, text in candidates:
         det = detect_language(text)
         if lang_code == det:
             return text
     if candidates:
         return max((t for _, t in candidates), key=lambda s: len(s or ""))
     return ""
 def get_ai_response(text: str) -> str:
     try:
         response = requests.post(ASK_URL, json={"query": text}, timeout=30)
@@ -200,19 +204,12 @@ def _load_natlas():
     global natlas_tokenizer, natlas_model
     if natlas_tokenizer is not None and natlas_model is not None:
         return True
-    hf_token = os.getenv("HF_TOKEN")
-    if hf_token:
-        hf_token = hf_token.strip()
     if not hf_token:
         logger.error("HF_TOKEN not available for N-ATLaS model access")
         return False
     try:
-        logger.info("Lazy-loading N-ATLaS language identification model...")
-        logger.info("This may take a few minutes as the model loads its shards...")
         natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
         natlas_model = AutoModelForCausalLM.from_pretrained(
             "NCAIR1/N-ATLaS",
@@ -232,40 +229,26 @@ def _load_natlas():
 def detect_language(text: str) -> str:
     logger.info(f"Detecting language for text: '{text[:50]}...'")
     if not _load_natlas():
         logger.warning("N-ATLaS model not available, falling back to keyword detection")
         text_lower = text.lower()
         if any(word in text_lower for word in HAUSA_WORDS):
-            logger.info("Keyword detection: Hausa")
             return "ha"
         elif any(word in text_lower for word in YORUBA_WORDS):
-            logger.info("Keyword detection: Yoruba")
             return "yo"
         elif any(word in text_lower for word in IGBO_WORDS):
-            logger.info("Keyword detection: Igbo")
             return "ig"
         else:
-            logger.info("Keyword detection: English (default)")
             return "en"
     try:
-        logger.info("Using N-ATLaS for language detection")
         messages = [
             {'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
             {'role': 'user', 'content': f'What language is this text written in? "{text}"'}
         ]
-        formatted_text = natlas_tokenizer.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=False
-        )
         input_tokens = natlas_tokenizer(formatted_text, return_tensors='pt', add_special_tokens=False)
         if torch.cuda.is_available():
             input_tokens = input_tokens.to('cuda')
         with torch.no_grad():
             outputs = natlas_model.generate(
                 **input_tokens,
@@ -275,28 +258,18 @@ def detect_language(text: str) -> str:
                 temperature=0.1,
                 do_sample=False
             )
         response = natlas_tokenizer.batch_decode(outputs)[0]
         response_text = response.split(messages[1]['content'])[-1].strip().lower()
-        logger.info(f"N-ATLaS response: '{response_text}'")
         if 'ha' in response_text:
-            logger.info("N-ATLaS detection: Hausa")
             return "ha"
         elif 'yo' in response_text:
-            logger.info("N-ATLaS detection: Yoruba")
             return "yo"
         elif 'ig' in response_text:
-            logger.info("N-ATLaS detection: Igbo")
             return "ig"
         else:
-            logger.info("N-ATLaS detection: English (default)")
             return "en"
     except Exception as e:
         logger.exception(f"Language detection failed: {e}")
-        logger.warning("Falling back to keyword detection due to N-ATLaS error")
         text_lower = text.lower()
         if any(word in text_lower for word in HAUSA_WORDS):
             return "ha"
@@ -308,9 +281,8 @@ def detect_language(text: str) -> str:
             return "en"
 def text_to_speech_file(text: str) -> str:
     lang = detect_language(text)
-    print(f"Detected language: {lang}")
     if lang == "ig":
         logger.info("Igbo language detected - returning text response instead of audio")
         fd, path = tempfile.mkstemp(suffix=".txt")
@@ -324,38 +296,25 @@ def text_to_speech_file(text: str) -> str:
         tts_model = tts_yo
     else:
         tts_model = tts_en
     if tts_model is None:
-        logger.error(f"TTS model for {lang} is not available")
         raise HTTPException(status_code=500, detail=f"TTS model for {lang} is not available")
     speech_output = tts_model(text)
     audio_raw = speech_output["audio"]
-    sampling_rate = int(speech_output["sampling_rate"])
     if isinstance(audio_raw, torch.Tensor):
         audio_np = audio_raw.detach().cpu().numpy()
     else:
         audio_np = np.asarray(audio_raw)
     if audio_np.ndim > 1:
         audio_np = audio_np.reshape(-1)
     audio_np = audio_np.astype(np.float32, copy=False)
     audio_clipped = np.clip(audio_np, -1.0, 1.0)
     audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
     fd, path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
     return path
 @app.get("/")
 async def root():
     return {"status": "ok", "message": "System ready"}
@@ -412,5 +371,4 @@ async def speak_to_ai(audio_file: UploadFile = File(...), speak: bool = True):
     return {"transcription": transcription, "ai_response": ai_response}
 if __name__ == "__main__":
-    import uvicorn
     uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+tts_ha, tts_en, tts_yo, tts_ig = None, None, None, None
+natlas_tokenizer, natlas_model = None, None
+ASK_URL = "https://remostart-milestone-one-farmlingua-ai.hf.space/ask"
+asr_models = {
+    "ha": {"repo": "NCAIR1/Hausa-ASR", "model": None, "proc": None},
+    "yo": {"repo": "NCAIR1/Yoruba-ASR", "model": None, "proc": None},
+    "ig": {"repo": "NCAIR1/Igbo-ASR", "model": None, "proc": None},
+    "en": {"repo": "NCAIR1/NigerianAccentedEnglish", "model": None, "proc": None},
+}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    logger.info("Starting Farmlingua AI Speech Interface...")
+    preload_natlas()
     yield
+    logger.info("Shutting down Farmlingua service...")
 app = FastAPI(title="Farmlingua AI Speech Interface", version="1.0.0", lifespan=lifespan)
     allow_headers=["*"],
 )
+def preload_natlas():
+    global natlas_tokenizer, natlas_model
+    if natlas_tokenizer is not None and natlas_model is not None:
+        logger.info("N-ATLaS already loaded.")
+        return
+    success = _load_natlas()
+    if success:
+        logger.info("N-ATLaS successfully preloaded at startup.")
+    else:
+        logger.warning("N-ATLaS preload failed. It will retry on first use.")
 def load_models():
+    global tts_ha, tts_en, tts_yo, tts_ig
     device = 0 if torch.cuda.is_available() else -1
+    hf_token = os.getenv("HF_TOKEN", "").strip()
+    logger.info("Lazy-loading TTS models on first use...")
+    if tts_ha is None:
+        try:
+            tts_ha = pipeline("text-to-speech", model="facebook/mms-tts-hau", device=device, token=hf_token)
+            logger.info("Loaded TTS (Hausa)")
+        except Exception:
+            logger.exception("Failed to load TTS (Hausa)")
+            tts_ha = None
+    if tts_en is None:
+        try:
+            tts_en = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device, token=hf_token)
+            logger.info("Loaded TTS (English)")
+        except Exception:
+            logger.exception("Failed to load TTS (English)")
+            tts_en = None
+    if tts_yo is None:
+        try:
+            tts_yo = pipeline("text-to-speech", model="facebook/mms-tts-yor", device=device, token=hf_token)
+            logger.info("Loaded TTS (Yoruba)")
+        except Exception:
+            logger.exception("Failed to load TTS (Yoruba)")
+            tts_yo = None
+    tts_ig = None
+    logger.info("Igbo TTS model disabled - returning text responses for Igbo language")
 def _get_asr(lang_code: str):
     entry = asr_models.get(lang_code)
     if entry["model"] is not None and entry["proc"] is not None:
         return entry["model"], entry["proc"]
     repo_id = entry["repo"]
+    hf_token = os.getenv("HF_TOKEN", "").strip()
     try:
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         logger.info(f"Lazy-loading ASR for {lang_code} from {repo_id}...")
         text = _run_whisper(model, proc, audio_array)
         if text:
             candidates.append((code, text))
     for lang_code, text in candidates:
         det = detect_language(text)
         if lang_code == det:
             return text
     if candidates:
         return max((t for _, t in candidates), key=lambda s: len(s or ""))
     return ""
 def get_ai_response(text: str) -> str:
     try:
         response = requests.post(ASK_URL, json={"query": text}, timeout=30)
     global natlas_tokenizer, natlas_model
     if natlas_tokenizer is not None and natlas_model is not None:
         return True
+    hf_token = os.getenv("HF_TOKEN", "").strip()
     if not hf_token:
         logger.error("HF_TOKEN not available for N-ATLaS model access")
         return False
     try:
+        logger.info("Loading N-ATLaS language identification model...")
         natlas_tokenizer = AutoTokenizer.from_pretrained("NCAIR1/N-ATLaS", token=hf_token)
         natlas_model = AutoModelForCausalLM.from_pretrained(
             "NCAIR1/N-ATLaS",
 def detect_language(text: str) -> str:
     logger.info(f"Detecting language for text: '{text[:50]}...'")
     if not _load_natlas():
         logger.warning("N-ATLaS model not available, falling back to keyword detection")
         text_lower = text.lower()
         if any(word in text_lower for word in HAUSA_WORDS):
             return "ha"
         elif any(word in text_lower for word in YORUBA_WORDS):
             return "yo"
         elif any(word in text_lower for word in IGBO_WORDS):
             return "ig"
         else:
             return "en"
     try:
         messages = [
             {'role': 'system', 'content': 'You are a language identification assistant. Identify the language of the given text and respond with only the language code: "en" for English, "ha" for Hausa, "yo" for Yoruba, or "ig" for Igbo.'},
             {'role': 'user', 'content': f'What language is this text written in? "{text}"'}
         ]
+        formatted_text = natlas_tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
         input_tokens = natlas_tokenizer(formatted_text, return_tensors='pt', add_special_tokens=False)
         if torch.cuda.is_available():
             input_tokens = input_tokens.to('cuda')
         with torch.no_grad():
             outputs = natlas_model.generate(
                 **input_tokens,
                 temperature=0.1,
                 do_sample=False
             )
         response = natlas_tokenizer.batch_decode(outputs)[0]
         response_text = response.split(messages[1]['content'])[-1].strip().lower()
         if 'ha' in response_text:
             return "ha"
         elif 'yo' in response_text:
             return "yo"
         elif 'ig' in response_text:
             return "ig"
         else:
             return "en"
     except Exception as e:
         logger.exception(f"Language detection failed: {e}")
         text_lower = text.lower()
         if any(word in text_lower for word in HAUSA_WORDS):
             return "ha"
             return "en"
 def text_to_speech_file(text: str) -> str:
+    load_models()
     lang = detect_language(text)
     if lang == "ig":
         logger.info("Igbo language detected - returning text response instead of audio")
         fd, path = tempfile.mkstemp(suffix=".txt")
         tts_model = tts_yo
     else:
         tts_model = tts_en
     if tts_model is None:
         raise HTTPException(status_code=500, detail=f"TTS model for {lang} is not available")
     speech_output = tts_model(text)
     audio_raw = speech_output["audio"]
+    sampling_rate = int(speech_output["sampling_rate"])
     if isinstance(audio_raw, torch.Tensor):
         audio_np = audio_raw.detach().cpu().numpy()
     else:
         audio_np = np.asarray(audio_raw)
     if audio_np.ndim > 1:
         audio_np = audio_np.reshape(-1)
     audio_np = audio_np.astype(np.float32, copy=False)
     audio_clipped = np.clip(audio_np, -1.0, 1.0)
     audio_int16 = (audio_clipped * 32767.0).astype(np.int16)
     fd, path = tempfile.mkstemp(suffix=".wav")
     os.close(fd)
     sf.write(path, audio_int16, sampling_rate, format='WAV', subtype='PCM_16')
     return path
 @app.get("/")
 async def root():
     return {"status": "ok", "message": "System ready"}
     return {"transcription": transcription, "ai_response": ai_response}
 if __name__ == "__main__":
     uvicorn.run(app, host="0.0.0.0", port=int(os.getenv("PORT", "7860")))