Spaces:

BissakaAI
/

hamid

Sleeping

App Files Files Community

BissakaAI commited on Dec 12, 2025

Commit

4bb2275

verified ·

1 Parent(s): 9b9fdff

first_upload

Browse files

Files changed (5) hide show

app.py +47 -0
awari-project (1).ipynb +391 -0
dockerfile +11 -0
model.py +156 -0
requirements.txt +9 -0

app.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from fastapi import FastAPI
+from pydantic import BaseModel
+from pydub import AudioSegment
+import librosa
+import uvicorn
+import torch
+import soundfile as sf
+# import your existing functions
+from your_model_file import textonly, speechonly
+app = FastAPI(title="Hamid Speech API", version="1.0.0")
+@app.get("/")
+def root():
+    return {"message": "Welcome to Hamid AI Speech API"}
+class TextRequest(BaseModel):
+    text: str
+class SpeechRequest(BaseModel):
+    input_audio_path: str
+    wav_output_path: str
+@app.post("/textonly")
+def run_text(req: TextRequest):
+    result = textonly(req.text)
+    return {"response": result}
+@app.post("/speechonly")
+def run_speech(req: SpeechRequest):
+    # Convert input audio to WAV
+    audio = AudioSegment.from_file(req.input_audio_path)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    audio.export(req.wav_output_path, format="wav")
+    # Load WAV
+    speech, sr = librosa.load(req.wav_output_path, sr=16000)
+    llm_response, wav_path = speechonly(speech, output_wav_path=req.wav_output_path)
+    return {
+        "response": llm_response,
+        "wav_saved": wav_path
+    }

awari-project (1).ipynb ADDED Viewed

	@@ -0,0 +1,391 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-11T15:43:45.235265Z",
+     "iopub.status.busy": "2025-12-11T15:43:45.235029Z",
+     "iopub.status.idle": "2025-12-11T15:43:45.340285Z",
+     "shell.execute_reply": "2025-12-11T15:43:45.339518Z",
+     "shell.execute_reply.started": "2025-12-11T15:43:45.235247Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "from kaggle_secrets import UserSecretsClient\n",
+    "user_secrets = UserSecretsClient()\n",
+    "secret_value_0 = user_secrets.get_secret(\"HF_TOKEN\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-11T15:43:45.341357Z",
+     "iopub.status.busy": "2025-12-11T15:43:45.341102Z",
+     "iopub.status.idle": "2025-12-11T15:45:04.811675Z",
+     "shell.execute_reply": "2025-12-11T15:45:04.810916Z",
+     "shell.execute_reply.started": "2025-12-11T15:43:45.341333Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "pip install -U bitsandbytes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import (\n",
+    "    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,\n",
+    "    AutoProcessor, SeamlessM4Tv2ForSpeechToText,\n",
+    "    VitsModel  # TTS\n",
+    ")\n",
+    "import torch\n",
+    "import soundfile as sf\n",
+    "import os\n",
+    "from kaggle_secrets import UserSecretsClient\n",
+    "\n",
+    "\n",
+    "# getting hftoken from kaggle secret\n",
+    "user_secrets = UserSecretsClient()\n",
+    "HF_TOKEN = user_secrets.get_secret(\"HF_TOKEN\")\n",
+    "print(\"hf_token retrieved\")\n",
+    "\n",
+    "\n",
+    "# using the bitsandbytes to quantize the model\n",
+    "bnb_config = BitsAndBytesConfig(load_in_8bit=True)\n",
+    "\n",
+    "#setting the device to use for runnning \n",
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "# loading Natlas model and  tokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\n",
+    "    \"NCAIR1/N-ATLaS\",\n",
+    "    trust_remote_code=True,\n",
+    "    token=HF_TOKEN\n",
+    ")\n",
+    "\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    \"NCAIR1/N-ATLaS\",\n",
+    "    quantization_config=bnb_config,\n",
+    "    device_map=\"auto\",\n",
+    "    trust_remote_code=True,\n",
+    "    token=HF_TOKEN\n",
+    ")\n",
+    "\n",
+    "\n",
+    "\n",
+    "#an Asr model to convert speech to text\n",
+    "ASR_MODEL = \"facebook/seamless-m4t-v2-large\"\n",
+    "processor = AutoProcessor.from_pretrained(ASR_MODEL, token=HF_TOKEN)\n",
+    "asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(ASR_MODEL, token=HF_TOKEN).to(device)\n",
+    "asr_model.eval()\n",
+    "\n",
+    "\n",
+    "# model to covert text back to speech \n",
+    "# load for hausa igbo,yoruba and english\n",
+    "tts_models = {}\n",
+    "for lang, tts_name in {\n",
+    "    \"yoruba\": \"facebook/mms-tts-yor\",\n",
+    "    # \"igbo\": \"facebook/mms-tts-ibo\",\n",
+    "    # \"hausa\": \"facebook/mms-tts-hau\",\n",
+    "}.items():\n",
+    "    print(f\"Loading TTS model for {lang}\")\n",
+    "    tts_proc = AutoProcessor.from_pretrained(tts_name, token=HF_TOKEN)\n",
+    "    tts_mod = VitsModel.from_pretrained(tts_name, token=HF_TOKEN).to(device)\n",
+    "    tts_mod.eval()\n",
+    "    tts_models[lang] = {\"processor\": tts_proc, \"model\": tts_mod}\n",
+    "\n",
+    "print(\"All the tts models loaded successfully!\")\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-12-11T15:45:04.813528Z",
+     "iopub.status.busy": "2025-12-11T15:45:04.813289Z",
+     "iopub.status.idle": "2025-12-11T15:49:56.343820Z",
+     "shell.execute_reply": "2025-12-11T15:49:56.343087Z",
+     "shell.execute_reply.started": "2025-12-11T15:45:04.813503Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import soundfile as sf\n",
+    "\n",
+    "\n",
+    "\n",
+    "# create a function to load text input\n",
+    "def textonly(user_msg: str):\n",
+    "    def format_prompt(messages):\n",
+    "        return tokenizer.apply_chat_template(\n",
+    "            messages,\n",
+    "            add_generation_prompt=True,\n",
+    "            tokenize=False\n",
+    "        )\n",
+    "\n",
+    "    chat = [\n",
+    "        {\"role\": \"system\", \"content\": \"You are a helpful model trained by Awarri AI Technologies.\"},\n",
+    "        {\"role\": \"user\", \"content\": user_msg}\n",
+    "    ]\n",
+    "\n",
+    "    final_text = format_prompt(chat)\n",
+    "    inputs = tokenizer(final_text, return_tensors=\"pt\").to(model.device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        output_ids = model.generate(\n",
+    "            **inputs,\n",
+    "            max_new_tokens=200,\n",
+    "            temperature=0.1,\n",
+    "            repetition_penalty=1.12\n",
+    "        )\n",
+    "\n",
+    "    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "    return response\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#create a function to handle speech input\n",
+    "def speechonly(speech, output_wav_path=\"response.wav\"):\n",
+    "    #the speech to text part \n",
+    "    inputs = processor(audios=speech, sampling_rate=16000, return_tensors=\"pt\").to(device)\n",
+    "    with torch.no_grad():\n",
+    "        predicted_ids = asr_model.generate(inputs[\"input_features\"], max_new_tokens=300)\n",
+    "        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]\n",
+    "\n",
+    "        print(\"\\nTRANSCRIPTION:\", transcription)\n",
+    "\n",
+    "\n",
+    "    #using Natlas LLM to handle the response \n",
+    "    def format_prompt(messages):\n",
+    "        return tokenizer.apply_chat_template(\n",
+    "            messages,\n",
+    "            add_generation_prompt=True,\n",
+    "            tokenize=False\n",
+    "        )\n",
+    "\n",
+    "    chat = [\n",
+    "        {\"role\": \"system\", \"content\": \"Respond ONLY in the detected Nigerian language (Yoruba, Igbo, Hausa, Pidgin, English).\"},\n",
+    "        {\"role\": \"user\", \"content\": transcription}\n",
+    "    ]\n",
+    "\n",
+    "    final_text = format_prompt(chat)\n",
+    "    inputs_llm = tokenizer(final_text, return_tensors=\"pt\").to(model.device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        output_ids = model.generate(\n",
+    "            **inputs_llm,\n",
+    "            max_new_tokens=200,\n",
+    "            temperature=0.1,\n",
+    "            repetition_penalty=1.12\n",
+    "        )\n",
+    "\n",
+    "    llm_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)\n",
+    "    print(\"\\nllm response:\", llm_response)\n",
+    "\n",
+    "\n",
+    "\n",
+    "\n",
+    "    #natlas is a multilingual model designed for nigerian languages \n",
+    "    # its expected that it has a good understanding of the nigerian languages \n",
+    "    # using it to detect the language of the user input \n",
+    "    lang_prompt = [\n",
+    "        {\"role\": \"system\", \"content\": \"You are a Nigerian language expert.\"},\n",
+    "        {\"role\": \"user\", \"content\": f\"In which Nigerian language is this text: '{llm_response}'? Reply with only one of these: Yoruba, Igbo, Hausa, Pidgin, English.\"}\n",
+    "    ]\n",
+    "    lang_text = format_prompt(lang_prompt)\n",
+    "    lang_inputs = tokenizer(lang_text, return_tensors=\"pt\").to(model.device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        lang_output_ids = model.generate(**lang_inputs, max_new_tokens=10)\n",
+    "\n",
+    "    llm_language = tokenizer.decode(lang_output_ids[0], skip_special_tokens=True).strip().lower()\n",
+    "    print(\"\\nLLM DETECTED LANGUAGE:\", llm_language)\n",
+    "\n",
+    "    # Picking TTS model based on LLM reply\n",
+    "  \n",
+    "    if llm_language not in tts_models:\n",
+    "        llm_language = \"english\"  \n",
+    "\n",
+    "    tts_processor = tts_models[llm_language][\"processor\"]\n",
+    "    tts_model = tts_models[llm_language][\"model\"]\n",
+    "\n",
+    "\n",
+    "    #to generate speech \n",
+    "\n",
+    "    # Process text\n",
+    "    tts_inputs = tts_processor(text=llm_response, return_tensors=\"pt\").to(device)\n",
+    "    with torch.no_grad():\n",
+    "        output = tts_model(**tts_inputs)\n",
+    "    audio_array = output.waveform.squeeze().cpu().numpy()\n",
+    "\n",
+    "    # Save WAV\n",
+    "    sf.write(output_wav_path, audio_array, 16000)\n",
+    "    return llm_response, output_wav_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "# Ask user for input type\n",
+    "userinput = input(\"Enter 'text' or 'audio': \").lower()\n",
+    "\n",
+    "if userinput == \"text\":\n",
+    "    # Call text function\n",
+    "    answer1 = textonly()\n",
+    "    print(\"\\ntext response:\\n\", answer1)\n",
+    "\n",
+    "else:\n",
+    "    # Load and preprocess audio\n",
+    "    audio_path = \"/kaggle/input/recordings/Recording (3).m4a\"  \n",
+    "    audio = AudioSegment.from_file(audio_path)\n",
+    "    audio = audio.set_frame_rate(16000).set_channels(1)\n",
+    "    audio.export(\"/kaggle/working/audio.wav\", format=\"wav\")\n",
+    "\n",
+    "    speech, sr = librosa.load(\"/kaggle/working/audio.wav\", sr=16000)\n",
+    "    print(\"Converted audio loaded.\")\n",
+    "\n",
+    "    # Call speech function\n",
+    "    answer2 = speechonly(speech)\n",
+    "    print(\"\\nAUDIO RESPONSE saved as:\", answer2)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fastapi import FastAPI\n",
+    "from pydantic import BaseModel\n",
+    "from pydub import AudioSegment\n",
+    "import librosa\n",
+    "import uvicorn\n",
+    "\n",
+    "app = FastAPI(title='Simple FastAPI App', version='1.0.0')\n",
+    "\n",
+    "@app.get(\"/\")\n",
+    "def root():\n",
+    "    return {\"Message\": \"Welcome to Healthatlas Application\"}\n",
+    "\n",
+    "\n",
+    "\n",
+    "class TextRequest(BaseModel):\n",
+    "    text: str\n",
+    "\n",
+    "\n",
+    "class SpeechRequest(BaseModel):\n",
+    "    input_audio_path: str  \n",
+    "    wav_output_path: str   \n",
+    "\n",
+    "\n",
+    "\n",
+    "@app.post(\"/textonly\")\n",
+    "def do_text(request: TextRequest):\n",
+    "    answer1 = textonly(request.text)\n",
+    "    print(\"\\nText response:\\n\", answer1)\n",
+    "    return {\"response\": answer1}\n",
+    "\n",
+    "\n",
+    "@app.post(\"/speechonly\")\n",
+    "def run_speech(request: SpeechRequest):\n",
+    "    audio = AudioSegment.from_file(request.input_audio_path)\n",
+    "    audio = audio.set_frame_rate(16000).set_channels(1)\n",
+    "    audio.export(request.wav_output_path, format=\"wav\")\n",
+    "\n",
+    "    speech, sr = librosa.load(request.wav_output_path, sr=16000)\n",
+    "    print(\"Converted audio loaded.\")\n",
+    "\n",
+    "\n",
+    "    answer2 = speechonly(speech)\n",
+    "\n",
+    "    return {\"response\": answer2, \"saved_wav\": request.wav_output_path}\n",
+    "\n",
+    "if __name__ == '__main__':\n",
+    "    print(os.getenv('host'))\n",
+    "    print(os.getenv('port'))\n",
+    "    uvicorn.run(app,host=os.getenv(\"host\"),port=int(os.getenv(\"port\")))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "nvidiaTeslaT4",
+   "dataSources": [
+    {
+     "datasetId": 8987240,
+     "sourceId": 14109383,
+     "sourceType": "datasetVersion"
+    }
+   ],
+   "dockerImageVersionId": 31193,
+   "isGpuEnabled": true,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "zoomcamp-pwCLAhn6",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.10
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
+COPY . .
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

model.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# your_model_file.py
+from transformers import (
+    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
+    AutoProcessor, SeamlessM4Tv2ForSpeechToText,
+    VitsModel
+)
+import torch
+import soundfile as sf
+import os
+# --------------------------
+# Device & config
+# --------------------------
+bnb_config = BitsAndBytesConfig(load_in_8bit=True)
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# --------------------------
+# Load LLM
+# --------------------------
+HF_TOKEN = os.getenv("HF_TOKEN")  # Use environment variable for Spaces
+tokenizer = AutoTokenizer.from_pretrained(
+    "NCAIR1/N-ATLaS",
+    trust_remote_code=True,
+    token=HF_TOKEN
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "NCAIR1/N-ATLaS",
+    quantization_config=bnb_config,
+    device_map="auto",
+    trust_remote_code=True,
+    token=HF_TOKEN
+)
+# --------------------------
+# Load ASR
+# --------------------------
+ASR_MODEL = "facebook/seamless-m4t-v2-large"
+processor = AutoProcessor.from_pretrained(ASR_MODEL, token=HF_TOKEN)
+asr_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(ASR_MODEL, token=HF_TOKEN).to(device)
+asr_model.eval()
+# --------------------------
+# Load Nigerian TTS models
+# --------------------------
+tts_models = {}
+for lang, tts_name in {
+    "yoruba": "facebook/mms-tts-yor",
+    # "igbo": "facebook/mms-tts-ibo",
+    # "hausa": "facebook/mms-tts-hau",
+}.items():
+    print(f"Loading TTS model for {lang}...")
+    tts_proc = AutoProcessor.from_pretrained(tts_name, token=HF_TOKEN)
+    tts_mod = VitsModel.from_pretrained(tts_name, token=HF_TOKEN).to(device)
+    tts_mod.eval()
+    tts_models[lang] = {"processor": tts_proc, "model": tts_mod}
+print("✅ All models loaded successfully!")
+# --------------------------
+# TEXT FUNCTION
+# --------------------------
+def textonly(user_msg: str) -> str:
+    def format_prompt(messages):
+        return tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False
+        )
+    chat = [
+        {"role": "system", "content": "You are a helpful model trained by Awarri AI Technologies."},
+        {"role": "user", "content": user_msg}
+    ]
+    final_text = format_prompt(chat)
+    inputs = tokenizer(final_text, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=200,
+            temperature=0.1,
+            repetition_penalty=1.12
+        )
+    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    return response
+# --------------------------
+# SPEECH FUNCTION
+# --------------------------
+def speechonly(speech, output_wav_path="response.wav"):
+    # --- ASR ---
+    inputs = processor(audios=speech, sampling_rate=16000, return_tensors="pt").to(device)
+    with torch.no_grad():
+        predicted_ids = asr_model.generate(inputs["input_features"], max_new_tokens=300)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    # --- LLM Response ---
+    def format_prompt(messages):
+        return tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False
+        )
+    chat = [
+        {"role": "system", "content": "Respond ONLY in the detected Nigerian language (Yoruba, Igbo, Hausa, Pidgin, English)."},
+        {"role": "user", "content": transcription}
+    ]
+    final_text = format_prompt(chat)
+    inputs_llm = tokenizer(final_text, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        output_ids = model.generate(
+            **inputs_llm,
+            max_new_tokens=200,
+            temperature=0.1,
+            repetition_penalty=1.12
+        )
+    llm_response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    # --- Detect language ---
+    lang_prompt = [
+        {"role": "system", "content": "You are a Nigerian language expert."},
+        {"role": "user", "content": f"In which Nigerian language is this text: '{llm_response}'? Reply with only one of these: Yoruba, Igbo, Hausa, Pidgin, English."}
+    ]
+    lang_text = format_prompt(lang_prompt)
+    lang_inputs = tokenizer(lang_text, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        lang_output_ids = model.generate(**lang_inputs, max_new_tokens=10)
+    llm_language = tokenizer.decode(lang_output_ids[0], skip_special_tokens=True).strip().lower()
+    if llm_language not in tts_models:
+        llm_language = "yoruba"
+    # --- TTS ---
+    tts_processor = tts_models[llm_language]["processor"]
+    tts_model = tts_models[llm_language]["model"]
+    tts_inputs = tts_processor(text=llm_response, return_tensors="pt").to(device)
+    with torch.no_grad():
+        output = tts_model(**tts_inputs)
+    # Extract waveform and save
+    audio_array = output.waveform.squeeze().cpu().numpy()
+    sf.write(output_wav_path, audio_array, 16000)
+    return llm_response, output_wav_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+pydub
+librosa
+soundfile
+transformers
+torch
+accelerate
+bitsandbytes