Spaces:

raksama19
/

Test-Voice-Chatbot

Sleeping

App Files Files Community

raksa-the-wildcats commited on Jul 18, 2025

Commit

47f7fc0

1 Parent(s): 9e99484

Update files and remove old samples for Hugging Face Space

Browse files

Files changed (10) hide show

.DS_Store +0 -0
data/samples/output1.wav +0 -3
data/samples/output2.wav +0 -3
data/samples/output3.wav +0 -3
data/samples/output4.wav +0 -3
data/samples/output5.wav +0 -3
gemma_inference.py +280 -0
hf_space_app.py +228 -0
requirements.txt +4 -0
requirements_hf.txt +12 -0

.DS_Store CHANGED Viewed

Binary files a/.DS_Store and b/.DS_Store differ

data/samples/output1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:534f36e41170c0264972164a83770c421981e62feb3a4a3cae9118c58f13ad1a
-size 62168

data/samples/output2.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b50c4df6f508a4367e5a49e90f974f8786c6d9ffb2599a8abcd25e693399735a
-size 105176

data/samples/output3.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a25ce163d418cce5d8360a92ee39d9c8cfd75e4425f6bc8c3f9406186c882693
-size 70360

data/samples/output4.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1a70d3d25137d5ae38e436effe9da895960750b6dacfde2345655ebd2c5a1b33
-size 67628

data/samples/output5.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:33f58e7cc49a4e4fd4809d20cde2fb22855054cf61558be8ffef347fc35ce8f2
-size 114732

gemma_inference.py ADDED Viewed

	@@ -0,0 +1,280 @@

+import torch
+import soundfile as sf
+import whisper
+from transformers import AutoProcessor, Gemma3nForConditionalGeneration
+from snac import SNAC
+import os
+import tempfile
+from typing import Generator, Optional
+import numpy as np
+from utils.snac_utils import generate_audio_data, get_snac
+from utils.vad import get_speech_timestamps, collect_chunks
+class GemmaOmniInference:
+    """
+    Gemma 3n based inference engine for omni-mini
+    Replaces the custom GPT with Gemma 3n for better conversational capabilities
+    """
+    def __init__(self, device='cuda:0', model_id="google/gemma-3n-e4b-it"):
+        self.device = device
+        self.model_id = model_id
+        # Initialize models
+        print("Loading Gemma 3n model...")
+        self.model = Gemma3nForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype=torch.bfloat16
+        ).eval()
+        self.processor = AutoProcessor.from_pretrained(model_id)
+        # Keep the audio processing models
+        print("Loading audio processing models...")
+        self.snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
+        self.whispermodel = whisper.load_model("small").to(device)
+        print("Models loaded successfully!")
+    def warm_up(self):
+        """Warm up the models"""
+        print("Warming up models...")
+        # Create a dummy audio file for warmup
+        dummy_audio = np.random.randn(16000).astype(np.float32)  # 1 second of dummy audio
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+            sf.write(tmp.name, dummy_audio, 16000)
+            try:
+                for _ in self.run_audio_to_audio_stream(tmp.name):
+                    break  # Just run one iteration for warmup
+            except:
+                pass
+            os.unlink(tmp.name)
+        print("Warmup completed!")
+    def audio_to_text(self, audio_path: str) -> str:
+        """
+        Convert audio to text using Gemma 3n
+        """
+        # Load and process audio
+        audio = whisper.load_audio(audio_path)
+        audio = whisper.pad_or_trim(audio)
+        # Prepare messages for Gemma 3n
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful AI assistant. Transcribe the following audio accurately."}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio": audio_path},
+                    {"type": "text", "text": "Please transcribe this audio."}
+                ]
+            }
+        ]
+        # Process with Gemma 3n
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device)
+        input_len = inputs["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **inputs,
+                max_new_tokens=200,
+                do_sample=False,
+                temperature=0.7
+            )
+            generation = generation[0][input_len:]
+        decoded = self.processor.decode(generation, skip_special_tokens=True)
+        return decoded.strip()
+    def text_to_text(self, text: str, conversation_history: list = None) -> str:
+        """
+        Generate text response using Gemma 3n
+        """
+        # Build conversation messages
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful AI assistant. Respond naturally and conversationally."}]
+            }
+        ]
+        # Add conversation history if provided
+        if conversation_history:
+            messages.extend(conversation_history)
+        # Add current user message
+        messages.append({
+            "role": "user",
+            "content": [{"type": "text", "text": text}]
+        })
+        # Process with Gemma 3n
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device)
+        input_len = inputs["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **inputs,
+                max_new_tokens=500,
+                do_sample=True,
+                temperature=0.9,
+                top_p=0.95
+            )
+            generation = generation[0][input_len:]
+        decoded = self.processor.decode(generation, skip_special_tokens=True)
+        return decoded.strip()
+    def text_to_audio(self, text: str, output_path: Optional[str] = None) -> str:
+        """
+        Convert text to audio using SNAC
+        This is a simplified version - in practice you'd need a text-to-speech model
+        For now, we'll use a placeholder approach
+        """
+        # TODO: Implement proper text-to-speech
+        # For now, return the text (would need additional TTS model)
+        if output_path is None:
+            output_path = tempfile.mktemp(suffix=".wav")
+        # Placeholder: generate silent audio
+        # In practice, you'd use a TTS model here
+        silence = np.zeros(16000 * 2)  # 2 seconds of silence
+        sf.write(output_path, silence, 16000)
+        return output_path
+    def run_audio_to_audio_stream(self, audio_path: str, stream_stride: int = 4) -> Generator[bytes, None, None]:
+        """
+        Audio-to-audio streaming inference using Gemma 3n
+        """
+        # Step 1: Audio to text using Gemma 3n
+        try:
+            # Use Gemma 3n for audio understanding
+            messages = [
+                {
+                    "role": "system",
+                    "content": [{"type": "text", "text": "You are a helpful AI assistant. Listen to the audio and respond naturally."}]
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "audio", "audio": audio_path},
+                        {"type": "text", "text": "Please respond to what I said."}
+                    ]
+                }
+            ]
+            inputs = self.processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            ).to(self.model.device)
+            input_len = inputs["input_ids"].shape[-1]
+            with torch.inference_mode():
+                generation = self.model.generate(
+                    **inputs,
+                    max_new_tokens=300,
+                    do_sample=True,
+                    temperature=0.9,
+                    top_p=0.95
+                )
+                generation = generation[0][input_len:]
+            response_text = self.processor.decode(generation, skip_special_tokens=True).strip()
+            print(f"Gemma 3n response: {response_text}")
+            # Step 2: Convert response text to audio (placeholder)
+            # TODO: Implement proper text-to-speech pipeline
+            # For now, yield dummy audio data
+            # Generate some dummy audio chunks for streaming
+            chunk_size = 4096
+            total_chunks = 10
+            for i in range(total_chunks):
+                # In practice, this would be real audio data from TTS
+                dummy_chunk = np.random.randn(chunk_size).astype(np.float32) * 0.1
+                audio_bytes = (dummy_chunk * 32767).astype(np.int16).tobytes()
+                yield audio_bytes
+        except Exception as e:
+            print(f"Error in audio-to-audio streaming: {e}")
+            return
+    def process_conversation_turn(self, audio_path: str) -> tuple[str, str]:
+        """
+        Process a single conversation turn: audio input -> text response
+        Returns (transcribed_text, response_text)
+        """
+        # Use Gemma 3n for both transcription and response
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": "You are a helpful AI assistant. Listen to the audio, understand what the user said, and respond naturally. First transcribe what you heard, then provide a response."}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "audio", "audio": audio_path},
+                    {"type": "text", "text": "Please transcribe what I said and then respond appropriately."}
+                ]
+            }
+        ]
+        inputs = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(self.model.device)
+        input_len = inputs["input_ids"].shape[-1]
+        with torch.inference_mode():
+            generation = self.model.generate(
+                **inputs,
+                max_new_tokens=400,
+                do_sample=True,
+                temperature=0.8,
+                top_p=0.95
+            )
+            generation = generation[0][input_len:]
+        full_response = self.processor.decode(generation, skip_special_tokens=True).strip()
+        # Try to split transcription and response
+        # This is a simple heuristic - in practice you'd need better parsing
+        if ":" in full_response:
+            parts = full_response.split(":", 1)
+            transcription = parts[0].strip()
+            response = parts[1].strip()
+        else:
+            # Fallback: use the full response as both
+            transcription = full_response
+            response = full_response
+        return transcription, response

hf_space_app.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import gradio as gr
+import os
+import tempfile
+import soundfile as sf
+import numpy as np
+from gemma_inference import GemmaOmniInference
+import torch
+# Global inference engine
+inference_engine = None
+def initialize_model():
+    """Initialize the Gemma 3n inference engine"""
+    global inference_engine
+    try:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {device}")
+        inference_engine = GemmaOmniInference(device=device)
+        inference_engine.warm_up()
+        return "✅ Model loaded successfully!"
+    except Exception as e:
+        return f"❌ Error loading model: {str(e)}"
+def process_audio(audio_input, conversation_history):
+    """Process audio input and generate response"""
+    global inference_engine
+    if inference_engine is None:
+        return "❌ Model not initialized. Please wait for model to load.", conversation_history, None
+    if audio_input is None:
+        return "❌ No audio input provided.", conversation_history, None
+    try:
+        # Save audio to temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            # Handle different audio input formats
+            if isinstance(audio_input, tuple):
+                sample_rate, audio_data = audio_input
+                sf.write(tmp_file.name, audio_data, sample_rate)
+            else:
+                # Assume it's already a file path
+                audio_path = audio_input
+                tmp_file.name = audio_path
+            # Process with Gemma 3n
+            transcription, response = inference_engine.process_conversation_turn(tmp_file.name)
+            # Update conversation history
+            updated_history = conversation_history + [
+                {"role": "user", "content": transcription},
+                {"role": "assistant", "content": response}
+            ]
+            # Format conversation for display
+            conversation_display = ""
+            for turn in updated_history:
+                role = "🧑 User" if turn["role"] == "user" else "🤖 Assistant"
+                conversation_display += f"{role}: {turn['content']}\n\n"
+            # Clean up temporary file
+            if os.path.exists(tmp_file.name):
+                os.unlink(tmp_file.name)
+            return conversation_display, updated_history, response
+    except Exception as e:
+        return f"❌ Error processing audio: {str(e)}", conversation_history, None
+def process_text_input(text_input, conversation_history):
+    """Process text input and generate response"""
+    global inference_engine
+    if inference_engine is None:
+        return "❌ Model not initialized. Please wait for model to load.", conversation_history
+    if not text_input.strip():
+        return "❌ No text input provided.", conversation_history
+    try:
+        # Generate response using Gemma 3n
+        response = inference_engine.text_to_text(text_input, conversation_history)
+        # Update conversation history
+        updated_history = conversation_history + [
+            {"role": "user", "content": text_input},
+            {"role": "assistant", "content": response}
+        ]
+        # Format conversation for display
+        conversation_display = ""
+        for turn in updated_history:
+            role = "🧑 User" if turn["role"] == "user" else "🤖 Assistant"
+            conversation_display += f"{role}: {turn['content']}\n\n"
+        return conversation_display, updated_history
+    except Exception as e:
+        return f"❌ Error processing text: {str(e)}", conversation_history
+def clear_conversation():
+    """Clear the conversation history"""
+    return "", []
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(title="Omni-Mini with Gemma 3n", theme=gr.themes.Soft()) as demo:
+        gr.Markdown("""
+        # 🎙️ Omni-Mini with Gemma 3n
+        A multimodal AI assistant powered by Google's Gemma 3n model.
+        You can interact using voice or text!
+        **Features:**
+        - 🎤 Voice input with automatic transcription
+        - 💬 Text-based conversation
+        - 🧠 Powered by Gemma 3n E4B model
+        - 🌍 Supports 140+ languages
+        """)
+        # Model status
+        model_status = gr.Textbox(
+            label="Model Status",
+            value="🔄 Loading model...",
+            interactive=False
+        )
+        # Conversation history (hidden state)
+        conversation_history = gr.State([])
+        # Main conversation display
+        conversation_display = gr.Textbox(
+            label="Conversation",
+            value="",
+            lines=15,
+            max_lines=20,
+            interactive=False,
+            placeholder="Your conversation will appear here..."
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎤 Voice Input")
+                audio_input = gr.Audio(
+                    label="Record your voice",
+                    type="numpy",
+                    format="wav"
+                )
+                audio_submit = gr.Button("🎤 Send Voice Message", variant="primary")
+            with gr.Column(scale=1):
+                gr.Markdown("### 💬 Text Input")
+                text_input = gr.Textbox(
+                    label="Type your message",
+                    placeholder="Enter your message here...",
+                    lines=3
+                )
+                text_submit = gr.Button("💬 Send Text Message", variant="primary")
+        with gr.Row():
+            clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
+        # Last response display
+        last_response = gr.Textbox(
+            label="Last Response",
+            value="",
+            lines=3,
+            interactive=False,
+            placeholder="The assistant's last response will appear here..."
+        )
+        # Event handlers
+        audio_submit.click(
+            process_audio,
+            inputs=[audio_input, conversation_history],
+            outputs=[conversation_display, conversation_history, last_response]
+        )
+        text_submit.click(
+            process_text_input,
+            inputs=[text_input, conversation_history],
+            outputs=[conversation_display, conversation_history]
+        )
+        text_input.submit(
+            process_text_input,
+            inputs=[text_input, conversation_history],
+            outputs=[conversation_display, conversation_history]
+        )
+        clear_btn.click(
+            clear_conversation,
+            outputs=[conversation_display, conversation_history]
+        )
+        # Initialize model on load
+        demo.load(
+            initialize_model,
+            outputs=[model_status]
+        )
+        gr.Markdown("""
+        ---
+        **Note:** This is a demo implementation. The audio-to-audio pipeline is simplified.
+        In a full implementation, you would need additional text-to-speech capabilities.
+        **Powered by:**
+        - 🧠 Google Gemma 3n E4B
+        - 🎤 OpenAI Whisper
+        - 🔊 SNAC Audio Codec
+        """)
+    return demo
+if __name__ == "__main__":
+    # Create and launch the interface
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

requirements.txt CHANGED Viewed

@@ -16,3 +16,7 @@ fastapi==0.112.4
 librosa==0.10.2.post1
 flask==3.0.3
 fire

 librosa==0.10.2.post1
 flask==3.0.3
 fire
+# Gemma 3n dependencies
+transformers>=4.53.0
+accelerate
+huggingface_hub

requirements_hf.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=2.0.0
+transformers>=4.53.0
+accelerate
+huggingface_hub
+gradio
+soundfile
+numpy
+snac==1.2.0
+openai-whisper
+librosa
+scipy
+torchaudio