raksa-the-wildcats commited on
Commit
47f7fc0
Β·
1 Parent(s): 9e99484

Update files and remove old samples for Hugging Face Space

Browse files
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
data/samples/output1.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:534f36e41170c0264972164a83770c421981e62feb3a4a3cae9118c58f13ad1a
3
- size 62168
 
 
 
 
data/samples/output2.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b50c4df6f508a4367e5a49e90f974f8786c6d9ffb2599a8abcd25e693399735a
3
- size 105176
 
 
 
 
data/samples/output3.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a25ce163d418cce5d8360a92ee39d9c8cfd75e4425f6bc8c3f9406186c882693
3
- size 70360
 
 
 
 
data/samples/output4.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1a70d3d25137d5ae38e436effe9da895960750b6dacfde2345655ebd2c5a1b33
3
- size 67628
 
 
 
 
data/samples/output5.wav DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:33f58e7cc49a4e4fd4809d20cde2fb22855054cf61558be8ffef347fc35ce8f2
3
- size 114732
 
 
 
 
gemma_inference.py ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import soundfile as sf
3
+ import whisper
4
+ from transformers import AutoProcessor, Gemma3nForConditionalGeneration
5
+ from snac import SNAC
6
+ import os
7
+ import tempfile
8
+ from typing import Generator, Optional
9
+ import numpy as np
10
+ from utils.snac_utils import generate_audio_data, get_snac
11
+ from utils.vad import get_speech_timestamps, collect_chunks
12
+
13
+ class GemmaOmniInference:
14
+ """
15
+ Gemma 3n based inference engine for omni-mini
16
+ Replaces the custom GPT with Gemma 3n for better conversational capabilities
17
+ """
18
+
19
+ def __init__(self, device='cuda:0', model_id="google/gemma-3n-e4b-it"):
20
+ self.device = device
21
+ self.model_id = model_id
22
+
23
+ # Initialize models
24
+ print("Loading Gemma 3n model...")
25
+ self.model = Gemma3nForConditionalGeneration.from_pretrained(
26
+ model_id,
27
+ device_map="auto",
28
+ torch_dtype=torch.bfloat16
29
+ ).eval()
30
+
31
+ self.processor = AutoProcessor.from_pretrained(model_id)
32
+
33
+ # Keep the audio processing models
34
+ print("Loading audio processing models...")
35
+ self.snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
36
+ self.whispermodel = whisper.load_model("small").to(device)
37
+
38
+ print("Models loaded successfully!")
39
+
40
+ def warm_up(self):
41
+ """Warm up the models"""
42
+ print("Warming up models...")
43
+ # Create a dummy audio file for warmup
44
+ dummy_audio = np.random.randn(16000).astype(np.float32) # 1 second of dummy audio
45
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
46
+ sf.write(tmp.name, dummy_audio, 16000)
47
+ try:
48
+ for _ in self.run_audio_to_audio_stream(tmp.name):
49
+ break # Just run one iteration for warmup
50
+ except:
51
+ pass
52
+ os.unlink(tmp.name)
53
+ print("Warmup completed!")
54
+
55
+ def audio_to_text(self, audio_path: str) -> str:
56
+ """
57
+ Convert audio to text using Gemma 3n
58
+ """
59
+ # Load and process audio
60
+ audio = whisper.load_audio(audio_path)
61
+ audio = whisper.pad_or_trim(audio)
62
+
63
+ # Prepare messages for Gemma 3n
64
+ messages = [
65
+ {
66
+ "role": "system",
67
+ "content": [{"type": "text", "text": "You are a helpful AI assistant. Transcribe the following audio accurately."}]
68
+ },
69
+ {
70
+ "role": "user",
71
+ "content": [
72
+ {"type": "audio", "audio": audio_path},
73
+ {"type": "text", "text": "Please transcribe this audio."}
74
+ ]
75
+ }
76
+ ]
77
+
78
+ # Process with Gemma 3n
79
+ inputs = self.processor.apply_chat_template(
80
+ messages,
81
+ add_generation_prompt=True,
82
+ tokenize=True,
83
+ return_dict=True,
84
+ return_tensors="pt",
85
+ ).to(self.model.device)
86
+
87
+ input_len = inputs["input_ids"].shape[-1]
88
+
89
+ with torch.inference_mode():
90
+ generation = self.model.generate(
91
+ **inputs,
92
+ max_new_tokens=200,
93
+ do_sample=False,
94
+ temperature=0.7
95
+ )
96
+ generation = generation[0][input_len:]
97
+
98
+ decoded = self.processor.decode(generation, skip_special_tokens=True)
99
+ return decoded.strip()
100
+
101
+ def text_to_text(self, text: str, conversation_history: list = None) -> str:
102
+ """
103
+ Generate text response using Gemma 3n
104
+ """
105
+ # Build conversation messages
106
+ messages = [
107
+ {
108
+ "role": "system",
109
+ "content": [{"type": "text", "text": "You are a helpful AI assistant. Respond naturally and conversationally."}]
110
+ }
111
+ ]
112
+
113
+ # Add conversation history if provided
114
+ if conversation_history:
115
+ messages.extend(conversation_history)
116
+
117
+ # Add current user message
118
+ messages.append({
119
+ "role": "user",
120
+ "content": [{"type": "text", "text": text}]
121
+ })
122
+
123
+ # Process with Gemma 3n
124
+ inputs = self.processor.apply_chat_template(
125
+ messages,
126
+ add_generation_prompt=True,
127
+ tokenize=True,
128
+ return_dict=True,
129
+ return_tensors="pt",
130
+ ).to(self.model.device)
131
+
132
+ input_len = inputs["input_ids"].shape[-1]
133
+
134
+ with torch.inference_mode():
135
+ generation = self.model.generate(
136
+ **inputs,
137
+ max_new_tokens=500,
138
+ do_sample=True,
139
+ temperature=0.9,
140
+ top_p=0.95
141
+ )
142
+ generation = generation[0][input_len:]
143
+
144
+ decoded = self.processor.decode(generation, skip_special_tokens=True)
145
+ return decoded.strip()
146
+
147
+ def text_to_audio(self, text: str, output_path: Optional[str] = None) -> str:
148
+ """
149
+ Convert text to audio using SNAC
150
+ This is a simplified version - in practice you'd need a text-to-speech model
151
+ For now, we'll use a placeholder approach
152
+ """
153
+ # TODO: Implement proper text-to-speech
154
+ # For now, return the text (would need additional TTS model)
155
+ if output_path is None:
156
+ output_path = tempfile.mktemp(suffix=".wav")
157
+
158
+ # Placeholder: generate silent audio
159
+ # In practice, you'd use a TTS model here
160
+ silence = np.zeros(16000 * 2) # 2 seconds of silence
161
+ sf.write(output_path, silence, 16000)
162
+
163
+ return output_path
164
+
165
+ def run_audio_to_audio_stream(self, audio_path: str, stream_stride: int = 4) -> Generator[bytes, None, None]:
166
+ """
167
+ Audio-to-audio streaming inference using Gemma 3n
168
+ """
169
+ # Step 1: Audio to text using Gemma 3n
170
+ try:
171
+ # Use Gemma 3n for audio understanding
172
+ messages = [
173
+ {
174
+ "role": "system",
175
+ "content": [{"type": "text", "text": "You are a helpful AI assistant. Listen to the audio and respond naturally."}]
176
+ },
177
+ {
178
+ "role": "user",
179
+ "content": [
180
+ {"type": "audio", "audio": audio_path},
181
+ {"type": "text", "text": "Please respond to what I said."}
182
+ ]
183
+ }
184
+ ]
185
+
186
+ inputs = self.processor.apply_chat_template(
187
+ messages,
188
+ add_generation_prompt=True,
189
+ tokenize=True,
190
+ return_dict=True,
191
+ return_tensors="pt",
192
+ ).to(self.model.device)
193
+
194
+ input_len = inputs["input_ids"].shape[-1]
195
+
196
+ with torch.inference_mode():
197
+ generation = self.model.generate(
198
+ **inputs,
199
+ max_new_tokens=300,
200
+ do_sample=True,
201
+ temperature=0.9,
202
+ top_p=0.95
203
+ )
204
+ generation = generation[0][input_len:]
205
+
206
+ response_text = self.processor.decode(generation, skip_special_tokens=True).strip()
207
+ print(f"Gemma 3n response: {response_text}")
208
+
209
+ # Step 2: Convert response text to audio (placeholder)
210
+ # TODO: Implement proper text-to-speech pipeline
211
+ # For now, yield dummy audio data
212
+
213
+ # Generate some dummy audio chunks for streaming
214
+ chunk_size = 4096
215
+ total_chunks = 10
216
+
217
+ for i in range(total_chunks):
218
+ # In practice, this would be real audio data from TTS
219
+ dummy_chunk = np.random.randn(chunk_size).astype(np.float32) * 0.1
220
+ audio_bytes = (dummy_chunk * 32767).astype(np.int16).tobytes()
221
+ yield audio_bytes
222
+
223
+ except Exception as e:
224
+ print(f"Error in audio-to-audio streaming: {e}")
225
+ return
226
+
227
+ def process_conversation_turn(self, audio_path: str) -> tuple[str, str]:
228
+ """
229
+ Process a single conversation turn: audio input -> text response
230
+ Returns (transcribed_text, response_text)
231
+ """
232
+ # Use Gemma 3n for both transcription and response
233
+ messages = [
234
+ {
235
+ "role": "system",
236
+ "content": [{"type": "text", "text": "You are a helpful AI assistant. Listen to the audio, understand what the user said, and respond naturally. First transcribe what you heard, then provide a response."}]
237
+ },
238
+ {
239
+ "role": "user",
240
+ "content": [
241
+ {"type": "audio", "audio": audio_path},
242
+ {"type": "text", "text": "Please transcribe what I said and then respond appropriately."}
243
+ ]
244
+ }
245
+ ]
246
+
247
+ inputs = self.processor.apply_chat_template(
248
+ messages,
249
+ add_generation_prompt=True,
250
+ tokenize=True,
251
+ return_dict=True,
252
+ return_tensors="pt",
253
+ ).to(self.model.device)
254
+
255
+ input_len = inputs["input_ids"].shape[-1]
256
+
257
+ with torch.inference_mode():
258
+ generation = self.model.generate(
259
+ **inputs,
260
+ max_new_tokens=400,
261
+ do_sample=True,
262
+ temperature=0.8,
263
+ top_p=0.95
264
+ )
265
+ generation = generation[0][input_len:]
266
+
267
+ full_response = self.processor.decode(generation, skip_special_tokens=True).strip()
268
+
269
+ # Try to split transcription and response
270
+ # This is a simple heuristic - in practice you'd need better parsing
271
+ if ":" in full_response:
272
+ parts = full_response.split(":", 1)
273
+ transcription = parts[0].strip()
274
+ response = parts[1].strip()
275
+ else:
276
+ # Fallback: use the full response as both
277
+ transcription = full_response
278
+ response = full_response
279
+
280
+ return transcription, response
hf_space_app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import soundfile as sf
5
+ import numpy as np
6
+ from gemma_inference import GemmaOmniInference
7
+ import torch
8
+
9
+ # Global inference engine
10
+ inference_engine = None
11
+
12
+ def initialize_model():
13
+ """Initialize the Gemma 3n inference engine"""
14
+ global inference_engine
15
+ try:
16
+ device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ print(f"Using device: {device}")
18
+
19
+ inference_engine = GemmaOmniInference(device=device)
20
+ inference_engine.warm_up()
21
+ return "βœ… Model loaded successfully!"
22
+ except Exception as e:
23
+ return f"❌ Error loading model: {str(e)}"
24
+
25
+ def process_audio(audio_input, conversation_history):
26
+ """Process audio input and generate response"""
27
+ global inference_engine
28
+
29
+ if inference_engine is None:
30
+ return "❌ Model not initialized. Please wait for model to load.", conversation_history, None
31
+
32
+ if audio_input is None:
33
+ return "❌ No audio input provided.", conversation_history, None
34
+
35
+ try:
36
+ # Save audio to temporary file
37
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
38
+ # Handle different audio input formats
39
+ if isinstance(audio_input, tuple):
40
+ sample_rate, audio_data = audio_input
41
+ sf.write(tmp_file.name, audio_data, sample_rate)
42
+ else:
43
+ # Assume it's already a file path
44
+ audio_path = audio_input
45
+ tmp_file.name = audio_path
46
+
47
+ # Process with Gemma 3n
48
+ transcription, response = inference_engine.process_conversation_turn(tmp_file.name)
49
+
50
+ # Update conversation history
51
+ updated_history = conversation_history + [
52
+ {"role": "user", "content": transcription},
53
+ {"role": "assistant", "content": response}
54
+ ]
55
+
56
+ # Format conversation for display
57
+ conversation_display = ""
58
+ for turn in updated_history:
59
+ role = "πŸ§‘ User" if turn["role"] == "user" else "πŸ€– Assistant"
60
+ conversation_display += f"{role}: {turn['content']}\n\n"
61
+
62
+ # Clean up temporary file
63
+ if os.path.exists(tmp_file.name):
64
+ os.unlink(tmp_file.name)
65
+
66
+ return conversation_display, updated_history, response
67
+
68
+ except Exception as e:
69
+ return f"❌ Error processing audio: {str(e)}", conversation_history, None
70
+
71
+ def process_text_input(text_input, conversation_history):
72
+ """Process text input and generate response"""
73
+ global inference_engine
74
+
75
+ if inference_engine is None:
76
+ return "❌ Model not initialized. Please wait for model to load.", conversation_history
77
+
78
+ if not text_input.strip():
79
+ return "❌ No text input provided.", conversation_history
80
+
81
+ try:
82
+ # Generate response using Gemma 3n
83
+ response = inference_engine.text_to_text(text_input, conversation_history)
84
+
85
+ # Update conversation history
86
+ updated_history = conversation_history + [
87
+ {"role": "user", "content": text_input},
88
+ {"role": "assistant", "content": response}
89
+ ]
90
+
91
+ # Format conversation for display
92
+ conversation_display = ""
93
+ for turn in updated_history:
94
+ role = "πŸ§‘ User" if turn["role"] == "user" else "πŸ€– Assistant"
95
+ conversation_display += f"{role}: {turn['content']}\n\n"
96
+
97
+ return conversation_display, updated_history
98
+
99
+ except Exception as e:
100
+ return f"❌ Error processing text: {str(e)}", conversation_history
101
+
102
+ def clear_conversation():
103
+ """Clear the conversation history"""
104
+ return "", []
105
+
106
+ def create_interface():
107
+ """Create the Gradio interface"""
108
+
109
+ with gr.Blocks(title="Omni-Mini with Gemma 3n", theme=gr.themes.Soft()) as demo:
110
+ gr.Markdown("""
111
+ # πŸŽ™οΈ Omni-Mini with Gemma 3n
112
+
113
+ A multimodal AI assistant powered by Google's Gemma 3n model.
114
+ You can interact using voice or text!
115
+
116
+ **Features:**
117
+ - 🎀 Voice input with automatic transcription
118
+ - πŸ’¬ Text-based conversation
119
+ - 🧠 Powered by Gemma 3n E4B model
120
+ - 🌍 Supports 140+ languages
121
+ """)
122
+
123
+ # Model status
124
+ model_status = gr.Textbox(
125
+ label="Model Status",
126
+ value="πŸ”„ Loading model...",
127
+ interactive=False
128
+ )
129
+
130
+ # Conversation history (hidden state)
131
+ conversation_history = gr.State([])
132
+
133
+ # Main conversation display
134
+ conversation_display = gr.Textbox(
135
+ label="Conversation",
136
+ value="",
137
+ lines=15,
138
+ max_lines=20,
139
+ interactive=False,
140
+ placeholder="Your conversation will appear here..."
141
+ )
142
+
143
+ with gr.Row():
144
+ with gr.Column(scale=1):
145
+ gr.Markdown("### 🎀 Voice Input")
146
+ audio_input = gr.Audio(
147
+ label="Record your voice",
148
+ type="numpy",
149
+ format="wav"
150
+ )
151
+
152
+ audio_submit = gr.Button("🎀 Send Voice Message", variant="primary")
153
+
154
+ with gr.Column(scale=1):
155
+ gr.Markdown("### πŸ’¬ Text Input")
156
+ text_input = gr.Textbox(
157
+ label="Type your message",
158
+ placeholder="Enter your message here...",
159
+ lines=3
160
+ )
161
+
162
+ text_submit = gr.Button("πŸ’¬ Send Text Message", variant="primary")
163
+
164
+ with gr.Row():
165
+ clear_btn = gr.Button("πŸ—‘οΈ Clear Conversation", variant="secondary")
166
+
167
+ # Last response display
168
+ last_response = gr.Textbox(
169
+ label="Last Response",
170
+ value="",
171
+ lines=3,
172
+ interactive=False,
173
+ placeholder="The assistant's last response will appear here..."
174
+ )
175
+
176
+ # Event handlers
177
+ audio_submit.click(
178
+ process_audio,
179
+ inputs=[audio_input, conversation_history],
180
+ outputs=[conversation_display, conversation_history, last_response]
181
+ )
182
+
183
+ text_submit.click(
184
+ process_text_input,
185
+ inputs=[text_input, conversation_history],
186
+ outputs=[conversation_display, conversation_history]
187
+ )
188
+
189
+ text_input.submit(
190
+ process_text_input,
191
+ inputs=[text_input, conversation_history],
192
+ outputs=[conversation_display, conversation_history]
193
+ )
194
+
195
+ clear_btn.click(
196
+ clear_conversation,
197
+ outputs=[conversation_display, conversation_history]
198
+ )
199
+
200
+ # Initialize model on load
201
+ demo.load(
202
+ initialize_model,
203
+ outputs=[model_status]
204
+ )
205
+
206
+ gr.Markdown("""
207
+ ---
208
+
209
+ **Note:** This is a demo implementation. The audio-to-audio pipeline is simplified.
210
+ In a full implementation, you would need additional text-to-speech capabilities.
211
+
212
+ **Powered by:**
213
+ - 🧠 Google Gemma 3n E4B
214
+ - 🎀 OpenAI Whisper
215
+ - πŸ”Š SNAC Audio Codec
216
+ """)
217
+
218
+ return demo
219
+
220
+ if __name__ == "__main__":
221
+ # Create and launch the interface
222
+ demo = create_interface()
223
+ demo.launch(
224
+ server_name="0.0.0.0",
225
+ server_port=7860,
226
+ share=False,
227
+ show_error=True
228
+ )
requirements.txt CHANGED
@@ -16,3 +16,7 @@ fastapi==0.112.4
16
  librosa==0.10.2.post1
17
  flask==3.0.3
18
  fire
 
 
 
 
 
16
  librosa==0.10.2.post1
17
  flask==3.0.3
18
  fire
19
+ # Gemma 3n dependencies
20
+ transformers>=4.53.0
21
+ accelerate
22
+ huggingface_hub
requirements_hf.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.53.0
3
+ accelerate
4
+ huggingface_hub
5
+ gradio
6
+ soundfile
7
+ numpy
8
+ snac==1.2.0
9
+ openai-whisper
10
+ librosa
11
+ scipy
12
+ torchaudio