mclemcrew commited on
Commit
d11be63
·
1 Parent(s): ed7ce23
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -128,19 +128,29 @@ def generate_response(audio_path, message, chat_history=None):
128
  conversation.append({"role": "assistant", "content": bot_msg})
129
 
130
  if audio_path:
131
- # Match the format that was working in the original code
132
- # Create a placeholder audio URL to ensure the <|AUDIO|> token is generated
133
- conversation.append({
134
- "role": "user",
135
- "content": [
136
- {"type": "audio", "audio_url": "placeholder_audio_url"},
137
- {"type": "text", "text": message}
138
- ]
139
- })
140
-
141
  audio_data = process_audio_file(audio_path)
 
142
  if audio_data is not None:
 
 
 
 
 
 
143
  audios.append(audio_data)
 
 
 
 
 
 
 
 
 
 
 
 
144
  else:
145
  conversation.append({"role": "user", "content": message})
146
 
@@ -158,12 +168,17 @@ def generate_response(audio_path, message, chat_history=None):
158
  truncation=True
159
  )
160
 
 
 
 
 
 
 
 
161
  # Move inputs to the same device as model
162
  device = next(model.parameters()).device
163
  inputs = {k: v.to(device) for k, v in inputs.items()}
164
 
165
- logger.info(f"Processor output: {inputs}")
166
-
167
  log_gpu_memory("Before generation")
168
 
169
  logger.info("Generating response")
@@ -178,8 +193,6 @@ def generate_response(audio_path, message, chat_history=None):
178
  pad_token_id=processor.tokenizer.pad_token_id
179
  )
180
 
181
- logger.info(f"Model output: {output}")
182
-
183
  generated_text = processor.batch_decode(
184
  output[:, inputs["input_ids"].shape[1]:],
185
  skip_special_tokens=True
 
128
  conversation.append({"role": "assistant", "content": bot_msg})
129
 
130
  if audio_path:
131
+ # Process the audio first to ensure it's valid
 
 
 
 
 
 
 
 
 
132
  audio_data = process_audio_file(audio_path)
133
+
134
  if audio_data is not None:
135
+ # Log audio properties to verify it's loaded correctly
136
+ logger.info(f"Audio data loaded: length={len(audio_data)}, dtype={audio_data.dtype}, "
137
+ f"min={np.min(audio_data)}, max={np.max(audio_data)}, "
138
+ f"contains_nan={np.isnan(audio_data).any()}")
139
+
140
+ # Store audio data for processing
141
  audios.append(audio_data)
142
+
143
+ # Create conversation entry with audio
144
+ conversation.append({
145
+ "role": "user",
146
+ "content": [
147
+ {"type": "audio", "audio_url": "placeholder_audio_url"},
148
+ {"type": "text", "text": message}
149
+ ]
150
+ })
151
+ else:
152
+ logger.error("Failed to process audio file, continuing without audio")
153
+ conversation.append({"role": "user", "content": message})
154
  else:
155
  conversation.append({"role": "user", "content": message})
156
 
 
168
  truncation=True
169
  )
170
 
171
+ # Verify audio was included in inputs
172
+ logger.info(f"Inputs keys: {inputs.keys()}")
173
+ if 'audio_features' in inputs:
174
+ logger.info(f"Audio features shape: {inputs['audio_features'].shape}")
175
+ else:
176
+ logger.warning("No audio_features in processor output!")
177
+
178
  # Move inputs to the same device as model
179
  device = next(model.parameters()).device
180
  inputs = {k: v.to(device) for k, v in inputs.items()}
181
 
 
 
182
  log_gpu_memory("Before generation")
183
 
184
  logger.info("Generating response")
 
193
  pad_token_id=processor.tokenizer.pad_token_id
194
  )
195
 
 
 
196
  generated_text = processor.batch_decode(
197
  output[:, inputs["input_ids"].shape[1]:],
198
  skip_special_tokens=True