Spaces:

mclemcrew
/

CoMix-Demo

Sleeping

App Files Files Community

mclemcrew commited on Mar 25, 2025

Commit

33edd95

1 Parent(s): 4eca0ed

updates for audio url issue

Browse files

Files changed (1) hide show

app.py +44 -50

app.py CHANGED Viewed

@@ -164,25 +164,46 @@ def generate_response(audio_data, message, chat_history=[]):
             {"role": "system", "content": system_prompt}
         ]
         # Add chat history (limited to last 3 turns)
         history_limit = min(len(chat_history), 3)
         for user_msg, bot_msg in chat_history[-history_limit:]:
-            conversation.append({"role": "user", "content": user_msg})
-            if bot_msg:  # Skip None responses
                 conversation.append({"role": "assistant", "content": bot_msg})
-        # Add current message with audio
         if audio_data is not None:
-            # First message with audio
             conversation.append({
                 "role": "user",
-                "content": [
-                    {"type": "audio"},  # Audio will be added in preprocessing
-                    {"type": "text", "text": message}
-                ]
             })
         else:
-            # Text-only follow-up message
             conversation.append({
                 "role": "user",
                 "content": message
@@ -197,53 +218,17 @@ def generate_response(audio_data, message, chat_history=[]):
         )
         # Process inputs
-        logger.info("Processing inputs")
         inputs = processor(
             text=text,
-            audios=[audio_data] if audio_data is not None else None,
             return_tensors="pt",
             padding=True,
             truncation=True
         )
-        # Move inputs to the same device as model
-        device = next(model.parameters()).device
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        log_gpu_memory("Before generation")
-        # Generate response
-        logger.info("Generating response")
-        with torch.no_grad():
-            output = model.generate(
-                **inputs,
-                max_new_tokens=150,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9,
-                use_cache=True,
-                pad_token_id=processor.tokenizer.pad_token_id
-            )
-        # Decode only the new tokens
-        generated_text = processor.batch_decode(
-            output[:, inputs.input_ids.shape[1]:],
-            skip_special_tokens=True
-        )[0]
-        # Clean up
-        del inputs, output
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-        log_gpu_memory("After generation")
-        return generated_text
-    except Exception as e:
-        logger.error(f"Error generating response: {e}")
-        return f"I encountered an error while processing your request: {str(e)}"
 # Create Gradio Interface
 def create_interface():
     """Create the Gradio interface"""
@@ -316,8 +301,17 @@ def create_interface():
             if not message or not message.strip():
                 return chat_history, ""
             # Add user message to history
-            chat_history.append((message, None))
             yield chat_history, ""
             try:

             {"role": "system", "content": system_prompt}
         ]
+        # Collect all audio samples in order
+        audios = []
         # Add chat history (limited to last 3 turns)
         history_limit = min(len(chat_history), 3)
         for user_msg, bot_msg in chat_history[-history_limit:]:
+            # Check if the user message is a string or already contains audio
+            if isinstance(user_msg, list) and any(item.get("type") == "audio" for item in user_msg):
+                # It's already in the right format with audio
+                conversation.append({"role": "user", "content": user_msg})
+                # Extract audio from this message
+                for item in user_msg:
+                    if item.get("type") == "audio" and "audio_data" in item:
+                        audios.append(item["audio_data"])
+            else:
+                # Regular text message
+                conversation.append({"role": "user", "content": user_msg})
+            # Add assistant response if available
+            if bot_msg:
                 conversation.append({"role": "assistant", "content": bot_msg})
+        # Add current message with audio if available
         if audio_data is not None:
+            # Current message with audio
+            user_content = [
+                {"type": "audio", "audio_url": "audio_sample.wav"},  # Placeholder URL
+                {"type": "text", "text": message}
+            ]
+            # Store the actual audio data for processing
+            audios.append(audio_data)
             conversation.append({
                 "role": "user",
+                "content": user_content
             })
         else:
+            # Text-only message
             conversation.append({
                 "role": "user",
                 "content": message
         )
         # Process inputs
+        logger.info(f"Processing inputs with {len(audios)} audio samples")
         inputs = processor(
             text=text,
+            audios=audios if audios else None,
             return_tensors="pt",
             padding=True,
             truncation=True
         )
+        # The rest of your function remains the same
+        # ...
 # Create Gradio Interface
 def create_interface():
     """Create the Gradio interface"""
             if not message or not message.strip():
                 return chat_history, ""
+            # If we have audio, format the user message as a list with audio and text
+            if audio_data is not None:
+                user_message = [
+                    {"type": "audio", "audio_url": "audio_sample.wav", "audio_data": audio_data},
+                    {"type": "text", "text": message}
+                ]
+            else:
+                user_message = message
             # Add user message to history
+            chat_history.append((user_message, None))
             yield chat_history, ""
             try: