Spaces:

kepsmiling121
/

ljsjdwe

Sleeping

App Files Files Community

kepsmiling121 commited on Jan 16

Commit

d4a5fb6

verified ·

1 Parent(s): df2fec5

Create models/musicgen_model.py

Browse files

Files changed (1) hide show

models/musicgen_model.py +159 -0

models/musicgen_model.py ADDED Viewed

	@@ -0,0 +1,159 @@

+"""
+MusicGen model wrapper with advanced features
+"""
+import torch
+import numpy as np
+from typing import Optional, Dict, List
+from transformers import AutoProcessor, MusicgenForConditionalGeneration
+import scipy
+import logging
+logger = logging.getLogger(__name__)
+class MusicGenModel:
+    def __init__(self, model_id: str = "facebook/musicgen-small"):
+        self.model_id = model_id
+        self.processor = None
+        self.model = None
+        self.device = None
+        self._load_model()
+    def _load_model(self):
+        """Load model and processor"""
+        try:
+            logger.info(f"Loading MusicGen model: {self.model_id}")
+            # Set device
+            self.device = "cuda" if torch.cuda.is_available() else "cpu"
+            # Load processor and model
+            self.processor = AutoProcessor.from_pretrained(self.model_id)
+            self.model = MusicgenForConditionalGeneration.from_pretrained(
+                self.model_id,
+                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
+            )
+            self.model.to(self.device)
+            self.model.eval()
+            logger.info(f"Model loaded successfully on {self.device}")
+        except Exception as e:
+            logger.error(f"Failed to load model: {str(e)}")
+            raise
+    def generate_from_text(
+        self,
+        prompt: str,
+        duration: int = 10,
+        guidance_scale: float = 3.0,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        do_sample: bool = True
+    ) -> np.ndarray:
+        """Generate music from text prompt"""
+        try:
+            max_new_tokens = int(duration * 50)  # Rough conversion
+            inputs = self.processor(
+                text=[prompt],
+                padding=True,
+                return_tensors="pt",
+            ).to(self.device)
+            with torch.no_grad():
+                audio_values = self.model.generate(
+                    **inputs,
+                    do_sample=do_sample,
+                    guidance_scale=guidance_scale,
+                    temperature=temperature,
+                    top_k=top_k,
+                    max_new_tokens=max_new_tokens
+                )
+            return audio_values[0, 0].cpu().numpy()
+        except Exception as e:
+            logger.error(f"Text generation failed: {str(e)}")
+            raise
+    def generate_from_audio(
+        self,
+        audio_array: np.ndarray,
+        duration: int = 10,
+        guidance_scale: float = 3.0
+    ) -> np.ndarray:
+        """Generate music conditioned on input audio"""
+        try:
+            max_new_tokens = int(duration * 50)
+            inputs = self.processor(
+                audio=audio_array,
+                sampling_rate=16000,
+                padding=True,
+                return_tensors="pt",
+            ).to(self.device)
+            with torch.no_grad():
+                audio_values = self.model.generate(
+                    **inputs,
+                    do_sample=True,
+                    guidance_scale=guidance_scale,
+                    max_new_tokens=max_new_tokens
+                )
+            return audio_values[0, 0].cpu().numpy()
+        except Exception as e:
+            logger.error(f"Audio conditioning failed: {str(e)}")
+            raise
+    def generate_from_text_and_audio(
+        self,
+        prompt: str,
+        audio_array: np.ndarray,
+        duration: int = 10,
+        guidance_scale: float = 3.0
+    ) -> np.ndarray:
+        """Generate music from both text and audio"""
+        try:
+            max_new_tokens = int(duration * 50)
+            inputs = self.processor(
+                text=[prompt],
+                audio=audio_array,
+                sampling_rate=16000,
+                padding=True,
+                return_tensors="pt",
+            ).to(self.device)
+            with torch.no_grad():
+                audio_values = self.model.generate(
+                    **inputs,
+                    do_sample=True,
+                    guidance_scale=guidance_scale,
+                    max_new_tokens=max_new_tokens
+                )
+            return audio_values[0, 0].cpu().numpy()
+        except Exception as e:
+            logger.error(f"Combined generation failed: {str(e)}")
+            raise
+    def batch_generate(
+        self,
+        prompts: List[str],
+        duration: int = 10,
+        guidance_scale: float = 3.0
+    ) -> List[np.ndarray]:
+        """Generate multiple music samples"""
+        results = []
+        for prompt in prompts:
+            audio = self.generate_from_text(
+                prompt=prompt,
+                duration=duration,
+                guidance_scale=guidance_scale
+            )
+            results.append(audio)
+        return results