Spaces:

hswift
/

decent-sampler-audio-api

Sleeping

App Files Files Community

hswift commited on Sep 12, 2025

Commit

1d43544

verified ·

1 Parent(s): 6585c4c

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -30

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from fastapi.middleware.cors import CORSMiddleware
 import io
 import torch
 from diffusers import AudioLDM2Pipeline
 from scipy.io.wavfile import write as write_wav
 import numpy as np
@@ -14,12 +15,9 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # --- CRITICAL FIX for Hugging Face Spaces Permissions ---
-# Set the cache directory for all Hugging Face libraries BEFORE they are used.
-# This forces the model download and any temporary files to a writable location.
 cache_dir = "/tmp/huggingface_cache"
 os.environ["HF_HOME"] = cache_dir
 os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
-# Create the directory if it doesn't exist, just in case.
 os.makedirs(cache_dir, exist_ok=True)
 logger.info(f"Hugging Face cache directory globally set to: {cache_dir}")
@@ -27,7 +25,6 @@ logger.info(f"Hugging Face cache directory globally set to: {cache_dir}")
 app = FastAPI()
 # --- Model Loading ---
-# This section loads the AI model when the application starts.
 MODEL_REPO = "cvssp/audioldm2"
 pipeline = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -35,39 +32,45 @@ torch_dtype = torch.float16 if device == "cuda" else torch.float32
 try:
     logger.info(f"Attempting to load model '{MODEL_REPO}' on device: {device} with dtype: {torch_dtype}")
-    # The cache_dir argument is now redundant because of the environment variable,
-    # but we'll leave it for extra safety.
     pipeline = AudioLDM2Pipeline.from_pretrained(
-        MODEL_REPO,
         torch_dtype=torch_dtype,
-        cache_dir=cache_dir
     )
     pipeline = pipeline.to(device)
     logger.info("Model loaded successfully and moved to device.")
 except Exception as e:
     logger.error(f"Fatal error during model loading: {e}", exc_info=True)
     # If the model fails to load, the 'pipeline' variable will remain None.
-    # The endpoint will then report an error.
 # --- CORS Middleware ---
-# Allows the frontend website to communicate with this API
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # Allows all origins
     allow_credentials=True,
-    allow_methods=["*"],  # Allows all methods
-    allow_headers=["*"],  # Allows all headers
 )
 # --- API Endpoints ---
 @app.post("/generate-audio")
 async def generate_audio_endpoint(payload: dict):
-    """
-    Receives a text prompt and returns a generated WAV audio file.
-    """
     if pipeline is None:
         logger.error("Request received, but model is not loaded.")
-        raise HTTPException(status_code=503, detail="Model is not available or failed to load. Please check the server logs.")
     prompt = payload.get("prompt")
     if not prompt:
@@ -76,25 +79,18 @@ async def generate_audio_endpoint(payload: dict):
     try:
         logger.info(f"Generating audio for prompt: '{prompt}'")
-        # Generate audio. The model works well with negative prompts to guide it.
         audio = pipeline(
             prompt,
-            negative_prompt="Low quality, noisy, muffled, mono", # Helps improve quality
-            num_inference_steps=200,  # Higher steps can improve quality
-            audio_length_in_s=2.5     # Generate 2.5-second clips
         ).audios[0]
-        # The model output is a numpy array with float values from -1.0 to 1.0.
-        # We need to convert it to a 16-bit PCM WAV file.
-        sample_rate = 16000  # The model's default sample rate
-        # Scale to 16-bit integer range
         audio_int16 = np.int16(audio * 32767)
-        # Use io.BytesIO to build the WAV file in memory
         wav_file_in_memory = io.BytesIO()
         write_wav(wav_file_in_memory, sample_rate, audio_int16)
-        wav_file_in_memory.seek(0)  # Rewind to the beginning of the stream
         safe_filename = "".join(c for c in prompt if c.isalnum() or c in (' ', '_')).rstrip()[:50] + ".wav"
@@ -111,5 +107,4 @@ async def generate_audio_endpoint(payload: dict):
 @app.get("/")
 def read_root():
-    """A simple root endpoint to confirm the API is running."""
     return {"message": "Decent Sampler Audio Generation API is running."}

 import io
 import torch
 from diffusers import AudioLDM2Pipeline
+from transformers import GPT2LMHeadModel  # <-- IMPORT THE CORRECT MODEL TYPE
 from scipy.io.wavfile import write as write_wav
 import numpy as np
 logger = logging.getLogger(__name__)
 # --- CRITICAL FIX for Hugging Face Spaces Permissions ---
 cache_dir = "/tmp/huggingface_cache"
 os.environ["HF_HOME"] = cache_dir
 os.environ["HUGGINGFACE_HUB_CACHE"] = cache_dir
 os.makedirs(cache_dir, exist_ok=True)
 logger.info(f"Hugging Face cache directory globally set to: {cache_dir}")
 app = FastAPI()
 # --- Model Loading ---
 MODEL_REPO = "cvssp/audioldm2"
 pipeline = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 try:
     logger.info(f"Attempting to load model '{MODEL_REPO}' on device: {device} with dtype: {torch_dtype}")
+    # --- FIX for Model Component Mismatch ---
+    # 1. Manually load the correct GPT2 model variant (GPT2LMHeadModel).
+    # The sub-model 'gpt2' is used by audioldm2 for prompt understanding.
+    logger.info("Pre-loading the correct language model component (GPT2LMHeadModel)...")
+    language_model = GPT2LMHeadModel.from_pretrained(
+        "openai-community/gpt2", cache_dir=cache_dir
+    ).to(device)
+    logger.info("Language model component loaded successfully.")
+    # 2. Load the main pipeline, injecting our pre-loaded component.
+    # This forces the pipeline to use the correct model and avoid the AttributeError.
     pipeline = AudioLDM2Pipeline.from_pretrained(
+        MODEL_REPO,
         torch_dtype=torch_dtype,
+        cache_dir=cache_dir,
+        language_model=language_model,  # <-- INJECT THE CORRECT COMPONENT
     )
     pipeline = pipeline.to(device)
     logger.info("Model loaded successfully and moved to device.")
 except Exception as e:
     logger.error(f"Fatal error during model loading: {e}", exc_info=True)
     # If the model fails to load, the 'pipeline' variable will remain None.
 # --- CORS Middleware ---
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
 # --- API Endpoints ---
 @app.post("/generate-audio")
 async def generate_audio_endpoint(payload: dict):
     if pipeline is None:
         logger.error("Request received, but model is not loaded.")
+        raise HTTPException(status_code=503, detail="Model is not available or failed to load. Please check server logs.")
     prompt = payload.get("prompt")
     if not prompt:
     try:
         logger.info(f"Generating audio for prompt: '{prompt}'")
         audio = pipeline(
             prompt,
+            negative_prompt="Low quality, noisy, muffled, mono",
+            num_inference_steps=200,
+            audio_length_in_s=2.5
         ).audios[0]
+        sample_rate = 16000
         audio_int16 = np.int16(audio * 32767)
         wav_file_in_memory = io.BytesIO()
         write_wav(wav_file_in_memory, sample_rate, audio_int16)
+        wav_file_in_memory.seek(0)
         safe_filename = "".join(c for c in prompt if c.isalnum() or c in (' ', '_')).rstrip()[:50] + ".wav"
 @app.get("/")
 def read_root():
     return {"message": "Decent Sampler Audio Generation API is running."}