Spaces:

alpercagann
/

SonicDiffusionClean

Runtime error

App Files Files Community

alpercagann commited on Apr 7, 2025

Commit

b4447cb

1 Parent(s): b76b715

Create minimal app without torch dependency

Browse files

Files changed (1) hide show

app.py +28 -447

app.py CHANGED Viewed

@@ -1,457 +1,38 @@
-# app.py
 import os
 import sys
-import subprocess
-import gradio as gr
-import torch
-import traceback
-from datetime import datetime
-# Ensure required packages are installed
-try:
-    import requests
-    import tqdm
-    import re
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "requests", "tqdm"])
-    import requests
-    import tqdm
-    import re
-# Asset management functions
-def get_gdrive_file_id(url):
-    """Extract file ID from Google Drive URL"""
-    match = re.search(r"d/([a-zA-Z0-9_-]+)", url) or re.search(r"id=([a-zA-Z0-9_-]+)", url)
-    if match:
-        return match.group(1)
-    return None
-def download_gdrive_file(file_id, destination):
-    """Download a file from Google Drive with support for large files"""
-    if os.path.exists(destination):
-        print(f"File already exists: {destination}")
-        return True
-    # Make the directory if it doesn't exist
-    os.makedirs(os.path.dirname(destination), exist_ok=True)
-    # First, try the direct download URL
-    url = f"https://drive.google.com/uc?export=download&id={file_id}"
-    # Set up a session to handle cookies
-    session = requests.Session()
-    # First request to get the confirmation token for large files
-    response = session.get(url, stream=True)
-    # Check if there's a download confirmation page
-    if "confirm" in response.url:
-        # Extract confirmation token
-        token = response.url.split("confirm=")[1].split("&")[0]
-        url = f"{url}&confirm={token}"
-        response = session.get(url, stream=True)
-    # Get file size for progress bar
-    total_size = int(response.headers.get('content-length', 0))
-    # Download the file with progress bar
-    print(f"Downloading to {destination} ({total_size/(1024*1024):.1f} MB)...")
-    with open(destination, 'wb') as f:
-        with tqdm.tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
-            for chunk in response.iter_content(chunk_size=1024*1024):
-                if chunk:
-                    f.write(chunk)
-                    pbar.update(len(chunk))
-    print(f"Downloaded {destination} successfully!")
-    return True
-def check_and_download_assets():
-    """Check if required assets exist and download them if needed"""
-    # Define required files and their Google Drive URLs
-    gdrive_urls = {
-        "assets/fire_crackling.wav": "https://drive.google.com/file/d/1vOAZcbkpo_hre2g26n--lUXdwbTQp22k/view?usp=drive_link",
-        "assets/plastic_bag.wav": "https://drive.google.com/file/d/15igeDor7a47a-oluSCfO6GeUvFVl2ttb/view?usp=sharing",
-        "ckpts/landscape.pt": "https://drive.google.com/file/d/1-oTNIjCZq3_mGI1XRfzDyCnmjXCvd0Vh/view?usp=drive_link",
-        "ckpts/greatest_hits.pt": "https://drive.google.com/file/d/1wGDCB4iRFi4kf7bsFXV3qkc9_jvyNrCa/view?usp=drive_link",
-        "ckpts/audio_projector_landscape.pth": "https://drive.google.com/file/d/1BdjzRJOC8bvyPgrAkJJcCaN3EEJg3STm/view?usp=sharing",
-        "ckpts/audio_projector_gh.pth": "https://drive.google.com/file/d/19Uk68PXVOjE3TJl86H-IlMaM1URhU33a/view?usp=sharing",
-        "ckpts/CLAP_weights_2022.pth": "https://drive.google.com/file/d/1VK22jxHkFwpxknxQBLd6kIgO5WxQdLFP/view?usp=sharing"
-    }
-    # Create necessary directories
-    os.makedirs("assets", exist_ok=True)
-    os.makedirs("ckpts", exist_ok=True)
-    # Only download missing files
-    missing_files = {dest: url for dest, url in gdrive_urls.items() if not os.path.exists(dest)}
-    if missing_files:
-        print(f"Missing {len(missing_files)} required files. Downloading...")
-        for destination, url in missing_files.items():
-            file_id = get_gdrive_file_id(url)
-            if file_id:
-                try:
-                    download_gdrive_file(file_id, destination)
-                except Exception as e:
-                    print(f"Error downloading {destination}: {e}")
-                    return False
-            else:
-                print(f"Could not extract file ID from {url}")
-                return False
-    print("All required assets are available!")
-    return True
-# SonicDiffusion Controller Class
-class SonicDiffusionController:
-    def __init__(self, device=None):
-        if device is None:
-            self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        else:
-            self.device = device
-        print(f"Using device: {self.device}")
-        self.sr = 44100
-        self.model_loaded = False
-    def load_model(self,
-                  gate_dict_path="ckpts/landscape.pt",
-                  clap_path="CLAP/msclap",
-                  clap_weights="ckpts/CLAP_weights_2022.pth",
-                  adapter_ckpt_path="ckpts/audio_projector_landscape.pth"):
-        """Load the model conditionally based on environment and availability"""
-        try:
-            # First, check if the required files exist
-            for path in [gate_dict_path, adapter_ckpt_path, clap_weights]:
-                if not os.path.exists(path):
-                    return f"Error: Required file {path} not found"
-            print("Loading models - this may take a moment...")
-            # Import here to avoid import errors if files are missing
-            from unet2d_custom import UNet2DConditionModel
-            from pipeline_stable_diffusion_custom import StableDiffusionPipeline
-            from ldm.modules.encoders.audio_projector_res import Adapter
-            # Try to load the model with appropriate settings for the hardware
-            try:
-                model_id = "CompVis/stable-diffusion-v1-4"
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    model_id,
-                    subfolder="unet",
-                    use_adapter_list=[False, True, True],
-                    low_cpu_mem_usage=True,
-                    device_map="auto" if self.device == "cuda" else None
-                )
-                self.pipeline = StableDiffusionPipeline.from_pretrained(
-                    model_id,
-                    use_safetensors=True,
-                    torch_dtype=torch.float16 if self.device == "cuda" else torch.float32
-                )
-                # Move models to the appropriate device
-                self.unet = self.unet.to(self.device)
-                self.pipeline = self.pipeline.to(self.device)
-            except Exception as e:
-                print(f"Warning: Encountered issue with full model loading: {e}")
-                print("Trying with simplified loading...")
-                # Simplified loading for compatibility
-                model_id = "CompVis/stable-diffusion-v1-4"
-                self.unet = UNet2DConditionModel.from_pretrained(
-                    model_id,
-                    subfolder="unet",
-                    use_adapter_list=[False, True, True],
-                    low_cpu_mem_usage=True
-                ).to(self.device)
-                self.pipeline = StableDiffusionPipeline.from_pretrained(
-                    model_id,
-                    use_safetensors=True
-                ).to(self.device)
-            # Load gate dictionary
-            gate_dict = torch.load(gate_dict_path, map_location=self.device)
-            for name, param in self.unet.named_parameters():
-                if "adapter" in name:
-                    param.data = gate_dict[name].to(self.device)
-            # Set pipeline's UNet
-            self.pipeline.unet = self.unet
-            # Import and load audio encoder
-            import sys
-            sys.path.append(clap_path)
-            try:
-                from CLAPWrapper import CLAPWrapper
-                self.audio_encoder = CLAPWrapper(clap_weights, use_cuda=(self.device=="cuda"))
-                self.audio_projector = Adapter(audio_token_count=77, transformer_layer_count=4).to(self.device)
-                self.audio_projector.load_state_dict(torch.load(adapter_ckpt_path, map_location=self.device))
-                self.audio_projector.eval()
-                self.model_loaded = True
-                print("Model loaded successfully!")
-                return "Model loaded successfully"
-            except ImportError as e:
-                return f"Error importing CLAP: {str(e)}. Make sure the CLAP module is available."
-        except Exception as e:
-            error_msg = f"Failed to load model: {str(e)}"
-            print(error_msg)
-            traceback.print_exc()
-            return error_msg
-    def generate(self, audio_model=None, audio=None, prompt=None, cfg_scale=5, num_inference_steps=50):
-        """Generate an image from audio input"""
-        if not self.model_loaded:
-            from PIL import Image, ImageDraw
-            img = Image.new('RGB', (512, 512), color=(255, 255, 255))
-            d = ImageDraw.Draw(img)
-            d.text((10, 250), "Error: Model not loaded. Click 'Load Model' first.", fill=(0, 0, 0))
-            return img
-        try:
-            if audio is None:
-                raise ValueError("No audio file provided")
-            if prompt is None or prompt.strip() == "":
-                prompt = "a high quality image"
-            with torch.no_grad():
-                # Process audio input
-                audio_emb, _ = self.audio_encoder.get_audio_embeddings([audio], resample=self.sr)
-                audio_proj = self.audio_projector(audio_emb.unsqueeze(1))
-                # Create unconditional embedding
-                audio_emb = torch.zeros(1, 1024).to(self.device)
-                audio_uc = self.audio_projector(audio_emb.unsqueeze(1))
-                # Combine for context
-                audio_context = torch.cat([audio_uc, audio_proj]).to(self.device)
-                # Generate image
-                print(f"Generating image with prompt: '{prompt}', CFG: {cfg_scale}, Steps: {num_inference_steps}")
-                image = self.pipeline(
-                    prompt=prompt,
-                    audio_context=audio_context,
-                    guidance_scale=cfg_scale,
-                    num_inference_steps=num_inference_steps
-                )
-                # Save a copy of the generated image
-                os.makedirs("outputs", exist_ok=True)
-                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-                output_path = f"outputs/generated_{timestamp}.png"
-                image.images[0].save(output_path)
-                print(f"Image saved to {output_path}")
-                return image.images[0]
-        except Exception as e:
-            error_msg = f"Error in generation: {str(e)}"
-            print(error_msg)
-            traceback.print_exc()
-            # Return a blank error image
-            from PIL import Image, ImageDraw
-            img = Image.new('RGB', (512, 512), color=(255, 255, 255))
-            d = ImageDraw.Draw(img)
-            d.text((10, 250), f"Error: {str(e)}", fill=(0, 0, 0))
-            return img
-    def update_audio_model(self, audio_model_update):
-        """Update audio model based on selection"""
-        try:
-            if not self.model_loaded:
-                return "Error: Model not loaded. Click 'Load Model' first."
-            if audio_model_update == "Landscape Model":
-                audio_projector_path = "ckpts/audio_projector_landscape.pth"
-                gate_dict_path = "ckpts/landscape.pt"
-            else:
-                audio_projector_path = "ckpts/audio_projector_gh.pth"
-                gate_dict_path = "ckpts/greatest_hits.pt"
-            # Check if files exist
-            if not os.path.exists(audio_projector_path) or not os.path.exists(gate_dict_path):
-                return f"Error: Required model files not found. Need {audio_projector_path} and {gate_dict_path}"
-            # Load gate dictionary and update parameters
-            gate_dict = torch.load(gate_dict_path, map_location=self.device)
-            for name, param in self.pipeline.unet.named_parameters():
-                if "adapter" in name:
-                    param.data = gate_dict[name].to(self.device)
-            # Load audio projector state
-            self.audio_projector.load_state_dict(torch.load(audio_projector_path, map_location=self.device))
-            return f"Model updated to {audio_model_update}"
-        except Exception as e:
-            error_msg = f"Error updating audio model: {str(e)}"
-            print(error_msg)
-            return error_msg
-# CSS for styling the UI
-css = """
-.gradio-container {
-    font-family: 'IBM Plex Sans', sans-serif;
-}
-.toolbutton {
-    margin-bottom: 0em;
-    max-width: 2em;
-    min-width: 2em !important;
-    height: 2em;
-}
-.output-image {
-    border-radius: 0.5rem;
-    border: 1px solid #cccccc;
-}
-.info-text {
-    font-size: 14px;
-    color: #666;
-    margin-top: 5px;
-}
-"""
-# Initialize controller
-controller = SonicDiffusionController()
-def ui():
-    with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
-        gr.Markdown(
-            """
-            # 🎵 SonicDiffusion: Audio-Driven Image Generation
-            Upload an audio file and enter a prompt to generate audio-conditioned images.
-            *This model transforms audio characteristics into visual elements.*
-            """
-        )
-        with gr.Row():
-            with gr.Column(scale=1):
-                # Left column - inputs
-                gr.Markdown("### Model Controls")
-                # Load model button - explicitly load the model when ready
-                load_model_button = gr.Button(value="1️⃣ Load Model (click first)", variant='primary')
-                with gr.Accordion("Model Selection", open=True):
-                    audio_model_dropdown = gr.Dropdown(
-                        label="Select SonicDiffusion model",
-                        value="Landscape Model",
-                        choices=["Landscape Model", "Greatest Hits Model"],
-                        interactive=True,
-                    )
-                    model_info = gr.Markdown("""
-                    **Landscape Model**: Optimized for nature and environment sounds
-                    **Greatest Hits**: Better with music and rhythmic sounds
-                    """)
-                # Audio input
-                audio_input = gr.Audio(label="2️⃣ Upload or Record Audio", sources=["upload", "microphone"], type="filepath")
-                # Prompt input
-                prompt_textbox = gr.Textbox(label="3️⃣ Enter Prompt", lines=2, placeholder="Describe the image you want to generate...")
-                with gr.Accordion("Advanced Settings", open=False):
-                    # Generation parameters
-                    with gr.Row():
-                        cfg_scale_slider = gr.Slider(label="Guidance Scale", value=7.5, minimum=1.0, maximum=20.0, info="Higher values = more prompt adherence")
-                        num_steps_slider = gr.Slider(label="Inference Steps", value=50, minimum=20, maximum=100, step=5, info="Higher values = more detail, slower generation")
-                # Generate button
-                generate_button = gr.Button(value="4️⃣ Generate Image", variant='primary', size="lg")
-                # Status indicator
-                status_text = gr.Textbox(label="Status", value="Click 'Load Model' to begin")
-                gr.Markdown("### Example Audio Files")
-                with gr.Row():
-                    examples = [
-                        ['./assets/fire_crackling.wav'],
-                        ['./assets/plastic_bag.wav'],
-                    ]
-                    gr.Examples(examples=examples, inputs=[audio_input])
-            with gr.Column(scale=1):
-                # Right column - output
-                gr.Markdown("### Generated Image")
-                output = gr.Image(label="Output Image", height=512, width=512)
-                download_btn = gr.Button("💾 Download Image")
-                output_info = gr.Markdown("""
-                *Generated images are also automatically saved to the 'outputs' folder.*
-                #### How SonicDiffusion Works
-                SonicDiffusion extracts features from audio files and uses them to condition a Stable Diffusion model.
-                The audio influences how the image is generated, with different sounds creating different visual effects.
-                Try experimenting with different audio files and prompts!
-                """)
-        # Event handlers
-        load_model_button.click(
-            fn=controller.load_model,
-            inputs=[],
-            outputs=[status_text]
-        )
-        audio_model_dropdown.change(
-            fn=controller.update_audio_model,
-            inputs=[audio_model_dropdown],
-            outputs=[status_text]
-        )
-        generate_button.click(
-            fn=controller.generate,
-            inputs=[
-                audio_model_dropdown,
-                audio_input,
-                prompt_textbox,
-                cfg_scale_slider,
-                num_steps_slider,
-            ],
-            outputs=[output]
-        )
-        download_btn.click(
-            fn=lambda x: x,
-            inputs=[output],
-            outputs=[output],
-            _js="(img) => { if(img) { const a = document.createElement('a'); a.href = img; a.download = 'sonicDiffusion_' + Date.now() + '.png'; a.click(); } return img; }"
-        )
-    return demo
 if __name__ == "__main__":
-    # Create necessary directories
-    os.makedirs("assets", exist_ok=True)
-    os.makedirs("ckpts", exist_ok=True)
-    os.makedirs("outputs", exist_ok=True)
-    # Check environment
-    print(f"Python version: {sys.version}")
-    print(f"PyTorch version: {torch.__version__}")
-    print(f"CUDA available: {torch.cuda.is_available()}")
-    if torch.cuda.is_available():
-        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
-    # Check and download assets if needed
-    print("Checking required assets...")
-    assets_ready = check_and_download_assets()
-    if not assets_ready:
-        print("Warning: Could not download all required assets. The app may not function correctly.")
     # Launch the demo
-    demo = ui()
-    demo.launch(share=True)

+# Minimal app.py that doesn't require torch
 import os
 import sys
+# Print environment information for debugging
+print("==== Environment Information ====")
+print(f"Python version: {sys.version}")
+print(f"Working directory: {os.getcwd()}")
+print(f"Directory contents: {os.listdir('.')}")
+# Simple Gradio interface
+import gradio as gr
+def hello(name):
+    if not name:
+        name = "World"
+    return f"Hello, {name}!"
+# Create a simple Gradio interface
+demo = gr.Interface(
+    fn=hello,
+    inputs="text",
+    outputs="text",
+    title="SonicDiffusion - Setup Test",
+    description="This is a test app to verify the environment is working."
+)
 if __name__ == "__main__":
+    # Try to print installed packages
+    try:
+        import subprocess
+        print("==== Installed Packages ====")
+        subprocess.run([sys.executable, "-m", "pip", "list"])
+    except Exception as e:
+        print(f"Error listing packages: {e}")
     # Launch the demo
+    demo.launch()