Spaces:

Kaworu17
/

Audtheia-CLAP

Paused

App Files Files Community

Kaworu17 commited on May 2, 2025

Commit

78e1c98

verified ·

1 Parent(s): 4c682ef

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -54

app.py CHANGED Viewed

@@ -1,93 +1,74 @@
 import torch
-import torchaudio
 import gradio as gr
-from transformers import ClapProcessor, ClapModel
-import tempfile
 import requests
-import os
-processor = ClapProcessor.from_pretrained("./")
-model = ClapModel.from_pretrained("./")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device).eval()
-# Efficient waveform processing
-def preprocess_waveform(waveform, sr):
-    if waveform.shape[0] > 1:
-        waveform = waveform.mean(dim=0, keepdim=True)
-    if sr != 48000:
-        waveform = torchaudio.transforms.Resample(sr, 48000)(waveform)
-    max_len = 240000  # 5 sec at 48kHz
-    if waveform.shape[1] > max_len:
-        waveform = waveform[:, :max_len]
-    else:
-        waveform = torch.nn.functional.pad(waveform, (0, max_len - waveform.shape[1]))
-    return waveform
-# Generate embeddings safely
-def generate_embeddings(waveform):
-    inputs = processor(audios=waveform, sampling_rate=48000, return_tensors="pt").to(device)
     with torch.no_grad():
-        output = model(**inputs)
-    return output.pooler_output.cpu().numpy().shape
-# Robust local file classification
 def classify_upload(audio_path):
     try:
-        waveform, sr = torchaudio.load(audio_path)
-        waveform = preprocess_waveform(waveform, sr)
-        shape = generate_embeddings(waveform)
         return f"✅ Upload Successful — Embedding Shape: {shape}"
     except Exception as e:
         return f"❌ Upload Error: {str(e)}"
-# Robust URL classification with error handling and file format checks
 def classify_url(audio_url):
     try:
-        response = requests.get(audio_url, timeout=25)
         response.raise_for_status()
-        file_extension = audio_url.split('.')[-1].lower()
-        if file_extension not in ['wav', 'mp3']:
-            return f"❌ Unsupported file format: .{file_extension}"
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp:
             tmp.write(response.content)
             tmp_path = tmp.name
-        waveform, sr = torchaudio.load(tmp_path)
-        os.remove(tmp_path)
-        waveform = preprocess_waveform(waveform, sr)
-        shape = generate_embeddings(waveform)
         return f"✅ URL Classified — Embedding Shape: {shape}"
     except requests.exceptions.Timeout:
-        return "❌ URL Error: Request Timeout"
     except Exception as e:
         return f"❌ URL Error: {str(e)}"
 upload_ui = gr.Interface(
-    fn=classify_upload,
-    inputs=gr.Audio(type="filepath", label="Upload Audio (.wav or .mp3)"),
-    outputs="text",
-    title="Audtheia CLAP Audio Agent",
-    description="Generate CLAP embeddings from uploaded audio (.wav/.mp3).",
 )
 url_ui = gr.Interface(
-    fn=classify_url,
-    inputs="text",
-    outputs="text",
-    title="Audtheia CLAP Audio Agent (URL Input)",
-    description="Provide direct audio URL (.wav/.mp3) to classify audio with CLAP.",
 )
 app = gr.TabbedInterface(
     [upload_ui, url_ui],
     ["Upload Audio", "HTTP Audio URL"],
-    title="🛰️ Audtheia Multimodal CLAP Agent",
 )
-# Stable launch configuration
-app.queue().launch()

 import torch
 import gradio as gr
 import requests
+import tempfile
+import librosa
+from transformers import ClapModel, ClapProcessor
+# Load official Hugging Face CLAP model and processor
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
+model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device).eval()
+# Function to preprocess and classify audio
+def classify_audio(audio, sr=48000):
+    inputs = processor(audios=audio, sampling_rate=sr, return_tensors="pt", padding=True)
+    inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
+        embeddings = model.get_audio_features(**inputs)
+    return embeddings.cpu().numpy().shape
+# 🔼 Classify uploaded audio
 def classify_upload(audio_path):
     try:
+        audio, sr = librosa.load(audio_path, sr=48000, mono=True)
+        shape = classify_audio(audio, sr)
         return f"✅ Upload Successful — Embedding Shape: {shape}"
     except Exception as e:
         return f"❌ Upload Error: {str(e)}"
+# 🌐 Classify audio via URL
 def classify_url(audio_url):
     try:
+        response = requests.get(audio_url, timeout=30)
         response.raise_for_status()
+        file_ext = audio_url.split('.')[-1].lower()
+        if file_ext not in ['wav', 'mp3', 'ogg']:
+            return f"❌ Unsupported format: .{file_ext}"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp:
             tmp.write(response.content)
             tmp_path = tmp.name
+        audio, sr = librosa.load(tmp_path, sr=48000, mono=True)
+        shape = classify_audio(audio, sr)
         return f"✅ URL Classified — Embedding Shape: {shape}"
     except requests.exceptions.Timeout:
+        return "❌ Error: Request timed out"
     except Exception as e:
         return f"❌ URL Error: {str(e)}"
+# Gradio interfaces
 upload_ui = gr.Interface(
+    classify_upload, gr.Audio(type="filepath"), "text",
+    title="Audtheia CLAP Audio Agent (Upload)",
+    description="Upload audio (.wav/.mp3) to generate CLAP embeddings using official LAION-CLAP."
 )
 url_ui = gr.Interface(
+    classify_url, "text", "text",
+    title="Audtheia CLAP Audio Agent (URL)",
+    description="Classify audio from direct URLs (.wav/.mp3/.ogg) using LAION-CLAP."
 )
 app = gr.TabbedInterface(
     [upload_ui, url_ui],
     ["Upload Audio", "HTTP Audio URL"],
+    title="🛰️ Audtheia Multimodal CLAP Agent"
 )
+# Corrected Gradio queue configuration
+app.queue(max_size=10).launch()