Spaces:

Kaworu17
/

Audtheia-CLAP

Paused

File size: 2,555 Bytes

71a6b61
 
3b8caa9
78e1c98
 
 
71a6b61
78e1c98
 
 
4c682ef
416998b
71a6b61
 
78e1c98
 
 
 
6ea33ca
78e1c98
 
6ea33ca
78e1c98
d373eae
4871a87
78e1c98
 
d44138a
3b8caa9
4c682ef
4871a87
78e1c98
3b8caa9
 
78e1c98
d44138a
4c682ef
78e1c98
 
 
 
 
d44138a
 
4c682ef
78e1c98
 
df2b6f5
4c682ef
78e1c98
4871a87
4c682ef
71a6b61
78e1c98
d44138a
78e1c98
 
 
71a6b61
 
d44138a
78e1c98
 
 
3b8caa9
 
6ea33ca
 
 
78e1c98
6ea33ca
d44138a
78e1c98

import torch
import gradio as gr
import requests
import tempfile
import librosa
from transformers import ClapModel, ClapProcessor

# Load official Hugging Face CLAP model and processor
processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
model = ClapModel.from_pretrained("laion/clap-htsat-unfused")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

# Function to preprocess and classify audio
def classify_audio(audio, sr=48000):
    inputs = processor(audios=audio, sampling_rate=sr, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        embeddings = model.get_audio_features(**inputs)
    return embeddings.cpu().numpy().shape

# 🔼 Classify uploaded audio
def classify_upload(audio_path):
    try:
        audio, sr = librosa.load(audio_path, sr=48000, mono=True)
        shape = classify_audio(audio, sr)
        return f"✅ Upload Successful — Embedding Shape: {shape}"
    except Exception as e:
        return f"❌ Upload Error: {str(e)}"

# 🌐 Classify audio via URL
def classify_url(audio_url):
    try:
        response = requests.get(audio_url, timeout=30)
        response.raise_for_status()

        file_ext = audio_url.split('.')[-1].lower()
        if file_ext not in ['wav', 'mp3', 'ogg']:
            return f"❌ Unsupported format: .{file_ext}"

        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp:
            tmp.write(response.content)
            tmp_path = tmp.name

        audio, sr = librosa.load(tmp_path, sr=48000, mono=True)
        shape = classify_audio(audio, sr)
        return f"✅ URL Classified — Embedding Shape: {shape}"
    except requests.exceptions.Timeout:
        return "❌ Error: Request timed out"
    except Exception as e:
        return f"❌ URL Error: {str(e)}"

# Gradio interfaces
upload_ui = gr.Interface(
    classify_upload, gr.Audio(type="filepath"), "text",
    title="Audtheia CLAP Audio Agent (Upload)",
    description="Upload audio (.wav/.mp3) to generate CLAP embeddings using official LAION-CLAP."
)

url_ui = gr.Interface(
    classify_url, "text", "text",
    title="Audtheia CLAP Audio Agent (URL)",
    description="Classify audio from direct URLs (.wav/.mp3/.ogg) using LAION-CLAP."
)

app = gr.TabbedInterface(
    [upload_ui, url_ui],
    ["Upload Audio", "HTTP Audio URL"],
    title="🛰️ Audtheia Multimodal CLAP Agent"
)

# Corrected Gradio queue configuration
app.queue(max_size=10).launch()