Kaworu17 commited on
Commit
78e1c98
Β·
verified Β·
1 Parent(s): 4c682ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -54
app.py CHANGED
@@ -1,93 +1,74 @@
1
  import torch
2
- import torchaudio
3
  import gradio as gr
4
- from transformers import ClapProcessor, ClapModel
5
- import tempfile
6
  import requests
7
- import os
 
 
8
 
9
- processor = ClapProcessor.from_pretrained("./")
10
- model = ClapModel.from_pretrained("./")
 
11
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device).eval()
14
 
15
- # Efficient waveform processing
16
- def preprocess_waveform(waveform, sr):
17
- if waveform.shape[0] > 1:
18
- waveform = waveform.mean(dim=0, keepdim=True)
19
- if sr != 48000:
20
- waveform = torchaudio.transforms.Resample(sr, 48000)(waveform)
21
- max_len = 240000 # 5 sec at 48kHz
22
- if waveform.shape[1] > max_len:
23
- waveform = waveform[:, :max_len]
24
- else:
25
- waveform = torch.nn.functional.pad(waveform, (0, max_len - waveform.shape[1]))
26
- return waveform
27
-
28
- # Generate embeddings safely
29
- def generate_embeddings(waveform):
30
- inputs = processor(audios=waveform, sampling_rate=48000, return_tensors="pt").to(device)
31
  with torch.no_grad():
32
- output = model(**inputs)
33
- return output.pooler_output.cpu().numpy().shape
34
 
35
- # Robust local file classification
36
  def classify_upload(audio_path):
37
  try:
38
- waveform, sr = torchaudio.load(audio_path)
39
- waveform = preprocess_waveform(waveform, sr)
40
- shape = generate_embeddings(waveform)
41
  return f"βœ… Upload Successful β€” Embedding Shape: {shape}"
42
  except Exception as e:
43
  return f"❌ Upload Error: {str(e)}"
44
 
45
- # Robust URL classification with error handling and file format checks
46
  def classify_url(audio_url):
47
  try:
48
- response = requests.get(audio_url, timeout=25)
49
  response.raise_for_status()
50
-
51
- file_extension = audio_url.split('.')[-1].lower()
52
- if file_extension not in ['wav', 'mp3']:
53
- return f"❌ Unsupported file format: .{file_extension}"
54
 
55
- with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp:
 
 
 
 
56
  tmp.write(response.content)
57
  tmp_path = tmp.name
58
 
59
- waveform, sr = torchaudio.load(tmp_path)
60
- os.remove(tmp_path)
61
-
62
- waveform = preprocess_waveform(waveform, sr)
63
- shape = generate_embeddings(waveform)
64
  return f"βœ… URL Classified β€” Embedding Shape: {shape}"
65
  except requests.exceptions.Timeout:
66
- return "❌ URL Error: Request Timeout"
67
  except Exception as e:
68
  return f"❌ URL Error: {str(e)}"
69
 
 
70
  upload_ui = gr.Interface(
71
- fn=classify_upload,
72
- inputs=gr.Audio(type="filepath", label="Upload Audio (.wav or .mp3)"),
73
- outputs="text",
74
- title="Audtheia CLAP Audio Agent",
75
- description="Generate CLAP embeddings from uploaded audio (.wav/.mp3).",
76
  )
77
 
78
  url_ui = gr.Interface(
79
- fn=classify_url,
80
- inputs="text",
81
- outputs="text",
82
- title="Audtheia CLAP Audio Agent (URL Input)",
83
- description="Provide direct audio URL (.wav/.mp3) to classify audio with CLAP.",
84
  )
85
 
86
  app = gr.TabbedInterface(
87
  [upload_ui, url_ui],
88
  ["Upload Audio", "HTTP Audio URL"],
89
- title="πŸ›°οΈ Audtheia Multimodal CLAP Agent",
90
  )
91
 
92
- # Stable launch configuration
93
- app.queue().launch()
 
1
  import torch
 
2
  import gradio as gr
 
 
3
  import requests
4
+ import tempfile
5
+ import librosa
6
+ from transformers import ClapModel, ClapProcessor
7
 
8
+ # Load official Hugging Face CLAP model and processor
9
+ processor = ClapProcessor.from_pretrained("laion/clap-htsat-unfused")
10
+ model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
11
 
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13
  model.to(device).eval()
14
 
15
+ # Function to preprocess and classify audio
16
+ def classify_audio(audio, sr=48000):
17
+ inputs = processor(audios=audio, sampling_rate=sr, return_tensors="pt", padding=True)
18
+ inputs = {k: v.to(device) for k, v in inputs.items()}
 
 
 
 
 
 
 
 
 
 
 
 
19
  with torch.no_grad():
20
+ embeddings = model.get_audio_features(**inputs)
21
+ return embeddings.cpu().numpy().shape
22
 
23
+ # πŸ”Ό Classify uploaded audio
24
  def classify_upload(audio_path):
25
  try:
26
+ audio, sr = librosa.load(audio_path, sr=48000, mono=True)
27
+ shape = classify_audio(audio, sr)
 
28
  return f"βœ… Upload Successful β€” Embedding Shape: {shape}"
29
  except Exception as e:
30
  return f"❌ Upload Error: {str(e)}"
31
 
32
+ # 🌐 Classify audio via URL
33
  def classify_url(audio_url):
34
  try:
35
+ response = requests.get(audio_url, timeout=30)
36
  response.raise_for_status()
 
 
 
 
37
 
38
+ file_ext = audio_url.split('.')[-1].lower()
39
+ if file_ext not in ['wav', 'mp3', 'ogg']:
40
+ return f"❌ Unsupported format: .{file_ext}"
41
+
42
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp:
43
  tmp.write(response.content)
44
  tmp_path = tmp.name
45
 
46
+ audio, sr = librosa.load(tmp_path, sr=48000, mono=True)
47
+ shape = classify_audio(audio, sr)
 
 
 
48
  return f"βœ… URL Classified β€” Embedding Shape: {shape}"
49
  except requests.exceptions.Timeout:
50
+ return "❌ Error: Request timed out"
51
  except Exception as e:
52
  return f"❌ URL Error: {str(e)}"
53
 
54
+ # Gradio interfaces
55
  upload_ui = gr.Interface(
56
+ classify_upload, gr.Audio(type="filepath"), "text",
57
+ title="Audtheia CLAP Audio Agent (Upload)",
58
+ description="Upload audio (.wav/.mp3) to generate CLAP embeddings using official LAION-CLAP."
 
 
59
  )
60
 
61
  url_ui = gr.Interface(
62
+ classify_url, "text", "text",
63
+ title="Audtheia CLAP Audio Agent (URL)",
64
+ description="Classify audio from direct URLs (.wav/.mp3/.ogg) using LAION-CLAP."
 
 
65
  )
66
 
67
  app = gr.TabbedInterface(
68
  [upload_ui, url_ui],
69
  ["Upload Audio", "HTTP Audio URL"],
70
+ title="πŸ›°οΈ Audtheia Multimodal CLAP Agent"
71
  )
72
 
73
+ # Corrected Gradio queue configuration
74
+ app.queue(max_size=10).launch()