Add metadata and improve model card

#1
by nielsr HF Staff - opened
Files changed (1) hide show
  1. README.md +20 -14
README.md CHANGED
@@ -1,11 +1,19 @@
1
- # AuroLA
2
- This repo contains the checkpoint of the following paper:
 
 
 
3
 
4
- **Scaling Audio-Text Retrieval with Multimodal Large Language Model**
 
 
 
 
 
5
 
6
  ## Quick Start
7
 
8
- Try the model to extract audio and text features.
9
 
10
  ```python
11
  import torch
@@ -44,8 +52,8 @@ def get_embed_feature(hidden_states, input_ids, embed_index):
44
  embed_features = hidden_states[torch.arange(len(embed_indices)), embed_indices - 1]
45
  return embed_features
46
 
47
- # 1) Load model + processor (same style as Qwen2.5-Omni)
48
- model_path = "Jazzcharles/AuroLA-3B" # or your HF repo id
49
 
50
  device = "cuda" if torch.cuda.is_available() else "cpu"
51
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
@@ -62,16 +70,13 @@ emb_token_ids = add_embed_token(tokenizer, model)
62
 
63
 
64
  # 2) Prepare retrieval inputs
65
- # audio paths and text queries can be any same-batch lists
66
  audio_files = [
67
- "/mnt/data/AudioCaps/audio/--0w1YA1Hm4_30.wav",
68
- "/mnt/data/AudioCaps/audio/-AheI8Epim4_30.wav",
69
- "/mnt/data/AudioCaps/audio/-BUWGM7qeUM_10.wav",
70
  ]
71
  text_queries = [
72
  "A vehicle driving as a man and woman are talking and laughing",
73
  "Muffled sounds followed by metal being hit",
74
- "Wind is blowing and heavy rain is falling and splashing",
75
  ]
76
 
77
  # Build audio-side messages
@@ -97,7 +102,8 @@ text_messages = [
97
  [
98
  {
99
  "role": "user",
100
- "content": [{"type": "text", "text": f"{t}\nSummarize above sentence in one word:"}],
 
101
  },
102
  {
103
  "role": "assistant",
@@ -119,10 +125,10 @@ with torch.inference_mode():
119
  text_out = model(**text_inputs, output_hidden_states=True, return_dict=True, use_audio_in_video=False)
120
  text_feat = get_embed_feature(text_out.hidden_states[-1], text_inputs['input_ids'], emb_token_ids)
121
 
122
- # 5) Similarity + top-k retrieval
123
  audio_feat = F.normalize(audio_feat, dim=-1)
124
  text_feat = F.normalize(text_feat, dim=-1)
125
- score = text_feat @ audio_feat.T # [N_text, N_audio]
126
  print(score.shape, score)
127
  ```
128
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: feature-extraction
5
+ ---
6
 
7
+ # AuroLA: Scaling Audio-Text Retrieval with Multimodal Large Language Models
8
+
9
+ AuroLA is a novel contrastive language-audio pre-training framework that re-purposes Multimodal Large Language Models (MLLMs) as a unified backbone for retrieval. It treats retrieval as a summarization task, using specific token hidden states as embeddings to align audio and text.
10
+
11
+ - **Paper:** [Scaling Audio-Text Retrieval with Multimodal Large Language Models](https://huggingface.co/papers/2602.18010)
12
+ - **Code:** [GitHub - Jazzcharles/AuroLA](https://github.com/Jazzcharles/AuroLA)
13
 
14
  ## Quick Start
15
 
16
+ Try the model to extract audio and text features. This requires the `qwen_omni_utils.py` script from the [official repository](https://github.com/Jazzcharles/AuroLA).
17
 
18
  ```python
19
  import torch
 
52
  embed_features = hidden_states[torch.arange(len(embed_indices)), embed_indices - 1]
53
  return embed_features
54
 
55
+ # 1) Load model + processor
56
+ model_path = "Jazzcharles/AuroLA-3B"
57
 
58
  device = "cuda" if torch.cuda.is_available() else "cpu"
59
  dtype = torch.bfloat16 if device == "cuda" else torch.float32
 
70
 
71
 
72
  # 2) Prepare retrieval inputs
 
73
  audio_files = [
74
+ "/path/to/audio1.wav",
75
+ "/path/to/audio2.wav",
 
76
  ]
77
  text_queries = [
78
  "A vehicle driving as a man and woman are talking and laughing",
79
  "Muffled sounds followed by metal being hit",
 
80
  ]
81
 
82
  # Build audio-side messages
 
102
  [
103
  {
104
  "role": "user",
105
+ "content": [{"type": "text", "text": f"{t}
106
+ Summarize above sentence in one word:"}],
107
  },
108
  {
109
  "role": "assistant",
 
125
  text_out = model(**text_inputs, output_hidden_states=True, return_dict=True, use_audio_in_video=False)
126
  text_feat = get_embed_feature(text_out.hidden_states[-1], text_inputs['input_ids'], emb_token_ids)
127
 
128
+ # 5) Similarity
129
  audio_feat = F.normalize(audio_feat, dim=-1)
130
  text_feat = F.normalize(text_feat, dim=-1)
131
+ score = text_feat @ audio_feat.T
132
  print(score.shape, score)
133
  ```
134