AndreasXi commited on
Commit
4e1c15b
·
verified ·
1 Parent(s): 9e01b09

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +33 -0
README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ```python
2
+ import torch
3
+ from transformers import AutoModel
4
+
5
+ audio_path = ['resources/1.wav', 'resources/2.wav'] # (B,)
6
+ caption = ["A woman speaks, dishes clanking, food frying, and music plays", 'A power tool is heard with male speech.'] # (B,)
7
+ phrases = ['Speech', 'Dog', 'Cat', 'Frying', 'Dishes', 'Music', 'Vacuum', 'Type', 'Power tool'] # (N,)
8
+
9
+
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+
12
+ model = AutoModel.from_pretrained("AndreasXi/FineLAP", trust_remote_code=True).to(device)
13
+ model.eval()
14
+
15
+ with torch.no_grad():
16
+ global_text_embeds = model.get_global_text_embeds(caption) # (B, d)
17
+ print(global_text_embeds.shape)
18
+
19
+ global_audio_embeds = model.get_global_audio_embeds(audio_path) # (B, d)
20
+ print(global_audio_embeds.shape)
21
+
22
+ dense_audio_embeds = model.get_dense_audio_embeds(audio_path) # (B, T, d)
23
+ print(dense_audio_embeds.shape)
24
+
25
+ clip_scores = model.get_clip_level_score(audio_path, caption) # (B, B)
26
+ print(clip_scores.shape)
27
+
28
+ frame_scores = model.get_frame_level_score(audio_path, phrases) # (B, N, T)
29
+ print(frame_scores.shape)
30
+
31
+ ## (Optional) Plot frame-level similarity, only supprt single audio file
32
+ model.plot_frame_level_score(audio_path[1], phrases, output_path="output/output_plot.png")
33
+ ```