VeuReu commited on
Commit
7153ef5
·
verified ·
1 Parent(s): b41d5dd

Create asr_client.py

Browse files
Files changed (1) hide show
  1. asr_client.py +132 -0
asr_client.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["CUDA_VISIBLE_DEVICES"] = "1"
3
+
4
+ from gradio_client import Client, handle_file
5
+ from typing import Any, Dict, List
6
+ from PIL import Image
7
+ import json
8
+
9
+ # Connect to the remote Space
10
+ asr_client = Client("VeuReu/asr")
11
+
12
+
13
+ def extract_audio_from_video(video_path: str) -> str:
14
+ """
15
+ Call the /extract_audio_ffmpeg endpoint of the remote VeuReu/asr Space.
16
+
17
+ This function uploads a video file to the remote ASR service and extracts its audio track.
18
+
19
+ Parameters
20
+ ----------
21
+ video_path : str
22
+ Path to the input video file from which audio will be extracted.
23
+
24
+ Returns
25
+ -------
26
+ str
27
+ Path or identifier of the extracted audio file returned by the remote service.
28
+ """
29
+ result = asr_client.predict(
30
+ video_file={"video": handle_file(video_path)},
31
+ api_name="/extract_audio_ffmpeg"
32
+ )
33
+ return result
34
+
35
+
36
+ def diarize_audio(audio_path: str) -> str:
37
+ """
38
+ Call the /diaritzar_audio endpoint of the remote VeuReu/asr Space.
39
+
40
+ This function performs speaker diarization, identifying segments of speech
41
+ belonging to different speakers in the audio file.
42
+
43
+ Parameters
44
+ ----------
45
+ audio_path : str
46
+ Path to the audio file to be diarized.
47
+
48
+ Returns
49
+ -------
50
+ str
51
+ JSON-like diarization output containing speaker segments and timings.
52
+ """
53
+ result = asr_client.predict(
54
+ wav_archivo=handle_file(audio_path),
55
+ api_name="/diaritzar_audio"
56
+ )
57
+ return result
58
+
59
+
60
+ def transcribe_long_audio(audio_path: str) -> str:
61
+ """
62
+ Call the /transcribe_long_audio endpoint of the remote VeuReu/asr Space.
63
+
64
+ Designed for long audio recordings, this function sends the audio to the ASR model
65
+ optimized for processing extended durations.
66
+
67
+ Parameters
68
+ ----------
69
+ audio_path : str
70
+ Path to the long audio file to be transcribed.
71
+
72
+ Returns
73
+ -------
74
+ str
75
+ Transcribed text returned by the remote ASR service.
76
+ """
77
+ result = asr_client.predict(
78
+ wav_path=handle_file(audio_path),
79
+ api_name="/transcribe_long_audio"
80
+ )
81
+ return result
82
+
83
+
84
+ def transcribe_short_audio(audio_path: str) -> str:
85
+ """
86
+ Call the /transcribe_wav endpoint of the remote VeuReu/asr Space.
87
+
88
+ This function is optimized for short-duration audio samples and produces fast transcriptions.
89
+
90
+ Parameters
91
+ ----------
92
+ audio_path : str
93
+ Path to the short audio file to be transcribed.
94
+
95
+ Returns
96
+ -------
97
+ str
98
+ Transcribed text returned by the remote service.
99
+ """
100
+ result = asr_client.predict(
101
+ wav_path=handle_file(audio_path),
102
+ api_name="/transcribe_wav"
103
+ )
104
+ return result
105
+
106
+
107
+ def identificar_veu(clip_path: str, voice_col: List[Dict[str, Any]]):
108
+ """
109
+ Call the /identificar_veu endpoint of the remote VeuReu/asr Space.
110
+
111
+ This function attempts to identify which known speaker (from a provided
112
+ collection of voice profiles) appears in the given audio clip.
113
+
114
+ Parameters
115
+ ----------
116
+ clip_path : str
117
+ Path to the audio clip whose speaker is to be identified.
118
+ voice_col : List[Dict[str, Any]]
119
+ List of dictionaries containing metadata or embeddings for known voices.
120
+
121
+ Returns
122
+ -------
123
+ Any
124
+ Output returned by the remote speaker identification model.
125
+ """
126
+ voice_col_str = json.dumps(voice_col)
127
+ result = asr_client.predict(
128
+ wav_archivo=handle_file(clip_path),
129
+ voice_col=voice_col_str,
130
+ api_name="/identificar_veu"
131
+ )
132
+ return result