File size: 3,191 Bytes
d56b9d9
 
0fea237
 
d56b9d9
0fea237
 
d56b9d9
dc382c8
d56b9d9
0fea237
dc382c8
 
d56b9d9
 
02c9b64
d56b9d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc382c8
 
 
 
 
 
 
 
02c9b64
0fea237
 
 
 
 
 
 
02c9b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0fea237
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import gradio as gr
from io import BytesIO
import librosa
import numpy as np
from os import getenv
from PIL.Image import Image, open as open_image
import soundfile as sf
import requests
from tempfile import NamedTemporaryFile
import torch
from transformers import AutoProcessor


# Try to import spaces decorator (for Hugging Face Spaces), otherwise use no-op decorator.
try:
    from spaces import GPU as spaces_gpu
except ImportError:
    # For local development, use a no-op decorator because spaces is not available.
    def spaces_gpu(func):
        return func

def get_pytorch_device() -> str:
    return ("cuda" if torch.cuda.is_available() # Nvidia CUDA and AMD ROCm
       else "xpu" if torch.xpu.is_available() # Intel XPU
       else "mps" if torch.mps.is_available() # Apple Silicon
       else "cpu") # gl bro 🫠

def request_image(url: str) -> Image:
    try:
        response = requests.get(url, timeout=int(getenv("REQUEST_TIMEOUT")))
        response.raise_for_status()
        return open_image(BytesIO(response.content))
    except requests.HTTPError as e:
        raise gr.Error(f"Failed to fetch image from URL because of HTTP error: {e.response.status_code} {e.response.text}")
    except requests.Timeout as e:
        raise gr.Error(f"Failed to fetch image from URL because the request timed out.")
    except requests.RequestException as e:
        raise gr.Error(f"Failed to fetch image from URL: {str(e)}")

def save_image_to_temp_file(image: Image) -> str:
    image_format = image.format if image.format else 'PNG'
    format_extension = image_format.lower() if image_format else 'png'
    temp_file = NamedTemporaryFile(delete=False, suffix=f".{format_extension}")
    temp_path = temp_file.name
    temp_file.close()
    image.save(temp_path, format=image_format)
    return temp_path

def get_model_sample_rate(model_id: str) -> int:
    try:
        processor = AutoProcessor.from_pretrained(model_id)
        return processor.feature_extractor.sampling_rate
    except Exception:
        return 16000 # Fallback value as most ASR models use 16kHz

def resample_audio(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> np.ndarray:
    sample_rate, audio_data = audio
    
    # Convert audio data to a numpy array if it’s bytes
    if isinstance(audio_data, bytes):
        audio_array = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
    elif isinstance(audio_data, np.ndarray):
        audio_array = audio_data.astype(np.float32)
    else:
        raise ValueError(f"Unsupported audio_data type: {type(audio_data)}")
    
    # Resample if sample rates don’t match.
    if sample_rate != target_sample_rate:
        audio_array = librosa.resample(audio_array, orig_sr=sample_rate, target_sr=target_sample_rate)
    
    return audio_array

def save_audio_to_temp_file(target_sample_rate: int, audio: tuple[int, bytes | np.ndarray]) -> str:
    audio_array = resample_audio(target_sample_rate, audio)
    temp_file = NamedTemporaryFile(delete=False, suffix='.wav')
    temp_path = temp_file.name
    temp_file.close()
    sf.write(temp_path, audio_array, target_sample_rate, format='WAV')
    return temp_path