Spaces:
Sleeping
Sleeping
| from gtts import gTTS | |
| from io import BytesIO | |
| import base64 | |
| from PIL import Image | |
| import cv2 | |
| import numpy as np | |
| import subprocess | |
| from speech_recognition import AudioFile, Recognizer | |
| def tts(text: str, language="ja", encode=False) -> object: | |
| """Converts text into autoplay html. | |
| Args: | |
| text (str): generated answer of bot | |
| language (str): language of text | |
| encode (bool): if True, return base64 encoded string | |
| Returns: | |
| html: autoplay object | |
| """ | |
| tts_object = gTTS(text=text, lang=language, slow=False) | |
| if encode: | |
| bytes_object = BytesIO() | |
| tts_object.write_to_fp(bytes_object) | |
| bytes_object.seek(0) | |
| b64 = base64.b64encode(bytes_object.getvalue()).decode() | |
| return b64 | |
| else: | |
| tts_object.save("temp.mp3") | |
| return "temp.mp3" | |
| def stt(audio: object, language='ja') -> str: | |
| """Converts speech to text. | |
| Args: | |
| audio: record of user speech | |
| language (str): language of text | |
| Returns: | |
| text (str): recognized speech of user | |
| """ | |
| # Create a Recognizer object | |
| r = Recognizer() | |
| # Open the audio file | |
| with AudioFile(audio) as source: | |
| # Listen for the data (load audio to memory) | |
| audio_data = r.record(source) | |
| # Transcribe the audio using Google's speech-to-text API | |
| text = r.recognize_google(audio_data, language=language) | |
| return text | |
| def read_image_file(file) -> Image.Image: | |
| image = Image.open(BytesIO(file)) | |
| return image | |
| def pil_to_base64(img, format="jpeg", encode=False): | |
| if encode: | |
| bytes_object = BytesIO() | |
| img.save(bytes_object, format) | |
| bytes_object.seek(0) | |
| b64 = base64.b64encode(bytes_object.getvalue()).decode("ascii") | |
| return b64 | |
| else: | |
| temp_path = f"temp.{format}" | |
| img.save(temp_path) | |
| return temp_path | |
| def base64_to_pil(img_str): | |
| if "base64," in img_str: | |
| img_str = img_str.split(",")[1] | |
| img_raw = base64.b64decode(img_str) | |
| img = Image.open(BytesIO(img_raw)) | |
| return img | |
| def get_hist(image): | |
| hist = cv2.calcHist([np.array(image)], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256]) | |
| hist = cv2.normalize(hist, hist).flatten() | |
| return hist | |
| def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: | |
| """ | |
| Helper function to read an audio file through ffmpeg. | |
| """ | |
| ar = f"{sampling_rate}" | |
| ac = "1" | |
| format_for_conversion = "f32le" | |
| ffmpeg_command = [ | |
| "ffmpeg", | |
| "-i", | |
| "pipe:0", | |
| "-ac", | |
| ac, | |
| "-ar", | |
| ar, | |
| "-f", | |
| format_for_conversion, | |
| "-hide_banner", | |
| "-loglevel", | |
| "quiet", | |
| "pipe:1", | |
| ] | |
| try: | |
| ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
| except FileNotFoundError: | |
| raise ValueError("ffmpeg was not found but is required to load audio files from filename") | |
| output_stream = ffmpeg_process.communicate(bpayload) | |
| out_bytes = output_stream[0] | |
| audio = np.frombuffer(out_bytes, np.float32) | |
| return audio |