Spaces:

Sayiqa
/

Speech_

Sleeping

App Files Files Community

Speech_ / app.py

Sayiqa

Update app.py

665107f verified about 1 year ago

raw

history blame contribute delete

4.56 kB


	import os
	import numpy as np
	import torch
	import librosa
	import asyncio
	import streamlit as st
	from transformers import pipeline
	from huggingface_hub import login
	from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
	from functools import lru_cache

	# Install missing packages (if required, handled manually for Streamlit environment)
	def install_missing_packages():
	required_packages = {
	"librosa": None,
	"diffusers": ">=0.14.0",
	"transformers": None,
	"torch": "==2.0.0+cu118",
	}
	for package, version in required_packages.items():
	try:
	__import__(package)
	except ImportError:
	package_name = f"{package}{version}" if version else package
	print(f"Installing {package_name}...")
	try:
	subprocess.check_call(["pip", "install", package_name])
	except subprocess.CalledProcessError as e:
	print(f"Error installing {package_name}: {e}")
	return

	install_missing_packages()

	# Authenticate with Hugging Face Hub
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	login(hf_token)
	else:
	raise ValueError("HF_TOKEN environment variable not set.")

	# Load models
	speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

	text_to_image = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
	)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	text_to_image.to(device)
	text_to_image.enable_attention_slicing()
	text_to_image.safety_checker = None
	text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)

	# Preprocess audio file into NumPy array
	def preprocess_audio(audio_path):
	try:
	audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
	return np.array(audio, dtype=np.float32)
	except Exception as e:
	return f"Error in preprocessing audio: {str(e)}"

	# Speech-to-text function
	@lru_cache(maxsize=10)
	async def transcribe_audio(audio_path):
	try:
	audio_array = preprocess_audio(audio_path)
	if isinstance(audio_array, str): # Error message from preprocessing
	return audio_array
	result = speech_to_text(audio_array)
	transcription = result["text"]
	return transcription
	except Exception as e:
	return f"Error in transcription: {str(e)}"

	# Text-to-image function
	@lru_cache(maxsize=10)
	async def generate_image_from_text(text):
	try:
	image = text_to_image(text, height=256, width=256).images[0]
	return image
	except Exception as e:
	return f"Error in image generation: {str(e)}"

	# Combined processing function
	async def process_audio_and_generate_image(audio_path):
	transcription_result = {"result": None}
	image_result = {"result": None}

	async def transcription_thread():
	transcription_result["result"] = await transcribe_audio(audio_path)

	async def image_generation_thread():
	transcription = transcription_result["result"]
	if transcription and "Error" not in transcription:
	image_result["result"] = await generate_image_from_text(transcription)

	await asyncio.gather(transcription_thread(), image_generation_thread())

	transcription = transcription_result["result"]
	image = image_result["result"]

	if "Error" in transcription:
	return None, transcription
	if isinstance(image, str) and "Error" in image:
	return None, image

	return image, transcription

	# Streamlit interface
	st.title("Voice-to-Image Generator")
	st.write("Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.")

	audio_file = st.file_uploader("Upload audio file (WAV/MP3)", type=["wav", "mp3"])

	if audio_file:
	audio_path = f"temp_{audio_file.name}"
	with open(audio_path, "wb") as f:
	f.write(audio_file.read())

	with st.spinner("Processing..."):
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	image, transcription = loop.run_until_complete(process_audio_and_generate_image(audio_path))

	if transcription and "Error" not in transcription:
	st.subheader("Transcription")
	st.write(transcription)

	if image:
	st.subheader("Generated Image")
	st.image(image, caption="Generated from transcription")
	else:
	st.error("Error in generating image.")
	else:
	st.error(transcription)