Spaces:

Sayiqa
/

text

Sleeping

App Files Files Community

text / app.py

Sayiqa

Create app.py

68add06 verified about 1 year ago

raw

history blame contribute delete

4.17 kB

	import subprocess
	import os
	import threading
	import numpy as np
	import librosa
	import gradio as gr
	from functools import lru_cache
	from transformers import pipeline
	from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
	import torch
	from huggingface_hub import login

	# Install required dependencies
	def install_missing_packages():
	required_packages = {
	"librosa": None,
	"diffusers": ">=0.14.0",
	"gradio": ">=3.35.2",
	"huggingface_hub": None,
	"accelerate": ">=0.20.1",
	"transformers": ">=4.31.0",
	"torch": ">=1.11.0"
	}
	for package, version in required_packages.items():
	try:
	__import__(package)
	except ImportError:
	package_name = f"{package}{version}" if version else package
	subprocess.check_call(["pip", "install", package_name])

	install_missing_packages()

	# Hugging Face token authentication
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	login(hf_token)
	else:
	raise ValueError("HF_TOKEN environment variable not set.")

	# Load the speech-to-text model
	speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

	# Load Stable Diffusion model
	text_to_image = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5",
	torch_dtype=torch.float16
	)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	text_to_image.to(device)
	text_to_image.enable_attention_slicing() # Optimizes memory usage
	text_to_image.safety_checker = None # Disables safety checker
	text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config)

	# Preprocess audio file into NumPy array
	def preprocess_audio(audio_path):
	try:
	audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
	return np.array(audio, dtype=np.float32)
	except Exception as e:
	return f"Error in preprocessing audio: {str(e)}"

	# Transcribe audio to text
	@lru_cache(maxsize=10)
	def transcribe_audio(audio_path):
	try:
	audio_array = preprocess_audio(audio_path)
	if isinstance(audio_array, str): # Error message from preprocessing
	return audio_array
	result = speech_to_text(audio_array)
	return result["text"]
	except Exception as e:
	return f"Error in transcription: {str(e)}"

	# Generate image from text
	@lru_cache(maxsize=10)
	def generate_image_from_text(text):
	try:
	image = text_to_image(text, height=512, width=512).images[0]
	return image
	except Exception as e:
	return f"Error in image generation: {str(e)}"

	# Process audio input (speech-to-image)
	def speech_to_image(audio_path):
	transcription = transcribe_audio(audio_path)
	if "Error" in transcription:
	return None, f"Transcription failed: {transcription}"

	image = generate_image_from_text(transcription)
	if isinstance(image, str) and "Error" in image:
	return None, f"Image generation failed: {image}"

	return image

	# Process text input (text-to-image)
	def text_to_image_interface(input_text):
	try:
	image = generate_image_from_text(input_text)
	return image
	except Exception as e:
	return f"Error: {str(e)}"

	# Gradio interface
	speech_to_image_interface = gr.Interface(
	fn=speech_to_image,
	inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
	outputs=gr.Image(label="Generated Image"),
	title="Speech-to-Image Generator",
	description="Upload an audio file to generate an image based on the transcribed speech."
	)

	text_to_image_interface = gr.Interface(
	fn=text_to_image_interface,
	inputs=gr.Textbox(label="Enter Text", placeholder="Describe an image..."),
	outputs=gr.Image(label="Generated Image"),
	title="Text-to-Image Generator",
	description="Enter text to generate an image based on the description."
	)

	# Combine interfaces into a single Gradio app
	app = gr.TabbedInterface(
	interface_list=[speech_to_image_interface, text_to_image_interface],
	tab_names=["Speech-to-Image", "Text-to-Image"]
	)

	# Launch Gradio interface
	app.launch(debug=True, share=True)