Spaces:

Sayiqa7
/

Voice_to_Image

Runtime error

App Files Files Community

Voice_to_Image / app.py

Sayiqa7

Update app.py

4983323 verified about 1 year ago

raw

history blame contribute delete

11.7 kB

	# import os
	# import subprocess
	# import threading
	# import numpy as np
	# from functools import lru_cache
	# from transformers import pipeline
	# from huggingface_hub import login
	# from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
	# import gradio as gr
	# import torch
	# import transformers
	# # Install missing dependencies
	# try:
	# import librosa
	# import transformers
	# import diffusers
	# import torch
	# import gradio
	# import huggingface_hub
	# except ImportError:
	# subprocess.check_call(["pip", "install", "librosa","transformers>=4.25.0", "diffusers>=0.14.0", "torch>=1.11.0", "gradio>=3.35.2", "huggingface_hub"])
	# import librosa

	# # Get the Hugging Face token from the environment variable
	# hf_token = os.getenv("HF_TOKEN")
	# if hf_token:
	# login(hf_token)
	# else:
	# raise ValueError("HF_TOKEN environment variable not set.")

	# # Load speech-to-text model
	# speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

	# # Load Stable Diffusion model with optimizations
	# text_to_image = StableDiffusionPipeline.from_pretrained(
	# "runwayml/stable-diffusion-v1-5"
	# )
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# text_to_image.to(device)
	# text_to_image.enable_attention_slicing() # Reduce memory usage
	# text_to_image.safety_checker = None # Disable safety checks to improve speed
	# text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config) # Faster scheduler

	# # Preprocess audio file into NumPy array
	# def preprocess_audio(audio_path):
	# try:
	# # Load audio using librosa
	# audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
	# return np.array(audio, dtype=np.float32)
	# except Exception as e:
	# return f"Error in preprocessing audio: {str(e)}"

	# # Speech-to-text function
	# @lru_cache(maxsize=10)
	# def transcribe_audio(audio_path):
	# try:
	# audio_array = preprocess_audio(audio_path)
	# if isinstance(audio_array, str): # Error message from preprocessing
	# return audio_array
	# result = speech_to_text(audio_array)
	# return result["text"]
	# except Exception as e:
	# return f"Error in transcription: {str(e)}"

	# # Text-to-image function
	# @lru_cache(maxsize=10)
	# def generate_image_from_text(text):
	# try:
	# image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
	# return image
	# except Exception as e:
	# return f"Error in image generation: {str(e)}"

	# # Combined processing function
	# def process_audio_and_generate_image(audio_path):
	# transcription_result = {"result": None}
	# image_result = {"result": None}

	# def transcription_thread():
	# transcription_result["result"] = transcribe_audio(audio_path)

	# def image_generation_thread():
	# transcription = transcription_result["result"]
	# if transcription and "Error" not in transcription:
	# image_result["result"] = generate_image_from_text(transcription)

	# t1 = threading.Thread(target=transcription_thread)
	# t2 = threading.Thread(target=image_generation_thread)

	# t1.start()
	# t1.join()
	# t2.start()
	# t2.join()

	# transcription = transcription_result["result"]
	# image = image_result["result"]

	# if "Error" in transcription:
	# return None, transcription
	# if isinstance(image, str) and "Error" in image:
	# return None, image

	# return image, transcription

	# # Gradio interface
	# iface = gr.Interface(
	# fn=process_audio_and_generate_image,
	# inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
	# outputs=[
	# gr.Image(label="Generated Image"),
	# gr.Textbox(label="Transcription")
	# ],
	# title="Speech-to-Text and Image Generation",
	# description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
	# )

	# # Launch the interface
	# iface.launch(debug=True, share=True)



	# import os
	# import subprocess
	# import threading
	# import numpy as np
	# from functools import lru_cache
	# import torch
	# import gradio as gr

	# # Install required dependencies with specific versions
	# required_packages = {
	# "librosa": None,
	# "transformers": ">=4.25.0",
	# "diffusers": ">=0.14.0",
	# "torch": "torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118",
	# "gradio": ">=3.35.2",
	# "huggingface_hub": None,
	# }

	# def install_missing_packages():
	# for package, version in required_packages.items():
	# try:
	# __import__(package)
	# except ImportError:
	# if package == "torch":
	# subprocess.check_call(["pip", "install", version])
	# else:
	# package_name = f"{package}{version}" if version else package
	# subprocess.check_call(["pip", "install", package_name])

	# install_missing_packages()

	# # Import libraries after ensuring installation
	# import librosa
	# from transformers import pipeline
	# from huggingface_hub import login
	# from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

	# # Get the Hugging Face token from the environment variable
	# hf_token = os.getenv("HF_TOKEN")
	# if hf_token:
	# login(hf_token)
	# else:
	# raise ValueError("HF_TOKEN environment variable not set.")

	# # Load speech-to-text model
	# speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")

	# # Load Stable Diffusion model with optimizations
	# text_to_image = StableDiffusionPipeline.from_pretrained(
	# "runwayml/stable-diffusion-v1-5"
	# )
	# device = "cuda" if torch.cuda.is_available() else "cpu"
	# text_to_image.to(device)
	# text_to_image.enable_attention_slicing() # Reduce memory usage
	# text_to_image.safety_checker = None # Disable safety checks to improve speed
	# text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config) # Faster scheduler

	# # Preprocess audio file into NumPy array
	# def preprocess_audio(audio_path):
	# try:
	# # Load audio using librosa
	# audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz
	# return np.array(audio, dtype=np.float32)
	# except Exception as e:
	# return f"Error in preprocessing audio: {str(e)}"

	# # Speech-to-text function
	# @lru_cache(maxsize=10)
	# def transcribe_audio(audio_path):
	# try:
	# audio_array = preprocess_audio(audio_path)
	# if isinstance(audio_array, str): # Error message from preprocessing
	# return audio_array
	# result = speech_to_text(audio_array)
	# return result["text"]
	# except Exception as e:
	# return f"Error in transcription: {str(e)}"

	# # Text-to-image function
	# @lru_cache(maxsize=10)
	# def generate_image_from_text(text):
	# try:
	# image = text_to_image(text, height=256, width=256).images[0] # Generate smaller images for speed
	# return image
	# except Exception as e:
	# return f"Error in image generation: {str(e)}"

	# # Combined processing function
	# def process_audio_and_generate_image(audio_path):
	# transcription_result = {"result": None}
	# image_result = {"result": None}

	# def transcription_thread():
	# transcription_result["result"] = transcribe_audio(audio_path)

	# def image_generation_thread():
	# transcription = transcription_result["result"]
	# if transcription and "Error" not in transcription:
	# image_result["result"] = generate_image_from_text(transcription)

	# t1 = threading.Thread(target=transcription_thread)
	# t2 = threading.Thread(target=image_generation_thread)

	# t1.start()
	# t1.join()
	# t2.start()
	# t2.join()

	# transcription = transcription_result["result"]
	# image = image_result["result"]

	# if "Error" in transcription:
	# return None, transcription
	# if isinstance(image, str) and "Error" in image:
	# return None, image

	# return image, transcription

	# # Gradio interface
	# iface = gr.Interface(
	# fn=process_audio_and_generate_image,
	# inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
	# outputs=[
	# gr.Image(label="Generated Image"),
	# gr.Textbox(label="Transcription")
	# ],
	# title="Speech-to-Text and Image Generation",
	# description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
	# )

	# # Launch the interface
	# iface.launch(debug=True, share=True)


	import subprocess

	# Install required libraries
	subprocess.check_call(["pip", "install", "torch>=1.11.0"])
	subprocess.check_call(["pip", "install", "transformers"])
	subprocess.check_call(["pip", "install", "diffusers"])
	subprocess.check_call(["pip", "install", "librosa"])

	import os
	import threading
	import numpy as np
	from functools import lru_cache
	from transformers import pipeline
	from huggingface_hub import login
	from transformers import pipeline
	from diffusers import StableDiffusionPipeline
	import gradio as gr
	import torch
	import transformers
	import numpy

	# Ensure required dependencies are installed
	def install_missing_packages():
	required_packages = {
	"librosa": None,
	"diffusers": ">=0.14.0",
	"gradio": ">=3.35.2",
	"huggingface_hub": None,
	}
	for package, version in required_packages.items():
	try:
	__import__(package)
	except ImportError:
	package_name = f"{package}{version}" if version else package
	subprocess.check_call(["pip", "install", package_name])

	install_missing_packages()
	# Log in to Hugging Face (replace with your token)
	# Get Hugging Face token for authentication
	hf_token = os.getenv("HF_TOKEN")
	if hf_token:
	login(hf_token)
	else:
	raise ValueError("HF_TOKEN environment variable not set.")
	# Load Hugging Face models
	speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base")

	# Load Stable Diffusion model using diffusers
	text_to_image = StableDiffusionPipeline.from_pretrained(
	"runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
	).to("cuda" if torch.cuda.is_available() else "cpu")

	# Speech-to-text function
	def transcribe_audio(audio_file):
	try:
	result = speech_to_text(audio_file)
	transcription = result["text"]
	return transcription
	except Exception as e:
	return f"Error in transcription: {str(e)}"

	# Text-to-image function
	def generate_image_from_text(text):
	try:
	image = text_to_image(text).images[0] # Generate one image
	return image
	except Exception as e:
	return f"Error in image generation: {str(e)}"

	# Combined processing function
	def process_audio_and_generate_image(audio_file):
	transcription = transcribe_audio(audio_file)
	if "Error" in transcription:
	return None, transcription

	image = generate_image_from_text(transcription)
	if isinstance(image, str) and "Error" in image:
	return None, image

	return image, transcription

	# Gradio interface
	iface = gr.Interface(
	fn=process_audio_and_generate_image,
	inputs=gr.Audio(type="filepath", label="Upload audio file (WAV/MP3)"),
	outputs=[
	gr.Image(label="Generated Image"),
	gr.Textbox(label="Transcription")
	],
	title="Speech-to-Text and Image Generation",
	description="Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.",
	)

	# Launch the interface
	iface.launch(share=True)