| |
|
| | import os |
| | import numpy as np |
| | import torch |
| | import librosa |
| | import asyncio |
| | import streamlit as st |
| | from transformers import pipeline |
| | from huggingface_hub import login |
| | from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler |
| | from functools import lru_cache |
| |
|
| | |
| | def install_missing_packages(): |
| | required_packages = { |
| | "librosa": None, |
| | "diffusers": ">=0.14.0", |
| | "transformers": None, |
| | "torch": "==2.0.0+cu118", |
| | } |
| | for package, version in required_packages.items(): |
| | try: |
| | __import__(package) |
| | except ImportError: |
| | package_name = f"{package}{version}" if version else package |
| | print(f"Installing {package_name}...") |
| | try: |
| | subprocess.check_call(["pip", "install", package_name]) |
| | except subprocess.CalledProcessError as e: |
| | print(f"Error installing {package_name}: {e}") |
| | return |
| |
|
| | install_missing_packages() |
| |
|
| | |
| | hf_token = os.getenv("HF_TOKEN") |
| | if hf_token: |
| | login(hf_token) |
| | else: |
| | raise ValueError("HF_TOKEN environment variable not set.") |
| |
|
| | |
| | speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-tiny") |
| |
|
| | text_to_image = StableDiffusionPipeline.from_pretrained( |
| | "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 |
| | ) |
| | device = "cuda" if torch.cuda.is_available() else "cpu" |
| | text_to_image.to(device) |
| | text_to_image.enable_attention_slicing() |
| | text_to_image.safety_checker = None |
| | text_to_image.scheduler = DPMSolverMultistepScheduler.from_config(text_to_image.scheduler.config) |
| |
|
| | |
| | def preprocess_audio(audio_path): |
| | try: |
| | audio, sr = librosa.load(audio_path, sr=16000) |
| | return np.array(audio, dtype=np.float32) |
| | except Exception as e: |
| | return f"Error in preprocessing audio: {str(e)}" |
| |
|
| | |
| | @lru_cache(maxsize=10) |
| | async def transcribe_audio(audio_path): |
| | try: |
| | audio_array = preprocess_audio(audio_path) |
| | if isinstance(audio_array, str): |
| | return audio_array |
| | result = speech_to_text(audio_array) |
| | transcription = result["text"] |
| | return transcription |
| | except Exception as e: |
| | return f"Error in transcription: {str(e)}" |
| |
|
| | |
| | @lru_cache(maxsize=10) |
| | async def generate_image_from_text(text): |
| | try: |
| | image = text_to_image(text, height=256, width=256).images[0] |
| | return image |
| | except Exception as e: |
| | return f"Error in image generation: {str(e)}" |
| |
|
| | |
| | async def process_audio_and_generate_image(audio_path): |
| | transcription_result = {"result": None} |
| | image_result = {"result": None} |
| |
|
| | async def transcription_thread(): |
| | transcription_result["result"] = await transcribe_audio(audio_path) |
| |
|
| | async def image_generation_thread(): |
| | transcription = transcription_result["result"] |
| | if transcription and "Error" not in transcription: |
| | image_result["result"] = await generate_image_from_text(transcription) |
| |
|
| | await asyncio.gather(transcription_thread(), image_generation_thread()) |
| |
|
| | transcription = transcription_result["result"] |
| | image = image_result["result"] |
| |
|
| | if "Error" in transcription: |
| | return None, transcription |
| | if isinstance(image, str) and "Error" in image: |
| | return None, image |
| |
|
| | return image, transcription |
| |
|
| | |
| | st.title("Voice-to-Image Generator") |
| | st.write("Upload an audio file to transcribe speech to text, and then generate an image based on the transcription.") |
| |
|
| | audio_file = st.file_uploader("Upload audio file (WAV/MP3)", type=["wav", "mp3"]) |
| |
|
| | if audio_file: |
| | audio_path = f"temp_{audio_file.name}" |
| | with open(audio_path, "wb") as f: |
| | f.write(audio_file.read()) |
| |
|
| | with st.spinner("Processing..."): |
| | loop = asyncio.new_event_loop() |
| | asyncio.set_event_loop(loop) |
| | image, transcription = loop.run_until_complete(process_audio_and_generate_image(audio_path)) |
| |
|
| | if transcription and "Error" not in transcription: |
| | st.subheader("Transcription") |
| | st.write(transcription) |
| |
|
| | if image: |
| | st.subheader("Generated Image") |
| | st.image(image, caption="Generated from transcription") |
| | else: |
| | st.error("Error in generating image.") |
| | else: |
| | st.error(transcription) |
| |
|