Spaces:
Runtime error
Runtime error
| import os | |
| import base64 | |
| import numpy as np | |
| from PIL import Image, ImageChops, ImageDraw | |
| import io | |
| import requests | |
| import replicate | |
| import gradio as gr | |
| import openai | |
| from openai import OpenAI | |
| from dotenv import load_dotenv, find_dotenv | |
| # Locate the .env file | |
| dotenv_path = find_dotenv() | |
| load_dotenv(dotenv_path) | |
| REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN') | |
| OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') | |
| client = OpenAI() | |
| # 1 - send image to vision-language model | |
| # Localised Speech | |
| # Non-localised speech e.g. people in the background | |
| # Inanimate objects e.g. Bell, iconic sounds | |
| # Ambient sound e.g. wind, water ripple, tree, traffic | |
| # Spatial dimension of the image | |
| # music | |
| # 2 - generate sounds from audioldm | |
| # localized speech can be a different speech-specific model | |
| # 3 - create soundtrack (not all sounds at once) | |
| # Could use different system prompts depending on what time of sound | |
| # Could use audio-ldm for sound effects and a different one for music | |
| # audio ldm: start music prompt with "background music that sounds like" | |
| CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound", "music"] | |
| def call_openai(image_data, prompt): | |
| try: | |
| response = client.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": { | |
| "url": image_data, | |
| }, | |
| }, | |
| ], | |
| } | |
| ], | |
| max_tokens=100, | |
| ) | |
| return response.choices[0].message.content | |
| except openai.BadRequestError as e: | |
| print(e) | |
| print("e type") | |
| print(type(e)) | |
| raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])") | |
| except Exception as e: | |
| raise gr.Error("Unknown Error") | |
| def img_to_base64(img): | |
| buffered = io.BytesIO() | |
| img.save(buffered, format="JPEG") | |
| img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8') | |
| return "data:image/jpeg;base64," + img_base_64 | |
| def vision_language_model(img): | |
| return | |
| def generate_prompt_from_description(checkbox_label, img): | |
| print(checkbox_label) | |
| if checkbox_label == CHECKBOX_INPUTS[0]: | |
| prompt = "reply with a single sentence that the person in the image might say" | |
| return call_openai(img, prompt) | |
| # use https://replicate.com/afiaka87/tortoise-tts | |
| if checkbox_label == CHECKBOX_INPUTS[1]: | |
| prompt = "in 5 words or less, describe the background noise (like people talking) of this image" | |
| return call_openai(img, prompt) | |
| elif checkbox_label == CHECKBOX_INPUTS[2]: | |
| prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image" | |
| return call_openai(img, prompt) | |
| elif checkbox_label == CHECKBOX_INPUTS[3]: | |
| prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image" | |
| return call_openai(img, prompt) | |
| elif checkbox_label == CHECKBOX_INPUTS[4]: | |
| prompt = "in 6 words or less, write a prompt to generate music that might be in this image" | |
| return call_openai(img, prompt) | |
| # https://replicate.com/meta/llama-2-70b-chat | |
| # You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text. | |
| return | |
| def generate_music(prompt): | |
| return | |
| def combine_music_clips(audio): | |
| return | |
| def download_audio(url): | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| return io.BytesIO(response.content) | |
| def generate_silent_audio(): | |
| silent_audio = np.zeros((22050,), dtype=np.int16) | |
| silent_bytes = io.BytesIO() | |
| silent_bytes.write(silent_audio.tobytes()) | |
| silent_bytes.seek(0) | |
| return silent_bytes | |
| def main(image, checkboxes): | |
| image = Image.fromarray(image.astype('uint8')) | |
| base_64_image = img_to_base64(image) | |
| generated_content = [] | |
| for selection in checkboxes: | |
| prompt = generate_prompt_from_description(selection, base_64_image) | |
| if not prompt: | |
| continue | |
| if selection == CHECKBOX_INPUTS[0]: | |
| output = replicate.run( | |
| "afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71", | |
| input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"} | |
| ) | |
| elif selection == CHECKBOX_INPUTS[4]: | |
| output = replicate.run( | |
| "riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05", | |
| input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50} | |
| ) | |
| output = output['audio'] | |
| else: | |
| output = replicate.run( | |
| "haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95", | |
| input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5} | |
| ) | |
| audio_file = download_audio(output) | |
| generated_content.append({"prompt": prompt, "audio": audio_file}) | |
| print(generated_content) | |
| # Ensure 5 pairs of prompt and audio | |
| while len(generated_content) < 5: | |
| generated_content.append({"prompt": "", "audio": generate_silent_audio()}) | |
| result_prompts = [item["prompt"] for item in generated_content] | |
| result_audios = [item["audio"].getvalue() for item in generated_content] | |
| return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4] | |
| demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"]) | |
| demo.launch(share=False) |