Spaces:

Leeps
/

background-sounds-generator

Runtime error

App Files Files Community

background-sounds-generator / index.py

Leeps

Upload folder using huggingface_hub

28fa3d8 verified almost 2 years ago

raw

history blame contribute delete

6.44 kB

	import os
	import base64
	import numpy as np
	from PIL import Image, ImageChops, ImageDraw

	import io
	import requests
	import replicate
	import gradio as gr
	import openai
	from openai import OpenAI

	from dotenv import load_dotenv, find_dotenv

	# Locate the .env file
	dotenv_path = find_dotenv()
	load_dotenv(dotenv_path)
	REPLICATE_API_TOKEN = os.getenv('REPLICATE_API_TOKEN')
	OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

	client = OpenAI()

	# 1 - send image to vision-language model
	# Localised Speech
	# Non-localised speech e.g. people in the background
	# Inanimate objects e.g. Bell, iconic sounds
	# Ambient sound e.g. wind, water ripple, tree, traffic
	# Spatial dimension of the image
	# music

	# 2 - generate sounds from audioldm
	# localized speech can be a different speech-specific model

	# 3 - create soundtrack (not all sounds at once)


	# Could use different system prompts depending on what time of sound
	# Could use audio-ldm for sound effects and a different one for music


	# audio ldm: start music prompt with "background music that sounds like"

	CHECKBOX_INPUTS = ["Localised Speech", "Non-localised speech", "Inanimate objects", "Ambient sound", "music"]

	def call_openai(image_data, prompt):

	try:
	response = client.chat.completions.create(
	model="gpt-4o",
	messages=[
	{
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{
	"type": "image_url",
	"image_url": {
	"url": image_data,
	},
	},
	],
	}
	],
	max_tokens=100,
	)
	return response.choices[0].message.content
	except openai.BadRequestError as e:
	print(e)
	print("e type")
	print(type(e))
	raise gr.Error(f"Please retry with a different moodboard file (below 20 MB in size and is of one the following formats: ['png', 'jpeg', 'gif', 'webp'])")
	except Exception as e:
	raise gr.Error("Unknown Error")

	def img_to_base64(img):
	buffered = io.BytesIO()
	img.save(buffered, format="JPEG")
	img_base_64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
	return "data:image/jpeg;base64," + img_base_64

	def vision_language_model(img):
	return

	def generate_prompt_from_description(checkbox_label, img):
	print(checkbox_label)
	if checkbox_label == CHECKBOX_INPUTS[0]:
	prompt = "reply with a single sentence that the person in the image might say"
	return call_openai(img, prompt)
	# use https://replicate.com/afiaka87/tortoise-tts

	if checkbox_label == CHECKBOX_INPUTS[1]:
	prompt = "in 5 words or less, describe the background noise (like people talking) of this image"
	return call_openai(img, prompt)
	elif checkbox_label == CHECKBOX_INPUTS[2]:
	prompt = "in 5 words or less, describe an inanimate noise, such as a bell or an appliance, that might be heard in this image"
	return call_openai(img, prompt)
	elif checkbox_label == CHECKBOX_INPUTS[3]:
	prompt = "in 5 words or less, describe an ambient sound, such as wind, water ripple, tree or traffic, that might be heard in this image"
	return call_openai(img, prompt)
	elif checkbox_label == CHECKBOX_INPUTS[4]:
	prompt = "in 6 words or less, write a prompt to generate music that might be in this image"
	return call_openai(img, prompt)

	# https://replicate.com/meta/llama-2-70b-chat
	# You are a talented prompt writer. you turn paragraphs into short 5-word prompts to generate a song. These go directly into systems, so there should be no other text.
	return

	def generate_music(prompt):
	return

	def combine_music_clips(audio):
	return


	def download_audio(url):
	response = requests.get(url)
	response.raise_for_status()
	return io.BytesIO(response.content)

	def generate_silent_audio():
	silent_audio = np.zeros((22050,), dtype=np.int16)
	silent_bytes = io.BytesIO()
	silent_bytes.write(silent_audio.tobytes())
	silent_bytes.seek(0)
	return silent_bytes

	def main(image, checkboxes):
	image = Image.fromarray(image.astype('uint8'))
	base_64_image = img_to_base64(image)

	generated_content = []

	for selection in checkboxes:
	prompt = generate_prompt_from_description(selection, base_64_image)
	if not prompt:
	continue

	if selection == CHECKBOX_INPUTS[0]:
	output = replicate.run(
	"afiaka87/tortoise-tts:e9658de4b325863c4fcdc12d94bb7c9b54cbfe351b7ca1b36860008172b91c71",
	input={"seed": 0, "text": prompt, "preset": "fast", "voice_a": "halle"}
	)
	elif selection == CHECKBOX_INPUTS[4]:
	output = replicate.run(
	"riffusion/riffusion:8cf61ea6c56afd61d8f5b9ffd14d7c216c0a93844ce2d82ac1c9ecc9c7f24e05",
	input={"alpha": 0.5, "prompt_a": prompt, "denoising": 0.75, "seed_image_id": "vibes", "num_inference_steps": 50}
	)
	output = output['audio']
	else:
	output = replicate.run(
	"haoheliu/audio-ldm:b61392adecdd660326fc9cfc5398182437dbe5e97b5decfb36e1a36de68b5b95",
	input={"text": prompt, "duration": "5.0", "n_candidates": 3, "guidance_scale": 2.5}
	)

	audio_file = download_audio(output)
	generated_content.append({"prompt": prompt, "audio": audio_file})

	print(generated_content)

	# Ensure 5 pairs of prompt and audio
	while len(generated_content) < 5:
	generated_content.append({"prompt": "", "audio": generate_silent_audio()})

	result_prompts = [item["prompt"] for item in generated_content]
	result_audios = [item["audio"].getvalue() for item in generated_content]

	return result_prompts[0], result_audios[0], result_prompts[1], result_audios[1], result_prompts[2], result_audios[2], result_prompts[3], result_audios[3], result_prompts[4], result_audios[4]

	demo = gr.Interface(fn=main, inputs=["image", gr.CheckboxGroup(CHECKBOX_INPUTS, label="Sounds to Generate", info="Based on Taxonomy of Sounds")], outputs=["text", "audio", "text", "audio", "text", "audio", "text", "audio", "text", "audio"])
	demo.launch(share=False)