Update App.py

71e96f9 over 2 years ago

9.69 kB

	import os
	import shutil
	from gradio_client import Client
	from langchain.llms import OpenAI
	from langchain.chains import ConversationChain
	from langchain.memory import ConversationBufferMemory
	from transformers import MusicgenForConditionalGeneration
	import torch
	from transformers import AutoProcessor
	import scipy
	import gradio as gr
	import colorama
	from pydub import AudioSegment
	from colorama import Fore
	import subprocess

	import re

	def clean_string(string):
	# Usando uma expressão regular para encontrar letras, números e pontos
	padrao = r'[^a-zA-Z0-9.]'
	return re.sub(padrao, '', string)

	def rename_file(video_path):
	# Essa parte renomeia o arquivo para input.mp4
	uploaded_filename = video_path.split("/")[2]
	new_filename = "input.mp4"
	os.rename(uploaded_filename, new_filename)



	def making_dir():
	#pasta com todos os frames do vídeo

	if not os.path.exists("fotopastas"):
	os.makedirs("fotopastas")
	image_files = [file for file in os.listdir() if file.startswith("frames_")]
	for image in image_files:
	shutil.move(image, os.path.join("fotopastas", image))

	# Defina o caminho para a pasta com as fotos
	pasta = '/content/fotopastas' # Substitua pelo caminho da sua pasta

	# Lista de extensões de arquivos de imagem que você deseja processar
	extensoes_de_imagem = ['.jpg', '.png', '.jpeg']

	# Ordenando os arquivos
	arquivos_ordenados = sorted(
	[arquivo for arquivo in os.listdir(pasta) if any(arquivo.lower().endswith(ext) for ext in extensoes_de_imagem)],
	key=lambda arquivo: int(arquivo.split("_")[1].split(".")[0])
	)

	return [arquivos_ordenados,pasta]

	def frame_list(video_path,seconds):

	rename_file(video_path)

	# ffmpeg -i input.mp4 -vf "fps=1/$seconds" -q:v 2 frames_%03d.jpg
	command = [
	'ffmpeg',
	'-i', 'input.mp4',
	'-vf', f'fps=1/{seconds}',
	'-q:v', '2',
	'frames_%03d.jpg'
	]

	# Run the command using subprocess
	subprocess.run(command)
	#pasta com todos os frames do vídeo

	elements = making_dir()

	from gradio_client import Client

	# Inicialize o cliente
	client = Client("https://fffiloni-clip-interrogator-2.hf.space/")

	finalList = []

	# Loop para percorrer as fotos na pasta
	for arquivo in elements[0]:
	caminho_arquivo = os.path.join(elements[1], arquivo)
	result = client.predict(
	caminho_arquivo,
	"best",
	8,
	api_name="/clipi2"
	)
	newList = []
	for item in result:
	if isinstance(item, str) and "{" in item:
	break
	newList.append(item)

	newString = newList[0] if newList else ""
	finalList.append(newString)

	resultList = []

	for description in finalList:
	first = description.split(',')
	resultList.append(first[0])
	print(resultList)
	return resultList


	def langchain_handle_text(text):
	print(Fore.CYAN + "to no lang")
	os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
	llm = OpenAI(temperature=0.3,model_name="gpt-3.5-turbo")
	conversation = ConversationChain(

	llm=llm,
	verbose=True,

	memory=ConversationBufferMemory()
	)

	conversation.predict(input=f"Given a text and you being an internationally renowned melodist, create a melody description with instruments and necessary transitions according to the context of the text. The text:{text}")
	output = conversation.predict(input="Summarize the melody without removing the necessary instruments and transitions. the otuput should be : the melody begins...")
	print(output)

	return output


	def eleven_labs(prompt):
	import requests

	CHUNK_SIZE = 1024
	url = "https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM"

	headers = {
	"Accept": "audio/mpeg",
	"Content-Type": "application/json",
	"xi-api-key": "xxxxxxxxxxxxxxxxxxxxxxxxx"
	}

	data = {
	"text": prompt,
	"model_id": "eleven_multilingual_v1",
	"voice_settings": {
	"stability": 0.5,
	"similarity_boost": 0.5

	}
	}

	response = requests.post(url, json=data, headers=headers)
	print(response.text)
	with open('narracao.mp3', 'wb') as f:
	for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
	if chunk:
	f.write(chunk)


	def check_duration():
	# Carregue o arquivo de áudio
	audio1 = AudioSegment.from_file("audio.mp3", format="mp3")
	audio2 = AudioSegment.from_file("narracao.mp3", format="mp3")

	# Obtenha a duração em milissegundos
	duração_em_milissegundos = len(audio1)
	duração_em_milissegundos2 = len(audio2)

	# Converta a duração para segundos
	duração_em_segundos = duração_em_milissegundos / 1000
	duração_em_segundos2 = duração_em_milissegundos2 / 1000

	print(f"A duração do áudio é de {duração_em_segundos} segundos.")
	print(f"A duração do áudio é de {duração_em_segundos2} segundos.")
	if duração_em_segundos > duração_em_segundos2:
	maior = duração_em_segundos
	else:
	maior = duração_em_segundos2

	return maior


	def merge_audio_text():

	#ffmpeg -y -i audio_1.wav -vn -ar 44100 -ac 2 -b:a 192k audio.mp3
	subprocess.run(['ffmpeg', '-y', '-i', 'audio_1.wav', '-vn', '-ar', '44100', '-ac', '2', '-b:a', '192k', 'audio.mp3'])
	duration = check_duration()
	#ffmpeg -stream_loop -1 -i audio.mp3 -t "$duration" -c:a libmp3lame audio_loop.mp3
	subprocess.run(['ffmpeg', '-stream_loop', '-1', '-i', 'audio.mp3', '-t', str(duration), '-c:a', 'libmp3lame', 'audio_loop.mp3'])

	#ffmpeg -i narracao.mp3 -i audio_loop.mp3 -filter_complex amix=inputs=2:duration=first:dropout_transition=2 output.mp3
	subprocess.run(['ffmpeg', '-i', 'narracao.mp3', '-i', 'audio_loop.mp3', '-filter_complex', 'amix=inputs=2:duration=first:dropout_transition=2', 'output.mp3'])
	audio_final = '/content/output.mp3'
	return audio_final


	def langchain_handle(description):
	print(Fore.CYAN + "to no lang")
	os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
	llm = OpenAI(temperature=0.3,model_name="gpt-3.5-turbo")
	conversation = ConversationChain(

	llm=llm,
	verbose=True,

	memory=ConversationBufferMemory()
	)

	conversation.predict(input=f"given a list of phrases and you being a world-renowned melodist, create a melody based on the context generated by the phrases on the list, reporting the necessary instruments and their transitions. The list:{description}")
	conversation.predict(input="put the intro and all the scenes together in one phrase. Give me the output star with: the melody begins ")
	y = conversation.predict(input='Summarize the and starts with: the melody begins')
	print(y)
	return y

	def music_gen(description):
	model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	model.to(device);
	processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

	inputs = processor(
	text=[f"{description}"],
	padding=True,
	return_tensors="pt",
	)
	print('antes do sampling')
	sampling_rate = model.config.audio_encoder.sampling_rate
	print('depois do sampling')

	audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=1503)

	# Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)
	print('vou salvar o audio')

	nome = 'audio_1.wav'
	scipy.io.wavfile.write(nome, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy())

	return "/content/audio_1.wav"


	def merge_audio_video():

	# ffmpeg -y -i audio_1.wav -vn -ar 44100 -ac 2 -b:a 192k audio.mp3

	# ffmpeg -y -i input.mp4 -i audio.mp3 -c:v copy -c:a copy output.mp4

	subprocess.run(['ffmpeg', '-y', '-i', 'audio_1.wav', '-vn', '-ar', '44100', '-ac', '2', '-b:a', '192k', 'audio.mp3'])

	# Combinar input.mp4 com audio.mp3 em output.mp4
	subprocess.run(['ffmpeg', '-y', '-i', 'input.mp4', '-i', 'audio.mp3', '-c:v', 'copy', '-c:a', 'copy', 'output.mp4'])


	def handle_text(text):
	description = langchain_handle_text(text)
	audio = music_gen(description)
	eleven_labs(text)
	audio_final = merge_audio_text()
	return audio_final

	import gradio as gr
	from pytube import YouTube
	def download_youtube_video(youtube_link,seconds):

	# Create a YouTube object for the provided link
	yt = YouTube(youtube_link)

	# Get the highest resolution stream (You can customize this)
	video_stream = yt.streams.filter(resolution = '720p',only_video=True).first()

	yt.title = clean_string(yt.title)
	# Download the video
	video_stream.download(output_path = '/content', filename = f'{yt.title}.mp4')

	video_path = f"/content/{yt.title}.mp4"
	print(video_path)
	print(yt.length)
	description = frame_list(video_path,seconds)

	final_description = langchain_handle(description)
	audio_path = music_gen(final_description)
	merge_audio_video()
	new_video_path = '/content/output.mp4'
	return new_video_path





	iface_1 = gr.Interface(
	download_youtube_video,
	[gr.Textbox(label="Enter YouTube Video Link"),
	gr.Dropdown( ["5", "3", "1"], label="Seconds", info="Extract an image every chosen number of seconds")],
	"video",

	)
	iface_2 = gr.Interface(
	handle_text,
	gr.Textbox(label="Enter a Text"),
	"audio"
	)


	# iface_1.launch(share = True,debug=True,enable_queue=True)
	demo = gr.TabbedInterface([iface_1, iface_2], ["video-to-SoundClip", "video-to-NarrativeText"])
	demo.launch(share=True,debug=True,enable_queue=True)