import os import shutil from gradio_client import Client from langchain.llms import OpenAI from langchain.chains import ConversationChain from langchain.memory import ConversationBufferMemory from transformers import MusicgenForConditionalGeneration import torch from transformers import AutoProcessor import scipy import gradio as gr import colorama from pydub import AudioSegment from colorama import Fore import subprocess import re def clean_string(string): # Usando uma expressão regular para encontrar letras, números e pontos padrao = r'[^a-zA-Z0-9.]' return re.sub(padrao, '', string) def rename_file(video_path): # Essa parte renomeia o arquivo para input.mp4 uploaded_filename = video_path.split("/")[2] new_filename = "input.mp4" os.rename(uploaded_filename, new_filename) def making_dir(): #pasta com todos os frames do vídeo if not os.path.exists("fotopastas"): os.makedirs("fotopastas") image_files = [file for file in os.listdir() if file.startswith("frames_")] for image in image_files: shutil.move(image, os.path.join("fotopastas", image)) # Defina o caminho para a pasta com as fotos pasta = '/content/fotopastas' # Substitua pelo caminho da sua pasta # Lista de extensões de arquivos de imagem que você deseja processar extensoes_de_imagem = ['.jpg', '.png', '.jpeg'] # Ordenando os arquivos arquivos_ordenados = sorted( [arquivo for arquivo in os.listdir(pasta) if any(arquivo.lower().endswith(ext) for ext in extensoes_de_imagem)], key=lambda arquivo: int(arquivo.split("_")[1].split(".")[0]) ) return [arquivos_ordenados,pasta] def frame_list(video_path,seconds): rename_file(video_path) # ffmpeg -i input.mp4 -vf "fps=1/$seconds" -q:v 2 frames_%03d.jpg command = [ 'ffmpeg', '-i', 'input.mp4', '-vf', f'fps=1/{seconds}', '-q:v', '2', 'frames_%03d.jpg' ] # Run the command using subprocess subprocess.run(command) #pasta com todos os frames do vídeo elements = making_dir() from gradio_client import Client # Inicialize o cliente client = Client("https://fffiloni-clip-interrogator-2.hf.space/") finalList = [] # Loop para percorrer as fotos na pasta for arquivo in elements[0]: caminho_arquivo = os.path.join(elements[1], arquivo) result = client.predict( caminho_arquivo, "best", 8, api_name="/clipi2" ) newList = [] for item in result: if isinstance(item, str) and "{" in item: break newList.append(item) newString = newList[0] if newList else "" finalList.append(newString) resultList = [] for description in finalList: first = description.split(',') resultList.append(first[0]) print(resultList) return resultList def langchain_handle_text(text): print(Fore.CYAN + "to no lang") os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" llm = OpenAI(temperature=0.3,model_name="gpt-3.5-turbo") conversation = ConversationChain( llm=llm, verbose=True, memory=ConversationBufferMemory() ) conversation.predict(input=f"Given a text and you being an internationally renowned melodist, create a melody description with instruments and necessary transitions according to the context of the text. The text:{text}") output = conversation.predict(input="Summarize the melody without removing the necessary instruments and transitions. the otuput should be : the melody begins...") print(output) return output def eleven_labs(prompt): import requests CHUNK_SIZE = 1024 url = "https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM" headers = { "Accept": "audio/mpeg", "Content-Type": "application/json", "xi-api-key": "xxxxxxxxxxxxxxxxxxxxxxxxx" } data = { "text": prompt, "model_id": "eleven_multilingual_v1", "voice_settings": { "stability": 0.5, "similarity_boost": 0.5 } } response = requests.post(url, json=data, headers=headers) print(response.text) with open('narracao.mp3', 'wb') as f: for chunk in response.iter_content(chunk_size=CHUNK_SIZE): if chunk: f.write(chunk) def check_duration(): # Carregue o arquivo de áudio audio1 = AudioSegment.from_file("audio.mp3", format="mp3") audio2 = AudioSegment.from_file("narracao.mp3", format="mp3") # Obtenha a duração em milissegundos duração_em_milissegundos = len(audio1) duração_em_milissegundos2 = len(audio2) # Converta a duração para segundos duração_em_segundos = duração_em_milissegundos / 1000 duração_em_segundos2 = duração_em_milissegundos2 / 1000 print(f"A duração do áudio é de {duração_em_segundos} segundos.") print(f"A duração do áudio é de {duração_em_segundos2} segundos.") if duração_em_segundos > duração_em_segundos2: maior = duração_em_segundos else: maior = duração_em_segundos2 return maior def merge_audio_text(): #ffmpeg -y -i audio_1.wav -vn -ar 44100 -ac 2 -b:a 192k audio.mp3 subprocess.run(['ffmpeg', '-y', '-i', 'audio_1.wav', '-vn', '-ar', '44100', '-ac', '2', '-b:a', '192k', 'audio.mp3']) duration = check_duration() #ffmpeg -stream_loop -1 -i audio.mp3 -t "$duration" -c:a libmp3lame audio_loop.mp3 subprocess.run(['ffmpeg', '-stream_loop', '-1', '-i', 'audio.mp3', '-t', str(duration), '-c:a', 'libmp3lame', 'audio_loop.mp3']) #ffmpeg -i narracao.mp3 -i audio_loop.mp3 -filter_complex amix=inputs=2:duration=first:dropout_transition=2 output.mp3 subprocess.run(['ffmpeg', '-i', 'narracao.mp3', '-i', 'audio_loop.mp3', '-filter_complex', 'amix=inputs=2:duration=first:dropout_transition=2', 'output.mp3']) audio_final = '/content/output.mp3' return audio_final def langchain_handle(description): print(Fore.CYAN + "to no lang") os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx" llm = OpenAI(temperature=0.3,model_name="gpt-3.5-turbo") conversation = ConversationChain( llm=llm, verbose=True, memory=ConversationBufferMemory() ) conversation.predict(input=f"given a list of phrases and you being a world-renowned melodist, create a melody based on the context generated by the phrases on the list, reporting the necessary instruments and their transitions. The list:{description}") conversation.predict(input="put the intro and all the scenes together in one phrase. Give me the output star with: the melody begins ") y = conversation.predict(input='Summarize the and starts with: the melody begins') print(y) return y def music_gen(description): model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") device = "cuda:0" if torch.cuda.is_available() else "cpu" model.to(device); processor = AutoProcessor.from_pretrained("facebook/musicgen-small") inputs = processor( text=[f"{description}"], padding=True, return_tensors="pt", ) print('antes do sampling') sampling_rate = model.config.audio_encoder.sampling_rate print('depois do sampling') audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=1503) # Audio(audio_values[0].cpu().numpy(), rate=sampling_rate) print('vou salvar o audio') nome = 'audio_1.wav' scipy.io.wavfile.write(nome, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()) return "/content/audio_1.wav" def merge_audio_video(): # ffmpeg -y -i audio_1.wav -vn -ar 44100 -ac 2 -b:a 192k audio.mp3 # ffmpeg -y -i input.mp4 -i audio.mp3 -c:v copy -c:a copy output.mp4 subprocess.run(['ffmpeg', '-y', '-i', 'audio_1.wav', '-vn', '-ar', '44100', '-ac', '2', '-b:a', '192k', 'audio.mp3']) # Combinar input.mp4 com audio.mp3 em output.mp4 subprocess.run(['ffmpeg', '-y', '-i', 'input.mp4', '-i', 'audio.mp3', '-c:v', 'copy', '-c:a', 'copy', 'output.mp4']) def handle_text(text): description = langchain_handle_text(text) audio = music_gen(description) eleven_labs(text) audio_final = merge_audio_text() return audio_final import gradio as gr from pytube import YouTube def download_youtube_video(youtube_link,seconds): # Create a YouTube object for the provided link yt = YouTube(youtube_link) # Get the highest resolution stream (You can customize this) video_stream = yt.streams.filter(resolution = '720p',only_video=True).first() yt.title = clean_string(yt.title) # Download the video video_stream.download(output_path = '/content', filename = f'{yt.title}.mp4') video_path = f"/content/{yt.title}.mp4" print(video_path) print(yt.length) description = frame_list(video_path,seconds) final_description = langchain_handle(description) audio_path = music_gen(final_description) merge_audio_video() new_video_path = '/content/output.mp4' return new_video_path iface_1 = gr.Interface( download_youtube_video, [gr.Textbox(label="Enter YouTube Video Link"), gr.Dropdown( ["5", "3", "1"], label="Seconds", info="Extract an image every chosen number of seconds")], "video", ) iface_2 = gr.Interface( handle_text, gr.Textbox(label="Enter a Text"), "audio" ) # iface_1.launch(share = True,debug=True,enable_queue=True) demo = gr.TabbedInterface([iface_1, iface_2], ["video-to-SoundClip", "video-to-NarrativeText"]) demo.launch(share=True,debug=True,enable_queue=True)