File size: 9,691 Bytes
41096cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71e96f9
41096cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71e96f9
41096cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71e96f9
41096cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import os
import shutil
from gradio_client import Client
from langchain.llms import OpenAI
from langchain.chains import ConversationChain
from langchain.memory import ConversationBufferMemory
from transformers import MusicgenForConditionalGeneration
import torch
from transformers import AutoProcessor
import scipy
import gradio as gr
import colorama
from pydub import AudioSegment
from colorama import Fore
import subprocess

import re

def clean_string(string):
    # Usando uma expressão regular para encontrar letras, números e pontos
    padrao = r'[^a-zA-Z0-9.]'
    return re.sub(padrao, '', string)

def rename_file(video_path):
    # Essa parte renomeia o arquivo para input.mp4
    uploaded_filename = video_path.split("/")[2]
    new_filename = "input.mp4"
    os.rename(uploaded_filename, new_filename)



def making_dir():
    #pasta com todos os frames do vídeo

    if not os.path.exists("fotopastas"):
        os.makedirs("fotopastas")
    image_files = [file for file in os.listdir() if file.startswith("frames_")]
    for image in image_files:
        shutil.move(image, os.path.join("fotopastas", image))

    # Defina o caminho para a pasta com as fotos
    pasta = '/content/fotopastas'  # Substitua pelo caminho da sua pasta

    # Lista de extensões de arquivos de imagem que você deseja processar
    extensoes_de_imagem = ['.jpg', '.png', '.jpeg']

    # Ordenando os arquivos
    arquivos_ordenados = sorted(
        [arquivo for arquivo in os.listdir(pasta) if any(arquivo.lower().endswith(ext) for ext in extensoes_de_imagem)],
        key=lambda arquivo: int(arquivo.split("_")[1].split(".")[0])
    )

    return [arquivos_ordenados,pasta]

def frame_list(video_path,seconds):

    rename_file(video_path)

    # ffmpeg -i input.mp4 -vf "fps=1/$seconds" -q:v 2 frames_%03d.jpg
    command = [
    'ffmpeg',
    '-i', 'input.mp4',
    '-vf', f'fps=1/{seconds}',
    '-q:v', '2',
    'frames_%03d.jpg'
    ]

# Run the command using subprocess
    subprocess.run(command)
    #pasta com todos os frames do vídeo

    elements = making_dir()

    from gradio_client import Client

    # Inicialize o cliente
    client = Client("https://fffiloni-clip-interrogator-2.hf.space/")

    finalList = []

    # Loop para percorrer as fotos na pasta
    for arquivo in elements[0]:
        caminho_arquivo = os.path.join(elements[1], arquivo)
        result = client.predict(
            caminho_arquivo,
            "best",
            8,
            api_name="/clipi2"
        )
        newList = []
        for item in result:
            if isinstance(item, str) and "{" in item:
                break
            newList.append(item)

        newString = newList[0] if newList else ""
        finalList.append(newString)

    resultList = []

    for description in finalList:
        first = description.split(',')
        resultList.append(first[0])
    print(resultList)
    return resultList


def langchain_handle_text(text):
    print(Fore.CYAN + "to no lang")
    os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
    llm = OpenAI(temperature=0.3,model_name="gpt-3.5-turbo")
    conversation = ConversationChain(

    llm=llm,
    verbose=True,

    memory=ConversationBufferMemory()
    )

    conversation.predict(input=f"Given a text and you being an internationally renowned melodist, create a melody description with instruments and necessary transitions according to the context of the text. The text:{text}")
    output = conversation.predict(input="Summarize the melody without removing the necessary instruments and transitions. the otuput should be : the melody begins...")
    print(output)

    return output


def eleven_labs(prompt):
  import requests

  CHUNK_SIZE = 1024
  url = "https://api.elevenlabs.io/v1/text-to-speech/21m00Tcm4TlvDq8ikWAM"

  headers = {
    "Accept": "audio/mpeg",
    "Content-Type": "application/json",
    "xi-api-key": "xxxxxxxxxxxxxxxxxxxxxxxxx"
  }

  data = {
    "text": prompt,
    "model_id": "eleven_multilingual_v1",
    "voice_settings": {
      "stability": 0.5,
      "similarity_boost": 0.5
      
    }
  }

  response = requests.post(url, json=data, headers=headers)
  print(response.text)
  with open('narracao.mp3', 'wb') as f:
      for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
          if chunk:
              f.write(chunk)


def check_duration():
  # Carregue o arquivo de áudio
  audio1 = AudioSegment.from_file("audio.mp3", format="mp3")
  audio2 = AudioSegment.from_file("narracao.mp3", format="mp3")

  # Obtenha a duração em milissegundos
  duração_em_milissegundos = len(audio1)
  duração_em_milissegundos2 = len(audio2)

  # Converta a duração para segundos
  duração_em_segundos = duração_em_milissegundos / 1000
  duração_em_segundos2 = duração_em_milissegundos2 / 1000

  print(f"A duração do áudio é de {duração_em_segundos} segundos.")
  print(f"A duração do áudio é de {duração_em_segundos2} segundos.")
  if duração_em_segundos > duração_em_segundos2:
    maior = duração_em_segundos
  else:
    maior = duração_em_segundos2

  return maior


def merge_audio_text():
  
  #ffmpeg -y -i audio_1.wav -vn -ar 44100 -ac 2 -b:a 192k audio.mp3
  subprocess.run(['ffmpeg', '-y', '-i', 'audio_1.wav', '-vn', '-ar', '44100', '-ac', '2', '-b:a', '192k', 'audio.mp3'])
  duration = check_duration()
  #ffmpeg -stream_loop -1 -i audio.mp3 -t "$duration" -c:a libmp3lame audio_loop.mp3
  subprocess.run(['ffmpeg', '-stream_loop', '-1', '-i', 'audio.mp3', '-t', str(duration), '-c:a', 'libmp3lame', 'audio_loop.mp3'])
  
  #ffmpeg -i narracao.mp3 -i audio_loop.mp3 -filter_complex amix=inputs=2:duration=first:dropout_transition=2 output.mp3
  subprocess.run(['ffmpeg', '-i', 'narracao.mp3', '-i', 'audio_loop.mp3', '-filter_complex', 'amix=inputs=2:duration=first:dropout_transition=2', 'output.mp3'])
  audio_final = '/content/output.mp3'
  return audio_final


def langchain_handle(description):
    print(Fore.CYAN + "to no lang")
    os.environ["OPENAI_API_KEY"] = "xxxxxxxxxxxxxxxxxxxxxxxxxxxx"
    llm = OpenAI(temperature=0.3,model_name="gpt-3.5-turbo")
    conversation = ConversationChain(

    llm=llm,
    verbose=True,

    memory=ConversationBufferMemory()
    )

    conversation.predict(input=f"given a list of phrases and you being a world-renowned melodist, create a melody based on the context generated by the phrases on the list, reporting the necessary instruments and their transitions. The list:{description}")
    conversation.predict(input="put the intro and all the scenes together in one phrase. Give me the output star with: the melody begins ")
    y = conversation.predict(input='Summarize the and starts with: the melody begins')
    print(y)
    return y

def music_gen(description):
    model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model.to(device);
    processor = AutoProcessor.from_pretrained("facebook/musicgen-small")

    inputs = processor(
        text=[f"{description}"],
        padding=True,
        return_tensors="pt",
    )
    print('antes do sampling')
    sampling_rate = model.config.audio_encoder.sampling_rate
    print('depois do sampling')

    audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=1503)

    # Audio(audio_values[0].cpu().numpy(), rate=sampling_rate)
    print('vou salvar o audio')

    nome = 'audio_1.wav'
    scipy.io.wavfile.write(nome, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy())

    return "/content/audio_1.wav"


def merge_audio_video():

    # ffmpeg -y -i audio_1.wav -vn -ar 44100 -ac 2 -b:a 192k audio.mp3

    # ffmpeg -y -i input.mp4 -i audio.mp3 -c:v copy -c:a copy output.mp4

    subprocess.run(['ffmpeg', '-y', '-i', 'audio_1.wav', '-vn', '-ar', '44100', '-ac', '2', '-b:a', '192k', 'audio.mp3'])

    # Combinar input.mp4 com audio.mp3 em output.mp4
    subprocess.run(['ffmpeg', '-y', '-i', 'input.mp4', '-i', 'audio.mp3', '-c:v', 'copy', '-c:a', 'copy', 'output.mp4'])


def handle_text(text):
  description = langchain_handle_text(text)
  audio = music_gen(description)
  eleven_labs(text)
  audio_final = merge_audio_text()
  return audio_final

import gradio as gr
from pytube import YouTube
def download_youtube_video(youtube_link,seconds):

        # Create a YouTube object for the provided link
        yt = YouTube(youtube_link)

        # Get the highest resolution stream (You can customize this)
        video_stream = yt.streams.filter(resolution = '720p',only_video=True).first()

        yt.title = clean_string(yt.title)
        # Download the video
        video_stream.download(output_path = '/content', filename = f'{yt.title}.mp4')

        video_path = f"/content/{yt.title}.mp4"
        print(video_path)
        print(yt.length)
        description = frame_list(video_path,seconds)

        final_description = langchain_handle(description)
        audio_path = music_gen(final_description)
        merge_audio_video()
        new_video_path = '/content/output.mp4'
        return new_video_path





iface_1 = gr.Interface(
    download_youtube_video,
    [gr.Textbox(label="Enter YouTube Video Link"),
     gr.Dropdown( ["5", "3", "1"], label="Seconds", info="Extract an image every chosen number of seconds")],
    "video",

)
iface_2 = gr.Interface(
    handle_text,
    gr.Textbox(label="Enter a Text"),
    "audio"
)


# iface_1.launch(share = True,debug=True,enable_queue=True)
demo = gr.TabbedInterface([iface_1, iface_2], ["video-to-SoundClip", "video-to-NarrativeText"])
demo.launch(share=True,debug=True,enable_queue=True)