RafaG commited on
Commit
13b7641
·
verified ·
1 Parent(s): c105f5a

Update scripts/download_video.py

Browse files
Files changed (1) hide show
  1. scripts/download_video.py +274 -273
scripts/download_video.py CHANGED
@@ -1,274 +1,275 @@
1
- import os
2
- import re
3
- import yt_dlp
4
- import sys
5
-
6
- def sanitize_filename(name):
7
- """Remove caracteres inválidos para nomes de arquivos/pastas."""
8
- cleaned = re.sub(r'[\\/*?:"<>|]', "", name)
9
- cleaned = cleaned.strip()
10
- return cleaned
11
-
12
- def progress_hook(d):
13
- if d['status'] == 'downloading':
14
- try:
15
- p = d.get('_percent_str', '').replace('%','')
16
- print(f"[download] {p}% - {d.get('_eta_str', 'N/A')} remaining", flush=True)
17
- except:
18
- pass
19
- elif d['status'] == 'finished':
20
- print(f"[download] Download concluído: {d['filename']}", flush=True)
21
-
22
- def download(url, base_root="VIRALS", download_subs=True, quality="best"):
23
- # 1. Extrair informações do vídeo para pegar o título
24
- print("Extraindo informações do vídeo...")
25
- title = None
26
-
27
- # ... (Keep existing title extraction logic) ...
28
- # Instead of repeating it effectively, I will rely on the diff to keep it or re-write it if I have to replace the whole block.
29
- # Since replace_file_content works on line ranges, I should be careful.
30
- # Let's assume I'm replacing the whole function body or significant parts.
31
-
32
- # Tentativa 1: Com cookies
33
- try:
34
- with yt_dlp.YoutubeDL({'quiet': True, 'no_warnings': True, 'cookiesfrombrowser': ('chrome',)}) as ydl:
35
- info = ydl.extract_info(url, download=False)
36
- title = info.get('title')
37
- except Exception as e:
38
- print(f"Aviso: Falha ao extrair info com cookies: {e}")
39
-
40
- # Tentativa 2: Sem cookies
41
- if not title:
42
- try:
43
- with yt_dlp.YoutubeDL({'quiet': True, 'no_warnings': True}) as ydl:
44
- info = ydl.extract_info(url, download=False)
45
- title = info.get('title')
46
- except Exception as e:
47
- print(f"Erro ao obter informações do vídeo (sem cookies): {e}")
48
-
49
- # Fallback final
50
- if title:
51
- safe_title = sanitize_filename(title)
52
- print(f"Título detectado: {title}")
53
- else:
54
- print("AVISO: Título não pôde ser obtido. Usando 'Unknown_Video'.")
55
- safe_title = "Unknown_Video"
56
-
57
- # 2. Criar estrutura de pastas
58
- project_folder = os.path.join(base_root, safe_title)
59
- os.makedirs(project_folder, exist_ok=True)
60
-
61
- # Caminho final do vídeo
62
- output_filename = 'input'
63
- output_path_base = os.path.join(project_folder, output_filename)
64
- final_video_path = f"{output_path_base}.mp4"
65
-
66
- # Verificação inteligente
67
- if os.path.exists(final_video_path):
68
- if os.path.getsize(final_video_path) > 1024:
69
- print(f"Vídeo já existe em: {final_video_path}")
70
- print("Pulando download e reutilizando arquivo local.")
71
- return final_video_path, project_folder
72
- else:
73
- print("Arquivo existente encontrado mas parece corrompido/vazio. Baixando novamente...")
74
- try:
75
- os.remove(final_video_path)
76
- except:
77
- pass
78
-
79
- # Limpeza de temp
80
- temp_path = f"{output_path_base}.temp.mp4"
81
- if os.path.exists(temp_path):
82
- try:
83
- os.remove(temp_path)
84
- except:
85
- pass
86
-
87
- # Mapeamento de Qualidade
88
- quality_map = {
89
- "best": 'bestvideo+bestaudio/best',
90
- "1080p": 'bestvideo[height<=1080]+bestaudio/best[height<=1080]',
91
- "720p": 'bestvideo[height<=720]+bestaudio/best[height<=720]',
92
- "480p": 'bestvideo[height<=480]+bestaudio/best[height<=480]'
93
- }
94
- selected_format = quality_map.get(quality, 'bestvideo+bestaudio/best')
95
- print(f"Configurando qualidade de download: {quality} -> {selected_format}")
96
-
97
- ydl_opts = {
98
- 'format': selected_format,
99
- 'overwrites': True,
100
- 'outtmpl': output_path_base,
101
- 'postprocessor_args': [
102
- '-movflags', 'faststart'
103
- ],
104
- 'merge_output_format': 'mp4',
105
- 'progress_hooks': [progress_hook],
106
- # Opções de Legenda
107
- 'writesubtitles': download_subs,
108
- 'writeautomaticsub': download_subs,
109
- 'subtitleslangs': ['pt.*', 'en.*', 'sp.*'], # Prioritize generic PT, EN, SP
110
- 'http_headers': {
111
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
112
- },
113
- 'skip_download': False,
114
- 'quiet': False,
115
- 'no_warnings': False,
116
- }
117
-
118
-
119
-
120
- if download_subs:
121
- ydl_opts['postprocessors'] = [{
122
- 'key': 'FFmpegSubtitlesConvertor',
123
- 'format': 'srt',
124
- }]
125
-
126
- print(f"Baixando vídeo para: {project_folder}...")
127
-
128
- # Tentativa 1: Com configuração original
129
- try:
130
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
131
- ydl.download([url])
132
- except yt_dlp.utils.DownloadError as e:
133
- error_str = str(e)
134
- if download_subs and ("Unable to download video subtitles" in error_str or "429" in error_str):
135
- print(f"\nAviso: Erro ao baixar legendas ({e}).")
136
- print("Tentando novamente APENAS o vídeo (sem legendas)...")
137
-
138
- ydl_opts['writesubtitles'] = False
139
- ydl_opts['writeautomaticsub'] = False
140
- ydl_opts['postprocessors'] = [p for p in ydl_opts.get('postprocessors', []) if 'Subtitle' not in p.get('key', '')]
141
-
142
- try:
143
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
144
- ydl.download([url])
145
- except Exception as e2:
146
- print(f"Erro fatal na segunda tentativa: {e2}")
147
- raise
148
- elif "is not a valid URL" in error_str:
149
- print("Erro: o link inserido não é válido.")
150
- raise
151
- else:
152
- print(f"Erro no download: {e}")
153
- raise
154
- except Exception as e:
155
- print(f"Erro inesperado: {e}")
156
- raise
157
-
158
- # RENOMEAR LEGENDA PARA PADRÃO (input.vtt ou input.srt)
159
- # Se for VTT, converte para SRT para garantir compatibilidade.
160
- try:
161
- import glob
162
- # Pega a primeira que encontrar
163
- potential_subs = glob.glob(os.path.join(project_folder, "input.*.vtt")) + glob.glob(os.path.join(project_folder, "input.*.srt"))
164
-
165
- if potential_subs:
166
- best_sub = potential_subs[0]
167
- ext = os.path.splitext(best_sub)[1]
168
- new_name = os.path.join(project_folder, "input.srt") # Vamos padronizar tudo para .srt
169
-
170
- if ext.lower() == '.vtt':
171
- print(f"Formatando legenda VTT complexa ({os.path.basename(best_sub)}) para SRT limpo...")
172
- try:
173
- with open(best_sub, 'r', encoding='utf-8') as f:
174
- lines = f.readlines()
175
-
176
- srt_content = []
177
- counter = 1
178
-
179
- seen_texts = set()
180
- last_text = ""
181
-
182
- for line in lines:
183
- clean_line = line.strip()
184
- # Ignora Headers e Metadados do VTT/Youtube
185
- if clean_line.startswith("WEBVTT") or \
186
- clean_line.startswith("X-TIMESTAMP") or \
187
- clean_line.startswith("NOTE") or \
188
- clean_line.startswith("Kind:") or \
189
- clean_line.startswith("Language:"):
190
- continue
191
-
192
- if "-->" in clean_line:
193
- # Parse Timestamp
194
- parts = clean_line.split("-->")
195
- start = parts[0].strip()
196
- # Remove tags de posicionamento "align:start position:0%"
197
- end = parts[1].strip().split(' ')[0]
198
-
199
- def fix_time(t):
200
- t = t.replace('.', ',')
201
- if t.count(':') == 1:
202
- t = "00:" + t
203
- return t
204
-
205
- current_start = fix_time(start)
206
- current_end = fix_time(end)
207
-
208
- elif clean_line:
209
- # Texto: remover tags complexas <00:00:00.560><c> etc
210
- # O YouTube usa formato karaoke. Ex: "Quanto<...> custa<...>"
211
- # Precisamos do texto limpo.
212
- text = re.sub(r'<[^>]+>', '', clean_line).strip()
213
-
214
- if not text: continue
215
-
216
- # Lógica para remover duplicatas do estilo "Roll-up" ou "Karaoke"
217
- # O YouTube repete a linha anterior às vezes.
218
- # Ex:
219
- # 1: "Quanto custa"
220
- # 2: "Quanto custa\nQuantos quilos"
221
-
222
- # Vamos pegar apenas a ULTIMA linha se tiver quebras
223
- lines_in_text = text.split('\n')
224
- final_line = lines_in_text[-1].strip()
225
-
226
- if not final_line: continue
227
-
228
- # Filtro de duplicidade consecutivo
229
- if final_line == last_text:
230
- continue
231
-
232
- # Evita blocos ultra curtos (glitch de 10ms) que repetem texto
233
- # Mas aqui estamos processando texto.
234
-
235
- srt_content.append(f"{counter}\n")
236
- srt_content.append(f"{current_start} --> {current_end}\n")
237
- srt_content.append(f"{final_line}\n\n")
238
-
239
- last_text = final_line
240
- counter += 1
241
-
242
- with open(new_name, 'w', encoding='utf-8') as f_out:
243
- f_out.writelines(srt_content)
244
-
245
- print(f"Legenda convertida e limpa: {new_name}")
246
- try: os.remove(best_sub)
247
- except: pass
248
-
249
- except Exception as e_conv:
250
- print(f"Falha ao converter VTT: {e_conv}. Mantendo original.")
251
- # Fallback: rename apenas
252
- new_name_fallback = os.path.join(project_folder, "input.vtt")
253
- if os.path.exists(new_name_fallback) and new_name_fallback != best_sub:
254
- try: os.remove(new_name_fallback)
255
- except: pass
256
- os.rename(best_sub, new_name_fallback)
257
-
258
- else:
259
- # Já é SRT, só renomeia
260
- if os.path.exists(new_name) and new_name != best_sub:
261
- try: os.remove(new_name)
262
- except: pass
263
- os.rename(best_sub, new_name)
264
- print(f"Legenda SRT renomeada para: {new_name}")
265
-
266
- # Limpa sobras
267
- for extra in potential_subs[1:]:
268
- try: os.remove(extra)
269
- except: pass
270
-
271
- except Exception as e_ren:
272
- print(f"Erro ao processar legendas: {e_ren}")
273
-
 
274
  return final_video_path, project_folder
 
1
+ import os
2
+ import re
3
+ import yt_dlp
4
+ import sys
5
+
6
+ def sanitize_filename(name):
7
+ """Remove caracteres inválidos para nomes de arquivos/pastas."""
8
+ cleaned = re.sub(r'[\\/*?:"<>|]', "", name)
9
+ cleaned = cleaned.strip()
10
+ return cleaned
11
+
12
+ def progress_hook(d):
13
+ if d['status'] == 'downloading':
14
+ try:
15
+ p = d.get('_percent_str', '').replace('%','')
16
+ print(f"[download] {p}% - {d.get('_eta_str', 'N/A')} remaining", flush=True)
17
+ except:
18
+ pass
19
+ elif d['status'] == 'finished':
20
+ print(f"[download] Download concluído: {d['filename']}", flush=True)
21
+
22
+ def download(url, base_root="VIRALS", download_subs=True, quality="best"):
23
+ # 1. Extrair informações do vídeo para pegar o título
24
+ print("Extraindo informações do vídeo...")
25
+ title = None
26
+
27
+ # ... (Keep existing title extraction logic) ...
28
+ # Instead of repeating it effectively, I will rely on the diff to keep it or re-write it if I have to replace the whole block.
29
+ # Since replace_file_content works on line ranges, I should be careful.
30
+ # Let's assume I'm replacing the whole function body or significant parts.
31
+
32
+ # Tentativa 1: Com cookies
33
+ try:
34
+ with yt_dlp.YoutubeDL({'quiet': True, 'no_warnings': True, 'cookiesfrombrowser': ('chrome',)}) as ydl:
35
+ info = ydl.extract_info(url, download=False)
36
+ title = info.get('title')
37
+ except Exception as e:
38
+ print(f"Aviso: Falha ao extrair info com cookies: {e}")
39
+
40
+ # Tentativa 2: Sem cookies
41
+ if not title:
42
+ try:
43
+ with yt_dlp.YoutubeDL({'quiet': True, 'no_warnings': True}) as ydl:
44
+ info = ydl.extract_info(url, download=False)
45
+ title = info.get('title')
46
+ except Exception as e:
47
+ print(f"Erro ao obter informações do vídeo (sem cookies): {e}")
48
+
49
+ # Fallback final
50
+ if title:
51
+ safe_title = sanitize_filename(title)
52
+ print(f"Título detectado: {title}")
53
+ else:
54
+ print("AVISO: Título não pôde ser obtido. Usando 'Unknown_Video'.")
55
+ safe_title = "Unknown_Video"
56
+
57
+ # 2. Criar estrutura de pastas
58
+ project_folder = os.path.join(base_root, safe_title)
59
+ os.makedirs(project_folder, exist_ok=True)
60
+
61
+ # Caminho final do vídeo
62
+ output_filename = 'input'
63
+ output_path_base = os.path.join(project_folder, output_filename)
64
+ final_video_path = f"{output_path_base}.mp4"
65
+
66
+ # Verificação inteligente
67
+ if os.path.exists(final_video_path):
68
+ if os.path.getsize(final_video_path) > 1024:
69
+ print(f"Vídeo já existe em: {final_video_path}")
70
+ print("Pulando download e reutilizando arquivo local.")
71
+ return final_video_path, project_folder
72
+ else:
73
+ print("Arquivo existente encontrado mas parece corrompido/vazio. Baixando novamente...")
74
+ try:
75
+ os.remove(final_video_path)
76
+ except:
77
+ pass
78
+
79
+ # Limpeza de temp
80
+ temp_path = f"{output_path_base}.temp.mp4"
81
+ if os.path.exists(temp_path):
82
+ try:
83
+ os.remove(temp_path)
84
+ except:
85
+ pass
86
+
87
+ # Mapeamento de Qualidade
88
+ quality_map = {
89
+ "best": 'bestvideo+bestaudio/best',
90
+ "1080p": 'bestvideo[height<=1080]+bestaudio/best[height<=1080]',
91
+ "720p": 'bestvideo[height<=720]+bestaudio/best[height<=720]',
92
+ "480p": 'bestvideo[height<=480]+bestaudio/best[height<=480]'
93
+ }
94
+ selected_format = quality_map.get(quality, 'bestvideo+bestaudio/best')
95
+ print(f"Configurando qualidade de download: {quality} -> {selected_format}")
96
+
97
+ ydl_opts = {
98
+ 'format': selected_format,
99
+ 'overwrites': True,
100
+ 'outtmpl': output_path_base,
101
+ 'postprocessor_args': [
102
+ '-movflags', 'faststart'
103
+ ],
104
+ 'merge_output_format': 'mp4',
105
+ 'progress_hooks': [progress_hook],
106
+ # Opções de Legenda
107
+ 'writesubtitles': download_subs,
108
+ 'writeautomaticsub': download_subs,
109
+ 'subtitleslangs': ['pt.*', 'en.*', 'sp.*'], # Prioritize generic PT, EN, SP
110
+ 'http_headers': {
111
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
112
+ },
113
+ 'skip_download': False,
114
+ 'quiet': False,
115
+ 'no_warnings': False,
116
+ 'force_ipv4': True,
117
+ }
118
+
119
+
120
+
121
+ if download_subs:
122
+ ydl_opts['postprocessors'] = [{
123
+ 'key': 'FFmpegSubtitlesConvertor',
124
+ 'format': 'srt',
125
+ }]
126
+
127
+ print(f"Baixando vídeo para: {project_folder}...")
128
+
129
+ # Tentativa 1: Com configuração original
130
+ try:
131
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
132
+ ydl.download([url])
133
+ except yt_dlp.utils.DownloadError as e:
134
+ error_str = str(e)
135
+ if download_subs and ("Unable to download video subtitles" in error_str or "429" in error_str):
136
+ print(f"\nAviso: Erro ao baixar legendas ({e}).")
137
+ print("Tentando novamente APENAS o vídeo (sem legendas)...")
138
+
139
+ ydl_opts['writesubtitles'] = False
140
+ ydl_opts['writeautomaticsub'] = False
141
+ ydl_opts['postprocessors'] = [p for p in ydl_opts.get('postprocessors', []) if 'Subtitle' not in p.get('key', '')]
142
+
143
+ try:
144
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
145
+ ydl.download([url])
146
+ except Exception as e2:
147
+ print(f"Erro fatal na segunda tentativa: {e2}")
148
+ raise
149
+ elif "is not a valid URL" in error_str:
150
+ print("Erro: o link inserido não é válido.")
151
+ raise
152
+ else:
153
+ print(f"Erro no download: {e}")
154
+ raise
155
+ except Exception as e:
156
+ print(f"Erro inesperado: {e}")
157
+ raise
158
+
159
+ # RENOMEAR LEGENDA PARA PADRÃO (input.vtt ou input.srt)
160
+ # Se for VTT, converte para SRT para garantir compatibilidade.
161
+ try:
162
+ import glob
163
+ # Pega a primeira que encontrar
164
+ potential_subs = glob.glob(os.path.join(project_folder, "input.*.vtt")) + glob.glob(os.path.join(project_folder, "input.*.srt"))
165
+
166
+ if potential_subs:
167
+ best_sub = potential_subs[0]
168
+ ext = os.path.splitext(best_sub)[1]
169
+ new_name = os.path.join(project_folder, "input.srt") # Vamos padronizar tudo para .srt
170
+
171
+ if ext.lower() == '.vtt':
172
+ print(f"Formatando legenda VTT complexa ({os.path.basename(best_sub)}) para SRT limpo...")
173
+ try:
174
+ with open(best_sub, 'r', encoding='utf-8') as f:
175
+ lines = f.readlines()
176
+
177
+ srt_content = []
178
+ counter = 1
179
+
180
+ seen_texts = set()
181
+ last_text = ""
182
+
183
+ for line in lines:
184
+ clean_line = line.strip()
185
+ # Ignora Headers e Metadados do VTT/Youtube
186
+ if clean_line.startswith("WEBVTT") or \
187
+ clean_line.startswith("X-TIMESTAMP") or \
188
+ clean_line.startswith("NOTE") or \
189
+ clean_line.startswith("Kind:") or \
190
+ clean_line.startswith("Language:"):
191
+ continue
192
+
193
+ if "-->" in clean_line:
194
+ # Parse Timestamp
195
+ parts = clean_line.split("-->")
196
+ start = parts[0].strip()
197
+ # Remove tags de posicionamento "align:start position:0%"
198
+ end = parts[1].strip().split(' ')[0]
199
+
200
+ def fix_time(t):
201
+ t = t.replace('.', ',')
202
+ if t.count(':') == 1:
203
+ t = "00:" + t
204
+ return t
205
+
206
+ current_start = fix_time(start)
207
+ current_end = fix_time(end)
208
+
209
+ elif clean_line:
210
+ # Texto: remover tags complexas <00:00:00.560><c> etc
211
+ # O YouTube usa formato karaoke. Ex: "Quanto<...> custa<...>"
212
+ # Precisamos do texto limpo.
213
+ text = re.sub(r'<[^>]+>', '', clean_line).strip()
214
+
215
+ if not text: continue
216
+
217
+ # Lógica para remover duplicatas do estilo "Roll-up" ou "Karaoke"
218
+ # O YouTube repete a linha anterior às vezes.
219
+ # Ex:
220
+ # 1: "Quanto custa"
221
+ # 2: "Quanto custa\nQuantos quilos"
222
+
223
+ # Vamos pegar apenas a ULTIMA linha se tiver quebras
224
+ lines_in_text = text.split('\n')
225
+ final_line = lines_in_text[-1].strip()
226
+
227
+ if not final_line: continue
228
+
229
+ # Filtro de duplicidade consecutivo
230
+ if final_line == last_text:
231
+ continue
232
+
233
+ # Evita blocos ultra curtos (glitch de 10ms) que repetem texto
234
+ # Mas aqui estamos processando texto.
235
+
236
+ srt_content.append(f"{counter}\n")
237
+ srt_content.append(f"{current_start} --> {current_end}\n")
238
+ srt_content.append(f"{final_line}\n\n")
239
+
240
+ last_text = final_line
241
+ counter += 1
242
+
243
+ with open(new_name, 'w', encoding='utf-8') as f_out:
244
+ f_out.writelines(srt_content)
245
+
246
+ print(f"Legenda convertida e limpa: {new_name}")
247
+ try: os.remove(best_sub)
248
+ except: pass
249
+
250
+ except Exception as e_conv:
251
+ print(f"Falha ao converter VTT: {e_conv}. Mantendo original.")
252
+ # Fallback: rename apenas
253
+ new_name_fallback = os.path.join(project_folder, "input.vtt")
254
+ if os.path.exists(new_name_fallback) and new_name_fallback != best_sub:
255
+ try: os.remove(new_name_fallback)
256
+ except: pass
257
+ os.rename(best_sub, new_name_fallback)
258
+
259
+ else:
260
+ # é SRT, renomeia
261
+ if os.path.exists(new_name) and new_name != best_sub:
262
+ try: os.remove(new_name)
263
+ except: pass
264
+ os.rename(best_sub, new_name)
265
+ print(f"Legenda SRT renomeada para: {new_name}")
266
+
267
+ # Limpa sobras
268
+ for extra in potential_subs[1:]:
269
+ try: os.remove(extra)
270
+ except: pass
271
+
272
+ except Exception as e_ren:
273
+ print(f"Erro ao processar legendas: {e_ren}")
274
+
275
  return final_video_path, project_folder