joseluisthepower commited on
Commit
652ee18
Β·
verified Β·
1 Parent(s): aa9793c

prueba nueva app. antes funcionaba.

Browse files
Files changed (1) hide show
  1. app.py +446 -159
app.py CHANGED
@@ -10,9 +10,21 @@ import langdetect
10
  import uuid
11
  import time
12
  import random
 
 
13
 
14
  # --- CONFIGURACIΓ“N INICIAL ---
15
- print("Starting the program...")
 
 
 
 
 
 
 
 
 
 
16
 
17
  # Carga del modelo en CPU
18
  model_path = "Qwen/Qwen2.5-7B-Instruct"
@@ -20,64 +32,154 @@ print(f"Loading model {model_path}...")
20
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
21
  model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
22
  model = model.eval()
23
- print("Model successfully loaded.")
24
 
25
- # --- CONFIGURACIΓ“N AVANZADA DE YT-DLP PARA VIMEO ---
26
- def get_enhanced_ydl_opts():
27
- """
28
- ConfiguraciΓ³n optimizada para evitar bloqueos de Vimeo
29
- """
30
- return {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  'format': 'bestaudio/best',
32
  'postprocessors': [{
33
  'key': 'FFmpegExtractAudio',
34
  'preferredcodec': 'wav',
35
  }],
 
36
  'keepvideo': False,
37
 
38
- # === CONFIGURACIONES ANTI-BLOQUEO ===
39
- # User Agent realista (Chrome mΓ‘s reciente)
 
 
 
 
40
  'http_headers': {
41
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
42
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
43
- 'Accept-Language': 'en-US,en;q=0.9,es;q=0.8',
44
  'Accept-Encoding': 'gzip, deflate, br',
45
  'DNT': '1',
46
  'Connection': 'keep-alive',
 
 
 
 
47
  'Upgrade-Insecure-Requests': '1',
48
  },
49
 
50
- # Rate limiting y sleep para parecer humano
51
- 'sleep_interval': random.uniform(2, 5), # Espera aleatoria entre 2-5 segundos
52
- 'max_sleep_interval': 8,
53
- 'sleep_interval_requests': random.uniform(0.5, 2), # Entre requests
54
-
55
- # Configuraciones de red
56
- 'socket_timeout': 60,
57
- 'retries': 5,
58
- 'fragment_retries': 10,
59
- 'retry_sleep_functions': {'http': lambda n: 2 ** n + random.uniform(0, 1)},
60
-
61
- # Bypass de restricciones geogrΓ‘ficas
62
- 'geo_bypass': True,
63
- 'geo_bypass_country': 'US',
64
 
65
  # Configuraciones especΓ­ficas para Vimeo
66
  'extractor_args': {
67
  'vimeo': {
68
- 'client': 'web', # Usar cliente web en lugar de android/ios
69
- 'original_format_policy': 'auto', # PolΓ­tica automΓ‘tica para formatos originales
70
  }
71
  },
72
 
73
- # Opciones adicionales para estabilidad
 
 
 
 
74
  'no_warnings': False,
75
  'ignoreerrors': False,
76
  'abort_on_unavailable_fragments': False,
77
- 'keep_fragments': False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
79
- # ImpersonaciΓ³n de navegador (si estΓ‘ disponible)
80
- 'impersonate': 'chrome', # Impersonar Chrome
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  }
82
 
83
  # --- FUNCIONES AUXILIARES ---
@@ -89,92 +191,189 @@ def cleanup_files(*files):
89
  if file and os.path.exists(file):
90
  try:
91
  os.remove(file)
92
- print(f"Removed file: {file}")
93
  except OSError as e:
94
- print(f"Error removing file {file}: {e}")
95
 
96
- def human_like_delay():
97
  """Simula comportamiento humano con delays aleatorios"""
98
- delay = random.uniform(1, 3)
99
- print(f"Waiting {delay:.1f} seconds...")
100
  time.sleep(delay)
101
 
102
- # --- LΓ“GICA PRINCIPAL DE PROCESAMIENTO MEJORADA ---
103
- def download_video_audio_enhanced(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  """
105
- FunciΓ³n mejorada para descargar audio de videos con anti-bloqueo
106
  """
107
- print(f"Downloading audio from: {url}")
108
  temp_filename = generate_unique_filename("")
109
  output_path = f"{temp_filename}.wav"
110
 
111
- # Delay inicial para parecer humano
112
- human_like_delay()
113
 
114
- # ConfiguraciΓ³n optimizada
115
- ydl_opts = get_enhanced_ydl_opts()
116
- ydl_opts['outtmpl'] = temp_filename
117
 
118
- max_retries = 3
119
- for attempt in range(max_retries):
120
- try:
121
- print(f"Attempt {attempt + 1}/{max_retries}")
 
 
 
 
122
 
123
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
124
- # Extraer informaciΓ³n primero sin descargar
125
- print("Extracting video information...")
126
- info = ydl.extract_info(url, download=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- # Verificar si el video estΓ‘ disponible
129
- if not info:
130
- raise Exception("Could not extract video information")
 
131
 
132
- print(f"Video found: {info.get('title', 'Unknown')}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
 
134
- # Delay adicional antes de la descarga
135
- human_like_delay()
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
- # Proceder con la descarga
138
- print("Starting download...")
139
- ydl.download([url])
140
-
141
- if os.path.exists(output_path):
142
- print(f"Download completed successfully: {output_path}")
143
- return output_path
144
- else:
145
- raise FileNotFoundError(f"Expected file {output_path} was not found")
 
 
 
146
 
147
- except Exception as e:
148
- print(f"Attempt {attempt + 1} failed: {str(e)}")
149
-
150
- if attempt < max_retries - 1:
151
- # Delay exponencial con jitter entre reintentos
152
- delay = (2 ** attempt) + random.uniform(1, 3)
153
- print(f"Retrying in {delay:.1f} seconds...")
154
- time.sleep(delay)
155
- else:
156
- # Todos los intentos fallaron
157
- raise Exception(f"Failed to download after {max_retries} attempts: {str(e)}")
 
 
 
158
 
159
  def transcribe_audio_enhanced(file_path):
160
  """FunciΓ³n mejorada de transcripciΓ³n con mejor manejo de errores"""
161
- print(f"Starting transcription of file: {file_path}")
162
  temp_audio = None
163
  original_file_to_clean = file_path
164
 
165
  try:
 
 
 
 
 
 
 
 
 
 
166
  # Convertir a WAV si es necesario
167
  if not file_path.endswith('.wav'):
168
- print("Non-WAV file detected. Converting...")
169
  video = mp.VideoFileClip(file_path)
 
 
 
170
  temp_audio = generate_unique_filename(".wav")
171
  video.audio.write_audiofile(temp_audio, verbose=False, logger=None)
172
- video.close() # Cerrar explΓ­citamente
173
  file_path = temp_audio
174
 
175
  output_file = generate_unique_filename(".json")
176
 
177
- # Comando mejorado de Whisper
178
  command = [
179
  "insanely-fast-whisper",
180
  "--file-name", file_path,
@@ -183,66 +382,78 @@ def transcribe_audio_enhanced(file_path):
183
  "--task", "transcribe",
184
  "--timestamp", "chunk",
185
  "--transcript-path", output_file,
186
- "--batch-size", "4", # Reducir batch size para evitar OOM
 
187
  ]
188
 
189
- print(f"Executing transcription command...")
190
  result = subprocess.run(
191
  command,
192
  check=True,
193
  capture_output=True,
194
  text=True,
195
- timeout=600 # Timeout de 10 minutos
196
  )
197
 
198
- # Leer resultado
 
 
199
  if not os.path.exists(output_file):
200
  raise FileNotFoundError("Transcription output file not found")
201
 
 
202
  with open(output_file, "r", encoding='utf-8') as f:
203
  transcription_data = json.load(f)
204
 
205
- result_text = transcription_data.get("text", "")
 
 
206
  if not result_text:
207
- # Fallback: concatenar chunks
208
  chunks = transcription_data.get("chunks", [])
209
- result_text = " ".join([chunk.get("text", "") for chunk in chunks])
 
 
 
 
 
210
 
211
- print("Transcription completed successfully.")
212
  cleanup_files(output_file)
213
- return result_text.strip()
214
 
215
  except subprocess.TimeoutExpired:
216
- print("Transcription timed out")
217
- raise Exception("Transcription process timed out")
218
  except Exception as e:
219
- print(f"Transcription error: {e}")
220
  raise
221
  finally:
222
- # Limpieza mejorada
223
- if temp_audio:
224
  cleanup_files(temp_audio)
225
- if original_file_to_clean != file_path:
226
  cleanup_files(original_file_to_clean)
227
 
228
  def generate_summary_stream(transcription):
229
  """FunciΓ³n mejorada de generaciΓ³n de resumen"""
230
  if not transcription or len(transcription.strip()) < 20:
231
- return "Transcription is too short to summarize."
232
 
233
- print("Generating summary...")
234
 
235
  try:
236
  detected_language = langdetect.detect(transcription)
237
- print(f"Detected language: {detected_language}")
238
  except:
239
- detected_language = "en" # Fallback a inglΓ©s
 
240
 
241
  # Truncar transcripciΓ³n si es muy larga
242
- max_chars = 15000
243
  truncated_text = transcription[:max_chars]
244
  if len(transcription) > max_chars:
245
  truncated_text += "..."
 
246
 
247
  prompt = f"""Please create a comprehensive summary of the following video transcription in {detected_language}.
248
  The summary should be 150-300 words and capture the main points, key ideas, and important details:
@@ -251,20 +462,23 @@ def generate_summary_stream(transcription):
251
 
252
  try:
253
  response, _ = model.chat(tokenizer, prompt, history=[])
254
- print("Summary generated successfully.")
255
  return response
256
  except Exception as e:
257
- print(f"Summary generation error: {e}")
258
- return f"Error generating summary: {str(e)}"
259
 
260
  # --- FUNCIONES DE INTERFAZ MEJORADAS ---
261
  def process_video_url_enhanced(url):
262
- """FunciΓ³n unificada para procesar URLs de video (YouTube, Vimeo, etc.)"""
263
  if not url or not url.strip():
264
- return "Please enter a valid video URL.", ""
265
 
266
  url = url.strip()
267
- print(f"Processing video URL: {url}")
 
 
 
268
 
269
  # Detectar plataforma
270
  platform = "Unknown"
@@ -273,72 +487,114 @@ def process_video_url_enhanced(url):
273
  elif "vimeo.com" in url:
274
  platform = "Vimeo"
275
 
276
- print(f"Detected platform: {platform}")
 
277
 
278
  audio_file = None
279
  try:
280
- # Usar funciΓ³n mejorada de descarga
281
- audio_file = download_video_audio_enhanced(url)
 
 
 
282
  transcription = transcribe_audio_enhanced(audio_file)
283
 
284
  if not transcription:
285
- return "No transcription could be generated from this video.", ""
286
 
287
- return transcription, f"βœ… Successfully processed {platform} video"
 
 
288
 
289
  except Exception as e:
290
  error_msg = str(e)
291
- print(f"Error processing {platform} video: {error_msg}")
292
 
293
- # Mensajes de error mΓ‘s informativos
294
  if "HTTP Error 401" in error_msg:
295
- return "❌ Access denied. The video might be private or require authentication.", ""
296
- elif "HTTP Error 403" in error_msg:
297
- return "❌ Video blocked. Try again in a few minutes or check if the video is publicly accessible.", ""
 
 
298
  elif "HTTP Error 429" in error_msg:
299
- return "❌ Rate limited. Please wait a few minutes before trying again.", ""
 
300
  elif "TLS fingerprint" in error_msg:
301
- return "❌ Connection blocked by security measures. Try again later.", ""
 
 
 
 
 
 
 
 
 
 
 
302
  else:
303
- return f"❌ Error processing video: {error_msg}", ""
304
  finally:
305
- if audio_file:
 
306
  cleanup_files(audio_file)
307
 
308
  def process_uploaded_video_enhanced(video_path):
309
  """FunciΓ³n mejorada para procesar videos subidos"""
310
  if video_path is None:
311
- return "Please upload a video file first.", ""
312
 
313
- print(f"Processing uploaded video: {video_path}")
 
 
 
314
 
315
  try:
 
 
 
 
 
 
 
 
 
 
 
316
  transcription = transcribe_audio_enhanced(video_path)
317
 
318
  if not transcription:
319
- return "No transcription could be generated from this video.", ""
320
 
321
- return transcription, "βœ… Successfully processed uploaded video"
 
 
322
 
323
  except Exception as e:
324
  error_msg = str(e)
325
- print(f"Error processing uploaded video: {error_msg}")
326
- return f"❌ Error processing video: {error_msg}", ""
 
 
 
 
 
 
327
 
328
  # --- CONSTRUCCIΓ“N DE LA INTERFAZ MEJORADA ---
329
- print("Setting up enhanced Gradio interface...")
330
 
331
- with gr.Blocks(theme=gr.themes.Soft(), title="πŸŽ₯ Enhanced Video Transcription") as demo:
332
- gr.Markdown("# πŸŽ₯ Enhanced Video Transcription & AI Summary")
333
- gr.Markdown("""
334
- Upload a video or provide a video URL (YouTube, Vimeo, etc.) to get a transcription and AI-generated summary.
335
 
336
- **✨ Enhanced features:**
337
- - πŸ›‘οΈ Anti-blocking measures for Vimeo and other platforms
338
- - πŸ”„ Automatic retry with exponential backoff
339
- - 🌍 Geographic restriction bypass
340
- - πŸ€– Human-like behavior simulation
341
- - πŸ“Š Better error handling and reporting
342
  """)
343
 
344
  with gr.Tabs():
@@ -346,7 +602,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="πŸŽ₯ Enhanced Video Transcription"
346
  with gr.Row():
347
  url_input = gr.Textbox(
348
  label="Video URL",
349
- placeholder="https://www.youtube.com/watch?v=... or https://vimeo.com/...",
350
  scale=4
351
  )
352
  url_button = gr.Button("πŸš€ Process URL", variant="primary", scale=1)
@@ -360,39 +616,67 @@ with gr.Blocks(theme=gr.themes.Soft(), title="πŸŽ₯ Enhanced Video Transcription"
360
  with gr.Column():
361
  transcription_output = gr.Textbox(
362
  label="πŸ“ Transcription",
363
- lines=12,
364
  interactive=True,
365
  placeholder="Transcription will appear here..."
366
  )
367
  with gr.Column():
368
  summary_output = gr.Textbox(
369
  label="πŸ“Š AI Summary",
370
- lines=12,
371
  placeholder="AI-generated summary will appear here..."
372
  )
373
 
374
  with gr.Row():
375
  status_output = gr.Textbox(
376
- label="πŸ“Š Status",
377
  interactive=False,
378
- placeholder="Ready to process videos..."
 
379
  )
380
  summary_button = gr.Button("πŸ“ Generate Summary", variant="secondary")
381
 
382
- # InformaciΓ³n adicional
383
- with gr.Accordion("ℹ️ Usage Tips", open=False):
384
- gr.Markdown("""
385
- **For best results:**
386
- - βœ… Use public videos (private videos may not work)
387
- - βœ… If you get blocked, wait 5-10 minutes before trying again
388
- - βœ… Vimeo links work best in format: `https://vimeo.com/VIDEO_ID`
389
- - βœ… For YouTube, both long and short URLs are supported
390
- - βœ… The system includes automatic retries with delays to avoid blocks
391
-
392
- **Supported formats:** MP4, AVI, MOV, MKV, WEBM, and most video formats
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  """)
394
 
395
- # Conexiones de eventos
396
  url_button.click(
397
  fn=process_video_url_enhanced,
398
  inputs=[url_input],
@@ -411,7 +695,10 @@ with gr.Blocks(theme=gr.themes.Soft(), title="πŸŽ₯ Enhanced Video Transcription"
411
  outputs=[summary_output]
412
  )
413
 
414
- print("Launching enhanced Gradio interface...")
 
 
 
415
  demo.launch(
416
  server_name="0.0.0.0",
417
  server_port=7860,
 
10
  import uuid
11
  import time
12
  import random
13
+ import re
14
+ from urllib.parse import urlparse
15
 
16
  # --- CONFIGURACIΓ“N INICIAL ---
17
+ print("Starting the enhanced program...")
18
+ print("Checking dependencies...")
19
+
20
+ # Verificar si curl-cffi estΓ‘ disponible
21
+ try:
22
+ import curl_cffi
23
+ CURL_CFFI_AVAILABLE = True
24
+ print("βœ… curl-cffi is available - Advanced impersonation enabled")
25
+ except ImportError:
26
+ CURL_CFFI_AVAILABLE = False
27
+ print("⚠️ curl-cffi not available - Using fallback methods")
28
 
29
  # Carga del modelo en CPU
30
  model_path = "Qwen/Qwen2.5-7B-Instruct"
 
32
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
33
  model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16, trust_remote_code=True)
34
  model = model.eval()
35
+ print("βœ… Model successfully loaded.")
36
 
37
+ # --- CONFIGURACIONES ESPECIALIZADAS POR MÉTODO ---
38
+ def get_vimeo_player_url(vimeo_url):
39
+ """Convierte URLs de Vimeo a formato player.vimeo.com"""
40
+ # Extraer ID del video de diferentes formatos de URL
41
+ patterns = [
42
+ r'vimeo\.com/(\d+)',
43
+ r'player\.vimeo\.com/video/(\d+)',
44
+ r'vimeo\.com/.*/(\d+)',
45
+ ]
46
+
47
+ for pattern in patterns:
48
+ match = re.search(pattern, vimeo_url)
49
+ if match:
50
+ video_id = match.group(1)
51
+ return f"https://player.vimeo.com/video/{video_id}"
52
+
53
+ return vimeo_url
54
+
55
+ def get_primary_ydl_opts(output_path):
56
+ """ConfiguraciΓ³n principal con curl-cffi si estΓ‘ disponible"""
57
+ opts = {
58
  'format': 'bestaudio/best',
59
  'postprocessors': [{
60
  'key': 'FFmpegExtractAudio',
61
  'preferredcodec': 'wav',
62
  }],
63
+ 'outtmpl': output_path,
64
  'keepvideo': False,
65
 
66
+ # Configuraciones bΓ‘sicas de red
67
+ 'socket_timeout': 60,
68
+ 'retries': 3,
69
+ 'fragment_retries': 5,
70
+
71
+ # Headers realistas
72
  'http_headers': {
73
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
74
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
75
+ 'Accept-Language': 'en-US,en;q=0.9',
76
  'Accept-Encoding': 'gzip, deflate, br',
77
  'DNT': '1',
78
  'Connection': 'keep-alive',
79
+ 'Sec-Fetch-Dest': 'document',
80
+ 'Sec-Fetch-Mode': 'navigate',
81
+ 'Sec-Fetch-Site': 'none',
82
+ 'Sec-Fetch-User': '?1',
83
  'Upgrade-Insecure-Requests': '1',
84
  },
85
 
86
+ # Rate limiting
87
+ 'sleep_interval': random.uniform(3, 6),
88
+ 'max_sleep_interval': 10,
89
+ 'sleep_interval_requests': random.uniform(1, 2),
 
 
 
 
 
 
 
 
 
 
90
 
91
  # Configuraciones especΓ­ficas para Vimeo
92
  'extractor_args': {
93
  'vimeo': {
94
+ 'client': 'web',
95
+ 'original_format_policy': 'never', # Evitar requests extra que pueden causar bloqueos
96
  }
97
  },
98
 
99
+ # Bypass geo
100
+ 'geo_bypass': True,
101
+ 'geo_bypass_country': 'US',
102
+
103
+ # Configuraciones adicionales
104
  'no_warnings': False,
105
  'ignoreerrors': False,
106
  'abort_on_unavailable_fragments': False,
107
+ }
108
+
109
+ # AΓ±adir impersonaciΓ³n solo si curl-cffi estΓ‘ disponible
110
+ if CURL_CFFI_AVAILABLE:
111
+ opts['impersonate'] = 'chrome'
112
+ print("πŸ” Using Chrome impersonation")
113
+ else:
114
+ print("⚠️ Using basic user agent (curl-cffi not available)")
115
+
116
+ return opts
117
+
118
+ def get_fallback_ydl_opts(output_path, method="player"):
119
+ """Configuraciones alternativas cuando el mΓ©todo principal falla"""
120
+ opts = {
121
+ 'format': 'bestaudio/best',
122
+ 'postprocessors': [{
123
+ 'key': 'FFmpegExtractAudio',
124
+ 'preferredcodec': 'wav',
125
+ }],
126
+ 'outtmpl': output_path,
127
+ 'keepvideo': False,
128
+
129
+ # Configuraciones mΓ‘s conservadoras
130
+ 'socket_timeout': 120,
131
+ 'retries': 2,
132
+ 'fragment_retries': 3,
133
+
134
+ # Rate limiting mΓ‘s agresivo
135
+ 'sleep_interval': random.uniform(5, 10),
136
+ 'max_sleep_interval': 15,
137
+ 'sleep_interval_requests': random.uniform(2, 4),
138
+
139
+ # Headers simplificados
140
+ 'http_headers': {
141
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
142
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
143
+ 'Accept-Language': 'en-US,en;q=0.5',
144
+ 'Accept-Encoding': 'gzip, deflate',
145
+ 'DNT': '1',
146
+ },
147
+
148
+ # Configuraciones especΓ­ficas segΓΊn el mΓ©todo
149
+ 'extractor_args': {
150
+ 'vimeo': {
151
+ 'client': 'android' if method == "android" else 'web',
152
+ 'original_format_policy': 'never',
153
+ }
154
+ },
155
 
156
+ # Configuraciones adicionales para estabilidad
157
+ 'no_warnings': True,
158
+ 'ignoreerrors': True,
159
+ 'abort_on_unavailable_fragments': True,
160
+ }
161
+
162
+ return opts
163
+
164
+ def get_generic_ydl_opts(output_path):
165
+ """ConfiguraciΓ³n genΓ©rica como ΓΊltimo recurso"""
166
+ return {
167
+ 'format': 'bestaudio/best',
168
+ 'postprocessors': [{
169
+ 'key': 'FFmpegExtractAudio',
170
+ 'preferredcodec': 'wav',
171
+ }],
172
+ 'outtmpl': output_path,
173
+ 'keepvideo': False,
174
+ 'socket_timeout': 180,
175
+ 'retries': 1,
176
+ 'http_headers': {
177
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
178
+ },
179
+ 'sleep_interval': 10,
180
+ 'no_warnings': True,
181
+ 'ignoreerrors': True,
182
+ 'geo_bypass': True,
183
  }
184
 
185
  # --- FUNCIONES AUXILIARES ---
 
191
  if file and os.path.exists(file):
192
  try:
193
  os.remove(file)
194
+ print(f"πŸ—‘οΈ Removed file: {file}")
195
  except OSError as e:
196
+ print(f"❌ Error removing file {file}: {e}")
197
 
198
+ def human_like_delay(min_sec=2, max_sec=5):
199
  """Simula comportamiento humano con delays aleatorios"""
200
+ delay = random.uniform(min_sec, max_sec)
201
+ print(f"⏳ Waiting {delay:.1f} seconds...")
202
  time.sleep(delay)
203
 
204
+ def is_vimeo_url(url):
205
+ """Detecta si una URL es de Vimeo"""
206
+ return 'vimeo.com' in url.lower()
207
+
208
+ def extract_vimeo_id(url):
209
+ """Extrae el ID del video de Vimeo"""
210
+ patterns = [
211
+ r'vimeo\.com/(\d+)',
212
+ r'player\.vimeo\.com/video/(\d+)',
213
+ r'vimeo\.com/.*/(\d+)',
214
+ ]
215
+
216
+ for pattern in patterns:
217
+ match = re.search(pattern, url)
218
+ if match:
219
+ return match.group(1)
220
+ return None
221
+
222
+ # --- LΓ“GICA PRINCIPAL MEJORADA CON MÚLTIPLES FALLBACKS ---
223
+ def download_video_audio_multi_fallback(url):
224
  """
225
+ FunciΓ³n robusta con mΓΊltiples mΓ©todos de fallback para Vimeo
226
  """
227
+ print(f"🎯 Processing URL: {url}")
228
  temp_filename = generate_unique_filename("")
229
  output_path = f"{temp_filename}.wav"
230
 
231
+ # Delay inicial
232
+ human_like_delay(1, 3)
233
 
234
+ # Detectar si es Vimeo y preparar URLs alternativas
235
+ is_vimeo = is_vimeo_url(url)
236
+ urls_to_try = [url]
237
 
238
+ if is_vimeo:
239
+ print("🎬 Vimeo video detected - Preparing fallback URLs")
240
+ video_id = extract_vimeo_id(url)
241
+ if video_id:
242
+ # AΓ±adir URLs alternativas para Vimeo
243
+ player_url = f"https://player.vimeo.com/video/{video_id}"
244
+ if player_url != url:
245
+ urls_to_try.append(player_url)
246
 
247
+ # URL con query parameters para bypassing
248
+ urls_to_try.append(f"https://player.vimeo.com/video/{video_id}?color=ffffff&title=0&byline=0&portrait=0")
249
+
250
+ # MΓ©todos a intentar en orden de preferencia
251
+ methods = [
252
+ ("primary", "πŸ” Primary method (with impersonation)", get_primary_ydl_opts),
253
+ ("player", "🌐 Player URL method", get_fallback_ydl_opts),
254
+ ("android", "πŸ“± Android client method", lambda p: get_fallback_ydl_opts(p, "android")),
255
+ ("generic", "πŸ› οΈ Generic fallback method", get_generic_ydl_opts),
256
+ ]
257
+
258
+ total_attempts = 0
259
+ max_total_attempts = 12 # 3 URLs Γ— 4 mΓ©todos
260
+
261
+ for current_url in urls_to_try:
262
+ print(f"\nπŸ”„ Trying URL: {current_url}")
263
+
264
+ for method_name, method_desc, get_opts_func in methods:
265
+ total_attempts += 1
266
+ print(f"\nπŸ“Š Attempt {total_attempts}/{max_total_attempts}")
267
+ print(f"πŸ›‘οΈ Using: {method_desc}")
268
+
269
+ try:
270
+ # Configurar opciones segΓΊn el mΓ©todo
271
+ ydl_opts = get_opts_func(temp_filename)
272
 
273
+ # Delay antes del intento
274
+ if total_attempts > 1:
275
+ delay_time = min(2 ** (total_attempts // 3), 15) # Backoff exponencial limitado
276
+ human_like_delay(delay_time, delay_time + 2)
277
 
278
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
279
+ # Primero extraer informaciΓ³n
280
+ print("πŸ“‹ Extracting video information...")
281
+ try:
282
+ info = ydl.extract_info(current_url, download=False)
283
+ if not info:
284
+ raise Exception("Could not extract video information")
285
+
286
+ title = info.get('title', 'Unknown')
287
+ duration = info.get('duration', 'Unknown')
288
+ print(f"βœ… Video found: {title} (Duration: {duration})")
289
+
290
+ # PequeΓ±o delay antes de descargar
291
+ human_like_delay(1, 2)
292
+
293
+ # Proceder con la descarga
294
+ print("⬇️ Starting download...")
295
+ ydl.download([current_url])
296
+
297
+ except Exception as extract_error:
298
+ print(f"❌ Extraction failed: {str(extract_error)}")
299
+ raise extract_error
300
 
301
+ # Verificar que el archivo se creΓ³
302
+ if os.path.exists(output_path):
303
+ file_size = os.path.getsize(output_path)
304
+ print(f"βœ… Download successful! File size: {file_size} bytes")
305
+
306
+ if file_size > 1000: # Al menos 1KB
307
+ return output_path
308
+ else:
309
+ raise Exception("Downloaded file is too small (possible error)")
310
+ else:
311
+ raise FileNotFoundError(f"Expected file {output_path} was not found")
312
+
313
+ except Exception as e:
314
+ error_msg = str(e).lower()
315
+ print(f"❌ Method '{method_name}' failed: {str(e)}")
316
 
317
+ # AnΓ‘lisis especΓ­fico del error
318
+ if "http error 401" in error_msg:
319
+ print("πŸ” Authentication issue detected")
320
+ elif "http error 403" in error_msg:
321
+ print("🚫 Access forbidden - likely blocked")
322
+ elif "http error 429" in error_msg:
323
+ print("⏰ Rate limited - increasing delay")
324
+ human_like_delay(10, 15) # Delay extra para rate limiting
325
+ elif "tls fingerprint" in error_msg:
326
+ print("πŸ›‘οΈ TLS fingerprinting detected")
327
+ elif "oauth token" in error_msg:
328
+ print("πŸ”‘ OAuth token issue")
329
 
330
+ # Limpiar archivo parcial si existe
331
+ if os.path.exists(output_path):
332
+ cleanup_files(output_path)
333
+
334
+ # Continuar con el siguiente mΓ©todo si no es el ΓΊltimo
335
+ if total_attempts < max_total_attempts:
336
+ print(f"πŸ”„ Trying next method...")
337
+ continue
338
+ else:
339
+ # Último intento falló
340
+ break
341
+
342
+ # Todos los mΓ©todos fallaron
343
+ raise Exception(f"All {total_attempts} download attempts failed. Vimeo may be blocking this IP or the video is not accessible.")
344
 
345
  def transcribe_audio_enhanced(file_path):
346
  """FunciΓ³n mejorada de transcripciΓ³n con mejor manejo de errores"""
347
+ print(f"🎀 Starting transcription of file: {file_path}")
348
  temp_audio = None
349
  original_file_to_clean = file_path
350
 
351
  try:
352
+ # Verificar que el archivo existe y tiene contenido
353
+ if not os.path.exists(file_path):
354
+ raise FileNotFoundError(f"Audio file not found: {file_path}")
355
+
356
+ file_size = os.path.getsize(file_path)
357
+ print(f"πŸ“Š Audio file size: {file_size} bytes")
358
+
359
+ if file_size < 1000:
360
+ raise Exception("Audio file is too small - may be corrupted")
361
+
362
  # Convertir a WAV si es necesario
363
  if not file_path.endswith('.wav'):
364
+ print("πŸ”„ Converting to WAV format...")
365
  video = mp.VideoFileClip(file_path)
366
+ if not video.audio:
367
+ raise Exception("No audio track found in video file")
368
+
369
  temp_audio = generate_unique_filename(".wav")
370
  video.audio.write_audiofile(temp_audio, verbose=False, logger=None)
371
+ video.close()
372
  file_path = temp_audio
373
 
374
  output_file = generate_unique_filename(".json")
375
 
376
+ # Comando de Whisper con configuraciones robustas
377
  command = [
378
  "insanely-fast-whisper",
379
  "--file-name", file_path,
 
382
  "--task", "transcribe",
383
  "--timestamp", "chunk",
384
  "--transcript-path", output_file,
385
+ "--batch-size", "2", # Batch size mΓ‘s pequeΓ±o para estabilidad
386
+ "--hf-token", "dummy", # Token dummy para evitar warnings
387
  ]
388
 
389
+ print(f"πŸ€– Executing transcription...")
390
  result = subprocess.run(
391
  command,
392
  check=True,
393
  capture_output=True,
394
  text=True,
395
+ timeout=900 # 15 minutos de timeout
396
  )
397
 
398
+ print(f"βœ… Transcription command completed")
399
+
400
+ # Verificar que el archivo de salida existe
401
  if not os.path.exists(output_file):
402
  raise FileNotFoundError("Transcription output file not found")
403
 
404
+ # Leer y procesar resultado
405
  with open(output_file, "r", encoding='utf-8') as f:
406
  transcription_data = json.load(f)
407
 
408
+ result_text = transcription_data.get("text", "").strip()
409
+
410
+ # Fallback: concatenar chunks si no hay texto principal
411
  if not result_text:
 
412
  chunks = transcription_data.get("chunks", [])
413
+ if chunks:
414
+ result_text = " ".join([chunk.get("text", "").strip() for chunk in chunks])
415
+
416
+ # Validar resultado
417
+ if not result_text or len(result_text) < 10:
418
+ raise Exception("Transcription produced no meaningful text")
419
 
420
+ print(f"βœ… Transcription completed. Length: {len(result_text)} characters")
421
  cleanup_files(output_file)
422
+ return result_text
423
 
424
  except subprocess.TimeoutExpired:
425
+ print("⏰ Transcription timed out")
426
+ raise Exception("Transcription process timed out (15 minutes)")
427
  except Exception as e:
428
+ print(f"❌ Transcription error: {e}")
429
  raise
430
  finally:
431
+ # Limpieza
432
+ if temp_audio and os.path.exists(temp_audio):
433
  cleanup_files(temp_audio)
434
+ if original_file_to_clean != file_path and os.path.exists(original_file_to_clean):
435
  cleanup_files(original_file_to_clean)
436
 
437
  def generate_summary_stream(transcription):
438
  """FunciΓ³n mejorada de generaciΓ³n de resumen"""
439
  if not transcription or len(transcription.strip()) < 20:
440
+ return "⚠️ Transcription is too short to summarize (less than 20 characters)."
441
 
442
+ print("πŸ€– Generating AI summary...")
443
 
444
  try:
445
  detected_language = langdetect.detect(transcription)
446
+ print(f"🌍 Detected language: {detected_language}")
447
  except:
448
+ detected_language = "en"
449
+ print("🌍 Language detection failed, defaulting to English")
450
 
451
  # Truncar transcripciΓ³n si es muy larga
452
+ max_chars = 12000 # Reducido para evitar problemas de memoria
453
  truncated_text = transcription[:max_chars]
454
  if len(transcription) > max_chars:
455
  truncated_text += "..."
456
+ print(f"πŸ“ Transcription truncated to {max_chars} characters")
457
 
458
  prompt = f"""Please create a comprehensive summary of the following video transcription in {detected_language}.
459
  The summary should be 150-300 words and capture the main points, key ideas, and important details:
 
462
 
463
  try:
464
  response, _ = model.chat(tokenizer, prompt, history=[])
465
+ print("βœ… Summary generated successfully")
466
  return response
467
  except Exception as e:
468
+ print(f"❌ Summary generation error: {e}")
469
+ return f"⚠️ Error generating summary: {str(e)}\n\nOriginal transcription:\n{transcription[:1000]}..."
470
 
471
  # --- FUNCIONES DE INTERFAZ MEJORADAS ---
472
  def process_video_url_enhanced(url):
473
+ """FunciΓ³n mejorada para procesar URLs con diagnΓ³stico detallado"""
474
  if not url or not url.strip():
475
+ return "❌ Please enter a valid video URL.", "⚠️ No URL provided"
476
 
477
  url = url.strip()
478
+ print(f"\n{'='*50}")
479
+ print(f"🎯 PROCESSING VIDEO URL")
480
+ print(f"{'='*50}")
481
+ print(f"URL: {url}")
482
 
483
  # Detectar plataforma
484
  platform = "Unknown"
 
487
  elif "vimeo.com" in url:
488
  platform = "Vimeo"
489
 
490
+ print(f"Platform: {platform}")
491
+ print(f"curl-cffi available: {CURL_CFFI_AVAILABLE}")
492
 
493
  audio_file = None
494
  try:
495
+ # Usar funciΓ³n robusta con mΓΊltiples fallbacks
496
+ print(f"\nπŸš€ Starting download process...")
497
+ audio_file = download_video_audio_multi_fallback(url)
498
+
499
+ print(f"\n🎀 Starting transcription process...")
500
  transcription = transcribe_audio_enhanced(audio_file)
501
 
502
  if not transcription:
503
+ return "❌ No transcription could be generated from this video.", "⚠️ Transcription failed"
504
 
505
+ print(f"\nβœ… Process completed successfully!")
506
+ success_msg = f"βœ… Successfully processed {platform} video ({len(transcription)} chars transcribed)"
507
+ return transcription, success_msg
508
 
509
  except Exception as e:
510
  error_msg = str(e)
511
+ print(f"\n❌ ERROR: {error_msg}")
512
 
513
+ # AnΓ‘lisis de errores mejorado
514
  if "HTTP Error 401" in error_msg:
515
+ return ("❌ ACCESS DENIED: The video might be private, require authentication, or have restricted access. "
516
+ "Try with a public video or check if the URL is correct."), "πŸ” Authentication Required"
517
+ elif "HTTP Error 403" in error_msg or "blocked" in error_msg.lower():
518
+ return ("❌ BLOCKED: Your IP or this server has been temporarily blocked by Vimeo. "
519
+ "This is common with datacenter IPs. Please try again in 10-15 minutes."), "🚫 Temporarily Blocked"
520
  elif "HTTP Error 429" in error_msg:
521
+ return ("❌ RATE LIMITED: Too many requests sent to Vimeo. "
522
+ "Please wait 5-10 minutes before trying again."), "⏰ Rate Limited"
523
  elif "TLS fingerprint" in error_msg:
524
+ return ("❌ TLS BLOCKED: Vimeo detected automated access. "
525
+ f"curl-cffi status: {'βœ… Available' if CURL_CFFI_AVAILABLE else '❌ Missing'}. "
526
+ "Try again later or contact support."), "πŸ›‘οΈ Security Block"
527
+ elif "oauth token" in error_msg or "Bad Request" in error_msg:
528
+ return ("❌ API ERROR: Vimeo's API is experiencing issues or the video format is not supported. "
529
+ "Try with a different Vimeo video."), "πŸ”‘ API Issue"
530
+ elif "not accessible" in error_msg.lower():
531
+ return ("❌ VIDEO NOT ACCESSIBLE: All download methods failed. The video might be: "
532
+ "1) Private/Password protected, 2) Geo-restricted, 3) Deleted, or 4) Not a valid video URL."), "🚫 Not Accessible"
533
+ elif "timeout" in error_msg.lower():
534
+ return ("❌ TIMEOUT: The process took too long. This might be due to: "
535
+ "1) Very long video, 2) Network issues, or 3) Server overload. Try with a shorter video."), "⏰ Timeout"
536
  else:
537
+ return f"❌ UNEXPECTED ERROR: {error_msg}", "❌ Unknown Error"
538
  finally:
539
+ # Limpieza final
540
+ if audio_file and os.path.exists(audio_file):
541
  cleanup_files(audio_file)
542
 
543
  def process_uploaded_video_enhanced(video_path):
544
  """FunciΓ³n mejorada para procesar videos subidos"""
545
  if video_path is None:
546
+ return "❌ Please upload a video file first.", "⚠️ No file uploaded"
547
 
548
+ print(f"\n{'='*50}")
549
+ print(f"πŸ“€ PROCESSING UPLOADED VIDEO")
550
+ print(f"{'='*50}")
551
+ print(f"File path: {video_path}")
552
 
553
  try:
554
+ # Verificar archivo
555
+ if not os.path.exists(video_path):
556
+ return "❌ Uploaded file not found.", "❌ File not found"
557
+
558
+ file_size = os.path.getsize(video_path)
559
+ print(f"File size: {file_size} bytes")
560
+
561
+ if file_size < 1000:
562
+ return "❌ Uploaded file is too small or corrupted.", "❌ Invalid file"
563
+
564
+ print(f"🎀 Starting transcription...")
565
  transcription = transcribe_audio_enhanced(video_path)
566
 
567
  if not transcription:
568
+ return "❌ No transcription could be generated from this video.", "⚠️ Transcription failed"
569
 
570
+ print(f"βœ… Process completed successfully!")
571
+ success_msg = f"βœ… Successfully processed uploaded video ({len(transcription)} chars transcribed)"
572
+ return transcription, success_msg
573
 
574
  except Exception as e:
575
  error_msg = str(e)
576
+ print(f"❌ ERROR: {error_msg}")
577
+
578
+ if "No audio track" in error_msg:
579
+ return "❌ NO AUDIO: The uploaded video doesn't contain an audio track.", "πŸ”‡ No Audio"
580
+ elif "timeout" in error_msg.lower():
581
+ return "❌ TIMEOUT: Video processing took too long. Try with a shorter video.", "⏰ Timeout"
582
+ else:
583
+ return f"❌ ERROR: {error_msg}", "❌ Processing Error"
584
 
585
  # --- CONSTRUCCIΓ“N DE LA INTERFAZ MEJORADA ---
586
+ print("🎨 Setting up enhanced Gradio interface...")
587
 
588
+ with gr.Blocks(theme=gr.themes.Soft(), title="πŸŽ₯ Anti-Block Video Transcription") as demo:
589
+ gr.Markdown("# πŸŽ₯ Anti-Block Video Transcription & AI Summary")
590
+ gr.Markdown(f"""
591
+ Advanced video transcription with **anti-blocking technology** for Vimeo and other platforms.
592
 
593
+ **πŸ›‘οΈ Current Status:**
594
+ - curl-cffi (Advanced Impersonation): {'βœ… Available' if CURL_CFFI_AVAILABLE else '❌ Not Available'}
595
+ - Multiple Fallback Methods: βœ… Enabled
596
+ - Rate Limiting Protection: βœ… Enabled
597
+ - TLS Fingerprint Evasion: {'βœ… Enabled' if CURL_CFFI_AVAILABLE else '⚠️ Basic Protection'}
 
598
  """)
599
 
600
  with gr.Tabs():
 
602
  with gr.Row():
603
  url_input = gr.Textbox(
604
  label="Video URL",
605
+ placeholder="https://vimeo.com/123456789 or https://www.youtube.com/watch?v=...",
606
  scale=4
607
  )
608
  url_button = gr.Button("πŸš€ Process URL", variant="primary", scale=1)
 
616
  with gr.Column():
617
  transcription_output = gr.Textbox(
618
  label="πŸ“ Transcription",
619
+ lines=15,
620
  interactive=True,
621
  placeholder="Transcription will appear here..."
622
  )
623
  with gr.Column():
624
  summary_output = gr.Textbox(
625
  label="πŸ“Š AI Summary",
626
+ lines=15,
627
  placeholder="AI-generated summary will appear here..."
628
  )
629
 
630
  with gr.Row():
631
  status_output = gr.Textbox(
632
+ label="πŸ“Š Status & Diagnostics",
633
  interactive=False,
634
+ placeholder="Ready to process videos...",
635
+ lines=2
636
  )
637
  summary_button = gr.Button("πŸ“ Generate Summary", variant="secondary")
638
 
639
+ # SecciΓ³n de ayuda expandida
640
+ with gr.Accordion("ℹ️ Troubleshooting & Tips", open=False):
641
+ gr.Markdown(f"""
642
+ ## πŸ› οΈ System Status
643
+ - **curl-cffi Library**: {'βœ… Installed' if CURL_CFFI_AVAILABLE else '❌ Missing (using fallback methods)'}
644
+ - **Fallback Methods**: βœ… 4 different methods available
645
+ - **Error Recovery**: βœ… Automatic retry with exponential backoff
646
+
647
+ ## 🎯 Best Practices for Vimeo
648
+ 1. **Public Videos Work Best**: Private/password-protected videos may fail
649
+ 2. **Wait Between Requests**: If blocked, wait 10-15 minutes before retrying
650
+ 3. **Use Standard URLs**: Format like `https://vimeo.com/123456789`
651
+ 4. **Check Video Accessibility**: Ensure video plays in your browser first
652
+
653
+ ## 🚨 Common Error Solutions
654
+
655
+ **"Access Denied" or "HTTP 401"**
656
+ - Video is private or requires login
657
+ - Try with a public video
658
+
659
+ **"Blocked" or "HTTP 403"**
660
+ - Temporary IP block (common with datacenter IPs)
661
+ - Wait 10-15 minutes and try again
662
+
663
+ **"Rate Limited" or "HTTP 429"**
664
+ - Too many requests sent
665
+ - Wait 5-10 minutes before retrying
666
+
667
+ **"TLS Fingerprint Blocked"**
668
+ - Advanced anti-bot protection detected
669
+ - System will try multiple fallback methods automatically
670
+
671
+ **"All download attempts failed"**
672
+ - Video may be geo-restricted or deleted
673
+ - Try a different video to test if service is working
674
+
675
+ ## πŸ“ž Support
676
+ If problems persist, check if the video plays normally in your browser and try with a different public video.
677
  """)
678
 
679
+ # Eventos de la interfaz
680
  url_button.click(
681
  fn=process_video_url_enhanced,
682
  inputs=[url_input],
 
695
  outputs=[summary_output]
696
  )
697
 
698
+ print("🌟 Enhanced Gradio interface ready!")
699
+ print(f"πŸ”§ curl-cffi status: {'Available' if CURL_CFFI_AVAILABLE else 'Not available'}")
700
+ print("πŸš€ Launching application...")
701
+
702
  demo.launch(
703
  server_name="0.0.0.0",
704
  server_port=7860,