sanch1tx commited on
Commit
840863b
·
verified ·
1 Parent(s): 1fd4ce4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -33
app.py CHANGED
@@ -278,14 +278,20 @@ def scrape_moviesdrive_details(page_url):
278
  if not main_content: return None
279
 
280
  title = soup.find('h1', class_='page-title').get_text(strip=True)
 
 
281
  poster_tag = main_content.select_one('.entry-content img.aligncenter, .entry-content p > img')
282
  poster = poster_tag['src'] if poster_tag else "N/A"
283
 
284
  all_h3s = main_content.find_all('h3')
285
- storyline_h3, screenshots_h3 = None, None
 
 
286
  for h3 in all_h3s:
287
- if 'Storyline' in h3.get_text(): storyline_h3 = h3
288
- elif 'Screen-Shots' in h3.get_text(): screenshots_h3 = h3
 
 
289
 
290
  storyline = storyline_h3.find_next_sibling('div').get_text(strip=True) if storyline_h3 and storyline_h3.find_next_sibling('div') else "N/A"
291
 
@@ -296,9 +302,11 @@ def scrape_moviesdrive_details(page_url):
296
  screenshots = [img['src'] for img in screenshot_container.find_all('img')]
297
 
298
  download_options = []
 
299
  for link_container in main_content.select('.entry-content h5, .entry-content p'):
300
  link_tag = link_container.find('a')
301
  if link_tag and link_tag.get('href'):
 
302
  text_lower = link_container.get_text(strip=True).lower()
303
  if '480p' not in text_lower and 'telegram' not in text_lower:
304
  download_options.append({
@@ -306,7 +314,11 @@ def scrape_moviesdrive_details(page_url):
306
  'url': link_tag['href']
307
  })
308
 
309
- return {'title': title, 'poster': poster, 'storyline': storyline, 'screenshots': screenshots, 'download_options': download_options}
 
 
 
 
310
  except Exception as e:
311
  print(f"[MoviesDrive] An error occurred during detail parsing: {e}")
312
  return None
@@ -323,66 +335,90 @@ def scrape_hblinks_page(page_url):
323
  response = requests.get(page_url, headers=headers, timeout=10)
324
  response.raise_for_status()
325
  soup = BeautifulSoup(response.content, 'html.parser')
 
326
  content = soup.select_one('.entry-content')
327
- if not content: return []
 
328
 
329
  current_group = None
 
330
  for tag in content.find_all(['h3', 'h5']):
331
  tag_text = tag.get_text(strip=True)
332
- links_in_tag = [{'provider': a.get_text(strip=True) or "Download", 'url': a['href']} for a in tag.find_all('a', href=True) if a.get('href') and 't.me' not in a['href']]
333
 
 
 
 
 
 
 
 
 
334
  if links_in_tag:
 
 
335
  if current_group is None or len(links_in_tag) > 1 or not re.search(r'drive|cloud|instant', tag_text, re.I):
336
  group_title = tag_text
 
337
  if len(links_in_tag) == 1:
338
  group_title = re.sub(r'\[?'+re.escape(links_in_tag[0]['provider'])+r'\]?', '', group_title).strip()
339
- current_group = {'quality_title': group_title or "Links", 'links': []}
 
340
  groups.append(current_group)
 
341
  current_group['links'].extend(links_in_tag)
 
 
342
  elif not tag.find('a'):
343
  current_group = {'quality_title': tag_text, 'links': []}
344
  groups.append(current_group)
345
 
346
  except requests.exceptions.RequestException as e:
347
  print(f"[HBLinks] Scraping failed for {page_url}: {e}")
 
 
348
  return [g for g in groups if g['links']]
349
 
 
350
  def scrape_mdrive_page(page_url):
351
- """REWRITTEN: Scrapes final cloud links from an mdrive.today page, correctly grouping by episode title."""
352
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
353
  groups = []
 
354
  try:
355
  response = requests.get(page_url, headers=headers, timeout=10)
356
  response.raise_for_status()
357
  soup = BeautifulSoup(response.content, 'html.parser')
358
  content = soup.select_one('.entry-content')
359
- if not content: return []
360
-
361
- current_title = "Download Links"
362
- current_links = []
363
-
364
- for element in content.find_all(['h5', 'hr']):
365
- if element.name == 'h5':
366
- if not element.find('a'): # This is a title tag
367
- if current_links: # Save the previous group
368
- groups.append({'quality_title': current_title, 'links': current_links})
369
- current_links = []
370
- current_title = element.get_text(strip=True)
371
- else: # This is a link tag
372
- for a_tag in element.find_all('a', href=True):
373
- href = a_tag.get('href')
374
- provider = a_tag.get_text(strip=True)
375
- if href and provider and 't.me' not in href and 'moviesdrive' not in href:
376
- current_links.append({'provider': provider, 'url': href})
377
-
378
- if current_links: # Append the last collected group
379
- groups.append({'quality_title': current_title, 'links': current_links})
 
380
 
381
  except requests.exceptions.RequestException as e:
382
  print(f"[MDrive Page] Scraping failed for {page_url}: {e}")
383
 
 
384
  return [g for g in groups if g.get('links')]
385
 
 
386
  def rot13(s):
387
  result = []
388
  for char in s:
@@ -529,14 +565,19 @@ def bypass():
529
  if 'hblinks.dad' in current_url:
530
  print(f"Secondary hblinks bypass required for: '{current_url}'")
531
  groups = scrape_hblinks_page(current_url)
532
- return jsonify({"download_groups": groups}) if groups else (jsonify({"error": "Failed to scrape final links from hblinks."}), 500)
 
 
 
533
 
534
  elif 'mdrive.today' in current_url:
535
  print(f"Secondary mdrive bypass required for: '{current_url}'")
536
  groups = scrape_mdrive_page(current_url)
537
- return jsonify({"download_groups": groups}) if groups else (jsonify({"error": "Failed to scrape final links from mdrive."}), 500)
 
 
 
538
 
539
  # If no secondary bypass was needed, it's the final URL
540
  print(f"Direct link after potential first bypass: '{current_url}'")
541
- return jsonify({"final_url": current_url})
542
-
 
278
  if not main_content: return None
279
 
280
  title = soup.find('h1', class_='page-title').get_text(strip=True)
281
+
282
+ # Improved poster selector
283
  poster_tag = main_content.select_one('.entry-content img.aligncenter, .entry-content p > img')
284
  poster = poster_tag['src'] if poster_tag else "N/A"
285
 
286
  all_h3s = main_content.find_all('h3')
287
+ storyline_h3 = None
288
+ screenshots_h3 = None
289
+
290
  for h3 in all_h3s:
291
+ if 'Storyline' in h3.get_text():
292
+ storyline_h3 = h3
293
+ elif 'Screen-Shots' in h3.get_text():
294
+ screenshots_h3 = h3
295
 
296
  storyline = storyline_h3.find_next_sibling('div').get_text(strip=True) if storyline_h3 and storyline_h3.find_next_sibling('div') else "N/A"
297
 
 
302
  screenshots = [img['src'] for img in screenshot_container.find_all('img')]
303
 
304
  download_options = []
305
+ # Find all link tags, which are typically in h5 or p tags for this provider
306
  for link_container in main_content.select('.entry-content h5, .entry-content p'):
307
  link_tag = link_container.find('a')
308
  if link_tag and link_tag.get('href'):
309
+ # Exclude 480p links and non-download links
310
  text_lower = link_container.get_text(strip=True).lower()
311
  if '480p' not in text_lower and 'telegram' not in text_lower:
312
  download_options.append({
 
314
  'url': link_tag['href']
315
  })
316
 
317
+ return {
318
+ 'title': title, 'poster': poster, 'storyline': storyline,
319
+ 'screenshots': screenshots, 'download_options': download_options
320
+ }
321
+
322
  except Exception as e:
323
  print(f"[MoviesDrive] An error occurred during detail parsing: {e}")
324
  return None
 
335
  response = requests.get(page_url, headers=headers, timeout=10)
336
  response.raise_for_status()
337
  soup = BeautifulSoup(response.content, 'html.parser')
338
+
339
  content = soup.select_one('.entry-content')
340
+ if not content:
341
+ return []
342
 
343
  current_group = None
344
+ # Iterate over all relevant tags (h3, h5) that might contain titles or links
345
  for tag in content.find_all(['h3', 'h5']):
346
  tag_text = tag.get_text(strip=True)
 
347
 
348
+ # Check for links first
349
+ links_in_tag = []
350
+ for a_tag in tag.find_all('a', href=True):
351
+ href = a_tag.get('href')
352
+ if href and 't.me' not in href and 'hblinks' not in href:
353
+ provider = a_tag.get_text(strip=True) or "Download"
354
+ links_in_tag.append({'provider': provider, 'url': href})
355
+
356
  if links_in_tag:
357
+ # If we find links, we need to decide which group they belong to.
358
+ # If there's no current group, or the tag text is a title, create a new one.
359
  if current_group is None or len(links_in_tag) > 1 or not re.search(r'drive|cloud|instant', tag_text, re.I):
360
  group_title = tag_text
361
+ # Clean up title if it contains the provider name
362
  if len(links_in_tag) == 1:
363
  group_title = re.sub(r'\[?'+re.escape(links_in_tag[0]['provider'])+r'\]?', '', group_title).strip()
364
+
365
+ current_group = {'quality_title': group_title, 'links': []}
366
  groups.append(current_group)
367
+
368
  current_group['links'].extend(links_in_tag)
369
+
370
+ # If the tag has no links but looks like a title, it's a header for the next links.
371
  elif not tag.find('a'):
372
  current_group = {'quality_title': tag_text, 'links': []}
373
  groups.append(current_group)
374
 
375
  except requests.exceptions.RequestException as e:
376
  print(f"[HBLinks] Scraping failed for {page_url}: {e}")
377
+
378
+ # Clean up any groups that were created but never got links.
379
  return [g for g in groups if g['links']]
380
 
381
+
382
  def scrape_mdrive_page(page_url):
383
+ """Scrapes the final cloud links from an mdrive.today page and groups them by episode."""
384
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
385
  groups = []
386
+ current_group = None
387
  try:
388
  response = requests.get(page_url, headers=headers, timeout=10)
389
  response.raise_for_status()
390
  soup = BeautifulSoup(response.content, 'html.parser')
391
  content = soup.select_one('.entry-content')
392
+ if not content:
393
+ return []
394
+
395
+ for tag in content.find_all('h5'):
396
+ tag_text = tag.get_text(strip=True)
397
+
398
+ # Check if it's an episode/quality header. This marks the start of a new group.
399
+ if re.search(r'Ep\d+|Season\s\d+', tag_text, re.IGNORECASE) or re.search(r'\d{3,4}p', tag_text):
400
+ current_group = {
401
+ 'quality_title': tag_text,
402
+ 'links': []
403
+ }
404
+ groups.append(current_group)
405
+
406
+ # Check for links within this tag and add them to the current group
407
+ links_in_tag = tag.find_all('a', href=True)
408
+ if links_in_tag and current_group:
409
+ for a_tag in links_in_tag:
410
+ href = a_tag.get('href')
411
+ provider = a_tag.get_text(strip=True)
412
+ if href and provider and 't.me' not in href and 'moviesdrive' not in href:
413
+ current_group['links'].append({'provider': provider, 'url': href})
414
 
415
  except requests.exceptions.RequestException as e:
416
  print(f"[MDrive Page] Scraping failed for {page_url}: {e}")
417
 
418
+ # Clean up empty groups that might have been created
419
  return [g for g in groups if g.get('links')]
420
 
421
+
422
  def rot13(s):
423
  result = []
424
  for char in s:
 
565
  if 'hblinks.dad' in current_url:
566
  print(f"Secondary hblinks bypass required for: '{current_url}'")
567
  groups = scrape_hblinks_page(current_url)
568
+ if groups:
569
+ return jsonify({"download_groups": groups})
570
+ else:
571
+ return jsonify({"error": "Failed to scrape final links from hblinks."}), 500
572
 
573
  elif 'mdrive.today' in current_url:
574
  print(f"Secondary mdrive bypass required for: '{current_url}'")
575
  groups = scrape_mdrive_page(current_url)
576
+ if groups:
577
+ return jsonify({"download_groups": groups})
578
+ else:
579
+ return jsonify({"error": "Failed to scrape final links from mdrive."}), 500
580
 
581
  # If no secondary bypass was needed, it's the final URL
582
  print(f"Direct link after potential first bypass: '{current_url}'")
583
+ return jsonify({"final_url": current_url})