Update app.py
Browse files
app.py
CHANGED
|
@@ -278,14 +278,20 @@ def scrape_moviesdrive_details(page_url):
|
|
| 278 |
if not main_content: return None
|
| 279 |
|
| 280 |
title = soup.find('h1', class_='page-title').get_text(strip=True)
|
|
|
|
|
|
|
| 281 |
poster_tag = main_content.select_one('.entry-content img.aligncenter, .entry-content p > img')
|
| 282 |
poster = poster_tag['src'] if poster_tag else "N/A"
|
| 283 |
|
| 284 |
all_h3s = main_content.find_all('h3')
|
| 285 |
-
storyline_h3
|
|
|
|
|
|
|
| 286 |
for h3 in all_h3s:
|
| 287 |
-
if 'Storyline' in h3.get_text():
|
| 288 |
-
|
|
|
|
|
|
|
| 289 |
|
| 290 |
storyline = storyline_h3.find_next_sibling('div').get_text(strip=True) if storyline_h3 and storyline_h3.find_next_sibling('div') else "N/A"
|
| 291 |
|
|
@@ -296,9 +302,11 @@ def scrape_moviesdrive_details(page_url):
|
|
| 296 |
screenshots = [img['src'] for img in screenshot_container.find_all('img')]
|
| 297 |
|
| 298 |
download_options = []
|
|
|
|
| 299 |
for link_container in main_content.select('.entry-content h5, .entry-content p'):
|
| 300 |
link_tag = link_container.find('a')
|
| 301 |
if link_tag and link_tag.get('href'):
|
|
|
|
| 302 |
text_lower = link_container.get_text(strip=True).lower()
|
| 303 |
if '480p' not in text_lower and 'telegram' not in text_lower:
|
| 304 |
download_options.append({
|
|
@@ -306,7 +314,11 @@ def scrape_moviesdrive_details(page_url):
|
|
| 306 |
'url': link_tag['href']
|
| 307 |
})
|
| 308 |
|
| 309 |
-
return {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
except Exception as e:
|
| 311 |
print(f"[MoviesDrive] An error occurred during detail parsing: {e}")
|
| 312 |
return None
|
|
@@ -323,66 +335,90 @@ def scrape_hblinks_page(page_url):
|
|
| 323 |
response = requests.get(page_url, headers=headers, timeout=10)
|
| 324 |
response.raise_for_status()
|
| 325 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
| 326 |
content = soup.select_one('.entry-content')
|
| 327 |
-
if not content:
|
|
|
|
| 328 |
|
| 329 |
current_group = None
|
|
|
|
| 330 |
for tag in content.find_all(['h3', 'h5']):
|
| 331 |
tag_text = tag.get_text(strip=True)
|
| 332 |
-
links_in_tag = [{'provider': a.get_text(strip=True) or "Download", 'url': a['href']} for a in tag.find_all('a', href=True) if a.get('href') and 't.me' not in a['href']]
|
| 333 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
if links_in_tag:
|
|
|
|
|
|
|
| 335 |
if current_group is None or len(links_in_tag) > 1 or not re.search(r'drive|cloud|instant', tag_text, re.I):
|
| 336 |
group_title = tag_text
|
|
|
|
| 337 |
if len(links_in_tag) == 1:
|
| 338 |
group_title = re.sub(r'\[?'+re.escape(links_in_tag[0]['provider'])+r'\]?', '', group_title).strip()
|
| 339 |
-
|
|
|
|
| 340 |
groups.append(current_group)
|
|
|
|
| 341 |
current_group['links'].extend(links_in_tag)
|
|
|
|
|
|
|
| 342 |
elif not tag.find('a'):
|
| 343 |
current_group = {'quality_title': tag_text, 'links': []}
|
| 344 |
groups.append(current_group)
|
| 345 |
|
| 346 |
except requests.exceptions.RequestException as e:
|
| 347 |
print(f"[HBLinks] Scraping failed for {page_url}: {e}")
|
|
|
|
|
|
|
| 348 |
return [g for g in groups if g['links']]
|
| 349 |
|
|
|
|
| 350 |
def scrape_mdrive_page(page_url):
|
| 351 |
-
"""
|
| 352 |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
| 353 |
groups = []
|
|
|
|
| 354 |
try:
|
| 355 |
response = requests.get(page_url, headers=headers, timeout=10)
|
| 356 |
response.raise_for_status()
|
| 357 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 358 |
content = soup.select_one('.entry-content')
|
| 359 |
-
if not content:
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
if
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
|
|
|
| 380 |
|
| 381 |
except requests.exceptions.RequestException as e:
|
| 382 |
print(f"[MDrive Page] Scraping failed for {page_url}: {e}")
|
| 383 |
|
|
|
|
| 384 |
return [g for g in groups if g.get('links')]
|
| 385 |
|
|
|
|
| 386 |
def rot13(s):
|
| 387 |
result = []
|
| 388 |
for char in s:
|
|
@@ -529,14 +565,19 @@ def bypass():
|
|
| 529 |
if 'hblinks.dad' in current_url:
|
| 530 |
print(f"Secondary hblinks bypass required for: '{current_url}'")
|
| 531 |
groups = scrape_hblinks_page(current_url)
|
| 532 |
-
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
elif 'mdrive.today' in current_url:
|
| 535 |
print(f"Secondary mdrive bypass required for: '{current_url}'")
|
| 536 |
groups = scrape_mdrive_page(current_url)
|
| 537 |
-
|
|
|
|
|
|
|
|
|
|
| 538 |
|
| 539 |
# If no secondary bypass was needed, it's the final URL
|
| 540 |
print(f"Direct link after potential first bypass: '{current_url}'")
|
| 541 |
-
return jsonify({"final_url": current_url})
|
| 542 |
-
|
|
|
|
| 278 |
if not main_content: return None
|
| 279 |
|
| 280 |
title = soup.find('h1', class_='page-title').get_text(strip=True)
|
| 281 |
+
|
| 282 |
+
# Improved poster selector
|
| 283 |
poster_tag = main_content.select_one('.entry-content img.aligncenter, .entry-content p > img')
|
| 284 |
poster = poster_tag['src'] if poster_tag else "N/A"
|
| 285 |
|
| 286 |
all_h3s = main_content.find_all('h3')
|
| 287 |
+
storyline_h3 = None
|
| 288 |
+
screenshots_h3 = None
|
| 289 |
+
|
| 290 |
for h3 in all_h3s:
|
| 291 |
+
if 'Storyline' in h3.get_text():
|
| 292 |
+
storyline_h3 = h3
|
| 293 |
+
elif 'Screen-Shots' in h3.get_text():
|
| 294 |
+
screenshots_h3 = h3
|
| 295 |
|
| 296 |
storyline = storyline_h3.find_next_sibling('div').get_text(strip=True) if storyline_h3 and storyline_h3.find_next_sibling('div') else "N/A"
|
| 297 |
|
|
|
|
| 302 |
screenshots = [img['src'] for img in screenshot_container.find_all('img')]
|
| 303 |
|
| 304 |
download_options = []
|
| 305 |
+
# Find all link tags, which are typically in h5 or p tags for this provider
|
| 306 |
for link_container in main_content.select('.entry-content h5, .entry-content p'):
|
| 307 |
link_tag = link_container.find('a')
|
| 308 |
if link_tag and link_tag.get('href'):
|
| 309 |
+
# Exclude 480p links and non-download links
|
| 310 |
text_lower = link_container.get_text(strip=True).lower()
|
| 311 |
if '480p' not in text_lower and 'telegram' not in text_lower:
|
| 312 |
download_options.append({
|
|
|
|
| 314 |
'url': link_tag['href']
|
| 315 |
})
|
| 316 |
|
| 317 |
+
return {
|
| 318 |
+
'title': title, 'poster': poster, 'storyline': storyline,
|
| 319 |
+
'screenshots': screenshots, 'download_options': download_options
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
except Exception as e:
|
| 323 |
print(f"[MoviesDrive] An error occurred during detail parsing: {e}")
|
| 324 |
return None
|
|
|
|
| 335 |
response = requests.get(page_url, headers=headers, timeout=10)
|
| 336 |
response.raise_for_status()
|
| 337 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 338 |
+
|
| 339 |
content = soup.select_one('.entry-content')
|
| 340 |
+
if not content:
|
| 341 |
+
return []
|
| 342 |
|
| 343 |
current_group = None
|
| 344 |
+
# Iterate over all relevant tags (h3, h5) that might contain titles or links
|
| 345 |
for tag in content.find_all(['h3', 'h5']):
|
| 346 |
tag_text = tag.get_text(strip=True)
|
|
|
|
| 347 |
|
| 348 |
+
# Check for links first
|
| 349 |
+
links_in_tag = []
|
| 350 |
+
for a_tag in tag.find_all('a', href=True):
|
| 351 |
+
href = a_tag.get('href')
|
| 352 |
+
if href and 't.me' not in href and 'hblinks' not in href:
|
| 353 |
+
provider = a_tag.get_text(strip=True) or "Download"
|
| 354 |
+
links_in_tag.append({'provider': provider, 'url': href})
|
| 355 |
+
|
| 356 |
if links_in_tag:
|
| 357 |
+
# If we find links, we need to decide which group they belong to.
|
| 358 |
+
# If there's no current group, or the tag text is a title, create a new one.
|
| 359 |
if current_group is None or len(links_in_tag) > 1 or not re.search(r'drive|cloud|instant', tag_text, re.I):
|
| 360 |
group_title = tag_text
|
| 361 |
+
# Clean up title if it contains the provider name
|
| 362 |
if len(links_in_tag) == 1:
|
| 363 |
group_title = re.sub(r'\[?'+re.escape(links_in_tag[0]['provider'])+r'\]?', '', group_title).strip()
|
| 364 |
+
|
| 365 |
+
current_group = {'quality_title': group_title, 'links': []}
|
| 366 |
groups.append(current_group)
|
| 367 |
+
|
| 368 |
current_group['links'].extend(links_in_tag)
|
| 369 |
+
|
| 370 |
+
# If the tag has no links but looks like a title, it's a header for the next links.
|
| 371 |
elif not tag.find('a'):
|
| 372 |
current_group = {'quality_title': tag_text, 'links': []}
|
| 373 |
groups.append(current_group)
|
| 374 |
|
| 375 |
except requests.exceptions.RequestException as e:
|
| 376 |
print(f"[HBLinks] Scraping failed for {page_url}: {e}")
|
| 377 |
+
|
| 378 |
+
# Clean up any groups that were created but never got links.
|
| 379 |
return [g for g in groups if g['links']]
|
| 380 |
|
| 381 |
+
|
| 382 |
def scrape_mdrive_page(page_url):
|
| 383 |
+
"""Scrapes the final cloud links from an mdrive.today page and groups them by episode."""
|
| 384 |
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
|
| 385 |
groups = []
|
| 386 |
+
current_group = None
|
| 387 |
try:
|
| 388 |
response = requests.get(page_url, headers=headers, timeout=10)
|
| 389 |
response.raise_for_status()
|
| 390 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 391 |
content = soup.select_one('.entry-content')
|
| 392 |
+
if not content:
|
| 393 |
+
return []
|
| 394 |
+
|
| 395 |
+
for tag in content.find_all('h5'):
|
| 396 |
+
tag_text = tag.get_text(strip=True)
|
| 397 |
+
|
| 398 |
+
# Check if it's an episode/quality header. This marks the start of a new group.
|
| 399 |
+
if re.search(r'Ep\d+|Season\s\d+', tag_text, re.IGNORECASE) or re.search(r'\d{3,4}p', tag_text):
|
| 400 |
+
current_group = {
|
| 401 |
+
'quality_title': tag_text,
|
| 402 |
+
'links': []
|
| 403 |
+
}
|
| 404 |
+
groups.append(current_group)
|
| 405 |
+
|
| 406 |
+
# Check for links within this tag and add them to the current group
|
| 407 |
+
links_in_tag = tag.find_all('a', href=True)
|
| 408 |
+
if links_in_tag and current_group:
|
| 409 |
+
for a_tag in links_in_tag:
|
| 410 |
+
href = a_tag.get('href')
|
| 411 |
+
provider = a_tag.get_text(strip=True)
|
| 412 |
+
if href and provider and 't.me' not in href and 'moviesdrive' not in href:
|
| 413 |
+
current_group['links'].append({'provider': provider, 'url': href})
|
| 414 |
|
| 415 |
except requests.exceptions.RequestException as e:
|
| 416 |
print(f"[MDrive Page] Scraping failed for {page_url}: {e}")
|
| 417 |
|
| 418 |
+
# Clean up empty groups that might have been created
|
| 419 |
return [g for g in groups if g.get('links')]
|
| 420 |
|
| 421 |
+
|
| 422 |
def rot13(s):
|
| 423 |
result = []
|
| 424 |
for char in s:
|
|
|
|
| 565 |
if 'hblinks.dad' in current_url:
|
| 566 |
print(f"Secondary hblinks bypass required for: '{current_url}'")
|
| 567 |
groups = scrape_hblinks_page(current_url)
|
| 568 |
+
if groups:
|
| 569 |
+
return jsonify({"download_groups": groups})
|
| 570 |
+
else:
|
| 571 |
+
return jsonify({"error": "Failed to scrape final links from hblinks."}), 500
|
| 572 |
|
| 573 |
elif 'mdrive.today' in current_url:
|
| 574 |
print(f"Secondary mdrive bypass required for: '{current_url}'")
|
| 575 |
groups = scrape_mdrive_page(current_url)
|
| 576 |
+
if groups:
|
| 577 |
+
return jsonify({"download_groups": groups})
|
| 578 |
+
else:
|
| 579 |
+
return jsonify({"error": "Failed to scrape final links from mdrive."}), 500
|
| 580 |
|
| 581 |
# If no secondary bypass was needed, it's the final URL
|
| 582 |
print(f"Direct link after potential first bypass: '{current_url}'")
|
| 583 |
+
return jsonify({"final_url": current_url})
|
|
|