Spaces:

fromozu
/

ebook-executor

Paused

App Files Files Community

ebook-executor / hf_backend /test_final_link.py

fromozu

Upload hf_backend/test_final_link.py with huggingface_hub

1ff5a24 verified about 1 month ago

raw

history blame contribute delete

4.67 kB

	#!/usr/bin/env python3
	"""
	Test using Playwright to capture the final EPUB URL from Anna's Archive.
	Handle the navigation properly by monitoring URL changes.
	"""

	from playwright.sync_api import sync_playwright
	import time
	import re

	MD5 = "d94c20d1364af9b484949659398c4062"
	SLOW_URL = f"https://annas-archive.gl/slow_download/{MD5}/0/3"

	def get_page_content_when_ready(url, timeout=120):
	"""Navigate to URL and wait for 'Download from partner website' page."""
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	context = browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
	)
	page = context.new_page()

	page.goto(url, timeout=120000, wait_until="domcontentloaded")

	# Wait for DDoS-Guard to pass
	for i in range(timeout):
	time.sleep(1)
	title = page.title()
	if title != "DDoS-Guard":
	print(f"DDoS-Guard 在第 {i+1} 秒通过")
	print(f"页面标题: {title}")
	break
	else:
	print(f"{timeout}秒内 DDoS-Guard 未通过")
	browser.close()
	return None

	# Small delay to let content load
	time.sleep(2)

	# Get the content
	content = page.content()
	browser.close()
	return content

	def extract_epub_url(content):
	"""Extract EPUB URL from page content."""
	if not content:
	return None

	# Look for direct EPUB links
	patterns = [
	r'href=["\']([^"\']\.epub[^"\'])["\']',
	r'src=["\']([^"\']\.epub[^"\'])["\']',
	r'url\(["\']?([^"\')]\.epub[^"\')])["\']?\)',
	r'["\']([^"\']\.epub[^"\'])["\']',
	]

	for pattern in patterns:
	matches = re.findall(pattern, content, re.IGNORECASE)
	for match in matches[:5]:
	if match and not match.startswith('data:'):
	print(f"找到 EPUB URL (pattern): {match[:100]}")
	return match

	# Look for meta refresh which might redirect to EPUB
	meta_refresh = re.findall(r'<meta[^>]content=["\']?[^"\']url=([^"\'>\s]+)', content, re.IGNORECASE)
	if meta_refresh:
	print(f"找到 meta refresh: {meta_refresh}")
	return meta_refresh[0]

	# Look for JavaScript redirects
	js_redirects = re.findall(r'window\.location\s=\s["\']([^"\']+)["\']', content)
	if js_redirects:
	print(f"找到 JS redirect: {js_redirects}")
	return js_redirects[0]

	js_redirects = re.findall(r'location\.href\s=\s["\']([^"\']+)["\']', content)
	if js_redirects:
	print(f"找到 location.href: {js_redirects}")
	return js_redirects[0]

	# Look for any URLs with common CDN domains
	cdn_patterns = ['amazonaws', 'cloudfront', 'fastly', 'azure', 'googleapis', 'gstatic']
	for cdn in cdn_patterns:
	if cdn in content.lower():
	urls = re.findall(r'https?://[^\s"\'<>\)]+' + cdn + r'[^\s"\'<>\)]*', content, re.IGNORECASE)
	if urls:
	print(f"找到 CDN URL ({cdn}): {urls[0][:100]}")
	return urls[0]

	return None

	def main():
	print("=" * 60)
	print("获取 Anna's Archive 慢下载页面内容")
	print("=" * 60)
	print(f"URL: {SLOW_URL}\n")

	content = get_page_content_when_ready(SLOW_URL, timeout=120)

	if content:
	print(f"\n页面内容长度: {len(content)} 字符")

	# Find all links
	links = re.findall(r'href=["\']([^"\']+)["\']', content)
	print(f"\n找到 {len(links)} 个 href 链接")

	# Look for any interesting links
	for link in links[:20]:
	if any(keyword in link.lower() for keyword in ['epub', 'download', 'file', 'cdn', 'amazonaws', 'cloudfront']):
	print(f" 相关链接: {link[:100]}")

	# Extract EPUB URL
	print("\n提取 EPUB URL...")
	epub_url = extract_epub_url(content)

	if epub_url:
	print(f"\n* 成功找到 EPUB URL *")
	print(f"URL: {epub_url}")
	else:
	print("\n页面中未找到 EPUB URL")
	print("\n页面内容片段:")
	# Print body text
	body_match = re.search(r'<body[^>]>(.?)</body>', content, re.DOTALL \| re.IGNORECASE)
	if body_match:
	body_text = re.sub(r'<[^>]+>', ' ', body_match.group(1))
	body_text = re.sub(r'\s+', ' ', body_text).strip()
	print(body_text[:2000])
	else:
	print("\n获取页面内容失败")

	if __name__ == "__main__":
	main()