Spaces:
Paused
Paused
Clean: remove Playwright code, keep scoring fix and fallback threshold
Browse files- hf_backend/fetcher.py +1 -53
hf_backend/fetcher.py
CHANGED
|
@@ -286,59 +286,7 @@ def _download_from_src_a(
|
|
| 286 |
except Exception:
|
| 287 |
continue
|
| 288 |
|
| 289 |
-
|
| 290 |
-
browser = p.chromium.launch(headless=True)
|
| 291 |
-
context = browser.new_context(
|
| 292 |
-
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 293 |
-
accept_downloads=True,
|
| 294 |
-
)
|
| 295 |
-
page = context.new_page()
|
| 296 |
-
|
| 297 |
-
try:
|
| 298 |
-
page.goto(url, timeout=60_000, wait_until="domcontentloaded")
|
| 299 |
-
|
| 300 |
-
# Wait for the countdown timer to finish and a download link to appear
|
| 301 |
-
for elapsed in range(wait_seconds):
|
| 302 |
-
time.sleep(1)
|
| 303 |
-
|
| 304 |
-
# Check if page navigated away (redirect to download)
|
| 305 |
-
current = page.url
|
| 306 |
-
if current != url and "slow_download" not in current and "fast_download" not in current:
|
| 307 |
-
# Direct redirect — fetch via requests using cookies from the browser
|
| 308 |
-
break
|
| 309 |
-
|
| 310 |
-
# Check for download links that appeared after countdown
|
| 311 |
-
try:
|
| 312 |
-
links = page.query_selector_all("a[href]")
|
| 313 |
-
for link in links:
|
| 314 |
-
href = link.get_attribute("href") or ""
|
| 315 |
-
text = (link.text_content() or "").strip().lower()
|
| 316 |
-
if ("get.php" in href or href.endswith(".epub") or
|
| 317 |
-
("download" in text and href and href != "#")):
|
| 318 |
-
# Try to capture download
|
| 319 |
-
try:
|
| 320 |
-
with page.expect_download(timeout=5_000) as dl_info:
|
| 321 |
-
link.click()
|
| 322 |
-
dl = dl_info.value
|
| 323 |
-
dl_path = dl.path()
|
| 324 |
-
if dl_path:
|
| 325 |
-
with open(dl_path, "rb") as f:
|
| 326 |
-
return f.read()
|
| 327 |
-
except Exception:
|
| 328 |
-
# click didn't trigger download, try fetching URL directly
|
| 329 |
-
abs_href = href if href.startswith("http") else f"https://annas-archive.gl{href}"
|
| 330 |
-
resp = context.request.get(abs_href, timeout=120_000)
|
| 331 |
-
if resp.status == 200 and len(resp.body()) > 1000:
|
| 332 |
-
return resp.body()
|
| 333 |
-
except Exception:
|
| 334 |
-
pass
|
| 335 |
-
|
| 336 |
-
except Exception:
|
| 337 |
-
pass
|
| 338 |
-
finally:
|
| 339 |
-
browser.close()
|
| 340 |
-
|
| 341 |
-
return None
|
| 342 |
|
| 343 |
|
| 344 |
def _download_from_src_b(
|
|
|
|
| 286 |
except Exception:
|
| 287 |
continue
|
| 288 |
|
| 289 |
+
raise FetchError("所有下载方式均失败(Libgen 和 Anna's Archive)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
|
| 291 |
|
| 292 |
def _download_from_src_b(
|