Spaces:
Paused
Paused
Revert: remove Playwright fallback
Browse files- hf_backend/fetcher.py +978 -1009
hf_backend/fetcher.py
CHANGED
|
@@ -1,1009 +1,978 @@
|
|
| 1 |
-
from __future__ import annotations
|
| 2 |
-
|
| 3 |
-
import math
|
| 4 |
-
import re
|
| 5 |
-
import time
|
| 6 |
-
import zipfile
|
| 7 |
-
from difflib import SequenceMatcher
|
| 8 |
-
from io import BytesIO
|
| 9 |
-
from pathlib import PurePosixPath
|
| 10 |
-
from typing import Any
|
| 11 |
-
from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
|
| 12 |
-
|
| 13 |
-
import requests
|
| 14 |
-
|
| 15 |
-
try:
|
| 16 |
-
import cloudscraper
|
| 17 |
-
except ImportError:
|
| 18 |
-
cloudscraper = None
|
| 19 |
-
|
| 20 |
-
try:
|
| 21 |
-
from bs4 import BeautifulSoup
|
| 22 |
-
except ImportError:
|
| 23 |
-
BeautifulSoup = None
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
"
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
"
|
| 58 |
-
"
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
scored
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
"
|
| 118 |
-
"
|
| 119 |
-
"
|
| 120 |
-
"
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
config
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
config
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
"""
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
response.
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
"
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
)
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
)
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
"
|
| 656 |
-
"
|
| 657 |
-
"filesize":
|
| 658 |
-
"year":
|
| 659 |
-
}
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
if
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
|
| 908 |
-
|
| 909 |
-
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
|
| 915 |
-
|
| 916 |
-
|
| 917 |
-
return
|
| 918 |
-
|
| 919 |
-
|
| 920 |
-
def
|
| 921 |
-
|
| 922 |
-
|
| 923 |
-
|
| 924 |
-
|
| 925 |
-
|
| 926 |
-
|
| 927 |
-
|
| 928 |
-
|
| 929 |
-
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
| 941 |
-
|
| 942 |
-
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
-
|
| 954 |
-
|
| 955 |
-
|
| 956 |
-
|
| 957 |
-
|
| 958 |
-
|
| 959 |
-
|
| 960 |
-
|
| 961 |
-
|
| 962 |
-
|
| 963 |
-
|
| 964 |
-
|
| 965 |
-
return ""
|
| 966 |
-
|
| 967 |
-
|
| 968 |
-
def
|
| 969 |
-
|
| 970 |
-
|
| 971 |
-
|
| 972 |
-
|
| 973 |
-
|
| 974 |
-
|
| 975 |
-
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
def _author_from_book_path(path: str) -> str:
|
| 980 |
-
parts = [part for part in path.strip("/").split("/") if part]
|
| 981 |
-
if len(parts) < 3:
|
| 982 |
-
return ""
|
| 983 |
-
return " ".join(part.capitalize() for part in parts[1].split("-"))
|
| 984 |
-
|
| 985 |
-
|
| 986 |
-
def _title_from_book_path(path: str) -> str:
|
| 987 |
-
parts = [part for part in path.strip("/").split("/") if part]
|
| 988 |
-
if len(parts) < 3:
|
| 989 |
-
return ""
|
| 990 |
-
return " ".join(part.capitalize() for part in parts[2].split("-"))
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
def _first_text(value: Any) -> str:
|
| 994 |
-
if isinstance(value, (list, tuple)):
|
| 995 |
-
return str(value[0]) if value else ""
|
| 996 |
-
return str(value or "")
|
| 997 |
-
|
| 998 |
-
|
| 999 |
-
def _provider_timeout(config: AppConfig) -> int:
|
| 1000 |
-
return max(5, min(int(config.fetch_timeout_seconds), 10))
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
def _normalize_download_url(url: str) -> str:
|
| 1004 |
-
parsed = urlparse(url)
|
| 1005 |
-
if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
|
| 1006 |
-
params = dict(parse_qsl(parsed.query, keep_blank_values=True))
|
| 1007 |
-
params.setdefault("source", "download")
|
| 1008 |
-
return urlunparse(parsed._replace(query=urlencode(params)))
|
| 1009 |
-
return url
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
import re
|
| 5 |
+
import time
|
| 6 |
+
import zipfile
|
| 7 |
+
from difflib import SequenceMatcher
|
| 8 |
+
from io import BytesIO
|
| 9 |
+
from pathlib import PurePosixPath
|
| 10 |
+
from typing import Any
|
| 11 |
+
from urllib.parse import parse_qsl, quote, unquote, urlencode, urljoin, urlparse, urlunparse
|
| 12 |
+
|
| 13 |
+
import requests
|
| 14 |
+
|
| 15 |
+
try:
|
| 16 |
+
import cloudscraper
|
| 17 |
+
except ImportError:
|
| 18 |
+
cloudscraper = None
|
| 19 |
+
|
| 20 |
+
try:
|
| 21 |
+
from bs4 import BeautifulSoup
|
| 22 |
+
except ImportError:
|
| 23 |
+
BeautifulSoup = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
from hf_backend.config import AppConfig
|
| 27 |
+
from hf_backend.filename_utils import normalize_source_filename
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class FetchError(RuntimeError):
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
USER_FACING_NOT_FOUND = "未找到可用的英文 EPUB,请提供直链"
|
| 35 |
+
|
| 36 |
+
ENGLISH_CODES = {
|
| 37 |
+
"en",
|
| 38 |
+
"eng",
|
| 39 |
+
"en-us",
|
| 40 |
+
"en-gb",
|
| 41 |
+
"english",
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def fetch_book_input(config: AppConfig, query: str) -> dict[str, Any]:
|
| 46 |
+
normalized_query = str(query or "").strip()
|
| 47 |
+
if not normalized_query:
|
| 48 |
+
raise FetchError("请输入书名或 EPUB 下载链接")
|
| 49 |
+
|
| 50 |
+
if _looks_like_url(normalized_query):
|
| 51 |
+
filename, content = download_epub_from_url(config, normalized_query)
|
| 52 |
+
return {
|
| 53 |
+
"filename": normalize_source_filename(filename, default_extension=".epub"),
|
| 54 |
+
"content": content,
|
| 55 |
+
"origin": "link_fetch",
|
| 56 |
+
"provider": "direct_link",
|
| 57 |
+
"query": normalized_query,
|
| 58 |
+
"download_url": normalized_query,
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
candidates: list[dict[str, Any]] = []
|
| 62 |
+
last_error = None
|
| 63 |
+
|
| 64 |
+
# Only use Anna's Archive and Z-Library
|
| 65 |
+
for provider in (
|
| 66 |
+
search_src_a,
|
| 67 |
+
search_src_b,
|
| 68 |
+
):
|
| 69 |
+
try:
|
| 70 |
+
provider_candidates = provider(config, normalized_query)
|
| 71 |
+
candidates.extend(provider_candidates)
|
| 72 |
+
except FetchError as exc:
|
| 73 |
+
last_error = exc
|
| 74 |
+
continue
|
| 75 |
+
except (requests.RequestException, ValueError) as exc:
|
| 76 |
+
last_error = exc
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
if not candidates:
|
| 80 |
+
if last_error:
|
| 81 |
+
error_msg = str(last_error)
|
| 82 |
+
if "src_a" in error_msg:
|
| 83 |
+
raise FetchError("未找到可用的英文 EPUB,请尝试提供直链或使用其他书名")
|
| 84 |
+
raise FetchError(f"搜索失败:{error_msg[:100]}")
|
| 85 |
+
raise FetchError(USER_FACING_NOT_FOUND)
|
| 86 |
+
|
| 87 |
+
# Rank candidates by score, then try downloading from best to worst
|
| 88 |
+
scored = [
|
| 89 |
+
(candidate, _score_candidate(normalized_query, candidate))
|
| 90 |
+
for candidate in candidates
|
| 91 |
+
]
|
| 92 |
+
scored.sort(key=lambda item: item[1], reverse=True)
|
| 93 |
+
|
| 94 |
+
top_score = scored[0][1] if scored else 0.0
|
| 95 |
+
# Minimum score ratio to allow fallback (e.g., 0.6 means fallback must be at least 60% of top score)
|
| 96 |
+
_FALLBACK_MIN_RATIO = 0.6
|
| 97 |
+
|
| 98 |
+
download_error = None
|
| 99 |
+
for candidate, score in scored:
|
| 100 |
+
# Don't fall back to books that are too different from the top match
|
| 101 |
+
if top_score > 0.5 and score < top_score * _FALLBACK_MIN_RATIO:
|
| 102 |
+
break
|
| 103 |
+
|
| 104 |
+
provider = candidate.get("provider", "")
|
| 105 |
+
try:
|
| 106 |
+
filename, content = download_epub_from_url(
|
| 107 |
+
config,
|
| 108 |
+
candidate["download_url"],
|
| 109 |
+
filename_hint=candidate.get("filename", ""),
|
| 110 |
+
provider=provider,
|
| 111 |
+
)
|
| 112 |
+
return {
|
| 113 |
+
"filename": normalize_source_filename(filename, default_extension=".epub"),
|
| 114 |
+
"content": content,
|
| 115 |
+
"origin": "title_fetch",
|
| 116 |
+
"provider": provider,
|
| 117 |
+
"query": normalized_query,
|
| 118 |
+
"title": candidate.get("title", ""),
|
| 119 |
+
"author": candidate.get("author", ""),
|
| 120 |
+
"download_url": candidate["download_url"],
|
| 121 |
+
}
|
| 122 |
+
except FetchError as exc:
|
| 123 |
+
download_error = exc
|
| 124 |
+
continue
|
| 125 |
+
except (requests.RequestException, ValueError) as exc:
|
| 126 |
+
download_error = exc
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
raise FetchError(str(download_error) if download_error else USER_FACING_NOT_FOUND)
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def download_epub_from_url(
|
| 133 |
+
config: AppConfig,
|
| 134 |
+
url: str,
|
| 135 |
+
*,
|
| 136 |
+
filename_hint: str = "",
|
| 137 |
+
provider: str = "",
|
| 138 |
+
) -> tuple[str, bytes]:
|
| 139 |
+
# Route to specialized downloaders based on provider hint or URL pattern
|
| 140 |
+
if provider == "src_a" or (
|
| 141 |
+
config.src_a_base_url
|
| 142 |
+
and url.lower().startswith(config.src_a_base_url.lower())
|
| 143 |
+
and "/slow_download/" in url.lower()
|
| 144 |
+
):
|
| 145 |
+
return _download_from_src_a(config, url, filename_hint)
|
| 146 |
+
|
| 147 |
+
if provider == "src_b" or (
|
| 148 |
+
config.src_b_base_url
|
| 149 |
+
and url.lower().startswith(config.src_b_base_url.lower())
|
| 150 |
+
and "/dl/" in url.lower()
|
| 151 |
+
):
|
| 152 |
+
return _download_from_src_b(config, url, filename_hint)
|
| 153 |
+
|
| 154 |
+
effective_url = _normalize_download_url(url)
|
| 155 |
+
response = requests.get(
|
| 156 |
+
effective_url,
|
| 157 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 158 |
+
timeout=config.fetch_timeout_seconds,
|
| 159 |
+
allow_redirects=True,
|
| 160 |
+
)
|
| 161 |
+
response.raise_for_status()
|
| 162 |
+
|
| 163 |
+
content = response.content
|
| 164 |
+
if not content:
|
| 165 |
+
raise FetchError("下载结果为空")
|
| 166 |
+
|
| 167 |
+
filename = _derive_filename(response, response.url or url, filename_hint)
|
| 168 |
+
_validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
|
| 169 |
+
|
| 170 |
+
if not filename.lower().endswith(".epub"):
|
| 171 |
+
filename = f"{filename}.epub"
|
| 172 |
+
return filename, content
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _download_from_src_a(
|
| 176 |
+
config: AppConfig,
|
| 177 |
+
url: str,
|
| 178 |
+
filename_hint: str,
|
| 179 |
+
) -> tuple[str, bytes]:
|
| 180 |
+
"""
|
| 181 |
+
Download EPUB from Anna's Archive.
|
| 182 |
+
First tries Libgen (unprotected), then falls back to Anna's Archive
|
| 183 |
+
fast_download/slow_download (requires cloudscraper for DDoS-Guard).
|
| 184 |
+
"""
|
| 185 |
+
# Extract md5 from URL (format: .../slow_download/{md5}/0/3)
|
| 186 |
+
md5_match = re.search(r"/slow_download/([a-f0-9]+)", url)
|
| 187 |
+
if not md5_match:
|
| 188 |
+
raise FetchError("无法从 URL 提取 md5")
|
| 189 |
+
md5 = md5_match.group(1)
|
| 190 |
+
|
| 191 |
+
libgen_headers = {
|
| 192 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
# --- Attempt 1: Libgen (fast, unprotected) ---
|
| 196 |
+
libgen_success = False
|
| 197 |
+
if BeautifulSoup is not None:
|
| 198 |
+
try:
|
| 199 |
+
ads_url = f"https://libgen.li/ads.php?md5={md5}"
|
| 200 |
+
ads_response = requests.get(ads_url, headers=libgen_headers, timeout=15)
|
| 201 |
+
ads_response.raise_for_status()
|
| 202 |
+
ads_soup = BeautifulSoup(ads_response.text, "lxml")
|
| 203 |
+
get_link = ads_soup.find("a", string=re.compile("GET"))
|
| 204 |
+
if get_link:
|
| 205 |
+
get_href = get_link.get("href", "")
|
| 206 |
+
if get_href:
|
| 207 |
+
download_url = f"https://libgen.li/{get_href.lstrip('/')}"
|
| 208 |
+
response = requests.get(
|
| 209 |
+
download_url,
|
| 210 |
+
headers=libgen_headers,
|
| 211 |
+
timeout=config.fetch_timeout_seconds,
|
| 212 |
+
allow_redirects=True,
|
| 213 |
+
)
|
| 214 |
+
response.raise_for_status()
|
| 215 |
+
content = response.content
|
| 216 |
+
if content:
|
| 217 |
+
filename = _derive_filename(response, response.url or download_url, filename_hint)
|
| 218 |
+
_validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
|
| 219 |
+
if not filename.lower().endswith(".epub"):
|
| 220 |
+
filename = f"{filename}.epub"
|
| 221 |
+
return filename, content
|
| 222 |
+
except Exception:
|
| 223 |
+
pass # Fall through to Anna's Archive
|
| 224 |
+
|
| 225 |
+
# --- Attempt 2: Anna's Archive fast_download (cloudscraper) ---
|
| 226 |
+
if cloudscraper is not None:
|
| 227 |
+
base_url = config.src_a_base_url.rstrip("/")
|
| 228 |
+
for server_id in range(6): # Try first 6 fast partner servers
|
| 229 |
+
try:
|
| 230 |
+
fast_url = f"{base_url}/fast_download/{md5}/0/{server_id}"
|
| 231 |
+
scraper = cloudscraper.create_scraper(
|
| 232 |
+
browser={"browser": "chrome", "platform": "windows", "mobile": False},
|
| 233 |
+
delay=10,
|
| 234 |
+
)
|
| 235 |
+
headers = {
|
| 236 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 237 |
+
"accept": "*/*",
|
| 238 |
+
}
|
| 239 |
+
response = scraper.get(
|
| 240 |
+
fast_url,
|
| 241 |
+
headers=headers,
|
| 242 |
+
timeout=config.fetch_timeout_seconds,
|
| 243 |
+
allow_redirects=True,
|
| 244 |
+
)
|
| 245 |
+
if response.status_code == 200 and len(response.content) > 1000:
|
| 246 |
+
filename = _derive_filename(response, response.url or fast_url, filename_hint)
|
| 247 |
+
try:
|
| 248 |
+
_validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
|
| 249 |
+
except FetchError:
|
| 250 |
+
continue
|
| 251 |
+
if not filename.lower().endswith(".epub"):
|
| 252 |
+
filename = f"{filename}.epub"
|
| 253 |
+
return filename, response.content
|
| 254 |
+
except Exception:
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
# --- Attempt 3: Anna's Archive slow_download (cloudscraper, may have wait) ---
|
| 258 |
+
if cloudscraper is not None:
|
| 259 |
+
base_url = config.src_a_base_url.rstrip("/")
|
| 260 |
+
for server_id in range(4):
|
| 261 |
+
try:
|
| 262 |
+
slow_url = f"{base_url}/slow_download/{md5}/0/{server_id}"
|
| 263 |
+
scraper = cloudscraper.create_scraper(
|
| 264 |
+
browser={"browser": "chrome", "platform": "windows", "mobile": False},
|
| 265 |
+
delay=10,
|
| 266 |
+
)
|
| 267 |
+
headers = {
|
| 268 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 269 |
+
"accept": "*/*",
|
| 270 |
+
}
|
| 271 |
+
response = scraper.get(
|
| 272 |
+
slow_url,
|
| 273 |
+
headers=headers,
|
| 274 |
+
timeout=180,
|
| 275 |
+
allow_redirects=True,
|
| 276 |
+
)
|
| 277 |
+
if response.status_code == 200 and len(response.content) > 1000:
|
| 278 |
+
filename = _derive_filename(response, response.url or slow_url, filename_hint)
|
| 279 |
+
try:
|
| 280 |
+
_validate_epub_bytes(content=response.content, filename=filename, content_type=response.headers.get("content-type", ""))
|
| 281 |
+
except FetchError:
|
| 282 |
+
continue
|
| 283 |
+
if not filename.lower().endswith(".epub"):
|
| 284 |
+
filename = f"{filename}.epub"
|
| 285 |
+
return filename, response.content
|
| 286 |
+
except Exception:
|
| 287 |
+
continue
|
| 288 |
+
|
| 289 |
+
with sync_playwright() as p:
|
| 290 |
+
browser = p.chromium.launch(headless=True)
|
| 291 |
+
context = browser.new_context(
|
| 292 |
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 293 |
+
accept_downloads=True,
|
| 294 |
+
)
|
| 295 |
+
page = context.new_page()
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
page.goto(url, timeout=60_000, wait_until="domcontentloaded")
|
| 299 |
+
|
| 300 |
+
# Wait for the countdown timer to finish and a download link to appear
|
| 301 |
+
for elapsed in range(wait_seconds):
|
| 302 |
+
time.sleep(1)
|
| 303 |
+
|
| 304 |
+
# Check if page navigated away (redirect to download)
|
| 305 |
+
current = page.url
|
| 306 |
+
if current != url and "slow_download" not in current and "fast_download" not in current:
|
| 307 |
+
# Direct redirect — fetch via requests using cookies from the browser
|
| 308 |
+
break
|
| 309 |
+
|
| 310 |
+
# Check for download links that appeared after countdown
|
| 311 |
+
try:
|
| 312 |
+
links = page.query_selector_all("a[href]")
|
| 313 |
+
for link in links:
|
| 314 |
+
href = link.get_attribute("href") or ""
|
| 315 |
+
text = (link.text_content() or "").strip().lower()
|
| 316 |
+
if ("get.php" in href or href.endswith(".epub") or
|
| 317 |
+
("download" in text and href and href != "#")):
|
| 318 |
+
# Try to capture download
|
| 319 |
+
try:
|
| 320 |
+
with page.expect_download(timeout=5_000) as dl_info:
|
| 321 |
+
link.click()
|
| 322 |
+
dl = dl_info.value
|
| 323 |
+
dl_path = dl.path()
|
| 324 |
+
if dl_path:
|
| 325 |
+
with open(dl_path, "rb") as f:
|
| 326 |
+
return f.read()
|
| 327 |
+
except Exception:
|
| 328 |
+
# click didn't trigger download, try fetching URL directly
|
| 329 |
+
abs_href = href if href.startswith("http") else f"https://annas-archive.gl{href}"
|
| 330 |
+
resp = context.request.get(abs_href, timeout=120_000)
|
| 331 |
+
if resp.status == 200 and len(resp.body()) > 1000:
|
| 332 |
+
return resp.body()
|
| 333 |
+
except Exception:
|
| 334 |
+
pass
|
| 335 |
+
|
| 336 |
+
except Exception:
|
| 337 |
+
pass
|
| 338 |
+
finally:
|
| 339 |
+
browser.close()
|
| 340 |
+
|
| 341 |
+
return None
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
def _download_from_src_b(
|
| 345 |
+
config: AppConfig,
|
| 346 |
+
url: str,
|
| 347 |
+
filename_hint: str,
|
| 348 |
+
) -> tuple[str, bytes]:
|
| 349 |
+
"""
|
| 350 |
+
Download EPUB from src_b download URL.
|
| 351 |
+
Uses cloudscraper to handle protection.
|
| 352 |
+
Falls back to requests.get if cloudscraper is unavailable.
|
| 353 |
+
"""
|
| 354 |
+
if cloudscraper is not None:
|
| 355 |
+
try:
|
| 356 |
+
scraper = cloudscraper.create_scraper(
|
| 357 |
+
browser={"browser": "chrome", "platform": "windows", "mobile": False},
|
| 358 |
+
delay=10,
|
| 359 |
+
)
|
| 360 |
+
headers = {
|
| 361 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 362 |
+
"accept": "*/*",
|
| 363 |
+
}
|
| 364 |
+
response = scraper.get(
|
| 365 |
+
url,
|
| 366 |
+
headers=headers,
|
| 367 |
+
timeout=120,
|
| 368 |
+
allow_redirects=True,
|
| 369 |
+
)
|
| 370 |
+
if response.status_code == 403:
|
| 371 |
+
raise FetchError("下载被阻止,请手动下载")
|
| 372 |
+
response.raise_for_status()
|
| 373 |
+
content = response.content
|
| 374 |
+
if not content:
|
| 375 |
+
raise FetchError("下载结果为空")
|
| 376 |
+
filename = _derive_filename(response, response.url or url, filename_hint)
|
| 377 |
+
_validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
|
| 378 |
+
if not filename.lower().endswith(".epub"):
|
| 379 |
+
filename = f"{filename}.epub"
|
| 380 |
+
return filename, content
|
| 381 |
+
except FetchError:
|
| 382 |
+
raise
|
| 383 |
+
except Exception:
|
| 384 |
+
pass # Fall through to requests
|
| 385 |
+
|
| 386 |
+
# Fallback to plain requests
|
| 387 |
+
response = requests.get(
|
| 388 |
+
url,
|
| 389 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 390 |
+
timeout=config.fetch_timeout_seconds,
|
| 391 |
+
allow_redirects=True,
|
| 392 |
+
)
|
| 393 |
+
if response.status_code == 403:
|
| 394 |
+
raise FetchError("下载被阻止,请手动下载")
|
| 395 |
+
response.raise_for_status()
|
| 396 |
+
content = response.content
|
| 397 |
+
if not content:
|
| 398 |
+
raise FetchError("下载结果为空")
|
| 399 |
+
filename = _derive_filename(response, response.url or url, filename_hint)
|
| 400 |
+
_validate_epub_bytes(content, filename=filename, content_type=response.headers.get("content-type", ""))
|
| 401 |
+
if not filename.lower().endswith(".epub"):
|
| 402 |
+
filename = f"{filename}.epub"
|
| 403 |
+
return filename, content
|
| 404 |
+
|
| 405 |
+
|
| 406 |
+
def pick_best_candidate(query: str, candidates: list[dict[str, Any]]) -> dict[str, Any] | None:
|
| 407 |
+
scored: list[tuple[float, dict[str, Any]]] = []
|
| 408 |
+
for candidate in candidates:
|
| 409 |
+
score = _score_candidate(query, candidate)
|
| 410 |
+
if score >= 0.45:
|
| 411 |
+
scored.append((score, candidate))
|
| 412 |
+
if not scored:
|
| 413 |
+
return None
|
| 414 |
+
scored.sort(key=lambda item: item[0], reverse=True)
|
| 415 |
+
return scored[0][1]
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
def search_standard_ebooks(config: AppConfig, query: str) -> list[dict[str, Any]]:
|
| 419 |
+
response = requests.get(
|
| 420 |
+
config.standard_ebooks_search_url,
|
| 421 |
+
params={"query": query},
|
| 422 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 423 |
+
timeout=_provider_timeout(config),
|
| 424 |
+
)
|
| 425 |
+
response.raise_for_status()
|
| 426 |
+
|
| 427 |
+
paths = _unique_matches(r'href="(/ebooks/[^"/]+/[^"/]+)"', response.text)
|
| 428 |
+
candidates: list[dict[str, Any]] = []
|
| 429 |
+
for path in paths[:6]:
|
| 430 |
+
detail_url = urljoin(config.standard_ebooks_search_url, path)
|
| 431 |
+
detail_response = requests.get(
|
| 432 |
+
detail_url,
|
| 433 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 434 |
+
timeout=_provider_timeout(config),
|
| 435 |
+
)
|
| 436 |
+
detail_response.raise_for_status()
|
| 437 |
+
download_path = _pick_standard_ebooks_download(detail_response.text)
|
| 438 |
+
if not download_path:
|
| 439 |
+
continue
|
| 440 |
+
candidates.append(
|
| 441 |
+
{
|
| 442 |
+
"provider": "standard_ebooks",
|
| 443 |
+
"title": _extract_html_title(detail_response.text) or _title_from_book_path(path),
|
| 444 |
+
"author": _author_from_book_path(path),
|
| 445 |
+
"language": "en",
|
| 446 |
+
"download_url": _normalize_download_url(
|
| 447 |
+
urljoin(config.standard_ebooks_search_url, download_path),
|
| 448 |
+
),
|
| 449 |
+
"filename": PurePosixPath(download_path).name,
|
| 450 |
+
}
|
| 451 |
+
)
|
| 452 |
+
return candidates
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
def search_project_gutenberg(config: AppConfig, query: str) -> list[dict[str, Any]]:
|
| 456 |
+
response = requests.get(
|
| 457 |
+
config.project_gutenberg_search_url,
|
| 458 |
+
params={"query": query},
|
| 459 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 460 |
+
timeout=_provider_timeout(config),
|
| 461 |
+
)
|
| 462 |
+
response.raise_for_status()
|
| 463 |
+
|
| 464 |
+
book_ids = _unique_matches(r'href="/ebooks/(\d+)"', response.text)
|
| 465 |
+
candidates: list[dict[str, Any]] = []
|
| 466 |
+
for book_id in book_ids[:5]:
|
| 467 |
+
detail_url = f"https://www.gutenberg.org/ebooks/{book_id}"
|
| 468 |
+
detail_response = requests.get(
|
| 469 |
+
detail_url,
|
| 470 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 471 |
+
timeout=_provider_timeout(config),
|
| 472 |
+
)
|
| 473 |
+
detail_response.raise_for_status()
|
| 474 |
+
detail_html = detail_response.text
|
| 475 |
+
download_path = _pick_gutenberg_epub(detail_html)
|
| 476 |
+
if not download_path:
|
| 477 |
+
continue
|
| 478 |
+
candidates.append(
|
| 479 |
+
{
|
| 480 |
+
"provider": "project_gutenberg",
|
| 481 |
+
"title": _extract_html_title(detail_html) or f"Project Gutenberg {book_id}",
|
| 482 |
+
"author": "",
|
| 483 |
+
"language": "en",
|
| 484 |
+
"download_url": urljoin(detail_url, download_path),
|
| 485 |
+
"filename": PurePosixPath(urlparse(download_path).path).name or f"pg{book_id}.epub",
|
| 486 |
+
}
|
| 487 |
+
)
|
| 488 |
+
return candidates
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
def search_internet_archive(config: AppConfig, query: str) -> list[dict[str, Any]]:
|
| 492 |
+
response = requests.get(
|
| 493 |
+
config.internet_archive_advancedsearch_url,
|
| 494 |
+
params={
|
| 495 |
+
"q": f"title:({query}) AND mediatype:(texts) AND (language:(english) OR language:(eng) OR language:(en))",
|
| 496 |
+
"fl[]": ["identifier", "title", "creator", "language", "downloads", "format"],
|
| 497 |
+
"sort[]": "downloads desc",
|
| 498 |
+
"rows": 8,
|
| 499 |
+
"page": 1,
|
| 500 |
+
"output": "json",
|
| 501 |
+
},
|
| 502 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 503 |
+
timeout=_provider_timeout(config),
|
| 504 |
+
)
|
| 505 |
+
response.raise_for_status()
|
| 506 |
+
payload = response.json()
|
| 507 |
+
docs = payload.get("response", {}).get("docs", [])
|
| 508 |
+
|
| 509 |
+
candidates: list[dict[str, Any]] = []
|
| 510 |
+
for item in docs:
|
| 511 |
+
if not _is_english(item.get("language")):
|
| 512 |
+
continue
|
| 513 |
+
formats = item.get("format") or []
|
| 514 |
+
if isinstance(formats, str):
|
| 515 |
+
formats = [formats]
|
| 516 |
+
if not any(str(value).strip().lower() == "epub" for value in formats):
|
| 517 |
+
continue
|
| 518 |
+
identifier = str(item.get("identifier") or "").strip()
|
| 519 |
+
if not identifier:
|
| 520 |
+
continue
|
| 521 |
+
metadata_url = config.internet_archive_metadata_url_template.format(identifier=identifier)
|
| 522 |
+
metadata_response = requests.get(
|
| 523 |
+
metadata_url,
|
| 524 |
+
headers={"user-agent": config.fetch_user_agent},
|
| 525 |
+
timeout=_provider_timeout(config),
|
| 526 |
+
)
|
| 527 |
+
metadata_response.raise_for_status()
|
| 528 |
+
metadata = metadata_response.json()
|
| 529 |
+
filename = _pick_archive_epub_filename(metadata)
|
| 530 |
+
if not filename:
|
| 531 |
+
continue
|
| 532 |
+
candidates.append(
|
| 533 |
+
{
|
| 534 |
+
"provider": "internet_archive",
|
| 535 |
+
"title": str(item.get("title") or ""),
|
| 536 |
+
"author": _first_text(item.get("creator")),
|
| 537 |
+
"language": _first_text(item.get("language")),
|
| 538 |
+
"downloads": int(item.get("downloads") or 0),
|
| 539 |
+
"download_url": f"https://archive.org/download/{identifier}/{quote(filename)}",
|
| 540 |
+
"filename": filename,
|
| 541 |
+
}
|
| 542 |
+
)
|
| 543 |
+
return candidates
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
def search_src_a(config: AppConfig, query: str) -> list[dict[str, Any]]:
|
| 547 |
+
"""
|
| 548 |
+
Search src_a for EPUB files matching the query.
|
| 549 |
+
Uses cloudscraper to bypass DDoS-Guard protection.
|
| 550 |
+
"""
|
| 551 |
+
if not config.src_a_search_url or not config.src_a_base_url:
|
| 552 |
+
return []
|
| 553 |
+
|
| 554 |
+
if cloudscraper is None or BeautifulSoup is None:
|
| 555 |
+
return []
|
| 556 |
+
|
| 557 |
+
candidates: list[dict[str, Any]] = []
|
| 558 |
+
search_url = config.src_a_search_url
|
| 559 |
+
|
| 560 |
+
try:
|
| 561 |
+
scraper = cloudscraper.create_scraper(
|
| 562 |
+
browser={"browser": "chrome", "platform": "windows", "mobile": False},
|
| 563 |
+
delay=10,
|
| 564 |
+
)
|
| 565 |
+
headers = {
|
| 566 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
| 567 |
+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 568 |
+
"accept-language": "en-US,en;q=0.9",
|
| 569 |
+
}
|
| 570 |
+
response = scraper.get(
|
| 571 |
+
f"{search_url}?q={quote(query)}",
|
| 572 |
+
headers=headers,
|
| 573 |
+
timeout=60,
|
| 574 |
+
)
|
| 575 |
+
response.raise_for_status()
|
| 576 |
+
|
| 577 |
+
html = response.text
|
| 578 |
+
if not html or len(html) < 500:
|
| 579 |
+
raise FetchError("搜索页面加载失败")
|
| 580 |
+
|
| 581 |
+
soup = BeautifulSoup(html, "lxml")
|
| 582 |
+
|
| 583 |
+
# Find all links containing /md5/ - these are book entry links
|
| 584 |
+
md5_links = soup.find_all("a", href=lambda h: h and "/md5/" in h)
|
| 585 |
+
|
| 586 |
+
seen_md5 = set()
|
| 587 |
+
for link in md5_links:
|
| 588 |
+
link_text = (link.get_text() or "").strip()
|
| 589 |
+
# We want links with empty text (Save links) that point to md5 entries
|
| 590 |
+
if link_text and link_text != "Save":
|
| 591 |
+
continue
|
| 592 |
+
|
| 593 |
+
href = link.get("href", "")
|
| 594 |
+
md5_match = re.search(r"/md5/([a-f0-9]+)", href)
|
| 595 |
+
if not md5_match:
|
| 596 |
+
continue
|
| 597 |
+
md5 = md5_match.group(1)
|
| 598 |
+
if md5 in seen_md5:
|
| 599 |
+
continue
|
| 600 |
+
|
| 601 |
+
# Find the parent container to get context
|
| 602 |
+
container = link.find_parent("div")
|
| 603 |
+
if not container:
|
| 604 |
+
container = link.parent
|
| 605 |
+
if not container:
|
| 606 |
+
continue
|
| 607 |
+
|
| 608 |
+
container_text = container.get_text(separator="\n") or ""
|
| 609 |
+
|
| 610 |
+
# Check if this container has an EPUB file
|
| 611 |
+
epub_match = re.search(r"([\w./-]+\.epub)", container_text, re.IGNORECASE)
|
| 612 |
+
if not epub_match:
|
| 613 |
+
continue
|
| 614 |
+
|
| 615 |
+
epub_path = epub_match.group(1)
|
| 616 |
+
seen_md5.add(md5)
|
| 617 |
+
|
| 618 |
+
# Extract metadata: English [en] · EPUB · 1.2MB · 2020
|
| 619 |
+
meta_match = re.search(
|
| 620 |
+
r"English\s*\[([^\]]+)\]\s*[·•]\s*([A-Z]+)\s*[·•]\s*([\d.]+(?:MB|GB))\s*[·•]\s*(\d{4})",
|
| 621 |
+
container_text,
|
| 622 |
+
)
|
| 623 |
+
metadata = {
|
| 624 |
+
"language": meta_match.group(1) if meta_match else "",
|
| 625 |
+
"format": meta_match.group(2) if meta_match else "",
|
| 626 |
+
"filesize": meta_match.group(3) if meta_match else "",
|
| 627 |
+
"year": meta_match.group(4) if meta_match else "",
|
| 628 |
+
} if meta_match else None
|
| 629 |
+
|
| 630 |
+
# Extract title and author from container text lines
|
| 631 |
+
lines = [l.strip() for l in container_text.split("\n") if l.strip()]
|
| 632 |
+
title = ""
|
| 633 |
+
author = ""
|
| 634 |
+
for line in lines:
|
| 635 |
+
if ".epub" in line.lower():
|
| 636 |
+
continue
|
| 637 |
+
if re.match(r"^[\d.,]+$", line):
|
| 638 |
+
continue
|
| 639 |
+
if re.match(r"^(English|Save|\d+)", line):
|
| 640 |
+
continue
|
| 641 |
+
if not title:
|
| 642 |
+
title = line[:200]
|
| 643 |
+
elif not author and len(line) < 100:
|
| 644 |
+
author = line
|
| 645 |
+
break
|
| 646 |
+
|
| 647 |
+
download_url = f"{config.src_a_base_url}/slow_download/{md5}/0/3"
|
| 648 |
+
filename = epub_path.split("/")[-1] if "/" in epub_path else epub_path
|
| 649 |
+
|
| 650 |
+
candidates.append({
|
| 651 |
+
"provider": "src_a",
|
| 652 |
+
"title": title,
|
| 653 |
+
"author": author,
|
| 654 |
+
"language": "en",
|
| 655 |
+
"download_url": download_url,
|
| 656 |
+
"filename": filename,
|
| 657 |
+
"filesize": metadata["filesize"] if metadata else "",
|
| 658 |
+
"year": metadata["year"] if metadata else "",
|
| 659 |
+
})
|
| 660 |
+
|
| 661 |
+
except FetchError:
|
| 662 |
+
raise
|
| 663 |
+
except Exception as exc:
|
| 664 |
+
raise FetchError(f"搜索失败: {str(exc)[:100]}")
|
| 665 |
+
|
| 666 |
+
return candidates
|
| 667 |
+
|
| 668 |
+
|
| 669 |
+
def search_src_b(config: AppConfig, query: str) -> list[dict[str, Any]]:
|
| 670 |
+
"""
|
| 671 |
+
Search src_b for EPUB files matching the query.
|
| 672 |
+
Uses cloudscraper to bypass DDoS-Guard protection.
|
| 673 |
+
Tries alternative Z-Library domains if primary fails.
|
| 674 |
+
"""
|
| 675 |
+
if not config.src_b_base_url:
|
| 676 |
+
return []
|
| 677 |
+
|
| 678 |
+
if cloudscraper is None or BeautifulSoup is None:
|
| 679 |
+
return []
|
| 680 |
+
|
| 681 |
+
# Try primary domain first, then alternatives
|
| 682 |
+
primary_base = config.src_b_base_url.rstrip("/")
|
| 683 |
+
alt_domains = ["https://z-lib.is", "https://z-library.se"]
|
| 684 |
+
bases_to_try = [primary_base] + [d for d in alt_domains if d.rstrip("/") != primary_base]
|
| 685 |
+
|
| 686 |
+
last_error = None
|
| 687 |
+
for base in bases_to_try:
|
| 688 |
+
try:
|
| 689 |
+
return _search_src_b_at_domain(base, query)
|
| 690 |
+
except FetchError as exc:
|
| 691 |
+
last_error = exc
|
| 692 |
+
continue
|
| 693 |
+
except (requests.RequestException, ValueError) as exc:
|
| 694 |
+
last_error = exc
|
| 695 |
+
continue
|
| 696 |
+
|
| 697 |
+
if last_error:
|
| 698 |
+
raise last_error
|
| 699 |
+
return []
|
| 700 |
+
|
| 701 |
+
|
| 702 |
+
def _search_src_b_at_domain(base: str, query: str) -> list[dict[str, Any]]:
|
| 703 |
+
"""Search a single Z-Library domain for EPUB files."""
|
| 704 |
+
candidates: list[dict[str, Any]] = []
|
| 705 |
+
|
| 706 |
+
scraper = cloudscraper.create_scraper(
|
| 707 |
+
browser={"browser": "chrome", "platform": "windows", "mobile": False},
|
| 708 |
+
delay=10,
|
| 709 |
+
)
|
| 710 |
+
headers = {
|
| 711 |
+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 712 |
+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
| 713 |
+
"accept-language": "en-US,en;q=0.9",
|
| 714 |
+
}
|
| 715 |
+
|
| 716 |
+
# Search page
|
| 717 |
+
search_url = f"{base}/s/{quote(query)}"
|
| 718 |
+
response = scraper.get(search_url, headers=headers, timeout=60)
|
| 719 |
+
response.raise_for_status()
|
| 720 |
+
|
| 721 |
+
html = response.text
|
| 722 |
+
if not html or len(html) < 200:
|
| 723 |
+
raise FetchError("搜索页面加载失败")
|
| 724 |
+
|
| 725 |
+
soup = BeautifulSoup(html, "lxml")
|
| 726 |
+
|
| 727 |
+
# Find book items by looking for links to /book/
|
| 728 |
+
book_links = soup.find_all("a", href=lambda h: h and "/book/" in h)
|
| 729 |
+
if not book_links:
|
| 730 |
+
raise FetchError("未找到任何书籍")
|
| 731 |
+
|
| 732 |
+
# Deduplicate by href
|
| 733 |
+
seen_hrefs: set[str] = set()
|
| 734 |
+
books: list[dict[str, str]] = []
|
| 735 |
+
for link in book_links:
|
| 736 |
+
href = link.get("href", "")
|
| 737 |
+
if not href or href in seen_hrefs:
|
| 738 |
+
continue
|
| 739 |
+
seen_hrefs.add(href)
|
| 740 |
+
text = (link.get_text() or "").strip()
|
| 741 |
+
if text and len(text) > 3:
|
| 742 |
+
books.append({"href": href, "text": text})
|
| 743 |
+
|
| 744 |
+
# Visit each book page to check for EPUB format
|
| 745 |
+
for book in books[:15]:
|
| 746 |
+
href = book.get("href", "")
|
| 747 |
+
if not href:
|
| 748 |
+
continue
|
| 749 |
+
|
| 750 |
+
book_url = href if href.startswith("http") else f"{base}{href}"
|
| 751 |
+
|
| 752 |
+
try:
|
| 753 |
+
book_resp = scraper.get(book_url, headers=headers, timeout=30)
|
| 754 |
+
book_resp.raise_for_status()
|
| 755 |
+
book_html = book_resp.text
|
| 756 |
+
book_soup = BeautifulSoup(book_html, "lxml")
|
| 757 |
+
page_text = book_soup.get_text(separator="\n") or ""
|
| 758 |
+
|
| 759 |
+
if "epub" not in page_text.lower():
|
| 760 |
+
continue
|
| 761 |
+
|
| 762 |
+
# Extract author
|
| 763 |
+
author = ""
|
| 764 |
+
author_match = re.search(r"Author[s]?[:\s]*([^\n]+)", page_text, re.IGNORECASE)
|
| 765 |
+
if author_match:
|
| 766 |
+
author = author_match.group(1).strip()[:100]
|
| 767 |
+
|
| 768 |
+
# Get download URL from /dl/ link
|
| 769 |
+
dl_match = re.search(r"/dl/([a-zA-Z0-9]+)", book_html)
|
| 770 |
+
dl_id = dl_match.group(1) if dl_match else ""
|
| 771 |
+
download_url = f"{base}/dl/{dl_id}" if dl_id else book_url
|
| 772 |
+
|
| 773 |
+
title_text = book.get("text", "").split("\n")[0].strip()
|
| 774 |
+
if not title_text:
|
| 775 |
+
title_text = book.get("text", "")[:100]
|
| 776 |
+
|
| 777 |
+
candidates.append({
|
| 778 |
+
"provider": "src_b",
|
| 779 |
+
"title": title_text,
|
| 780 |
+
"author": author,
|
| 781 |
+
"language": "en",
|
| 782 |
+
"download_url": download_url,
|
| 783 |
+
"filename": "",
|
| 784 |
+
"filesize": "",
|
| 785 |
+
})
|
| 786 |
+
|
| 787 |
+
except Exception:
|
| 788 |
+
continue
|
| 789 |
+
|
| 790 |
+
return candidates
|
| 791 |
+
|
| 792 |
+
|
| 793 |
+
def _validate_epub_bytes(content: bytes, *, filename: str, content_type: str) -> None:
|
| 794 |
+
if len(content) < 4 or not content.startswith(b"PK"):
|
| 795 |
+
raise FetchError("下载内容不是 EPUB")
|
| 796 |
+
|
| 797 |
+
try:
|
| 798 |
+
with zipfile.ZipFile(BytesIO(content)) as archive:
|
| 799 |
+
mimetype = archive.read("mimetype").decode("utf-8", errors="replace").strip()
|
| 800 |
+
except (KeyError, zipfile.BadZipFile) as exc:
|
| 801 |
+
raise FetchError("下载内容不是 EPUB") from exc
|
| 802 |
+
|
| 803 |
+
if mimetype != "application/epub+zip":
|
| 804 |
+
raise FetchError("下载内容不是 EPUB")
|
| 805 |
+
|
| 806 |
+
lowered_content_type = content_type.lower()
|
| 807 |
+
if filename.lower().endswith(".epub"):
|
| 808 |
+
return
|
| 809 |
+
if "application/epub+zip" in lowered_content_type:
|
| 810 |
+
return
|
| 811 |
+
|
| 812 |
+
|
| 813 |
+
def _derive_filename(response: requests.Response, url: str, filename_hint: str) -> str:
|
| 814 |
+
hint = str(filename_hint or "").strip()
|
| 815 |
+
if hint:
|
| 816 |
+
return normalize_source_filename(PurePosixPath(unquote(hint)).name, default_extension=".epub")
|
| 817 |
+
|
| 818 |
+
disposition = response.headers.get("content-disposition", "")
|
| 819 |
+
match = re.search(r'filename\*?=(?:UTF-8\'\')?"?([^";]+)"?', disposition, flags=re.IGNORECASE)
|
| 820 |
+
if match:
|
| 821 |
+
return normalize_source_filename(
|
| 822 |
+
PurePosixPath(unquote(match.group(1).strip())).name,
|
| 823 |
+
default_extension=".epub",
|
| 824 |
+
)
|
| 825 |
+
|
| 826 |
+
parsed = urlparse(url)
|
| 827 |
+
name = PurePosixPath(unquote(parsed.path)).name
|
| 828 |
+
if name:
|
| 829 |
+
return normalize_source_filename(name, default_extension=".epub")
|
| 830 |
+
return "downloaded_book.epub"
|
| 831 |
+
|
| 832 |
+
|
| 833 |
+
def _score_candidate(query: str, candidate: dict[str, Any]) -> float:
|
| 834 |
+
if not candidate.get("download_url") or not _is_english(candidate.get("language")):
|
| 835 |
+
return -1.0
|
| 836 |
+
|
| 837 |
+
normalized_query = _normalize_text(query)
|
| 838 |
+
normalized_title = _normalize_text(candidate.get("title", ""))
|
| 839 |
+
|
| 840 |
+
# Word overlap scoring - more accurate for title matching
|
| 841 |
+
query_words = set(normalized_query.split())
|
| 842 |
+
title_words = set(normalized_title.split())
|
| 843 |
+
|
| 844 |
+
if not query_words or not title_words:
|
| 845 |
+
return -1.0
|
| 846 |
+
|
| 847 |
+
# Calculate word overlap
|
| 848 |
+
common_words = query_words & title_words
|
| 849 |
+
all_words = query_words | title_words
|
| 850 |
+
|
| 851 |
+
# Jaccard similarity (word overlap / total unique words)
|
| 852 |
+
jaccard_score = len(common_words) / len(all_words) if all_words else 0
|
| 853 |
+
|
| 854 |
+
# Sequence similarity for word order
|
| 855 |
+
sequence_score = SequenceMatcher(None, normalized_query, normalized_title).ratio()
|
| 856 |
+
|
| 857 |
+
# Combined base score (weighted toward word overlap)
|
| 858 |
+
title_score = jaccard_score * 0.7 + sequence_score * 0.3
|
| 859 |
+
|
| 860 |
+
# Strong bonus for exact match
|
| 861 |
+
if normalized_query == normalized_title:
|
| 862 |
+
title_score += 1.0
|
| 863 |
+
# Bonus for all query words in title
|
| 864 |
+
elif query_words <= title_words:
|
| 865 |
+
title_score += 0.3
|
| 866 |
+
# Penalty for titles with no meaningful word overlap
|
| 867 |
+
elif len(common_words) == 0:
|
| 868 |
+
title_score -= 0.3
|
| 869 |
+
|
| 870 |
+
provider_bonus = {
|
| 871 |
+
"src_a": 0.1,
|
| 872 |
+
"src_b": 0.08,
|
| 873 |
+
}.get(candidate.get("provider"), 0.0)
|
| 874 |
+
|
| 875 |
+
filename = str(candidate.get("filename") or "").lower()
|
| 876 |
+
download_url = str(candidate.get("download_url") or "").lower()
|
| 877 |
+
epub_bonus = 0.05 if filename.endswith(".epub") or download_url.endswith(".epub") else 0.0
|
| 878 |
+
|
| 879 |
+
downloads = max(int(candidate.get("downloads") or 0), 0)
|
| 880 |
+
downloads_bonus = min(math.log10(downloads + 1) / 20, 0.05) if downloads else 0.0
|
| 881 |
+
return title_score + provider_bonus + epub_bonus + downloads_bonus
|
| 882 |
+
|
| 883 |
+
|
| 884 |
+
def _looks_like_url(value: str) -> bool:
|
| 885 |
+
parsed = urlparse(value)
|
| 886 |
+
return parsed.scheme in {"http", "https"} and bool(parsed.netloc)
|
| 887 |
+
|
| 888 |
+
|
| 889 |
+
def _normalize_text(value: str) -> str:
|
| 890 |
+
lowered = re.sub(r"[^a-z0-9]+", " ", str(value or "").lower())
|
| 891 |
+
return " ".join(lowered.split())
|
| 892 |
+
|
| 893 |
+
|
| 894 |
+
def _is_english(value: Any) -> bool:
|
| 895 |
+
if isinstance(value, (list, tuple, set)):
|
| 896 |
+
return any(_is_english(item) for item in value)
|
| 897 |
+
normalized = _normalize_text(str(value or ""))
|
| 898 |
+
return normalized in ENGLISH_CODES
|
| 899 |
+
|
| 900 |
+
|
| 901 |
+
def _unique_matches(pattern: str, text: str) -> list[str]:
|
| 902 |
+
results: list[str] = []
|
| 903 |
+
for match in re.findall(pattern, text):
|
| 904 |
+
value = match.strip()
|
| 905 |
+
if value and value not in results:
|
| 906 |
+
results.append(value)
|
| 907 |
+
return results
|
| 908 |
+
|
| 909 |
+
|
| 910 |
+
def _pick_standard_ebooks_download(html: str) -> str:
|
| 911 |
+
links = _unique_matches(r'href="(/ebooks/[^"]+/downloads/[^"]+?\.epub)"', html)
|
| 912 |
+
for link in links:
|
| 913 |
+
lower_link = link.lower()
|
| 914 |
+
if ".kepub.epub" in lower_link or "_advanced.epub" in lower_link:
|
| 915 |
+
continue
|
| 916 |
+
return link
|
| 917 |
+
return links[0] if links else ""
|
| 918 |
+
|
| 919 |
+
|
| 920 |
+
def _pick_gutenberg_epub(html: str) -> str:
|
| 921 |
+
links = _unique_matches(r'href="([^"]+\.epub(?:\.images)?)"', html)
|
| 922 |
+
for link in links:
|
| 923 |
+
lower_link = link.lower()
|
| 924 |
+
if lower_link.endswith(".epub") or ".epub." in lower_link:
|
| 925 |
+
return link
|
| 926 |
+
return ""
|
| 927 |
+
|
| 928 |
+
|
| 929 |
+
def _pick_archive_epub_filename(metadata: dict[str, Any]) -> str:
|
| 930 |
+
for item in metadata.get("files", []) or []:
|
| 931 |
+
name = str(item.get("name") or "")
|
| 932 |
+
if name.lower().endswith(".epub"):
|
| 933 |
+
return name
|
| 934 |
+
return ""
|
| 935 |
+
|
| 936 |
+
|
| 937 |
+
def _extract_html_title(html: str) -> str:
|
| 938 |
+
title_match = re.search(r"<title>\s*([^<]+?)\s*</title>", html, flags=re.IGNORECASE)
|
| 939 |
+
if not title_match:
|
| 940 |
+
return ""
|
| 941 |
+
title = re.sub(r"\s+", " ", title_match.group(1)).strip()
|
| 942 |
+
title = re.sub(r"\s*[-|]\s*Standard Ebooks.*$", "", title, flags=re.IGNORECASE)
|
| 943 |
+
title = re.sub(r"\s*[-|]\s*Project Gutenberg.*$", "", title, flags=re.IGNORECASE)
|
| 944 |
+
title = re.sub(r",\s*by\s+.+$", "", title, flags=re.IGNORECASE)
|
| 945 |
+
return title
|
| 946 |
+
|
| 947 |
+
|
| 948 |
+
def _author_from_book_path(path: str) -> str:
|
| 949 |
+
parts = [part for part in path.strip("/").split("/") if part]
|
| 950 |
+
if len(parts) < 3:
|
| 951 |
+
return ""
|
| 952 |
+
return " ".join(part.capitalize() for part in parts[1].split("-"))
|
| 953 |
+
|
| 954 |
+
|
| 955 |
+
def _title_from_book_path(path: str) -> str:
|
| 956 |
+
parts = [part for part in path.strip("/").split("/") if part]
|
| 957 |
+
if len(parts) < 3:
|
| 958 |
+
return ""
|
| 959 |
+
return " ".join(part.capitalize() for part in parts[2].split("-"))
|
| 960 |
+
|
| 961 |
+
|
| 962 |
+
def _first_text(value: Any) -> str:
|
| 963 |
+
if isinstance(value, (list, tuple)):
|
| 964 |
+
return str(value[0]) if value else ""
|
| 965 |
+
return str(value or "")
|
| 966 |
+
|
| 967 |
+
|
| 968 |
+
def _provider_timeout(config: AppConfig) -> int:
|
| 969 |
+
return max(5, min(int(config.fetch_timeout_seconds), 10))
|
| 970 |
+
|
| 971 |
+
|
| 972 |
+
def _normalize_download_url(url: str) -> str:
|
| 973 |
+
parsed = urlparse(url)
|
| 974 |
+
if parsed.netloc.endswith("standardebooks.org") and "/downloads/" in parsed.path:
|
| 975 |
+
params = dict(parse_qsl(parsed.query, keep_blank_values=True))
|
| 976 |
+
params.setdefault("source", "download")
|
| 977 |
+
return urlunparse(parsed._replace(query=urlencode(params)))
|
| 978 |
+
return url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|