MB-IDK commited on
Commit
59755fa
·
verified ·
1 Parent(s): 1909027

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +252 -18
app.py CHANGED
@@ -9,7 +9,7 @@ import re
9
  import logging
10
  from datetime import datetime, timedelta
11
  from dataclasses import dataclass, asdict
12
- from typing import Optional, Any
13
  import html
14
 
15
  from flask import Flask, jsonify, request, Response
@@ -27,6 +27,8 @@ class Config:
27
  HOST = "0.0.0.0"
28
  MIRRORS_URL = "https://shadowlibraries.github.io/DirectDownloads/AnnasArchive/"
29
  DEFAULT_BASE_URL = "https://annas-archive.gs"
 
 
30
  BROWSER_IMPERSONATE = "chrome110"
31
  CACHE_TTL_MINUTES = 10
32
  REQUEST_TIMEOUT = 20
@@ -84,7 +86,8 @@ class Book:
84
  size_mb: float
85
  url: str
86
  cover_url: Optional[str] = None
87
-
 
88
  def to_dict(self):
89
  return asdict(self)
90
 
@@ -120,8 +123,6 @@ def parse_size(size_str: str) -> float:
120
 
121
  # ============================================================================
122
  # MIRROR MANAGER
123
- # FIX: lru_cache sur une méthode d'instance cause des problèmes avec cache_clear()
124
- # On déplace le cache au niveau module avec une variable de classe.
125
  # ============================================================================
126
 
127
  class MirrorManager:
@@ -193,7 +194,7 @@ class MirrorManager:
193
  mirror_manager = MirrorManager()
194
 
195
  # ============================================================================
196
- # SCRAPER
197
  # ============================================================================
198
 
199
  def scrape_search(query: str, page: int = 1, **filters) -> dict:
@@ -238,6 +239,181 @@ def scrape_search(query: str, page: int = 1, **filters) -> dict:
238
  return {"books": [], "total": 0, "has_more": False, "error": str(e)}
239
 
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  def scrape_recent_downloads() -> dict:
242
  """
243
  Endpoint /dyn/recent_downloads/ — retourne les 50 derniers téléchargements globaux.
@@ -263,7 +439,6 @@ def scrape_recent_downloads() -> dict:
263
  )
264
  resp.raise_for_status()
265
  items = resp.json()
266
- # Enrichissement : on extrait le md5 depuis path
267
  enriched = []
268
  for item in items:
269
  md5_match = re.search(r'/md5/([a-f0-9]{32})', item.get("path", ""))
@@ -286,14 +461,15 @@ def scrape_recent_downloads() -> dict:
286
  return {"items": [], "total": 0, "error": str(e)}
287
 
288
 
 
 
 
 
289
  def parse_books(html_text: str, base_url: str) -> list[Book]:
290
  soup = BeautifulSoup(html_text, 'html.parser')
291
  books = []
292
  seen_md5s = set()
293
 
294
- # FIX BUG 1: Le HTML réel utilise "flex pt-3 pb-3" (double espace).
295
- # 'flex pt-3 pb-3' in x échoue car c'est une recherche de sous-chaîne exacte.
296
- # On vérifie chaque classe individuellement pour être robuste aux variations de whitespace.
297
  blocks = soup.find_all(
298
  'div',
299
  class_=lambda x: x and 'flex' in x and 'pt-3' in x and 'pb-3' in x
@@ -343,12 +519,9 @@ def parse_books(html_text: str, base_url: str) -> list[Book]:
343
  for link in block.find_all('a', href=re.compile(r'search\?q=')):
344
  if 'company' in str(link):
345
  pub_text = clean_text(link.get_text())
346
- # FIX BUG 2: Le format réel est "Publisher, Aug 20, 2016" ou "Publisher, 2016"
347
- # L'ancienne regex r'(.+),\s*(\d{4})$' ne matchait que "Publisher, 2016"
348
  year_match_pub = re.search(r'(\d{4})$', pub_text)
349
  if year_match_pub:
350
  year = int(year_match_pub.group(1))
351
- # Retire ", Aug 20, 2016" ou ", 2016" de la fin
352
  publisher = re.sub(r',\s*(?:\w+\s+\d+,\s*)?\d{4}$', '', pub_text).strip()
353
  else:
354
  publisher = pub_text
@@ -357,7 +530,7 @@ def parse_books(html_text: str, base_url: str) -> list[Book]:
357
  info_div = block.find('div', class_=re.compile(r'text-gray-800'))
358
  info_text = info_div.get_text() if info_div else ""
359
  format_match = re.search(r'·\s*([A-Z0-9]+)\s*·', info_text)
360
- lang_match = re.search(r'\[([a-z]{2,4})\]', info_text) # élargi pour "eng", "yue", etc.
361
  size_match = re.search(r'([\d.]+\s*[KMGT]?B)', info_text)
362
  year_match = re.search(r'·\s*(\d{4})\s*·', info_text)
363
 
@@ -394,13 +567,15 @@ app = Flask(__name__)
394
  def index():
395
  return jsonify({
396
  "name": "Anna's Archives API",
397
- "version": "1.1.0",
398
  "description": "HF Space Edition - Free Tier Optimized",
399
  "browser": Config.BROWSER_IMPERSONATE,
400
  "endpoints": {
401
  "GET /": "Documentation",
402
  "GET /search": "Search books",
403
  "GET /recent": "Recent global downloads (live feed)",
 
 
404
  "GET /health": "Health check",
405
  "GET /mirrors": "List mirrors",
406
  "POST /cache/clear": "Clear cache"
@@ -410,10 +585,16 @@ def index():
410
  "filters": "/search?q=machine+learning&ext=pdf&lang=en",
411
  "pagination": "/search?q=python&page=2",
412
  "csv": "/search?q=python&format=csv",
413
- "recent": "/recent"
 
 
 
 
 
414
  }
415
  })
416
 
 
417
  @app.route('/search')
418
  def search():
419
  query = request.args.get('q', '').strip()
@@ -452,6 +633,54 @@ def search():
452
  })
453
 
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  @app.route('/recent')
456
  def recent_downloads():
457
  result = scrape_recent_downloads()
@@ -463,16 +692,18 @@ def health():
463
  try:
464
  mirror = mirror_manager.get_active_mirror()
465
  status = "healthy"
466
- except:
467
  mirror = "unavailable"
468
  status = "degraded"
469
  return jsonify({
470
  "status": status,
471
  "mirror": mirror,
 
472
  "cache_size": cache.size(),
473
  "browser": Config.BROWSER_IMPERSONATE
474
  })
475
 
 
476
  @app.route('/mirrors')
477
  def mirrors():
478
  return jsonify({
@@ -480,18 +711,21 @@ def mirrors():
480
  "current": mirror_manager.get_active_mirror()
481
  })
482
 
 
483
  @app.route('/cache/clear', methods=['POST'])
484
  def clear_cache():
485
  cache.clear()
486
- mirror_manager.reset() # FIX: reset() remplace l'ancien cache_clear() cassé
487
  return jsonify({"message": "Cache cleared", "size": 0})
488
 
 
489
  if __name__ == "__main__":
490
  logger.info("=" * 70)
491
- logger.info("🚀 Anna's Archives API - HF Space Edition v1.1.0")
492
  logger.info("=" * 70)
493
  logger.info(f"Port: {Config.PORT}")
494
  logger.info(f"Browser: {Config.BROWSER_IMPERSONATE}")
 
495
  logger.info("=" * 70)
496
  mirror_manager.get_active_mirror()
497
  app.run(host=Config.HOST, port=Config.PORT)
 
9
  import logging
10
  from datetime import datetime, timedelta
11
  from dataclasses import dataclass, asdict
12
+ from typing import Optional, Any, Literal
13
  import html
14
 
15
  from flask import Flask, jsonify, request, Response
 
27
  HOST = "0.0.0.0"
28
  MIRRORS_URL = "https://shadowlibraries.github.io/DirectDownloads/AnnasArchive/"
29
  DEFAULT_BASE_URL = "https://annas-archive.gs"
30
+ # Welib est un miroir stable avec l'endpoint /popular — on le cible directement.
31
+ WELIB_BASE_URL = "https://fr.welib.org"
32
  BROWSER_IMPERSONATE = "chrome110"
33
  CACHE_TTL_MINUTES = 10
34
  REQUEST_TIMEOUT = 20
 
86
  size_mb: float
87
  url: str
88
  cover_url: Optional[str] = None
89
+ description: Optional[str] = None
90
+
91
  def to_dict(self):
92
  return asdict(self)
93
 
 
123
 
124
  # ============================================================================
125
  # MIRROR MANAGER
 
 
126
  # ============================================================================
127
 
128
  class MirrorManager:
 
194
  mirror_manager = MirrorManager()
195
 
196
  # ============================================================================
197
+ # SCRAPER — SEARCH (Anna's Archive)
198
  # ============================================================================
199
 
200
  def scrape_search(query: str, page: int = 1, **filters) -> dict:
 
239
  return {"books": [], "total": 0, "has_more": False, "error": str(e)}
240
 
241
 
242
+ # ============================================================================
243
+ # SCRAPER — POPULAR (Welib /popular endpoint)
244
+ # ============================================================================
245
+
246
+ # Intervalles valides côté serveur welib
247
+ PopularInterval = Literal["24h", "week", "month", "random"]
248
+
249
+ def scrape_popular(interval: PopularInterval, offset: int = 0, limit: int = 10) -> dict:
250
+ """
251
+ Scrape GET /popular?interval={interval}&offset={offset}&limit={limit} sur fr.welib.org.
252
+ Le HTML retourné est un fragment de liste de livres (pas une page complète).
253
+ interval : "24h" | "week" | "month" | "random"
254
+ """
255
+ cache_key = f"popular_{interval}_{offset}_{limit}"
256
+ # Pour "random" (surprenez-moi), le cache est volontairement court (1 min).
257
+ ttl = 1 if interval == "random" else Config.CACHE_TTL_MINUTES
258
+ cached = cache.get(cache_key)
259
+ if cached and interval != "random":
260
+ logger.info(f"Cache HIT: popular/{interval}")
261
+ return cached
262
+
263
+ logger.info(f"Fetching popular books: interval={interval}, offset={offset}, limit={limit}")
264
+ url = f"{Config.WELIB_BASE_URL}/popular"
265
+ params = {"interval": interval, "offset": offset, "limit": limit}
266
+
267
+ try:
268
+ resp = requests.get(
269
+ url,
270
+ params=params,
271
+ impersonate=Config.BROWSER_IMPERSONATE,
272
+ timeout=Config.REQUEST_TIMEOUT,
273
+ headers={
274
+ "Accept": "*/*",
275
+ "Referer": f"{Config.WELIB_BASE_URL}/",
276
+ "Accept-Language": "fr,fr-FR;q=0.9,en-US;q=0.8,en;q=0.7",
277
+ }
278
+ )
279
+ resp.raise_for_status()
280
+ books = parse_welib_books(resp.text)
281
+ result = {
282
+ "interval": interval,
283
+ "offset": offset,
284
+ "limit": limit,
285
+ "books": [b.to_dict() for b in books],
286
+ "total": len(books),
287
+ "timestamp": datetime.now().isoformat()
288
+ }
289
+ cache.set(cache_key, result)
290
+ logger.info(f"Got {len(books)} popular books ({interval})")
291
+ return result
292
+ except Exception as e:
293
+ logger.error(f"Popular scraping error: {e}")
294
+ return {"interval": interval, "books": [], "total": 0, "error": str(e)}
295
+
296
+
297
+ def parse_welib_books(html_text: str) -> list[Book]:
298
+ """
299
+ Parse le fragment HTML retourné par /popular sur fr.welib.org.
300
+ Structure des cartes :
301
+ .book-card
302
+ img[data-author][data-title][src] → cover, author, title
303
+ a[href=/md5/...] → md5, url
304
+ h2.font-semibold → title (fallback)
305
+ a[href=/search?q=...] → author
306
+ p.text-gray-600 → description
307
+ div.mb-1 > span (×4) → format · langue · année · taille
308
+ """
309
+ soup = BeautifulSoup(html_text, "html.parser")
310
+ books = []
311
+ seen_md5s: set[str] = set()
312
+
313
+ for card in soup.find_all("div", class_="book-card"):
314
+ try:
315
+ # — MD5 & URL —
316
+ md5 = None
317
+ url = ""
318
+ anchor = card.find("a", href=re.compile(r'/md5/'))
319
+ if anchor:
320
+ href = anchor.get("href", "")
321
+ md5_match = re.search(r'/md5/([a-f0-9]{32})', href)
322
+ if md5_match:
323
+ md5 = md5_match.group(1)
324
+ url = f"{Config.WELIB_BASE_URL}{href}" if href.startswith('/') else href
325
+
326
+ if md5:
327
+ if md5 in seen_md5s:
328
+ continue
329
+ seen_md5s.add(md5)
330
+
331
+ # — Titre —
332
+ title = ""
333
+ img = card.find("img", attrs={"data-title": True})
334
+ if img:
335
+ title = clean_text(img["data-title"])
336
+ if not title:
337
+ h2 = card.find("h2", class_=lambda x: x and "font-semibold" in x)
338
+ if h2:
339
+ title = clean_text(h2.get_text())
340
+ if not title:
341
+ continue # carte invalide
342
+
343
+ # — Auteur —
344
+ author = "Unknown"
345
+ if img and img.get("data-author"):
346
+ author = clean_text(img["data-author"])
347
+ else:
348
+ author_link = card.find("a", href=re.compile(r'search\?q='))
349
+ if author_link:
350
+ author = clean_text(author_link.get_text())
351
+
352
+ # — Cover URL —
353
+ cover_url = None
354
+ if img:
355
+ src = img.get("src", "")
356
+ cover_url = src if src else None
357
+
358
+ # — Description —
359
+ description = None
360
+ desc_p = card.find("p", class_=lambda x: x and "text-gray-600" in x)
361
+ if desc_p:
362
+ # Exclure le bouton "Lire plus…"
363
+ for btn in desc_p.find_all("button"):
364
+ btn.decompose()
365
+ description = clean_text(desc_p.get_text()) or None
366
+
367
+ # — Métadonnées (format · langue · année · taille) —
368
+ # Dans le HTML welib, ces 4 infos sont dans des <span> inside div.mb-1
369
+ fmt = "UNKNOWN"
370
+ language = "xx"
371
+ year = None
372
+ size_mb = 0.0
373
+
374
+ meta_div = card.find("div", class_="mb-1")
375
+ if meta_div:
376
+ spans = [clean_text(s.get_text()) for s in meta_div.find_all("span")]
377
+ # spans typiques : ["PDF", "· français", "· 2017", "· 13.6 MB"]
378
+ # On nettoie les "· " en tête et on parse chaque span
379
+ for span in spans:
380
+ span = re.sub(r'^[·\s]+', '', span).strip()
381
+ if not span:
382
+ continue
383
+ if re.match(r'^\d{4}$', span):
384
+ year = int(span)
385
+ elif re.search(r'[\d.]+\s*[KMGT]?B', span, re.I):
386
+ size_mb = parse_size(span)
387
+ elif re.match(r'^[A-Z0-9]{2,6}$', span):
388
+ fmt = span
389
+ else:
390
+ # langue : peut être "français", "english", "deutsch", etc.
391
+ language = span
392
+
393
+ books.append(Book(
394
+ md5=md5,
395
+ title=title,
396
+ author=author,
397
+ publisher="Unknown", # pas exposé dans ce fragment HTML
398
+ year=year,
399
+ format=fmt,
400
+ language=language,
401
+ size_mb=size_mb,
402
+ url=url,
403
+ cover_url=cover_url,
404
+ description=description,
405
+ ))
406
+ except Exception as e:
407
+ logger.warning(f"Error parsing welib book card: {e}")
408
+ continue
409
+
410
+ return books
411
+
412
+
413
+ # ============================================================================
414
+ # SCRAPER — RECENT DOWNLOADS (Anna's Archive /dyn/recent_downloads/)
415
+ # ============================================================================
416
+
417
  def scrape_recent_downloads() -> dict:
418
  """
419
  Endpoint /dyn/recent_downloads/ — retourne les 50 derniers téléchargements globaux.
 
439
  )
440
  resp.raise_for_status()
441
  items = resp.json()
 
442
  enriched = []
443
  for item in items:
444
  md5_match = re.search(r'/md5/([a-f0-9]{32})', item.get("path", ""))
 
461
  return {"items": [], "total": 0, "error": str(e)}
462
 
463
 
464
+ # ============================================================================
465
+ # SCRAPER — SEARCH PARSER (Anna's Archive)
466
+ # ============================================================================
467
+
468
  def parse_books(html_text: str, base_url: str) -> list[Book]:
469
  soup = BeautifulSoup(html_text, 'html.parser')
470
  books = []
471
  seen_md5s = set()
472
 
 
 
 
473
  blocks = soup.find_all(
474
  'div',
475
  class_=lambda x: x and 'flex' in x and 'pt-3' in x and 'pb-3' in x
 
519
  for link in block.find_all('a', href=re.compile(r'search\?q=')):
520
  if 'company' in str(link):
521
  pub_text = clean_text(link.get_text())
 
 
522
  year_match_pub = re.search(r'(\d{4})$', pub_text)
523
  if year_match_pub:
524
  year = int(year_match_pub.group(1))
 
525
  publisher = re.sub(r',\s*(?:\w+\s+\d+,\s*)?\d{4}$', '', pub_text).strip()
526
  else:
527
  publisher = pub_text
 
530
  info_div = block.find('div', class_=re.compile(r'text-gray-800'))
531
  info_text = info_div.get_text() if info_div else ""
532
  format_match = re.search(r'·\s*([A-Z0-9]+)\s*·', info_text)
533
+ lang_match = re.search(r'\[([a-z]{2,4})\]', info_text)
534
  size_match = re.search(r'([\d.]+\s*[KMGT]?B)', info_text)
535
  year_match = re.search(r'·\s*(\d{4})\s*·', info_text)
536
 
 
567
  def index():
568
  return jsonify({
569
  "name": "Anna's Archives API",
570
+ "version": "1.2.0",
571
  "description": "HF Space Edition - Free Tier Optimized",
572
  "browser": Config.BROWSER_IMPERSONATE,
573
  "endpoints": {
574
  "GET /": "Documentation",
575
  "GET /search": "Search books",
576
  "GET /recent": "Recent global downloads (live feed)",
577
+ "GET /popular": "Popular books by interval (24h / week / month)",
578
+ "GET /surprise": "Random book selection (surprenez-moi)",
579
  "GET /health": "Health check",
580
  "GET /mirrors": "List mirrors",
581
  "POST /cache/clear": "Clear cache"
 
585
  "filters": "/search?q=machine+learning&ext=pdf&lang=en",
586
  "pagination": "/search?q=python&page=2",
587
  "csv": "/search?q=python&format=csv",
588
+ "recent": "/recent",
589
+ "popular_day": "/popular?interval=24h",
590
+ "popular_week": "/popular?interval=week",
591
+ "popular_month": "/popular?interval=month",
592
+ "popular_paged": "/popular?interval=week&offset=10&limit=10",
593
+ "surprise": "/surprise"
594
  }
595
  })
596
 
597
+
598
  @app.route('/search')
599
  def search():
600
  query = request.args.get('q', '').strip()
 
633
  })
634
 
635
 
636
+ @app.route('/popular')
637
+ def popular():
638
+ """
639
+ Livres populaires par période.
640
+ Paramètres :
641
+ interval : "24h" | "week" | "month" (défaut : "week")
642
+ offset : int (défaut : 0)
643
+ limit : int (défaut : 10, max : 50)
644
+ Source : fr.welib.org/popular
645
+ """
646
+ interval = request.args.get('interval', 'week').lower()
647
+ valid_intervals = {"24h", "week", "month"}
648
+ if interval not in valid_intervals:
649
+ return jsonify({
650
+ "error": f"Invalid interval '{interval}'. Must be one of: {', '.join(sorted(valid_intervals))}"
651
+ }), 400
652
+
653
+ try:
654
+ offset = max(0, int(request.args.get('offset', 0)))
655
+ limit = min(50, max(1, int(request.args.get('limit', 10))))
656
+ except ValueError:
657
+ return jsonify({"error": "Invalid offset or limit"}), 400
658
+
659
+ result = scrape_popular(interval, offset, limit)
660
+ return jsonify(result)
661
+
662
+
663
+ @app.route('/surprise')
664
+ def surprise():
665
+ """
666
+ Sélection aléatoire de livres — "Surprenez-moi".
667
+ Paramètres :
668
+ limit : int (défaut : 10, max : 50)
669
+ Source : fr.welib.org/popular?interval=random
670
+ Le résultat N'EST PAS mis en cache (chaque appel retourne une sélection fraîche).
671
+ """
672
+ try:
673
+ limit = min(50, max(1, int(request.args.get('limit', 10))))
674
+ except ValueError:
675
+ return jsonify({"error": "Invalid limit"}), 400
676
+
677
+ result = scrape_popular("random", offset=0, limit=limit)
678
+ # On rebaptise l'interval pour l'utilisateur
679
+ result["interval"] = "random"
680
+ result["description"] = "Random book selection — surprenez-moi!"
681
+ return jsonify(result)
682
+
683
+
684
  @app.route('/recent')
685
  def recent_downloads():
686
  result = scrape_recent_downloads()
 
692
  try:
693
  mirror = mirror_manager.get_active_mirror()
694
  status = "healthy"
695
+ except Exception:
696
  mirror = "unavailable"
697
  status = "degraded"
698
  return jsonify({
699
  "status": status,
700
  "mirror": mirror,
701
+ "welib": Config.WELIB_BASE_URL,
702
  "cache_size": cache.size(),
703
  "browser": Config.BROWSER_IMPERSONATE
704
  })
705
 
706
+
707
  @app.route('/mirrors')
708
  def mirrors():
709
  return jsonify({
 
711
  "current": mirror_manager.get_active_mirror()
712
  })
713
 
714
+
715
  @app.route('/cache/clear', methods=['POST'])
716
  def clear_cache():
717
  cache.clear()
718
+ mirror_manager.reset()
719
  return jsonify({"message": "Cache cleared", "size": 0})
720
 
721
+
722
  if __name__ == "__main__":
723
  logger.info("=" * 70)
724
+ logger.info("🚀 Anna's Archives API - HF Space Edition v1.2.0")
725
  logger.info("=" * 70)
726
  logger.info(f"Port: {Config.PORT}")
727
  logger.info(f"Browser: {Config.BROWSER_IMPERSONATE}")
728
+ logger.info(f"Popular source: {Config.WELIB_BASE_URL}")
729
  logger.info("=" * 70)
730
  mirror_manager.get_active_mirror()
731
  app.run(host=Config.HOST, port=Config.PORT)