anna-archive

Sleeping

App Files Files Community

MB-IDK commited on Apr 17

Commit

59755fa

verified ·

1 Parent(s): 1909027

Update app.py

Browse files

Files changed (1) hide show

app.py +252 -18

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import re
 import logging
 from datetime import datetime, timedelta
 from dataclasses import dataclass, asdict
-from typing import Optional, Any
 import html
 from flask import Flask, jsonify, request, Response
@@ -27,6 +27,8 @@ class Config:
     HOST = "0.0.0.0"
     MIRRORS_URL = "https://shadowlibraries.github.io/DirectDownloads/AnnasArchive/"
     DEFAULT_BASE_URL = "https://annas-archive.gs"
     BROWSER_IMPERSONATE = "chrome110"
     CACHE_TTL_MINUTES = 10
     REQUEST_TIMEOUT = 20
@@ -84,7 +86,8 @@ class Book:
     size_mb: float
     url: str
     cover_url: Optional[str] = None
     def to_dict(self):
         return asdict(self)
@@ -120,8 +123,6 @@ def parse_size(size_str: str) -> float:
 # ============================================================================
 # MIRROR MANAGER
-# FIX: lru_cache sur une méthode d'instance cause des problèmes avec cache_clear()
-# On déplace le cache au niveau module avec une variable de classe.
 # ============================================================================
 class MirrorManager:
@@ -193,7 +194,7 @@ class MirrorManager:
 mirror_manager = MirrorManager()
 # ============================================================================
-# SCRAPER
 # ============================================================================
 def scrape_search(query: str, page: int = 1, **filters) -> dict:
@@ -238,6 +239,181 @@ def scrape_search(query: str, page: int = 1, **filters) -> dict:
         return {"books": [], "total": 0, "has_more": False, "error": str(e)}
 def scrape_recent_downloads() -> dict:
     """
     Endpoint /dyn/recent_downloads/ — retourne les 50 derniers téléchargements globaux.
@@ -263,7 +439,6 @@ def scrape_recent_downloads() -> dict:
         )
         resp.raise_for_status()
         items = resp.json()
-        # Enrichissement : on extrait le md5 depuis path
         enriched = []
         for item in items:
             md5_match = re.search(r'/md5/([a-f0-9]{32})', item.get("path", ""))
@@ -286,14 +461,15 @@ def scrape_recent_downloads() -> dict:
         return {"items": [], "total": 0, "error": str(e)}
 def parse_books(html_text: str, base_url: str) -> list[Book]:
     soup = BeautifulSoup(html_text, 'html.parser')
     books = []
     seen_md5s = set()
-    # FIX BUG 1: Le HTML réel utilise "flex  pt-3 pb-3" (double espace).
-    # 'flex pt-3 pb-3' in x échoue car c'est une recherche de sous-chaîne exacte.
-    # On vérifie chaque classe individuellement pour être robuste aux variations de whitespace.
     blocks = soup.find_all(
         'div',
         class_=lambda x: x and 'flex' in x and 'pt-3' in x and 'pb-3' in x
@@ -343,12 +519,9 @@ def parse_books(html_text: str, base_url: str) -> list[Book]:
             for link in block.find_all('a', href=re.compile(r'search\?q=')):
                 if 'company' in str(link):
                     pub_text = clean_text(link.get_text())
-                    # FIX BUG 2: Le format réel est "Publisher, Aug 20, 2016" ou "Publisher, 2016"
-                    # L'ancienne regex r'(.+),\s*(\d{4})$' ne matchait que "Publisher, 2016"
                     year_match_pub = re.search(r'(\d{4})$', pub_text)
                     if year_match_pub:
                         year = int(year_match_pub.group(1))
-                        # Retire ", Aug 20, 2016" ou ", 2016" de la fin
                         publisher = re.sub(r',\s*(?:\w+\s+\d+,\s*)?\d{4}$', '', pub_text).strip()
                     else:
                         publisher = pub_text
@@ -357,7 +530,7 @@ def parse_books(html_text: str, base_url: str) -> list[Book]:
             info_div = block.find('div', class_=re.compile(r'text-gray-800'))
             info_text = info_div.get_text() if info_div else ""
             format_match = re.search(r'·\s*([A-Z0-9]+)\s*·', info_text)
-            lang_match = re.search(r'\[([a-z]{2,4})\]', info_text)  # élargi pour "eng", "yue", etc.
             size_match = re.search(r'([\d.]+\s*[KMGT]?B)', info_text)
             year_match = re.search(r'·\s*(\d{4})\s*·', info_text)
@@ -394,13 +567,15 @@ app = Flask(__name__)
 def index():
     return jsonify({
         "name": "Anna's Archives API",
-        "version": "1.1.0",
         "description": "HF Space Edition - Free Tier Optimized",
         "browser": Config.BROWSER_IMPERSONATE,
         "endpoints": {
             "GET /": "Documentation",
             "GET /search": "Search books",
             "GET /recent": "Recent global downloads (live feed)",
             "GET /health": "Health check",
             "GET /mirrors": "List mirrors",
             "POST /cache/clear": "Clear cache"
@@ -410,10 +585,16 @@ def index():
             "filters": "/search?q=machine+learning&ext=pdf&lang=en",
             "pagination": "/search?q=python&page=2",
             "csv": "/search?q=python&format=csv",
-            "recent": "/recent"
         }
     })
 @app.route('/search')
 def search():
     query = request.args.get('q', '').strip()
@@ -452,6 +633,54 @@ def search():
     })
 @app.route('/recent')
 def recent_downloads():
     result = scrape_recent_downloads()
@@ -463,16 +692,18 @@ def health():
     try:
         mirror = mirror_manager.get_active_mirror()
         status = "healthy"
-    except:
         mirror = "unavailable"
         status = "degraded"
     return jsonify({
         "status": status,
         "mirror": mirror,
         "cache_size": cache.size(),
         "browser": Config.BROWSER_IMPERSONATE
     })
 @app.route('/mirrors')
 def mirrors():
     return jsonify({
@@ -480,18 +711,21 @@ def mirrors():
         "current": mirror_manager.get_active_mirror()
     })
 @app.route('/cache/clear', methods=['POST'])
 def clear_cache():
     cache.clear()
-    mirror_manager.reset()  # FIX: reset() remplace l'ancien cache_clear() cassé
     return jsonify({"message": "Cache cleared", "size": 0})
 if __name__ == "__main__":
     logger.info("=" * 70)
-    logger.info("🚀 Anna's Archives API - HF Space Edition v1.1.0")
     logger.info("=" * 70)
     logger.info(f"Port: {Config.PORT}")
     logger.info(f"Browser: {Config.BROWSER_IMPERSONATE}")
     logger.info("=" * 70)
     mirror_manager.get_active_mirror()
     app.run(host=Config.HOST, port=Config.PORT)

 import logging
 from datetime import datetime, timedelta
 from dataclasses import dataclass, asdict
+from typing import Optional, Any, Literal
 import html
 from flask import Flask, jsonify, request, Response
     HOST = "0.0.0.0"
     MIRRORS_URL = "https://shadowlibraries.github.io/DirectDownloads/AnnasArchive/"
     DEFAULT_BASE_URL = "https://annas-archive.gs"
+    # Welib est un miroir stable avec l'endpoint /popular — on le cible directement.
+    WELIB_BASE_URL = "https://fr.welib.org"
     BROWSER_IMPERSONATE = "chrome110"
     CACHE_TTL_MINUTES = 10
     REQUEST_TIMEOUT = 20
     size_mb: float
     url: str
     cover_url: Optional[str] = None
+    description: Optional[str] = None
     def to_dict(self):
         return asdict(self)
 # ============================================================================
 # MIRROR MANAGER
 # ============================================================================
 class MirrorManager:
 mirror_manager = MirrorManager()
 # ============================================================================
+# SCRAPER — SEARCH (Anna's Archive)
 # ============================================================================
 def scrape_search(query: str, page: int = 1, **filters) -> dict:
         return {"books": [], "total": 0, "has_more": False, "error": str(e)}
+# ============================================================================
+# SCRAPER — POPULAR (Welib /popular endpoint)
+# ============================================================================
+# Intervalles valides côté serveur welib
+PopularInterval = Literal["24h", "week", "month", "random"]
+def scrape_popular(interval: PopularInterval, offset: int = 0, limit: int = 10) -> dict:
+    """
+    Scrape GET /popular?interval={interval}&offset={offset}&limit={limit} sur fr.welib.org.
+    Le HTML retourné est un fragment de liste de livres (pas une page complète).
+    interval : "24h" | "week" | "month" | "random"
+    """
+    cache_key = f"popular_{interval}_{offset}_{limit}"
+    # Pour "random" (surprenez-moi), le cache est volontairement court (1 min).
+    ttl = 1 if interval == "random" else Config.CACHE_TTL_MINUTES
+    cached = cache.get(cache_key)
+    if cached and interval != "random":
+        logger.info(f"Cache HIT: popular/{interval}")
+        return cached
+    logger.info(f"Fetching popular books: interval={interval}, offset={offset}, limit={limit}")
+    url = f"{Config.WELIB_BASE_URL}/popular"
+    params = {"interval": interval, "offset": offset, "limit": limit}
+    try:
+        resp = requests.get(
+            url,
+            params=params,
+            impersonate=Config.BROWSER_IMPERSONATE,
+            timeout=Config.REQUEST_TIMEOUT,
+            headers={
+                "Accept": "*/*",
+                "Referer": f"{Config.WELIB_BASE_URL}/",
+                "Accept-Language": "fr,fr-FR;q=0.9,en-US;q=0.8,en;q=0.7",
+            }
+        )
+        resp.raise_for_status()
+        books = parse_welib_books(resp.text)
+        result = {
+            "interval": interval,
+            "offset": offset,
+            "limit": limit,
+            "books": [b.to_dict() for b in books],
+            "total": len(books),
+            "timestamp": datetime.now().isoformat()
+        }
+        cache.set(cache_key, result)
+        logger.info(f"Got {len(books)} popular books ({interval})")
+        return result
+    except Exception as e:
+        logger.error(f"Popular scraping error: {e}")
+        return {"interval": interval, "books": [], "total": 0, "error": str(e)}
+def parse_welib_books(html_text: str) -> list[Book]:
+    """
+    Parse le fragment HTML retourné par /popular sur fr.welib.org.
+    Structure des cartes :
+      .book-card
+        img[data-author][data-title][src]   → cover, author, title
+        a[href=/md5/...]                     → md5, url
+        h2.font-semibold                     → title (fallback)
+        a[href=/search?q=...]               → author
+        p.text-gray-600                     → description
+        div.mb-1 > span (×4)               → format · langue · année · taille
+    """
+    soup = BeautifulSoup(html_text, "html.parser")
+    books = []
+    seen_md5s: set[str] = set()
+    for card in soup.find_all("div", class_="book-card"):
+        try:
+            # — MD5 & URL —
+            md5 = None
+            url = ""
+            anchor = card.find("a", href=re.compile(r'/md5/'))
+            if anchor:
+                href = anchor.get("href", "")
+                md5_match = re.search(r'/md5/([a-f0-9]{32})', href)
+                if md5_match:
+                    md5 = md5_match.group(1)
+                url = f"{Config.WELIB_BASE_URL}{href}" if href.startswith('/') else href
+            if md5:
+                if md5 in seen_md5s:
+                    continue
+                seen_md5s.add(md5)
+            # — Titre —
+            title = ""
+            img = card.find("img", attrs={"data-title": True})
+            if img:
+                title = clean_text(img["data-title"])
+            if not title:
+                h2 = card.find("h2", class_=lambda x: x and "font-semibold" in x)
+                if h2:
+                    title = clean_text(h2.get_text())
+            if not title:
+                continue  # carte invalide
+            # — Auteur —
+            author = "Unknown"
+            if img and img.get("data-author"):
+                author = clean_text(img["data-author"])
+            else:
+                author_link = card.find("a", href=re.compile(r'search\?q='))
+                if author_link:
+                    author = clean_text(author_link.get_text())
+            # — Cover URL —
+            cover_url = None
+            if img:
+                src = img.get("src", "")
+                cover_url = src if src else None
+            # — Description —
+            description = None
+            desc_p = card.find("p", class_=lambda x: x and "text-gray-600" in x)
+            if desc_p:
+                # Exclure le bouton "Lire plus…"
+                for btn in desc_p.find_all("button"):
+                    btn.decompose()
+                description = clean_text(desc_p.get_text()) or None
+            # — Métadonnées (format · langue · année · taille) —
+            # Dans le HTML welib, ces 4 infos sont dans des <span> inside div.mb-1
+            fmt = "UNKNOWN"
+            language = "xx"
+            year = None
+            size_mb = 0.0
+            meta_div = card.find("div", class_="mb-1")
+            if meta_div:
+                spans = [clean_text(s.get_text()) for s in meta_div.find_all("span")]
+                # spans typiques : ["PDF", "· français", "· 2017", "· 13.6 MB"]
+                # On nettoie les "· " en tête et on parse chaque span
+                for span in spans:
+                    span = re.sub(r'^[·\s]+', '', span).strip()
+                    if not span:
+                        continue
+                    if re.match(r'^\d{4}$', span):
+                        year = int(span)
+                    elif re.search(r'[\d.]+\s*[KMGT]?B', span, re.I):
+                        size_mb = parse_size(span)
+                    elif re.match(r'^[A-Z0-9]{2,6}$', span):
+                        fmt = span
+                    else:
+                        # langue : peut être "français", "english", "deutsch", etc.
+                        language = span
+            books.append(Book(
+                md5=md5,
+                title=title,
+                author=author,
+                publisher="Unknown",   # pas exposé dans ce fragment HTML
+                year=year,
+                format=fmt,
+                language=language,
+                size_mb=size_mb,
+                url=url,
+                cover_url=cover_url,
+                description=description,
+            ))
+        except Exception as e:
+            logger.warning(f"Error parsing welib book card: {e}")
+            continue
+    return books
+# ============================================================================
+# SCRAPER — RECENT DOWNLOADS (Anna's Archive /dyn/recent_downloads/)
+# ============================================================================
 def scrape_recent_downloads() -> dict:
     """
     Endpoint /dyn/recent_downloads/ — retourne les 50 derniers téléchargements globaux.
         )
         resp.raise_for_status()
         items = resp.json()
         enriched = []
         for item in items:
             md5_match = re.search(r'/md5/([a-f0-9]{32})', item.get("path", ""))
         return {"items": [], "total": 0, "error": str(e)}
+# ============================================================================
+# SCRAPER — SEARCH PARSER (Anna's Archive)
+# ============================================================================
 def parse_books(html_text: str, base_url: str) -> list[Book]:
     soup = BeautifulSoup(html_text, 'html.parser')
     books = []
     seen_md5s = set()
     blocks = soup.find_all(
         'div',
         class_=lambda x: x and 'flex' in x and 'pt-3' in x and 'pb-3' in x
             for link in block.find_all('a', href=re.compile(r'search\?q=')):
                 if 'company' in str(link):
                     pub_text = clean_text(link.get_text())
                     year_match_pub = re.search(r'(\d{4})$', pub_text)
                     if year_match_pub:
                         year = int(year_match_pub.group(1))
                         publisher = re.sub(r',\s*(?:\w+\s+\d+,\s*)?\d{4}$', '', pub_text).strip()
                     else:
                         publisher = pub_text
             info_div = block.find('div', class_=re.compile(r'text-gray-800'))
             info_text = info_div.get_text() if info_div else ""
             format_match = re.search(r'·\s*([A-Z0-9]+)\s*·', info_text)
+            lang_match = re.search(r'\[([a-z]{2,4})\]', info_text)
             size_match = re.search(r'([\d.]+\s*[KMGT]?B)', info_text)
             year_match = re.search(r'·\s*(\d{4})\s*·', info_text)
 def index():
     return jsonify({
         "name": "Anna's Archives API",
+        "version": "1.2.0",
         "description": "HF Space Edition - Free Tier Optimized",
         "browser": Config.BROWSER_IMPERSONATE,
         "endpoints": {
             "GET /": "Documentation",
             "GET /search": "Search books",
             "GET /recent": "Recent global downloads (live feed)",
+            "GET /popular": "Popular books by interval (24h / week / month)",
+            "GET /surprise": "Random book selection (surprenez-moi)",
             "GET /health": "Health check",
             "GET /mirrors": "List mirrors",
             "POST /cache/clear": "Clear cache"
             "filters": "/search?q=machine+learning&ext=pdf&lang=en",
             "pagination": "/search?q=python&page=2",
             "csv": "/search?q=python&format=csv",
+            "recent": "/recent",
+            "popular_day": "/popular?interval=24h",
+            "popular_week": "/popular?interval=week",
+            "popular_month": "/popular?interval=month",
+            "popular_paged": "/popular?interval=week&offset=10&limit=10",
+            "surprise": "/surprise"
         }
     })
 @app.route('/search')
 def search():
     query = request.args.get('q', '').strip()
     })
+@app.route('/popular')
+def popular():
+    """
+    Livres populaires par période.
+    Paramètres :
+      interval : "24h" | "week" | "month"  (défaut : "week")
+      offset   : int  (défaut : 0)
+      limit    : int  (défaut : 10, max : 50)
+    Source : fr.welib.org/popular
+    """
+    interval = request.args.get('interval', 'week').lower()
+    valid_intervals = {"24h", "week", "month"}
+    if interval not in valid_intervals:
+        return jsonify({
+            "error": f"Invalid interval '{interval}'. Must be one of: {', '.join(sorted(valid_intervals))}"
+        }), 400
+    try:
+        offset = max(0, int(request.args.get('offset', 0)))
+        limit = min(50, max(1, int(request.args.get('limit', 10))))
+    except ValueError:
+        return jsonify({"error": "Invalid offset or limit"}), 400
+    result = scrape_popular(interval, offset, limit)
+    return jsonify(result)
+@app.route('/surprise')
+def surprise():
+    """
+    Sélection aléatoire de livres — "Surprenez-moi".
+    Paramètres :
+      limit : int  (défaut : 10, max : 50)
+    Source : fr.welib.org/popular?interval=random
+    Le résultat N'EST PAS mis en cache (chaque appel retourne une sélection fraîche).
+    """
+    try:
+        limit = min(50, max(1, int(request.args.get('limit', 10))))
+    except ValueError:
+        return jsonify({"error": "Invalid limit"}), 400
+    result = scrape_popular("random", offset=0, limit=limit)
+    # On rebaptise l'interval pour l'utilisateur
+    result["interval"] = "random"
+    result["description"] = "Random book selection — surprenez-moi!"
+    return jsonify(result)
 @app.route('/recent')
 def recent_downloads():
     result = scrape_recent_downloads()
     try:
         mirror = mirror_manager.get_active_mirror()
         status = "healthy"
+    except Exception:
         mirror = "unavailable"
         status = "degraded"
     return jsonify({
         "status": status,
         "mirror": mirror,
+        "welib": Config.WELIB_BASE_URL,
         "cache_size": cache.size(),
         "browser": Config.BROWSER_IMPERSONATE
     })
 @app.route('/mirrors')
 def mirrors():
     return jsonify({
         "current": mirror_manager.get_active_mirror()
     })
 @app.route('/cache/clear', methods=['POST'])
 def clear_cache():
     cache.clear()
+    mirror_manager.reset()
     return jsonify({"message": "Cache cleared", "size": 0})
 if __name__ == "__main__":
     logger.info("=" * 70)
+    logger.info("🚀 Anna's Archives API - HF Space Edition v1.2.0")
     logger.info("=" * 70)
     logger.info(f"Port: {Config.PORT}")
     logger.info(f"Browser: {Config.BROWSER_IMPERSONATE}")
+    logger.info(f"Popular source: {Config.WELIB_BASE_URL}")
     logger.info("=" * 70)
     mirror_manager.get_active_mirror()
     app.run(host=Config.HOST, port=Config.PORT)