bluesky-explorer

Sleeping

App Files Files Community

jccolon commited on Feb 6

Commit

5239d7e

verified ·

1 Parent(s): fa8a2ca

Update app/fetcher.py

Browse files

Files changed (1) hide show

app/fetcher.py +137 -139

app/fetcher.py CHANGED Viewed

@@ -1,139 +1,137 @@
-import math
-from datetime import datetime, timedelta, timezone
-from typing import Iterable, Optional, List
-import pandas as pd
-from atproto import models
-from app.client_manager import get_client
-def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
-    """Convierte ISO (posible 'Z') a datetime naive en UTC."""
-    if not iso:
-        return None
-    try:
-        iso = iso.replace("Z", "+00:00")
-        aware = datetime.fromisoformat(iso)
-        return aware.astimezone(timezone.utc).replace(tzinfo=None)
-    except Exception:
-        return None
-def _search_one_term(
-    term: str,
-    days_back: int,
-    max_posts: Optional[int],
-) -> pd.DataFrame:
-    """
-    Busca posts de un único término usando la API oficial (app.bsky.feed.search_posts).
-    Devuelve un DataFrame con columnas: texto, autor, fecha (datetime naive UTC), uri.
-    Respeta el corte por días y el límite max_posts.
-    """
-    client = get_client()
-    if client is None:
-        raise RuntimeError("No hay sesión de Bluesky.")
-    cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
-    rows: List[dict] = []
-    cursor = None
-    while True:
-        remaining = None if max_posts is None else max(max_posts - len(rows), 0)
-        if remaining == 0:
-            break
-        limit = 100 if remaining is None else max(1, min(100, remaining))
-        params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
-        resp = client.app.bsky.feed.search_posts(params=params)
-        posts = resp.posts or []
-        if not posts:
-            break
-        # Si encontramos algún post más antiguo que el cutoff, paramos este término
-        stop_for_age = False
-        for p in posts:
-            created_raw = getattr(p.record, "created_at", "") or ""
-            # comparar con cutoff usando AWARE
-            try:
-                aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
-                if aware < cutoff_aware:
-                    stop_for_age = True
-                    break
-            except Exception:
-                pass
-            created_dt = _iso_to_dt_utc_naive(created_raw)
-            rows.append(
-                {
-                    "texto": getattr(p.record, "text", "") or "",
-                    "autor": getattr(p.author, "handle", "") or "",
-                    "fecha": created_dt,
-                    "uri": getattr(p, "uri", "") or "",
-                }
-            )
-            if max_posts is not None and len(rows) >= max_posts:
-                stop_for_age = True
-                break
-        if stop_for_age:
-            break
-        cursor = resp.cursor
-        if not cursor:
-            break
-    return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
-def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
-    """
-    Búsqueda tipo AND (la API ya hace matching por 'q').
-    """
-    return _search_one_term(topic, days_back, max_posts)
-def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
-    """
-    Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
-    """
-    terms = [t.strip() for t in terms if t and t.strip()]
-    if not terms:
-        return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
-    remaining = max_posts
-    frames: List[pd.DataFrame] = []
-    aportes = {}
-    for i, term in enumerate(terms):
-        limit_i = None
-        if remaining is not None:
-            # reparte lo que queda entre los que faltan (redondeo hacia arriba)
-            limit_i = math.ceil(remaining / (len(terms) - i))
-        try:
-            df_i = _search_one_term(term, days_back, limit_i)
-        except Exception as e:
-            # devolvemos vacío y que la app lo muestre como aviso
-            df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
-        aportes[term] = len(df_i)
-        frames.append(df_i)
-        if remaining is not None:
-            remaining = max(0, remaining - len(df_i))
-    df = pd.concat(frames, ignore_index=True)
-    df = df.drop_duplicates(subset=["uri", "texto", "autor"])
-    if max_posts is not None:
-        df = df.head(max_posts)
-    # guardamos “aportes” como atributo para que la UI lo muestre
-    df.attrs["aportes"] = aportes
-    return df

+import math
+from datetime import datetime, timedelta, timezone
+from typing import Iterable, Optional, List
+import pandas as pd
+from atproto import models
+from app.client_manager import get_client
+def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
+    """Convierte ISO (posible 'Z') a datetime naive en UTC."""
+    if not iso:
+        return None
+    try:
+        iso = iso.replace("Z", "+00:00")
+        aware = datetime.fromisoformat(iso)
+        return aware.astimezone(timezone.utc).replace(tzinfo=None)
+    except Exception:
+        return None
+def _search_one_term(
+    term: str,
+    days_back: int,
+    max_posts: Optional[int],
+) -> pd.DataFrame:
+    if not term or not term.strip():
+        return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
+    client = get_client()
+    if client is None:
+        raise RuntimeError("No hay sesión de Bluesky.")
+    cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
+    rows: List[dict] = []
+    cursor = None
+    while True:
+        remaining = None if max_posts is None else max(max_posts - len(rows), 0)
+        if remaining == 0:
+            break
+        limit = 100 if remaining is None else max(1, min(100, remaining))
+        params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
+        resp = client.app.bsky.feed.search_posts(params=params)
+        posts = resp.posts or []
+        if not posts:
+            break
+        # Si encontramos algún post más antiguo que el cutoff, paramos este término
+        stop_for_age = False
+        for p in posts:
+            created_raw = getattr(p.record, "created_at", "") or ""
+            # comparar con cutoff usando AWARE
+            try:
+                aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
+                if aware < cutoff_aware:
+                    stop_for_age = True
+                    break
+            except Exception:
+                pass
+            created_dt = _iso_to_dt_utc_naive(created_raw)
+            rows.append(
+                {
+                    "texto": getattr(p.record, "text", "") or "",
+                    "autor": getattr(p.author, "handle", "") or "",
+                    "fecha": created_dt,
+                    "uri": getattr(p, "uri", "") or "",
+                }
+            )
+            if max_posts is not None and len(rows) >= max_posts:
+                stop_for_age = True
+                break
+        if stop_for_age:
+            break
+        cursor = resp.cursor
+        if not cursor:
+            break
+    return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
+def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
+    """
+    Búsqueda tipo AND (la API ya hace matching por 'q').
+    """
+    return _search_one_term(topic, days_back, max_posts)
+def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
+    """
+    Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
+    """
+    terms = [t.strip() for t in terms if t and t.strip()]
+    if not terms:
+        return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
+    remaining = max_posts
+    frames: List[pd.DataFrame] = []
+    aportes = {}
+    for i, term in enumerate(terms):
+        limit_i = None
+        if remaining is not None:
+            # reparte lo que queda entre los que faltan (redondeo hacia arriba)
+            limit_i = math.ceil(remaining / (len(terms) - i))
+        try:
+            df_i = _search_one_term(term, days_back, limit_i)
+        except Exception as e:
+            # devolvemos vacío y que la app lo muestre como aviso
+            df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
+        aportes[term] = len(df_i)
+        frames.append(df_i)
+        if remaining is not None:
+            remaining = max(0, remaining - len(df_i))
+    df = pd.concat(frames, ignore_index=True)
+    df = df.drop_duplicates(subset=["uri", "texto", "autor"])
+    if max_posts is not None:
+        df = df.head(max_posts)
+    # guardamos “aportes” como atributo para que la UI lo muestre
+    df.attrs["aportes"] = aportes
+    return df