Spaces:
Sleeping
Sleeping
Update app/fetcher.py
Browse files- app/fetcher.py +137 -139
app/fetcher.py
CHANGED
|
@@ -1,139 +1,137 @@
|
|
| 1 |
-
import math
|
| 2 |
-
from datetime import datetime, timedelta, timezone
|
| 3 |
-
from typing import Iterable, Optional, List
|
| 4 |
-
|
| 5 |
-
import pandas as pd
|
| 6 |
-
from atproto import models
|
| 7 |
-
|
| 8 |
-
from app.client_manager import get_client
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
|
| 12 |
-
"""Convierte ISO (posible 'Z') a datetime naive en UTC."""
|
| 13 |
-
if not iso:
|
| 14 |
-
return None
|
| 15 |
-
try:
|
| 16 |
-
iso = iso.replace("Z", "+00:00")
|
| 17 |
-
aware = datetime.fromisoformat(iso)
|
| 18 |
-
return aware.astimezone(timezone.utc).replace(tzinfo=None)
|
| 19 |
-
except Exception:
|
| 20 |
-
return None
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
def _search_one_term(
|
| 24 |
-
term: str,
|
| 25 |
-
days_back: int,
|
| 26 |
-
max_posts: Optional[int],
|
| 27 |
-
) -> pd.DataFrame:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
if remaining
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
"
|
| 74 |
-
"
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
"""
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
"""
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
df.attrs["aportes"] = aportes
|
| 139 |
-
return df
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
from datetime import datetime, timedelta, timezone
|
| 3 |
+
from typing import Iterable, Optional, List
|
| 4 |
+
|
| 5 |
+
import pandas as pd
|
| 6 |
+
from atproto import models
|
| 7 |
+
|
| 8 |
+
from app.client_manager import get_client
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
|
| 12 |
+
"""Convierte ISO (posible 'Z') a datetime naive en UTC."""
|
| 13 |
+
if not iso:
|
| 14 |
+
return None
|
| 15 |
+
try:
|
| 16 |
+
iso = iso.replace("Z", "+00:00")
|
| 17 |
+
aware = datetime.fromisoformat(iso)
|
| 18 |
+
return aware.astimezone(timezone.utc).replace(tzinfo=None)
|
| 19 |
+
except Exception:
|
| 20 |
+
return None
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _search_one_term(
|
| 24 |
+
term: str,
|
| 25 |
+
days_back: int,
|
| 26 |
+
max_posts: Optional[int],
|
| 27 |
+
) -> pd.DataFrame:
|
| 28 |
+
|
| 29 |
+
if not term or not term.strip():
|
| 30 |
+
return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
|
| 31 |
+
client = get_client()
|
| 32 |
+
if client is None:
|
| 33 |
+
raise RuntimeError("No hay sesión de Bluesky.")
|
| 34 |
+
|
| 35 |
+
cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
|
| 36 |
+
|
| 37 |
+
rows: List[dict] = []
|
| 38 |
+
cursor = None
|
| 39 |
+
|
| 40 |
+
while True:
|
| 41 |
+
remaining = None if max_posts is None else max(max_posts - len(rows), 0)
|
| 42 |
+
if remaining == 0:
|
| 43 |
+
break
|
| 44 |
+
limit = 100 if remaining is None else max(1, min(100, remaining))
|
| 45 |
+
|
| 46 |
+
params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
|
| 47 |
+
resp = client.app.bsky.feed.search_posts(params=params)
|
| 48 |
+
|
| 49 |
+
posts = resp.posts or []
|
| 50 |
+
if not posts:
|
| 51 |
+
break
|
| 52 |
+
|
| 53 |
+
# Si encontramos algún post más antiguo que el cutoff, paramos este término
|
| 54 |
+
stop_for_age = False
|
| 55 |
+
|
| 56 |
+
for p in posts:
|
| 57 |
+
created_raw = getattr(p.record, "created_at", "") or ""
|
| 58 |
+
# comparar con cutoff usando AWARE
|
| 59 |
+
try:
|
| 60 |
+
aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
|
| 61 |
+
if aware < cutoff_aware:
|
| 62 |
+
stop_for_age = True
|
| 63 |
+
break
|
| 64 |
+
except Exception:
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
created_dt = _iso_to_dt_utc_naive(created_raw)
|
| 68 |
+
|
| 69 |
+
rows.append(
|
| 70 |
+
{
|
| 71 |
+
"texto": getattr(p.record, "text", "") or "",
|
| 72 |
+
"autor": getattr(p.author, "handle", "") or "",
|
| 73 |
+
"fecha": created_dt,
|
| 74 |
+
"uri": getattr(p, "uri", "") or "",
|
| 75 |
+
}
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if max_posts is not None and len(rows) >= max_posts:
|
| 79 |
+
stop_for_age = True
|
| 80 |
+
break
|
| 81 |
+
|
| 82 |
+
if stop_for_age:
|
| 83 |
+
break
|
| 84 |
+
|
| 85 |
+
cursor = resp.cursor
|
| 86 |
+
if not cursor:
|
| 87 |
+
break
|
| 88 |
+
|
| 89 |
+
return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
|
| 93 |
+
"""
|
| 94 |
+
Búsqueda tipo AND (la API ya hace matching por 'q').
|
| 95 |
+
"""
|
| 96 |
+
return _search_one_term(topic, days_back, max_posts)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
|
| 100 |
+
"""
|
| 101 |
+
Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
|
| 102 |
+
"""
|
| 103 |
+
terms = [t.strip() for t in terms if t and t.strip()]
|
| 104 |
+
if not terms:
|
| 105 |
+
return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
|
| 106 |
+
|
| 107 |
+
remaining = max_posts
|
| 108 |
+
frames: List[pd.DataFrame] = []
|
| 109 |
+
aportes = {}
|
| 110 |
+
|
| 111 |
+
for i, term in enumerate(terms):
|
| 112 |
+
limit_i = None
|
| 113 |
+
if remaining is not None:
|
| 114 |
+
# reparte lo que queda entre los que faltan (redondeo hacia arriba)
|
| 115 |
+
limit_i = math.ceil(remaining / (len(terms) - i))
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
df_i = _search_one_term(term, days_back, limit_i)
|
| 119 |
+
except Exception as e:
|
| 120 |
+
# devolvemos vacío y que la app lo muestre como aviso
|
| 121 |
+
df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
|
| 122 |
+
|
| 123 |
+
aportes[term] = len(df_i)
|
| 124 |
+
frames.append(df_i)
|
| 125 |
+
|
| 126 |
+
if remaining is not None:
|
| 127 |
+
remaining = max(0, remaining - len(df_i))
|
| 128 |
+
|
| 129 |
+
df = pd.concat(frames, ignore_index=True)
|
| 130 |
+
df = df.drop_duplicates(subset=["uri", "texto", "autor"])
|
| 131 |
+
|
| 132 |
+
if max_posts is not None:
|
| 133 |
+
df = df.head(max_posts)
|
| 134 |
+
|
| 135 |
+
# guardamos “aportes” como atributo para que la UI lo muestre
|
| 136 |
+
df.attrs["aportes"] = aportes
|
| 137 |
+
return df
|
|
|
|
|
|