jccolon commited on
Commit
5239d7e
·
verified ·
1 Parent(s): fa8a2ca

Update app/fetcher.py

Browse files
Files changed (1) hide show
  1. app/fetcher.py +137 -139
app/fetcher.py CHANGED
@@ -1,139 +1,137 @@
1
- import math
2
- from datetime import datetime, timedelta, timezone
3
- from typing import Iterable, Optional, List
4
-
5
- import pandas as pd
6
- from atproto import models
7
-
8
- from app.client_manager import get_client
9
-
10
-
11
- def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
12
- """Convierte ISO (posible 'Z') a datetime naive en UTC."""
13
- if not iso:
14
- return None
15
- try:
16
- iso = iso.replace("Z", "+00:00")
17
- aware = datetime.fromisoformat(iso)
18
- return aware.astimezone(timezone.utc).replace(tzinfo=None)
19
- except Exception:
20
- return None
21
-
22
-
23
- def _search_one_term(
24
- term: str,
25
- days_back: int,
26
- max_posts: Optional[int],
27
- ) -> pd.DataFrame:
28
- """
29
- Busca posts de un único término usando la API oficial (app.bsky.feed.search_posts).
30
- Devuelve un DataFrame con columnas: texto, autor, fecha (datetime naive UTC), uri.
31
- Respeta el corte por días y el límite max_posts.
32
- """
33
- client = get_client()
34
- if client is None:
35
- raise RuntimeError("No hay sesión de Bluesky.")
36
-
37
- cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
38
-
39
- rows: List[dict] = []
40
- cursor = None
41
-
42
- while True:
43
- remaining = None if max_posts is None else max(max_posts - len(rows), 0)
44
- if remaining == 0:
45
- break
46
- limit = 100 if remaining is None else max(1, min(100, remaining))
47
-
48
- params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
49
- resp = client.app.bsky.feed.search_posts(params=params)
50
-
51
- posts = resp.posts or []
52
- if not posts:
53
- break
54
-
55
- # Si encontramos algún post más antiguo que el cutoff, paramos este término
56
- stop_for_age = False
57
-
58
- for p in posts:
59
- created_raw = getattr(p.record, "created_at", "") or ""
60
- # comparar con cutoff usando AWARE
61
- try:
62
- aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
63
- if aware < cutoff_aware:
64
- stop_for_age = True
65
- break
66
- except Exception:
67
- pass
68
-
69
- created_dt = _iso_to_dt_utc_naive(created_raw)
70
-
71
- rows.append(
72
- {
73
- "texto": getattr(p.record, "text", "") or "",
74
- "autor": getattr(p.author, "handle", "") or "",
75
- "fecha": created_dt,
76
- "uri": getattr(p, "uri", "") or "",
77
- }
78
- )
79
-
80
- if max_posts is not None and len(rows) >= max_posts:
81
- stop_for_age = True
82
- break
83
-
84
- if stop_for_age:
85
- break
86
-
87
- cursor = resp.cursor
88
- if not cursor:
89
- break
90
-
91
- return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
92
-
93
-
94
- def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
95
- """
96
- Búsqueda tipo AND (la API ya hace matching por 'q').
97
- """
98
- return _search_one_term(topic, days_back, max_posts)
99
-
100
-
101
- def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
102
- """
103
- Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
104
- """
105
- terms = [t.strip() for t in terms if t and t.strip()]
106
- if not terms:
107
- return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
108
-
109
- remaining = max_posts
110
- frames: List[pd.DataFrame] = []
111
- aportes = {}
112
-
113
- for i, term in enumerate(terms):
114
- limit_i = None
115
- if remaining is not None:
116
- # reparte lo que queda entre los que faltan (redondeo hacia arriba)
117
- limit_i = math.ceil(remaining / (len(terms) - i))
118
-
119
- try:
120
- df_i = _search_one_term(term, days_back, limit_i)
121
- except Exception as e:
122
- # devolvemos vacío y que la app lo muestre como aviso
123
- df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
124
-
125
- aportes[term] = len(df_i)
126
- frames.append(df_i)
127
-
128
- if remaining is not None:
129
- remaining = max(0, remaining - len(df_i))
130
-
131
- df = pd.concat(frames, ignore_index=True)
132
- df = df.drop_duplicates(subset=["uri", "texto", "autor"])
133
-
134
- if max_posts is not None:
135
- df = df.head(max_posts)
136
-
137
- # guardamos “aportes” como atributo para que la UI lo muestre
138
- df.attrs["aportes"] = aportes
139
- return df
 
1
+ import math
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Iterable, Optional, List
4
+
5
+ import pandas as pd
6
+ from atproto import models
7
+
8
+ from app.client_manager import get_client
9
+
10
+
11
+ def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
12
+ """Convierte ISO (posible 'Z') a datetime naive en UTC."""
13
+ if not iso:
14
+ return None
15
+ try:
16
+ iso = iso.replace("Z", "+00:00")
17
+ aware = datetime.fromisoformat(iso)
18
+ return aware.astimezone(timezone.utc).replace(tzinfo=None)
19
+ except Exception:
20
+ return None
21
+
22
+
23
+ def _search_one_term(
24
+ term: str,
25
+ days_back: int,
26
+ max_posts: Optional[int],
27
+ ) -> pd.DataFrame:
28
+
29
+ if not term or not term.strip():
30
+ return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
31
+ client = get_client()
32
+ if client is None:
33
+ raise RuntimeError("No hay sesión de Bluesky.")
34
+
35
+ cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
36
+
37
+ rows: List[dict] = []
38
+ cursor = None
39
+
40
+ while True:
41
+ remaining = None if max_posts is None else max(max_posts - len(rows), 0)
42
+ if remaining == 0:
43
+ break
44
+ limit = 100 if remaining is None else max(1, min(100, remaining))
45
+
46
+ params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
47
+ resp = client.app.bsky.feed.search_posts(params=params)
48
+
49
+ posts = resp.posts or []
50
+ if not posts:
51
+ break
52
+
53
+ # Si encontramos algún post más antiguo que el cutoff, paramos este término
54
+ stop_for_age = False
55
+
56
+ for p in posts:
57
+ created_raw = getattr(p.record, "created_at", "") or ""
58
+ # comparar con cutoff usando AWARE
59
+ try:
60
+ aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
61
+ if aware < cutoff_aware:
62
+ stop_for_age = True
63
+ break
64
+ except Exception:
65
+ pass
66
+
67
+ created_dt = _iso_to_dt_utc_naive(created_raw)
68
+
69
+ rows.append(
70
+ {
71
+ "texto": getattr(p.record, "text", "") or "",
72
+ "autor": getattr(p.author, "handle", "") or "",
73
+ "fecha": created_dt,
74
+ "uri": getattr(p, "uri", "") or "",
75
+ }
76
+ )
77
+
78
+ if max_posts is not None and len(rows) >= max_posts:
79
+ stop_for_age = True
80
+ break
81
+
82
+ if stop_for_age:
83
+ break
84
+
85
+ cursor = resp.cursor
86
+ if not cursor:
87
+ break
88
+
89
+ return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
90
+
91
+
92
+ def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
93
+ """
94
+ Búsqueda tipo AND (la API ya hace matching por 'q').
95
+ """
96
+ return _search_one_term(topic, days_back, max_posts)
97
+
98
+
99
+ def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
100
+ """
101
+ Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
102
+ """
103
+ terms = [t.strip() for t in terms if t and t.strip()]
104
+ if not terms:
105
+ return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
106
+
107
+ remaining = max_posts
108
+ frames: List[pd.DataFrame] = []
109
+ aportes = {}
110
+
111
+ for i, term in enumerate(terms):
112
+ limit_i = None
113
+ if remaining is not None:
114
+ # reparte lo que queda entre los que faltan (redondeo hacia arriba)
115
+ limit_i = math.ceil(remaining / (len(terms) - i))
116
+
117
+ try:
118
+ df_i = _search_one_term(term, days_back, limit_i)
119
+ except Exception as e:
120
+ # devolvemos vacío y que la app lo muestre como aviso
121
+ df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
122
+
123
+ aportes[term] = len(df_i)
124
+ frames.append(df_i)
125
+
126
+ if remaining is not None:
127
+ remaining = max(0, remaining - len(df_i))
128
+
129
+ df = pd.concat(frames, ignore_index=True)
130
+ df = df.drop_duplicates(subset=["uri", "texto", "autor"])
131
+
132
+ if max_posts is not None:
133
+ df = df.head(max_posts)
134
+
135
+ # guardamos “aportes” como atributo para que la UI lo muestre
136
+ df.attrs["aportes"] = aportes
137
+ return df