Spaces:
Sleeping
Sleeping
Commit ·
5571520
1
Parent(s): f58cab6
Project Uploaded
Browse files
final5.py
CHANGED
|
@@ -55,10 +55,9 @@ def build_gmail_service():
|
|
| 55 |
print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
|
| 56 |
return None
|
| 57 |
|
|
|
|
| 58 |
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 59 |
-
if not service:
|
| 60 |
-
print("[GMAIL] service not available; skipping email")
|
| 61 |
-
return 0
|
| 62 |
from email.message import EmailMessage
|
| 63 |
sent = 0
|
| 64 |
for to in to_list:
|
|
@@ -71,8 +70,6 @@ def send_html_email(service, sender: str, to_list: List[str], subject: str, html
|
|
| 71 |
raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
|
| 72 |
service.users().messages().send(userId="me", body={"raw": raw}).execute()
|
| 73 |
sent += 1
|
| 74 |
-
except HttpError as e:
|
| 75 |
-
print(f"[GMAIL] http error to {to}: {e}")
|
| 76 |
except Exception as e:
|
| 77 |
print(f"[GMAIL] send error to {to}: {e}")
|
| 78 |
return sent
|
|
@@ -123,55 +120,30 @@ class GeminiManager:
|
|
| 123 |
else:
|
| 124 |
raise e
|
| 125 |
|
| 126 |
-
def parse_retry_seconds_from_error(err: Exception) -> int:
|
| 127 |
-
s = str(err)
|
| 128 |
-
m1 = re.search(r"retry[_ ]delay\s*\{\s*seconds:\s*(\d+)", s, re.IGNORECASE)
|
| 129 |
-
if m1: return int(m1.group(1))
|
| 130 |
-
m2 = re.search(r'"retryDelay"\s*:\s*"(\d+)s"', s)
|
| 131 |
-
if m2: return int(m2.group(1))
|
| 132 |
-
return 45
|
| 133 |
-
|
| 134 |
def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
|
| 135 |
fallback = {
|
| 136 |
-
"is_medical_seeking": False,
|
| 137 |
-
"confidence": "low",
|
| 138 |
"medical_summary": "Not a medical request (AI unavailable/throttled)",
|
| 139 |
-
"suggested_services": [],
|
| 140 |
-
"
|
| 141 |
-
"analysis": "Keyword-based fallback",
|
| 142 |
-
"reasoning": "short explanation",
|
| 143 |
-
"matched_keywords": found_keywords
|
| 144 |
}
|
| 145 |
-
if not gemini_manager or not gemini_manager.is_available():
|
| 146 |
-
return fallback
|
| 147 |
keywords_str = ", ".join(found_keywords) if found_keywords else "none"
|
| 148 |
prompt = f"""
|
| 149 |
Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
|
| 150 |
KEYWORDS FOUND IN POST: {keywords_str}
|
| 151 |
CRITICAL RULES:
|
| 152 |
1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
|
| 153 |
-
2. IGNORE posts about:
|
| 154 |
-
- Business services (e.g., "Looking for a doctor for my clinic")
|
| 155 |
-
- Donations or fundraising (e.g., "Raising money for surgery")
|
| 156 |
-
- Selling medical products
|
| 157 |
-
- Job postings for medical professionals
|
| 158 |
-
- General health information sharing
|
| 159 |
-
- Research or academic inquiries
|
| 160 |
3. ONLY flag if it's a PERSONAL HEALTH NEED
|
| 161 |
Post: "{post_text}"
|
| 162 |
Return ONLY JSON:
|
| 163 |
{{
|
| 164 |
-
"is_medical_seeking": true/false,
|
| 165 |
-
"
|
| 166 |
-
"
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
"analysis": "why it's seeking help",
|
| 170 |
-
"reasoning": "short explanation",
|
| 171 |
-
"matched_keywords": ["keyword1", "keyword2"]
|
| 172 |
-
}}
|
| 173 |
-
"""
|
| 174 |
-
for attempt in range(1, 5):
|
| 175 |
try:
|
| 176 |
resp = gemini_manager.generate_content(prompt)
|
| 177 |
txt = (resp.text or "").strip()
|
|
@@ -179,36 +151,26 @@ Return ONLY JSON:
|
|
| 179 |
if s >= 0 and e > s:
|
| 180 |
result = json.loads(txt[s:e])
|
| 181 |
result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
|
| 182 |
-
if "matched_keywords" not in result:
|
| 183 |
-
result["matched_keywords"] = found_keywords
|
| 184 |
return result
|
| 185 |
return fallback
|
| 186 |
-
except ResourceExhausted
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
time.sleep(wait_s)
|
| 190 |
-
if gemini_manager.is_available():
|
| 191 |
-
continue
|
| 192 |
-
else:
|
| 193 |
-
return fallback
|
| 194 |
except Exception as e:
|
| 195 |
print(f"[GEMINI] error: {e}")
|
| 196 |
gemini_manager.rotate_key()
|
| 197 |
-
if not gemini_manager.is_available():
|
| 198 |
-
return fallback
|
| 199 |
return fallback
|
| 200 |
|
| 201 |
MEDICAL_KEYWORDS = [
|
| 202 |
-
"doctor","physician","primary care","healthcare","medical","clinic","hospital",
|
| 203 |
-
"
|
| 204 |
-
"
|
| 205 |
-
"
|
| 206 |
-
"
|
| 207 |
-
"
|
| 208 |
-
"
|
| 209 |
-
"where to go","doctor recommendation",
|
| 210 |
-
"pregnancy","birth control","contraception","fertility",
|
| 211 |
-
"hillside","medical group","wellness center"
|
| 212 |
]
|
| 213 |
|
| 214 |
def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
|
@@ -216,67 +178,55 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
|
| 216 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 217 |
return (len(hits) > 0, hits)
|
| 218 |
|
| 219 |
-
# --- FIX:
|
| 220 |
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 221 |
options = webdriver.ChromeOptions()
|
| 222 |
|
|
|
|
| 223 |
cache_path = os.path.join(WRITABLE_DIR, "selenium")
|
| 224 |
os.makedirs(cache_path, exist_ok=True)
|
| 225 |
os.environ["SE_CACHE_PATH"] = cache_path
|
| 226 |
-
|
| 227 |
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
| 228 |
-
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
options.add_argument("--disable-notifications")
|
| 231 |
-
options.add_argument("--disable-web-security")
|
| 232 |
-
options.add_argument("--disable-features=IsolateOrigins,site-per-process")
|
| 233 |
-
options.add_argument("--disable-blink-features=AutomationControlled")
|
| 234 |
-
options.add_experimental_option("useAutomationExtension", False)
|
| 235 |
-
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 236 |
options.add_argument("--window-size=1920,1080")
|
| 237 |
-
options.add_argument("--lang=en-US,en")
|
| 238 |
-
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
|
| 239 |
-
if headless:
|
| 240 |
-
options.add_argument("--headless=new")
|
| 241 |
-
options.add_argument("--disable-gpu")
|
| 242 |
-
options.add_argument("--disable-dev-shm-usage")
|
| 243 |
-
options.add_argument("--no-sandbox")
|
| 244 |
-
options.add_argument("--disable-extensions")
|
| 245 |
-
options.add_argument("--disable-plugins")
|
| 246 |
-
options.add_argument("--disable-images")
|
| 247 |
|
| 248 |
driver = webdriver.Chrome(options=options)
|
| 249 |
-
|
| 250 |
-
try:
|
| 251 |
-
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
| 252 |
-
"source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
|
| 253 |
-
})
|
| 254 |
-
except Exception:
|
| 255 |
-
pass
|
| 256 |
-
|
| 257 |
return driver, user_data_dir
|
| 258 |
|
|
|
|
| 259 |
def load_cookies(driver, cookies_file: str):
|
| 260 |
-
print("[FB]
|
| 261 |
driver.get("https://www.facebook.com")
|
| 262 |
-
time.sleep(
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
def wait_group_feed(driver, wait):
|
| 282 |
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
|
@@ -290,93 +240,68 @@ def wait_group_feed(driver, wait):
|
|
| 290 |
try:
|
| 291 |
driver.find_element(By.XPATH, "//div[@role='article']")
|
| 292 |
feed_loaded = True; break
|
| 293 |
-
except NoSuchElementException:
|
| 294 |
-
pass
|
| 295 |
time.sleep(1)
|
| 296 |
if not feed_loaded:
|
| 297 |
-
raise TimeoutException("Timed out waiting for group feed")
|
| 298 |
|
| 299 |
def find_message_nodes(driver):
|
| 300 |
-
|
| 301 |
-
if nodes: return nodes
|
| 302 |
-
nodes = driver.find_elements(By.XPATH, "//div[@data-ad-comet-preview='message']")
|
| 303 |
-
if nodes: return nodes
|
| 304 |
-
return driver.find_elements(By.XPATH, "//div[@role='article']//div[@dir='auto' and string-length(normalize-space())>0]")
|
| 305 |
|
| 306 |
def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
| 307 |
print(f"[SCRAPE] Navigating to group: {group_url}")
|
| 308 |
driver.get(group_url)
|
| 309 |
wait_group_feed(driver, wait)
|
| 310 |
-
posts, seen
|
| 311 |
-
total = 0
|
| 312 |
for s in range(max_scrolls):
|
| 313 |
print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
|
| 314 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 315 |
-
try:
|
| 316 |
-
wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
|
| 317 |
-
except Exception:
|
| 318 |
-
pass
|
| 319 |
time.sleep(pause)
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
print(f"[SCRAPE] find error: {e}")
|
| 325 |
-
continue
|
| 326 |
-
added = 0
|
| 327 |
-
for i, d in enumerate(divs):
|
| 328 |
-
try:
|
| 329 |
-
rect = (d.rect.get('x'), d.rect.get('y'), d.rect.get('width'), d.rect.get('height'))
|
| 330 |
-
if rect in rects: continue
|
| 331 |
-
rects.add(rect)
|
| 332 |
-
except Exception:
|
| 333 |
-
pass
|
| 334 |
try:
|
| 335 |
txt = (d.text or "").strip()
|
| 336 |
-
if len(txt) <
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
| 342 |
except StaleElementReferenceException:
|
| 343 |
continue
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
wc = len(re.findall(r"\b\w+\b", txt))
|
| 347 |
-
if wc > 7 and not any(j in txt for j in ["LikeCommentShare","Write a comment","View more comments"]):
|
| 348 |
-
seen.add(txt)
|
| 349 |
-
total += 1
|
| 350 |
-
posts.append({"id": total, "text": txt, "group_link": group_url})
|
| 351 |
-
added += 1
|
| 352 |
-
print(f"[SCRAPE] New posts this scroll: {added}")
|
| 353 |
-
print(f"[SCRAPE] Total unique posts: {total}")
|
| 354 |
return posts
|
| 355 |
|
| 356 |
-
# --- FIX:
|
| 357 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 358 |
driver = None
|
| 359 |
user_data_dir = None
|
| 360 |
posts = []
|
| 361 |
try:
|
| 362 |
driver, user_data_dir = new_driver(headless=True)
|
| 363 |
-
wait = WebDriverWait(driver,
|
| 364 |
load_cookies(driver, cookies_file)
|
| 365 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 366 |
except Exception as e:
|
| 367 |
-
print(f"[SCRAPE]
|
|
|
|
|
|
|
| 368 |
finally:
|
| 369 |
if driver:
|
| 370 |
-
try:
|
| 371 |
-
|
| 372 |
-
except Exception as e:
|
| 373 |
-
print(f"Error during driver.quit(): {e}")
|
| 374 |
if user_data_dir and os.path.exists(user_data_dir):
|
| 375 |
try:
|
| 376 |
shutil.rmtree(user_data_dir, ignore_errors=True)
|
| 377 |
-
print(f"Cleaned up user data directory: {user_data_dir}")
|
| 378 |
except Exception as e:
|
| 379 |
-
print(f"Error cleaning up
|
| 380 |
return posts
|
| 381 |
|
| 382 |
def main():
|
|
@@ -384,29 +309,15 @@ def main():
|
|
| 384 |
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 385 |
os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
|
| 386 |
|
| 387 |
-
gemini_keys = []
|
| 388 |
-
|
| 389 |
-
gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()]
|
| 390 |
-
else:
|
| 391 |
-
for i in range(1, 6):
|
| 392 |
-
key = os.environ.get(f"GEMINI_API_KEY_{i}")
|
| 393 |
-
if key:
|
| 394 |
-
gemini_keys.append(key)
|
| 395 |
-
gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
|
| 396 |
|
| 397 |
-
# This is not used to send mail, just to confirm auth is possible.
|
| 398 |
-
_ = build_gmail_service()
|
| 399 |
-
|
| 400 |
-
# Call the modified function which now returns only posts.
|
| 401 |
posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
|
| 402 |
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
print(f"::SCRAPE_SAVED::{args.out}")
|
| 408 |
-
except Exception as e:
|
| 409 |
-
print(f"[SCRAPE] Error saving posts: {e}")
|
| 410 |
|
| 411 |
keyword_hits, confirmed = [], []
|
| 412 |
for p in posts:
|
|
@@ -416,8 +327,7 @@ def main():
|
|
| 416 |
keyword_hits.append(p)
|
| 417 |
print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
|
| 418 |
|
| 419 |
-
per_call_sleep =
|
| 420 |
-
analyzed_posts = []
|
| 421 |
for idx, p in enumerate(keyword_hits, start=1):
|
| 422 |
found_kws = p.get("found_keywords", [])
|
| 423 |
ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
|
|
@@ -425,32 +335,24 @@ def main():
|
|
| 425 |
print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
|
| 426 |
if ai.get("is_medical_seeking"):
|
| 427 |
confirmed.append(p)
|
| 428 |
-
analyzed_posts.append(p)
|
| 429 |
if idx < len(keyword_hits):
|
| 430 |
time.sleep(per_call_sleep)
|
| 431 |
|
| 432 |
report = {
|
| 433 |
-
"analysis_date": datetime.now().isoformat(),
|
| 434 |
-
"
|
| 435 |
-
"
|
| 436 |
-
"keyword_hits": len(keyword_hits),
|
| 437 |
-
"confirmed_medical": len(confirmed),
|
| 438 |
-
"emails_sent": 0,
|
| 439 |
-
"posts": confirmed
|
| 440 |
}
|
| 441 |
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
print(f"::ANALYSIS_SAVED::{args.analysis_out}")
|
| 447 |
-
except Exception as e:
|
| 448 |
-
print(f"[ANALYSIS] Error saving analysis: {e}")
|
| 449 |
|
| 450 |
if __name__ == "__main__":
|
| 451 |
try:
|
| 452 |
main()
|
| 453 |
-
except Exception
|
| 454 |
-
|
| 455 |
-
print(
|
| 456 |
-
|
|
|
|
| 55 |
print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
|
| 56 |
return None
|
| 57 |
|
| 58 |
+
# The send_html_email function is not used by main() but is kept for modularity
|
| 59 |
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 60 |
+
if not service: return 0
|
|
|
|
|
|
|
| 61 |
from email.message import EmailMessage
|
| 62 |
sent = 0
|
| 63 |
for to in to_list:
|
|
|
|
| 70 |
raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
|
| 71 |
service.users().messages().send(userId="me", body={"raw": raw}).execute()
|
| 72 |
sent += 1
|
|
|
|
|
|
|
| 73 |
except Exception as e:
|
| 74 |
print(f"[GMAIL] send error to {to}: {e}")
|
| 75 |
return sent
|
|
|
|
| 120 |
else:
|
| 121 |
raise e
|
| 122 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
|
| 124 |
fallback = {
|
| 125 |
+
"is_medical_seeking": False, "confidence": "low",
|
|
|
|
| 126 |
"medical_summary": "Not a medical request (AI unavailable/throttled)",
|
| 127 |
+
"suggested_services": [], "urgency_level": "low", "analysis": "Keyword-based fallback",
|
| 128 |
+
"reasoning": "short explanation", "matched_keywords": found_keywords
|
|
|
|
|
|
|
|
|
|
| 129 |
}
|
| 130 |
+
if not gemini_manager or not gemini_manager.is_available(): return fallback
|
|
|
|
| 131 |
keywords_str = ", ".join(found_keywords) if found_keywords else "none"
|
| 132 |
prompt = f"""
|
| 133 |
Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
|
| 134 |
KEYWORDS FOUND IN POST: {keywords_str}
|
| 135 |
CRITICAL RULES:
|
| 136 |
1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
|
| 137 |
+
2. IGNORE posts about: business services, donations, selling products, job postings, general info sharing, or academic inquiries.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
3. ONLY flag if it's a PERSONAL HEALTH NEED
|
| 139 |
Post: "{post_text}"
|
| 140 |
Return ONLY JSON:
|
| 141 |
{{
|
| 142 |
+
"is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
|
| 143 |
+
"suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
|
| 144 |
+
"analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1", "keyword2"]
|
| 145 |
+
}}"""
|
| 146 |
+
for _ in range(1, 5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
try:
|
| 148 |
resp = gemini_manager.generate_content(prompt)
|
| 149 |
txt = (resp.text or "").strip()
|
|
|
|
| 151 |
if s >= 0 and e > s:
|
| 152 |
result = json.loads(txt[s:e])
|
| 153 |
result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
|
| 154 |
+
if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
|
|
|
|
| 155 |
return result
|
| 156 |
return fallback
|
| 157 |
+
except ResourceExhausted:
|
| 158 |
+
gemini_manager.rotate_key()
|
| 159 |
+
if not gemini_manager.is_available(): return fallback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
except Exception as e:
|
| 161 |
print(f"[GEMINI] error: {e}")
|
| 162 |
gemini_manager.rotate_key()
|
| 163 |
+
if not gemini_manager.is_available(): return fallback
|
|
|
|
| 164 |
return fallback
|
| 165 |
|
| 166 |
MEDICAL_KEYWORDS = [
|
| 167 |
+
"doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care",
|
| 168 |
+
"emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health",
|
| 169 |
+
"health center","family doctor","maternity","prenatal","postnatal","labor","delivery",
|
| 170 |
+
"need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help",
|
| 171 |
+
"appointment","checkup","treatment","prescription","medicine","surgery","best hospital",
|
| 172 |
+
"best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception",
|
| 173 |
+
"fertility","hillside","medical group","wellness center"
|
|
|
|
|
|
|
|
|
|
| 174 |
]
|
| 175 |
|
| 176 |
def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
|
|
|
| 178 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 179 |
return (len(hits) > 0, hits)
|
| 180 |
|
| 181 |
+
# --- FIX #1: The Definitive Solution for the Selenium Crash ---
|
| 182 |
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 183 |
options = webdriver.ChromeOptions()
|
| 184 |
|
| 185 |
+
# Define writable paths inside /tmp for Selenium's cache and user data
|
| 186 |
cache_path = os.path.join(WRITABLE_DIR, "selenium")
|
| 187 |
os.makedirs(cache_path, exist_ok=True)
|
| 188 |
os.environ["SE_CACHE_PATH"] = cache_path
|
|
|
|
| 189 |
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
|
|
|
| 190 |
|
| 191 |
+
# Add all necessary arguments for a stable headless run in Docker
|
| 192 |
+
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 193 |
+
options.add_argument("--headless=new")
|
| 194 |
+
options.add_argument("--no-sandbox")
|
| 195 |
+
options.add_argument("--disable-dev-shm-usage") # CRITICAL: THIS IS THE FIX
|
| 196 |
+
options.add_argument("--disable-gpu")
|
| 197 |
options.add_argument("--disable-notifications")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
options.add_argument("--window-size=1920,1080")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
driver = webdriver.Chrome(options=options)
|
| 201 |
+
print("[SELENIUM] WebDriver session created successfully.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
return driver, user_data_dir
|
| 203 |
|
| 204 |
+
# --- FIX #2: Add Better Logging to the Login Process ---
|
| 205 |
def load_cookies(driver, cookies_file: str):
|
| 206 |
+
print("[FB] Navigating to Facebook homepage to load cookies...")
|
| 207 |
driver.get("https://www.facebook.com")
|
| 208 |
+
time.sleep(2)
|
| 209 |
+
|
| 210 |
+
if not os.path.exists(cookies_file):
|
| 211 |
+
raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
|
| 212 |
+
|
| 213 |
+
with open(cookies_file, "rb") as f:
|
| 214 |
+
cookies = pickle.load(f)
|
| 215 |
+
|
| 216 |
+
for cookie in cookies:
|
| 217 |
+
if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
|
| 218 |
+
cookie["sameSite"] = "Lax"
|
| 219 |
+
driver.add_cookie(cookie)
|
| 220 |
+
|
| 221 |
+
print("[FB] All cookies loaded. Refreshing page to apply session...")
|
| 222 |
+
driver.refresh()
|
| 223 |
+
time.sleep(5)
|
| 224 |
+
|
| 225 |
+
# Check for login success by looking for a keyword in the title
|
| 226 |
+
if "log in" in driver.title.lower():
|
| 227 |
+
print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
|
| 228 |
+
else:
|
| 229 |
+
print(f"[FB] Login appears successful. Page title is: '{driver.title}'")
|
| 230 |
|
| 231 |
def wait_group_feed(driver, wait):
|
| 232 |
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
|
|
|
| 240 |
try:
|
| 241 |
driver.find_element(By.XPATH, "//div[@role='article']")
|
| 242 |
feed_loaded = True; break
|
| 243 |
+
except NoSuchElementException: pass
|
|
|
|
| 244 |
time.sleep(1)
|
| 245 |
if not feed_loaded:
|
| 246 |
+
raise TimeoutException("Timed out waiting for group feed to load.")
|
| 247 |
|
| 248 |
def find_message_nodes(driver):
|
| 249 |
+
return driver.find_elements(By.XPATH, "//div[@role='article']")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
| 252 |
print(f"[SCRAPE] Navigating to group: {group_url}")
|
| 253 |
driver.get(group_url)
|
| 254 |
wait_group_feed(driver, wait)
|
| 255 |
+
posts, seen = [], set()
|
|
|
|
| 256 |
for s in range(max_scrolls):
|
| 257 |
print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
|
| 258 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
time.sleep(pause)
|
| 260 |
+
|
| 261 |
+
divs = find_message_nodes(driver)
|
| 262 |
+
added_this_scroll = 0
|
| 263 |
+
for d in divs:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
try:
|
| 265 |
txt = (d.text or "").strip()
|
| 266 |
+
if len(txt) < 25 or txt in seen: continue
|
| 267 |
+
|
| 268 |
+
# Filter out common UI text that gets scraped as a post
|
| 269 |
+
if any(ui_text in txt for ui_text in ["Comment Share", "Write a comment...", "View more comments"]):
|
| 270 |
+
continue
|
| 271 |
+
|
| 272 |
+
seen.add(txt)
|
| 273 |
+
posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
|
| 274 |
+
added_this_scroll += 1
|
| 275 |
except StaleElementReferenceException:
|
| 276 |
continue
|
| 277 |
+
print(f"[SCRAPE] Found {added_this_scroll} new, unique posts this scroll.")
|
| 278 |
+
print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
return posts
|
| 280 |
|
| 281 |
+
# --- FIX #3: Make the Script Fail Properly on Critical Errors ---
|
| 282 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 283 |
driver = None
|
| 284 |
user_data_dir = None
|
| 285 |
posts = []
|
| 286 |
try:
|
| 287 |
driver, user_data_dir = new_driver(headless=True)
|
| 288 |
+
wait = WebDriverWait(driver, 20)
|
| 289 |
load_cookies(driver, cookies_file)
|
| 290 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 291 |
except Exception as e:
|
| 292 |
+
print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
|
| 293 |
+
# Re-raise the exception to make the script exit with a non-zero code
|
| 294 |
+
raise
|
| 295 |
finally:
|
| 296 |
if driver:
|
| 297 |
+
try: driver.quit()
|
| 298 |
+
except Exception: pass
|
|
|
|
|
|
|
| 299 |
if user_data_dir and os.path.exists(user_data_dir):
|
| 300 |
try:
|
| 301 |
shutil.rmtree(user_data_dir, ignore_errors=True)
|
| 302 |
+
print(f"[SELENIUM] Cleaned up user data directory: {user_data_dir}")
|
| 303 |
except Exception as e:
|
| 304 |
+
print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
|
| 305 |
return posts
|
| 306 |
|
| 307 |
def main():
|
|
|
|
| 309 |
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 310 |
os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
|
| 311 |
|
| 312 |
+
gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
|
| 313 |
+
gemini_manager = GeminiManager(gemini_keys)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
|
| 316 |
|
| 317 |
+
with open(args.out, "w", encoding="utf-8") as f:
|
| 318 |
+
json.dump(posts, f, ensure_ascii=False, indent=2)
|
| 319 |
+
print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
|
| 320 |
+
print(f"::SCRAPE_SAVED::{args.out}")
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
keyword_hits, confirmed = [], []
|
| 323 |
for p in posts:
|
|
|
|
| 327 |
keyword_hits.append(p)
|
| 328 |
print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
|
| 329 |
|
| 330 |
+
per_call_sleep = 5
|
|
|
|
| 331 |
for idx, p in enumerate(keyword_hits, start=1):
|
| 332 |
found_kws = p.get("found_keywords", [])
|
| 333 |
ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
|
|
|
|
| 335 |
print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
|
| 336 |
if ai.get("is_medical_seeking"):
|
| 337 |
confirmed.append(p)
|
|
|
|
| 338 |
if idx < len(keyword_hits):
|
| 339 |
time.sleep(per_call_sleep)
|
| 340 |
|
| 341 |
report = {
|
| 342 |
+
"analysis_date": datetime.now().isoformat(), "group_link": args.group,
|
| 343 |
+
"total_posts": len(posts), "keyword_hits": len(keyword_hits),
|
| 344 |
+
"confirmed_medical": len(confirmed), "emails_sent": 0, "posts": confirmed
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
}
|
| 346 |
|
| 347 |
+
with open(args.analysis_out, "w", encoding="utf-8") as f:
|
| 348 |
+
json.dump(report, f, ensure_ascii=False, indent=2)
|
| 349 |
+
print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
|
| 350 |
+
print(f"::ANALYSIS_SAVED::{args.analysis_out}")
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
if __name__ == "__main__":
|
| 353 |
try:
|
| 354 |
main()
|
| 355 |
+
except Exception:
|
| 356 |
+
# The detailed traceback is already printed in try_scrape_with_fallback
|
| 357 |
+
print("Main execution failed. Exiting with error.")
|
| 358 |
+
sys.exit(1) # Ensure a non-zero exit code on failure
|