sonuprasad23 commited on
Commit
5571520
·
1 Parent(s): f58cab6

Project Uploaded

Browse files
Files changed (1) hide show
  1. final5.py +103 -201
final5.py CHANGED
@@ -55,10 +55,9 @@ def build_gmail_service():
55
  print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
56
  return None
57
 
 
58
  def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
59
- if not service:
60
- print("[GMAIL] service not available; skipping email")
61
- return 0
62
  from email.message import EmailMessage
63
  sent = 0
64
  for to in to_list:
@@ -71,8 +70,6 @@ def send_html_email(service, sender: str, to_list: List[str], subject: str, html
71
  raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
72
  service.users().messages().send(userId="me", body={"raw": raw}).execute()
73
  sent += 1
74
- except HttpError as e:
75
- print(f"[GMAIL] http error to {to}: {e}")
76
  except Exception as e:
77
  print(f"[GMAIL] send error to {to}: {e}")
78
  return sent
@@ -123,55 +120,30 @@ class GeminiManager:
123
  else:
124
  raise e
125
 
126
- def parse_retry_seconds_from_error(err: Exception) -> int:
127
- s = str(err)
128
- m1 = re.search(r"retry[_ ]delay\s*\{\s*seconds:\s*(\d+)", s, re.IGNORECASE)
129
- if m1: return int(m1.group(1))
130
- m2 = re.search(r'"retryDelay"\s*:\s*"(\d+)s"', s)
131
- if m2: return int(m2.group(1))
132
- return 45
133
-
134
  def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
135
  fallback = {
136
- "is_medical_seeking": False,
137
- "confidence": "low",
138
  "medical_summary": "Not a medical request (AI unavailable/throttled)",
139
- "suggested_services": [],
140
- "urgency_level": "low",
141
- "analysis": "Keyword-based fallback",
142
- "reasoning": "short explanation",
143
- "matched_keywords": found_keywords
144
  }
145
- if not gemini_manager or not gemini_manager.is_available():
146
- return fallback
147
  keywords_str = ", ".join(found_keywords) if found_keywords else "none"
148
  prompt = f"""
149
  Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
150
  KEYWORDS FOUND IN POST: {keywords_str}
151
  CRITICAL RULES:
152
  1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
153
- 2. IGNORE posts about:
154
- - Business services (e.g., "Looking for a doctor for my clinic")
155
- - Donations or fundraising (e.g., "Raising money for surgery")
156
- - Selling medical products
157
- - Job postings for medical professionals
158
- - General health information sharing
159
- - Research or academic inquiries
160
  3. ONLY flag if it's a PERSONAL HEALTH NEED
161
  Post: "{post_text}"
162
  Return ONLY JSON:
163
  {{
164
- "is_medical_seeking": true/false,
165
- "confidence": "high/medium/low",
166
- "medical_summary": "short summary",
167
- "suggested_services": ["service1","service2"],
168
- "urgency_level": "high/medium/low",
169
- "analysis": "why it's seeking help",
170
- "reasoning": "short explanation",
171
- "matched_keywords": ["keyword1", "keyword2"]
172
- }}
173
- """
174
- for attempt in range(1, 5):
175
  try:
176
  resp = gemini_manager.generate_content(prompt)
177
  txt = (resp.text or "").strip()
@@ -179,36 +151,26 @@ Return ONLY JSON:
179
  if s >= 0 and e > s:
180
  result = json.loads(txt[s:e])
181
  result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
182
- if "matched_keywords" not in result:
183
- result["matched_keywords"] = found_keywords
184
  return result
185
  return fallback
186
- except ResourceExhausted as e:
187
- wait_s = min(parse_retry_seconds_from_error(e) + 2, 120)
188
- print(f"[GEMINI] 429 rate limit; backoff {wait_s}s (attempt {attempt}/4)")
189
- time.sleep(wait_s)
190
- if gemini_manager.is_available():
191
- continue
192
- else:
193
- return fallback
194
  except Exception as e:
195
  print(f"[GEMINI] error: {e}")
196
  gemini_manager.rotate_key()
197
- if not gemini_manager.is_available():
198
- return fallback
199
  return fallback
200
 
201
  MEDICAL_KEYWORDS = [
202
- "doctor","physician","primary care","healthcare","medical","clinic","hospital",
203
- "urgent care","emergency","er","specialist","pediatrician","dentist",
204
- "gynecologist","obgyn","women's health","health center","family doctor",
205
- "maternity","prenatal","postnatal","labor","delivery",
206
- "need doctor","looking for doctor","find doctor","recommend doctor",
207
- "medical help","health help","appointment","checkup","treatment",
208
- "prescription","medicine","surgery","best hospital","best clinic",
209
- "where to go","doctor recommendation",
210
- "pregnancy","birth control","contraception","fertility",
211
- "hillside","medical group","wellness center"
212
  ]
213
 
214
  def contains_keywords(text: str) -> Tuple[bool, List[str]]:
@@ -216,67 +178,55 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
216
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
217
  return (len(hits) > 0, hits)
218
 
219
- # --- FIX: Return the user_data_dir for explicit cleanup ---
220
  def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
221
  options = webdriver.ChromeOptions()
222
 
 
223
  cache_path = os.path.join(WRITABLE_DIR, "selenium")
224
  os.makedirs(cache_path, exist_ok=True)
225
  os.environ["SE_CACHE_PATH"] = cache_path
226
-
227
  user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
228
- options.add_argument(f"--user-data-dir={user_data_dir}")
229
 
 
 
 
 
 
 
230
  options.add_argument("--disable-notifications")
231
- options.add_argument("--disable-web-security")
232
- options.add_argument("--disable-features=IsolateOrigins,site-per-process")
233
- options.add_argument("--disable-blink-features=AutomationControlled")
234
- options.add_experimental_option("useAutomationExtension", False)
235
- options.add_experimental_option("excludeSwitches", ["enable-automation"])
236
  options.add_argument("--window-size=1920,1080")
237
- options.add_argument("--lang=en-US,en")
238
- options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
239
- if headless:
240
- options.add_argument("--headless=new")
241
- options.add_argument("--disable-gpu")
242
- options.add_argument("--disable-dev-shm-usage")
243
- options.add_argument("--no-sandbox")
244
- options.add_argument("--disable-extensions")
245
- options.add_argument("--disable-plugins")
246
- options.add_argument("--disable-images")
247
 
248
  driver = webdriver.Chrome(options=options)
249
-
250
- try:
251
- driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
252
- "source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
253
- })
254
- except Exception:
255
- pass
256
-
257
  return driver, user_data_dir
258
 
 
259
  def load_cookies(driver, cookies_file: str):
260
- print("[FB] Loading Facebook homepage...")
261
  driver.get("https://www.facebook.com")
262
- time.sleep(3)
263
- try:
264
- with open(cookies_file, "rb") as f:
265
- cookies = pickle.load(f)
266
- for cookie in cookies:
267
- if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
268
- cookie["sameSite"] = "Lax"
269
- try:
270
- driver.add_cookie(cookie)
271
- except Exception as e:
272
- print(f"Could not add cookie: {e}")
273
- print("[FB] Cookies loaded. Refreshing page...")
274
- driver.refresh()
275
- time.sleep(5)
276
- except FileNotFoundError:
277
- raise RuntimeError(f"[FB] Cookies file not found: {cookies_file}")
278
- except Exception as e:
279
- raise RuntimeError(f"[FB] Cookie load error: {e}")
 
 
 
 
280
 
281
  def wait_group_feed(driver, wait):
282
  wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
@@ -290,93 +240,68 @@ def wait_group_feed(driver, wait):
290
  try:
291
  driver.find_element(By.XPATH, "//div[@role='article']")
292
  feed_loaded = True; break
293
- except NoSuchElementException:
294
- pass
295
  time.sleep(1)
296
  if not feed_loaded:
297
- raise TimeoutException("Timed out waiting for group feed")
298
 
299
  def find_message_nodes(driver):
300
- nodes = driver.find_elements(By.XPATH, "//div[@data-ad-preview='message']")
301
- if nodes: return nodes
302
- nodes = driver.find_elements(By.XPATH, "//div[@data-ad-comet-preview='message']")
303
- if nodes: return nodes
304
- return driver.find_elements(By.XPATH, "//div[@role='article']//div[@dir='auto' and string-length(normalize-space())>0]")
305
 
306
  def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
307
  print(f"[SCRAPE] Navigating to group: {group_url}")
308
  driver.get(group_url)
309
  wait_group_feed(driver, wait)
310
- posts, seen, rects = [], set(), set()
311
- total = 0
312
  for s in range(max_scrolls):
313
  print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
314
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
315
- try:
316
- wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
317
- except Exception:
318
- pass
319
  time.sleep(pause)
320
- try:
321
- divs = find_message_nodes(driver)
322
- print(f"[SCRAPE] Nodes found: {len(divs)}")
323
- except Exception as e:
324
- print(f"[SCRAPE] find error: {e}")
325
- continue
326
- added = 0
327
- for i, d in enumerate(divs):
328
- try:
329
- rect = (d.rect.get('x'), d.rect.get('y'), d.rect.get('width'), d.rect.get('height'))
330
- if rect in rects: continue
331
- rects.add(rect)
332
- except Exception:
333
- pass
334
  try:
335
  txt = (d.text or "").strip()
336
- if len(txt) < 20:
337
- try:
338
- art = d.find_element(By.XPATH, "ancestor::div[@role='article']")
339
- txt = (art.text or "").strip()
340
- except Exception:
341
- pass
 
 
 
342
  except StaleElementReferenceException:
343
  continue
344
- if not txt or len(txt) < 20: continue
345
- if txt in seen: continue
346
- wc = len(re.findall(r"\b\w+\b", txt))
347
- if wc > 7 and not any(j in txt for j in ["LikeCommentShare","Write a comment","View more comments"]):
348
- seen.add(txt)
349
- total += 1
350
- posts.append({"id": total, "text": txt, "group_link": group_url})
351
- added += 1
352
- print(f"[SCRAPE] New posts this scroll: {added}")
353
- print(f"[SCRAPE] Total unique posts: {total}")
354
  return posts
355
 
356
- # --- FIX: Robust cleanup of the driver and its user data directory ---
357
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
358
  driver = None
359
  user_data_dir = None
360
  posts = []
361
  try:
362
  driver, user_data_dir = new_driver(headless=True)
363
- wait = WebDriverWait(driver, 15)
364
  load_cookies(driver, cookies_file)
365
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
366
  except Exception as e:
367
- print(f"[SCRAPE] Error in headless mode: {e}")
 
 
368
  finally:
369
  if driver:
370
- try:
371
- driver.quit()
372
- except Exception as e:
373
- print(f"Error during driver.quit(): {e}")
374
  if user_data_dir and os.path.exists(user_data_dir):
375
  try:
376
  shutil.rmtree(user_data_dir, ignore_errors=True)
377
- print(f"Cleaned up user data directory: {user_data_dir}")
378
  except Exception as e:
379
- print(f"Error cleaning up user data directory {user_data_dir}: {e}")
380
  return posts
381
 
382
  def main():
@@ -384,29 +309,15 @@ def main():
384
  os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
385
  os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
386
 
387
- gemini_keys = []
388
- if args.gemini_keys:
389
- gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()]
390
- else:
391
- for i in range(1, 6):
392
- key = os.environ.get(f"GEMINI_API_KEY_{i}")
393
- if key:
394
- gemini_keys.append(key)
395
- gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
396
 
397
- # This is not used to send mail, just to confirm auth is possible.
398
- _ = build_gmail_service()
399
-
400
- # Call the modified function which now returns only posts.
401
  posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
402
 
403
- try:
404
- with open(args.out, "w", encoding="utf-8") as f:
405
- json.dump(posts, f, ensure_ascii=False, indent=2)
406
- print(f"[SCRAPE] Saved scraped posts to {args.out}")
407
- print(f"::SCRAPE_SAVED::{args.out}")
408
- except Exception as e:
409
- print(f"[SCRAPE] Error saving posts: {e}")
410
 
411
  keyword_hits, confirmed = [], []
412
  for p in posts:
@@ -416,8 +327,7 @@ def main():
416
  keyword_hits.append(p)
417
  print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
418
 
419
- per_call_sleep = 7
420
- analyzed_posts = []
421
  for idx, p in enumerate(keyword_hits, start=1):
422
  found_kws = p.get("found_keywords", [])
423
  ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
@@ -425,32 +335,24 @@ def main():
425
  print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
426
  if ai.get("is_medical_seeking"):
427
  confirmed.append(p)
428
- analyzed_posts.append(p)
429
  if idx < len(keyword_hits):
430
  time.sleep(per_call_sleep)
431
 
432
  report = {
433
- "analysis_date": datetime.now().isoformat(),
434
- "group_link": args.group,
435
- "total_posts": len(posts),
436
- "keyword_hits": len(keyword_hits),
437
- "confirmed_medical": len(confirmed),
438
- "emails_sent": 0,
439
- "posts": confirmed
440
  }
441
 
442
- try:
443
- with open(args.analysis_out, "w", encoding="utf-8") as f:
444
- json.dump(report, f, ensure_ascii=False, indent=2)
445
- print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
446
- print(f"::ANALYSIS_SAVED::{args.analysis_out}")
447
- except Exception as e:
448
- print(f"[ANALYSIS] Error saving analysis: {e}")
449
 
450
  if __name__ == "__main__":
451
  try:
452
  main()
453
- except Exception as e:
454
- print(f"Unhandled error in main: {e}")
455
- print(traceback.format_exc())
456
- raise
 
55
  print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
56
  return None
57
 
58
+ # The send_html_email function is not used by main() but is kept for modularity
59
  def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
60
+ if not service: return 0
 
 
61
  from email.message import EmailMessage
62
  sent = 0
63
  for to in to_list:
 
70
  raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
71
  service.users().messages().send(userId="me", body={"raw": raw}).execute()
72
  sent += 1
 
 
73
  except Exception as e:
74
  print(f"[GMAIL] send error to {to}: {e}")
75
  return sent
 
120
  else:
121
  raise e
122
 
 
 
 
 
 
 
 
 
123
  def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
124
  fallback = {
125
+ "is_medical_seeking": False, "confidence": "low",
 
126
  "medical_summary": "Not a medical request (AI unavailable/throttled)",
127
+ "suggested_services": [], "urgency_level": "low", "analysis": "Keyword-based fallback",
128
+ "reasoning": "short explanation", "matched_keywords": found_keywords
 
 
 
129
  }
130
+ if not gemini_manager or not gemini_manager.is_available(): return fallback
 
131
  keywords_str = ", ".join(found_keywords) if found_keywords else "none"
132
  prompt = f"""
133
  Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
134
  KEYWORDS FOUND IN POST: {keywords_str}
135
  CRITICAL RULES:
136
  1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
137
+ 2. IGNORE posts about: business services, donations, selling products, job postings, general info sharing, or academic inquiries.
 
 
 
 
 
 
138
  3. ONLY flag if it's a PERSONAL HEALTH NEED
139
  Post: "{post_text}"
140
  Return ONLY JSON:
141
  {{
142
+ "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
143
+ "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
144
+ "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1", "keyword2"]
145
+ }}"""
146
+ for _ in range(1, 5):
 
 
 
 
 
 
147
  try:
148
  resp = gemini_manager.generate_content(prompt)
149
  txt = (resp.text or "").strip()
 
151
  if s >= 0 and e > s:
152
  result = json.loads(txt[s:e])
153
  result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
154
+ if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
 
155
  return result
156
  return fallback
157
+ except ResourceExhausted:
158
+ gemini_manager.rotate_key()
159
+ if not gemini_manager.is_available(): return fallback
 
 
 
 
 
160
  except Exception as e:
161
  print(f"[GEMINI] error: {e}")
162
  gemini_manager.rotate_key()
163
+ if not gemini_manager.is_available(): return fallback
 
164
  return fallback
165
 
166
  MEDICAL_KEYWORDS = [
167
+ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care",
168
+ "emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health",
169
+ "health center","family doctor","maternity","prenatal","postnatal","labor","delivery",
170
+ "need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help",
171
+ "appointment","checkup","treatment","prescription","medicine","surgery","best hospital",
172
+ "best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception",
173
+ "fertility","hillside","medical group","wellness center"
 
 
 
174
  ]
175
 
176
  def contains_keywords(text: str) -> Tuple[bool, List[str]]:
 
178
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
179
  return (len(hits) > 0, hits)
180
 
181
+ # --- FIX #1: The Definitive Solution for the Selenium Crash ---
182
  def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
183
  options = webdriver.ChromeOptions()
184
 
185
+ # Define writable paths inside /tmp for Selenium's cache and user data
186
  cache_path = os.path.join(WRITABLE_DIR, "selenium")
187
  os.makedirs(cache_path, exist_ok=True)
188
  os.environ["SE_CACHE_PATH"] = cache_path
 
189
  user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
 
190
 
191
+ # Add all necessary arguments for a stable headless run in Docker
192
+ options.add_argument(f"--user-data-dir={user_data_dir}")
193
+ options.add_argument("--headless=new")
194
+ options.add_argument("--no-sandbox")
195
+ options.add_argument("--disable-dev-shm-usage") # CRITICAL: THIS IS THE FIX
196
+ options.add_argument("--disable-gpu")
197
  options.add_argument("--disable-notifications")
 
 
 
 
 
198
  options.add_argument("--window-size=1920,1080")
 
 
 
 
 
 
 
 
 
 
199
 
200
  driver = webdriver.Chrome(options=options)
201
+ print("[SELENIUM] WebDriver session created successfully.")
 
 
 
 
 
 
 
202
  return driver, user_data_dir
203
 
204
+ # --- FIX #2: Add Better Logging to the Login Process ---
205
  def load_cookies(driver, cookies_file: str):
206
+ print("[FB] Navigating to Facebook homepage to load cookies...")
207
  driver.get("https://www.facebook.com")
208
+ time.sleep(2)
209
+
210
+ if not os.path.exists(cookies_file):
211
+ raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
212
+
213
+ with open(cookies_file, "rb") as f:
214
+ cookies = pickle.load(f)
215
+
216
+ for cookie in cookies:
217
+ if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
218
+ cookie["sameSite"] = "Lax"
219
+ driver.add_cookie(cookie)
220
+
221
+ print("[FB] All cookies loaded. Refreshing page to apply session...")
222
+ driver.refresh()
223
+ time.sleep(5)
224
+
225
+ # Check for login success by looking for a keyword in the title
226
+ if "log in" in driver.title.lower():
227
+ print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
228
+ else:
229
+ print(f"[FB] Login appears successful. Page title is: '{driver.title}'")
230
 
231
  def wait_group_feed(driver, wait):
232
  wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
 
240
  try:
241
  driver.find_element(By.XPATH, "//div[@role='article']")
242
  feed_loaded = True; break
243
+ except NoSuchElementException: pass
 
244
  time.sleep(1)
245
  if not feed_loaded:
246
+ raise TimeoutException("Timed out waiting for group feed to load.")
247
 
248
  def find_message_nodes(driver):
249
+ return driver.find_elements(By.XPATH, "//div[@role='article']")
 
 
 
 
250
 
251
  def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
252
  print(f"[SCRAPE] Navigating to group: {group_url}")
253
  driver.get(group_url)
254
  wait_group_feed(driver, wait)
255
+ posts, seen = [], set()
 
256
  for s in range(max_scrolls):
257
  print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
258
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 
 
 
 
259
  time.sleep(pause)
260
+
261
+ divs = find_message_nodes(driver)
262
+ added_this_scroll = 0
263
+ for d in divs:
 
 
 
 
 
 
 
 
 
 
264
  try:
265
  txt = (d.text or "").strip()
266
+ if len(txt) < 25 or txt in seen: continue
267
+
268
+ # Filter out common UI text that gets scraped as a post
269
+ if any(ui_text in txt for ui_text in ["Comment Share", "Write a comment...", "View more comments"]):
270
+ continue
271
+
272
+ seen.add(txt)
273
+ posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
274
+ added_this_scroll += 1
275
  except StaleElementReferenceException:
276
  continue
277
+ print(f"[SCRAPE] Found {added_this_scroll} new, unique posts this scroll.")
278
+ print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
 
 
 
 
 
 
 
 
279
  return posts
280
 
281
+ # --- FIX #3: Make the Script Fail Properly on Critical Errors ---
282
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
283
  driver = None
284
  user_data_dir = None
285
  posts = []
286
  try:
287
  driver, user_data_dir = new_driver(headless=True)
288
+ wait = WebDriverWait(driver, 20)
289
  load_cookies(driver, cookies_file)
290
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
291
  except Exception as e:
292
+ print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
293
+ # Re-raise the exception to make the script exit with a non-zero code
294
+ raise
295
  finally:
296
  if driver:
297
+ try: driver.quit()
298
+ except Exception: pass
 
 
299
  if user_data_dir and os.path.exists(user_data_dir):
300
  try:
301
  shutil.rmtree(user_data_dir, ignore_errors=True)
302
+ print(f"[SELENIUM] Cleaned up user data directory: {user_data_dir}")
303
  except Exception as e:
304
+ print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
305
  return posts
306
 
307
  def main():
 
309
  os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
310
  os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
311
 
312
+ gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
313
+ gemini_manager = GeminiManager(gemini_keys)
 
 
 
 
 
 
 
314
 
 
 
 
 
315
  posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
316
 
317
+ with open(args.out, "w", encoding="utf-8") as f:
318
+ json.dump(posts, f, ensure_ascii=False, indent=2)
319
+ print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
320
+ print(f"::SCRAPE_SAVED::{args.out}")
 
 
 
321
 
322
  keyword_hits, confirmed = [], []
323
  for p in posts:
 
327
  keyword_hits.append(p)
328
  print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
329
 
330
+ per_call_sleep = 5
 
331
  for idx, p in enumerate(keyword_hits, start=1):
332
  found_kws = p.get("found_keywords", [])
333
  ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
 
335
  print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
336
  if ai.get("is_medical_seeking"):
337
  confirmed.append(p)
 
338
  if idx < len(keyword_hits):
339
  time.sleep(per_call_sleep)
340
 
341
  report = {
342
+ "analysis_date": datetime.now().isoformat(), "group_link": args.group,
343
+ "total_posts": len(posts), "keyword_hits": len(keyword_hits),
344
+ "confirmed_medical": len(confirmed), "emails_sent": 0, "posts": confirmed
 
 
 
 
345
  }
346
 
347
+ with open(args.analysis_out, "w", encoding="utf-8") as f:
348
+ json.dump(report, f, ensure_ascii=False, indent=2)
349
+ print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
350
+ print(f"::ANALYSIS_SAVED::{args.analysis_out}")
 
 
 
351
 
352
  if __name__ == "__main__":
353
  try:
354
  main()
355
+ except Exception:
356
+ # The detailed traceback is already printed in try_scrape_with_fallback
357
+ print("Main execution failed. Exiting with error.")
358
+ sys.exit(1) # Ensure a non-zero exit code on failure