sonuprasad23 commited on
Commit
f58cab6
·
1 Parent(s): 66d5034

Project Uploaded

Browse files
Files changed (1) hide show
  1. final5.py +23 -35
final5.py CHANGED
@@ -1,4 +1,4 @@
1
- import os, re, sys, time, json, base64, pickle, argparse, traceback
2
  from typing import List, Dict, Any, Tuple
3
  from datetime import datetime
4
  import tempfile
@@ -41,28 +41,20 @@ def get_args():
41
 
42
  GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
43
 
44
- # --- FIX: Simplify this function. It's not the primary auth method. ---
45
  def build_gmail_service():
46
- """Builds Gmail service if a service account file exists."""
47
  if os.path.exists(SERVICE_ACCOUNT_FILE):
48
  try:
49
  sender_email = os.environ.get("SENDER_EMAIL")
50
  if not sender_email:
51
  print("[GMAIL] SENDER_EMAIL environment variable not set.")
52
  return None
53
-
54
  credentials = service_account.Credentials.from_service_account_file(
55
  SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
56
-
57
- svc = build("gmail", "v1", credentials=credentials)
58
- return svc
59
  except Exception as e:
60
  print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
61
- return None
62
- print("[GMAIL] Service account file not found in final5.py.")
63
  return None
64
 
65
- # The send_html_email function is kept for potential future direct use, but it's not called by main()
66
  def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
67
  if not service:
68
  print("[GMAIL] service not available; skipping email")
@@ -224,16 +216,14 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
224
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
225
  return (len(hits) > 0, hits)
226
 
227
- # --- FIX: Set a writable cache path for Selenium Manager and ensure unique user-data-dir ---
228
- def new_driver(headless: bool):
229
  options = webdriver.ChromeOptions()
230
 
231
- # Specify a writable directory for Selenium Manager's driver cache
232
  cache_path = os.path.join(WRITABLE_DIR, "selenium")
233
  os.makedirs(cache_path, exist_ok=True)
234
  os.environ["SE_CACHE_PATH"] = cache_path
235
 
236
- # Create a unique temporary directory for this specific Chrome instance's user data
237
  user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
238
  options.add_argument(f"--user-data-dir={user_data_dir}")
239
 
@@ -263,8 +253,8 @@ def new_driver(headless: bool):
263
  })
264
  except Exception:
265
  pass
266
- return driver
267
-
268
 
269
  def load_cookies(driver, cookies_file: str):
270
  print("[FB] Loading Facebook homepage...")
@@ -278,8 +268,8 @@ def load_cookies(driver, cookies_file: str):
278
  cookie["sameSite"] = "Lax"
279
  try:
280
  driver.add_cookie(cookie)
281
- except Exception:
282
- pass
283
  print("[FB] Cookies loaded. Refreshing page...")
284
  driver.refresh()
285
  time.sleep(5)
@@ -363,33 +353,31 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
363
  print(f"[SCRAPE] Total unique posts: {total}")
364
  return posts
365
 
 
366
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
367
  driver = None
 
 
368
  try:
369
- driver = new_driver(headless=True)
370
  wait = WebDriverWait(driver, 15)
371
  load_cookies(driver, cookies_file)
372
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
373
- return posts, driver
374
  except Exception as e:
375
  print(f"[SCRAPE] Error in headless mode: {e}")
376
- return [], None
377
  finally:
378
  if driver:
379
  try:
380
- # Also try to clean up the temporary user data directory
381
- user_data_dir = None
382
- for arg in driver.options.arguments:
383
- if arg.startswith('--user-data-dir='):
384
- user_data_dir = arg.split('=', 1)[1]
385
- break
386
  driver.quit()
387
- if user_data_dir and os.path.exists(user_data_dir):
388
- import shutil
389
- shutil.rmtree(user_data_dir, ignore_errors=True)
390
  except Exception as e:
391
- print(f"Error during driver cleanup: {e}")
392
-
 
 
 
 
 
 
393
 
394
  def main():
395
  args = get_args()
@@ -406,11 +394,11 @@ def main():
406
  gemini_keys.append(key)
407
  gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
408
 
409
- # This is not used to send mail, but just to check if auth is possible
410
  _ = build_gmail_service()
411
 
412
- posts, driver = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
413
- # Driver is cleaned up in try_scrape_with_fallback's finally block
414
 
415
  try:
416
  with open(args.out, "w", encoding="utf-8") as f:
 
1
+ import os, re, sys, time, json, base64, pickle, argparse, traceback, shutil
2
  from typing import List, Dict, Any, Tuple
3
  from datetime import datetime
4
  import tempfile
 
41
 
42
  GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
43
 
 
44
  def build_gmail_service():
 
45
  if os.path.exists(SERVICE_ACCOUNT_FILE):
46
  try:
47
  sender_email = os.environ.get("SENDER_EMAIL")
48
  if not sender_email:
49
  print("[GMAIL] SENDER_EMAIL environment variable not set.")
50
  return None
 
51
  credentials = service_account.Credentials.from_service_account_file(
52
  SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
53
+ return build("gmail", "v1", credentials=credentials)
 
 
54
  except Exception as e:
55
  print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
 
 
56
  return None
57
 
 
58
  def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
59
  if not service:
60
  print("[GMAIL] service not available; skipping email")
 
216
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
217
  return (len(hits) > 0, hits)
218
 
219
+ # --- FIX: Return the user_data_dir for explicit cleanup ---
220
+ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
221
  options = webdriver.ChromeOptions()
222
 
 
223
  cache_path = os.path.join(WRITABLE_DIR, "selenium")
224
  os.makedirs(cache_path, exist_ok=True)
225
  os.environ["SE_CACHE_PATH"] = cache_path
226
 
 
227
  user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
228
  options.add_argument(f"--user-data-dir={user_data_dir}")
229
 
 
253
  })
254
  except Exception:
255
  pass
256
+
257
+ return driver, user_data_dir
258
 
259
  def load_cookies(driver, cookies_file: str):
260
  print("[FB] Loading Facebook homepage...")
 
268
  cookie["sameSite"] = "Lax"
269
  try:
270
  driver.add_cookie(cookie)
271
+ except Exception as e:
272
+ print(f"Could not add cookie: {e}")
273
  print("[FB] Cookies loaded. Refreshing page...")
274
  driver.refresh()
275
  time.sleep(5)
 
353
  print(f"[SCRAPE] Total unique posts: {total}")
354
  return posts
355
 
356
+ # --- FIX: Robust cleanup of the driver and its user data directory ---
357
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
358
  driver = None
359
+ user_data_dir = None
360
+ posts = []
361
  try:
362
+ driver, user_data_dir = new_driver(headless=True)
363
  wait = WebDriverWait(driver, 15)
364
  load_cookies(driver, cookies_file)
365
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
 
366
  except Exception as e:
367
  print(f"[SCRAPE] Error in headless mode: {e}")
 
368
  finally:
369
  if driver:
370
  try:
 
 
 
 
 
 
371
  driver.quit()
 
 
 
372
  except Exception as e:
373
+ print(f"Error during driver.quit(): {e}")
374
+ if user_data_dir and os.path.exists(user_data_dir):
375
+ try:
376
+ shutil.rmtree(user_data_dir, ignore_errors=True)
377
+ print(f"Cleaned up user data directory: {user_data_dir}")
378
+ except Exception as e:
379
+ print(f"Error cleaning up user data directory {user_data_dir}: {e}")
380
+ return posts
381
 
382
  def main():
383
  args = get_args()
 
394
  gemini_keys.append(key)
395
  gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
396
 
397
+ # This is not used to send mail, just to confirm auth is possible.
398
  _ = build_gmail_service()
399
 
400
+ # Call the modified function which now returns only posts.
401
+ posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
402
 
403
  try:
404
  with open(args.out, "w", encoding="utf-8") as f: