Spaces:
Sleeping
Sleeping
Commit
·
f58cab6
1
Parent(s):
66d5034
Project Uploaded
Browse files
final5.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import os, re, sys, time, json, base64, pickle, argparse, traceback
|
| 2 |
from typing import List, Dict, Any, Tuple
|
| 3 |
from datetime import datetime
|
| 4 |
import tempfile
|
|
@@ -41,28 +41,20 @@ def get_args():
|
|
| 41 |
|
| 42 |
GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
|
| 43 |
|
| 44 |
-
# --- FIX: Simplify this function. It's not the primary auth method. ---
|
| 45 |
def build_gmail_service():
|
| 46 |
-
"""Builds Gmail service if a service account file exists."""
|
| 47 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 48 |
try:
|
| 49 |
sender_email = os.environ.get("SENDER_EMAIL")
|
| 50 |
if not sender_email:
|
| 51 |
print("[GMAIL] SENDER_EMAIL environment variable not set.")
|
| 52 |
return None
|
| 53 |
-
|
| 54 |
credentials = service_account.Credentials.from_service_account_file(
|
| 55 |
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
|
| 56 |
-
|
| 57 |
-
svc = build("gmail", "v1", credentials=credentials)
|
| 58 |
-
return svc
|
| 59 |
except Exception as e:
|
| 60 |
print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
|
| 61 |
-
return None
|
| 62 |
-
print("[GMAIL] Service account file not found in final5.py.")
|
| 63 |
return None
|
| 64 |
|
| 65 |
-
# The send_html_email function is kept for potential future direct use, but it's not called by main()
|
| 66 |
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 67 |
if not service:
|
| 68 |
print("[GMAIL] service not available; skipping email")
|
|
@@ -224,16 +216,14 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
|
| 224 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 225 |
return (len(hits) > 0, hits)
|
| 226 |
|
| 227 |
-
# --- FIX:
|
| 228 |
-
def new_driver(headless: bool):
|
| 229 |
options = webdriver.ChromeOptions()
|
| 230 |
|
| 231 |
-
# Specify a writable directory for Selenium Manager's driver cache
|
| 232 |
cache_path = os.path.join(WRITABLE_DIR, "selenium")
|
| 233 |
os.makedirs(cache_path, exist_ok=True)
|
| 234 |
os.environ["SE_CACHE_PATH"] = cache_path
|
| 235 |
|
| 236 |
-
# Create a unique temporary directory for this specific Chrome instance's user data
|
| 237 |
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
| 238 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 239 |
|
|
@@ -263,8 +253,8 @@ def new_driver(headless: bool):
|
|
| 263 |
})
|
| 264 |
except Exception:
|
| 265 |
pass
|
| 266 |
-
|
| 267 |
-
|
| 268 |
|
| 269 |
def load_cookies(driver, cookies_file: str):
|
| 270 |
print("[FB] Loading Facebook homepage...")
|
|
@@ -278,8 +268,8 @@ def load_cookies(driver, cookies_file: str):
|
|
| 278 |
cookie["sameSite"] = "Lax"
|
| 279 |
try:
|
| 280 |
driver.add_cookie(cookie)
|
| 281 |
-
except Exception:
|
| 282 |
-
|
| 283 |
print("[FB] Cookies loaded. Refreshing page...")
|
| 284 |
driver.refresh()
|
| 285 |
time.sleep(5)
|
|
@@ -363,33 +353,31 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
|
| 363 |
print(f"[SCRAPE] Total unique posts: {total}")
|
| 364 |
return posts
|
| 365 |
|
|
|
|
| 366 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 367 |
driver = None
|
|
|
|
|
|
|
| 368 |
try:
|
| 369 |
-
driver = new_driver(headless=True)
|
| 370 |
wait = WebDriverWait(driver, 15)
|
| 371 |
load_cookies(driver, cookies_file)
|
| 372 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 373 |
-
return posts, driver
|
| 374 |
except Exception as e:
|
| 375 |
print(f"[SCRAPE] Error in headless mode: {e}")
|
| 376 |
-
return [], None
|
| 377 |
finally:
|
| 378 |
if driver:
|
| 379 |
try:
|
| 380 |
-
# Also try to clean up the temporary user data directory
|
| 381 |
-
user_data_dir = None
|
| 382 |
-
for arg in driver.options.arguments:
|
| 383 |
-
if arg.startswith('--user-data-dir='):
|
| 384 |
-
user_data_dir = arg.split('=', 1)[1]
|
| 385 |
-
break
|
| 386 |
driver.quit()
|
| 387 |
-
if user_data_dir and os.path.exists(user_data_dir):
|
| 388 |
-
import shutil
|
| 389 |
-
shutil.rmtree(user_data_dir, ignore_errors=True)
|
| 390 |
except Exception as e:
|
| 391 |
-
print(f"Error during driver
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
def main():
|
| 395 |
args = get_args()
|
|
@@ -406,11 +394,11 @@ def main():
|
|
| 406 |
gemini_keys.append(key)
|
| 407 |
gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
|
| 408 |
|
| 409 |
-
# This is not used to send mail,
|
| 410 |
_ = build_gmail_service()
|
| 411 |
|
| 412 |
-
|
| 413 |
-
|
| 414 |
|
| 415 |
try:
|
| 416 |
with open(args.out, "w", encoding="utf-8") as f:
|
|
|
|
| 1 |
+
import os, re, sys, time, json, base64, pickle, argparse, traceback, shutil
|
| 2 |
from typing import List, Dict, Any, Tuple
|
| 3 |
from datetime import datetime
|
| 4 |
import tempfile
|
|
|
|
| 41 |
|
| 42 |
GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
|
| 43 |
|
|
|
|
| 44 |
def build_gmail_service():
|
|
|
|
| 45 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 46 |
try:
|
| 47 |
sender_email = os.environ.get("SENDER_EMAIL")
|
| 48 |
if not sender_email:
|
| 49 |
print("[GMAIL] SENDER_EMAIL environment variable not set.")
|
| 50 |
return None
|
|
|
|
| 51 |
credentials = service_account.Credentials.from_service_account_file(
|
| 52 |
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
|
| 53 |
+
return build("gmail", "v1", credentials=credentials)
|
|
|
|
|
|
|
| 54 |
except Exception as e:
|
| 55 |
print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
|
|
|
|
|
|
|
| 56 |
return None
|
| 57 |
|
|
|
|
| 58 |
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 59 |
if not service:
|
| 60 |
print("[GMAIL] service not available; skipping email")
|
|
|
|
| 216 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 217 |
return (len(hits) > 0, hits)
|
| 218 |
|
| 219 |
+
# --- FIX: Return the user_data_dir for explicit cleanup ---
|
| 220 |
+
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 221 |
options = webdriver.ChromeOptions()
|
| 222 |
|
|
|
|
| 223 |
cache_path = os.path.join(WRITABLE_DIR, "selenium")
|
| 224 |
os.makedirs(cache_path, exist_ok=True)
|
| 225 |
os.environ["SE_CACHE_PATH"] = cache_path
|
| 226 |
|
|
|
|
| 227 |
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
| 228 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 229 |
|
|
|
|
| 253 |
})
|
| 254 |
except Exception:
|
| 255 |
pass
|
| 256 |
+
|
| 257 |
+
return driver, user_data_dir
|
| 258 |
|
| 259 |
def load_cookies(driver, cookies_file: str):
|
| 260 |
print("[FB] Loading Facebook homepage...")
|
|
|
|
| 268 |
cookie["sameSite"] = "Lax"
|
| 269 |
try:
|
| 270 |
driver.add_cookie(cookie)
|
| 271 |
+
except Exception as e:
|
| 272 |
+
print(f"Could not add cookie: {e}")
|
| 273 |
print("[FB] Cookies loaded. Refreshing page...")
|
| 274 |
driver.refresh()
|
| 275 |
time.sleep(5)
|
|
|
|
| 353 |
print(f"[SCRAPE] Total unique posts: {total}")
|
| 354 |
return posts
|
| 355 |
|
| 356 |
+
# --- FIX: Robust cleanup of the driver and its user data directory ---
|
| 357 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 358 |
driver = None
|
| 359 |
+
user_data_dir = None
|
| 360 |
+
posts = []
|
| 361 |
try:
|
| 362 |
+
driver, user_data_dir = new_driver(headless=True)
|
| 363 |
wait = WebDriverWait(driver, 15)
|
| 364 |
load_cookies(driver, cookies_file)
|
| 365 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
|
|
|
| 366 |
except Exception as e:
|
| 367 |
print(f"[SCRAPE] Error in headless mode: {e}")
|
|
|
|
| 368 |
finally:
|
| 369 |
if driver:
|
| 370 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
driver.quit()
|
|
|
|
|
|
|
|
|
|
| 372 |
except Exception as e:
|
| 373 |
+
print(f"Error during driver.quit(): {e}")
|
| 374 |
+
if user_data_dir and os.path.exists(user_data_dir):
|
| 375 |
+
try:
|
| 376 |
+
shutil.rmtree(user_data_dir, ignore_errors=True)
|
| 377 |
+
print(f"Cleaned up user data directory: {user_data_dir}")
|
| 378 |
+
except Exception as e:
|
| 379 |
+
print(f"Error cleaning up user data directory {user_data_dir}: {e}")
|
| 380 |
+
return posts
|
| 381 |
|
| 382 |
def main():
|
| 383 |
args = get_args()
|
|
|
|
| 394 |
gemini_keys.append(key)
|
| 395 |
gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
|
| 396 |
|
| 397 |
+
# This is not used to send mail, just to confirm auth is possible.
|
| 398 |
_ = build_gmail_service()
|
| 399 |
|
| 400 |
+
# Call the modified function which now returns only posts.
|
| 401 |
+
posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
|
| 402 |
|
| 403 |
try:
|
| 404 |
with open(args.out, "w", encoding="utf-8") as f:
|