Spaces:
Sleeping
Sleeping
Commit
·
7eb9383
1
Parent(s):
cbfc2e1
Project Uploaded
Browse files- api_server.py +28 -53
- final5.py +49 -65
api_server.py
CHANGED
|
@@ -4,17 +4,14 @@ from dataclasses import dataclass, field
|
|
| 4 |
from typing import List, Dict, Any, Optional
|
| 5 |
from flask import Flask, request, jsonify, send_from_directory
|
| 6 |
from flask_cors import CORS
|
| 7 |
-
from
|
| 8 |
-
from google.auth.transport.requests import Request
|
| 9 |
from googleapiclient.discovery import build
|
| 10 |
from googleapiclient.errors import HttpError
|
| 11 |
-
from google.oauth2 import service_account # Added import
|
| 12 |
from dotenv import load_dotenv
|
| 13 |
|
| 14 |
load_dotenv()
|
| 15 |
|
| 16 |
def decode_base64_with_padding(b64_string: str) -> bytes:
|
| 17 |
-
"""Decodes a Base64 string, adding missing padding if necessary."""
|
| 18 |
missing_padding = len(b64_string) % 4
|
| 19 |
if missing_padding:
|
| 20 |
b64_string += '=' * (4 - missing_padding)
|
|
@@ -26,29 +23,21 @@ def decode_base64_with_padding(b64_string: str) -> bytes:
|
|
| 26 |
|
| 27 |
# Define a writable directory for ALL runtime files
|
| 28 |
WRITABLE_DIR = "/tmp"
|
| 29 |
-
CREDENTIALS_PATH = os.path.join(WRITABLE_DIR, "credentials.json")
|
| 30 |
COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
|
| 31 |
-
|
| 32 |
-
SERVICE_ACCOUNT_PATH = os.path.join(WRITABLE_DIR, "service_account.json") # Added
|
| 33 |
|
| 34 |
# Decode secrets at startup into the /tmp directory
|
| 35 |
-
if 'CREDENTIALS_B64' in os.environ:
|
| 36 |
-
decoded_creds = decode_base64_with_padding(os.environ['CREDENTIALS_B64'])
|
| 37 |
-
if decoded_creds:
|
| 38 |
-
with open(CREDENTIALS_PATH, 'w') as f:
|
| 39 |
-
f.write(decoded_creds.decode('utf-8'))
|
| 40 |
-
|
| 41 |
if 'FB_COOKIES_B64' in os.environ:
|
| 42 |
decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
|
| 43 |
if decoded_cookies:
|
| 44 |
with open(COOKIES_PATH, 'wb') as f:
|
| 45 |
f.write(decoded_cookies)
|
| 46 |
|
| 47 |
-
#
|
| 48 |
if 'SERVICE_ACCOUNT_B64' in os.environ:
|
| 49 |
decoded_service_account = decode_base64_with_padding(os.environ['SERVICE_ACCOUNT_B64'])
|
| 50 |
if decoded_service_account:
|
| 51 |
-
with open(
|
| 52 |
f.write(decoded_service_account.decode('utf-8'))
|
| 53 |
|
| 54 |
GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
|
|
@@ -56,12 +45,8 @@ FINAL5_PATH = os.environ.get("FINAL5_PATH", "final5.py")
|
|
| 56 |
PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
|
| 57 |
SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
|
| 58 |
|
| 59 |
-
# --- MODIFICATION START ---
|
| 60 |
-
# Point the output directories to the writable /tmp directory
|
| 61 |
SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
|
| 62 |
ANALYSIS_OUTDIR = os.path.join(WRITABLE_DIR, "analysis")
|
| 63 |
-
# --- MODIFICATION END ---
|
| 64 |
-
|
| 65 |
|
| 66 |
GEMINI_KEYS = []
|
| 67 |
for i in range(1, 6):
|
|
@@ -69,14 +54,32 @@ for i in range(1, 6):
|
|
| 69 |
if key:
|
| 70 |
GEMINI_KEYS.append(key)
|
| 71 |
|
| 72 |
-
|
| 73 |
-
GMAIL_SCOPES = [
|
| 74 |
-
"https://www.googleapis.com/auth/gmail.send",
|
| 75 |
-
"https://www.googleapis.com/auth/gmail.metadata",
|
| 76 |
-
]
|
| 77 |
os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
|
| 78 |
os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
@dataclass
|
| 81 |
class GroupRun:
|
| 82 |
link: str
|
|
@@ -199,35 +202,6 @@ def slugify(url: str) -> str:
|
|
| 199 |
s = re.sub(r"[^a-zA-Z0-9]+", "-", url)
|
| 200 |
return s.strip("-").lower()
|
| 201 |
|
| 202 |
-
def build_gmail_service():
|
| 203 |
-
# Try service account first
|
| 204 |
-
if os.path.exists(SERVICE_ACCOUNT_PATH):
|
| 205 |
-
try:
|
| 206 |
-
credentials = service_account.Credentials.from_service_account_file(
|
| 207 |
-
SERVICE_ACCOUNT_PATH, scopes=GMAIL_SCOPES)
|
| 208 |
-
return build("gmail", "v1", credentials=credentials)
|
| 209 |
-
except Exception as e:
|
| 210 |
-
log(f"Service account authentication failed: {e}", "error", "gmail")
|
| 211 |
-
|
| 212 |
-
# Fallback to token.pickle if exists
|
| 213 |
-
creds = None
|
| 214 |
-
if os.path.exists(TOKEN_PATH):
|
| 215 |
-
with open(TOKEN_PATH, "rb") as token: creds = pickle.load(token)
|
| 216 |
-
if not creds or not creds.valid:
|
| 217 |
-
if creds and creds.expired and creds.refresh_token:
|
| 218 |
-
creds.refresh(Request())
|
| 219 |
-
else:
|
| 220 |
-
log("No valid credentials available; Gmail unavailable", "warn", "gmail")
|
| 221 |
-
return None
|
| 222 |
-
with open(TOKEN_PATH, "wb") as token: pickle.dump(creds, token)
|
| 223 |
-
try:
|
| 224 |
-
return build("gmail", "v1", credentials=creds)
|
| 225 |
-
except Exception as e:
|
| 226 |
-
log(f"Gmail service build failed: {e}", "error", "gmail")
|
| 227 |
-
return None
|
| 228 |
-
|
| 229 |
-
gmail_service = build_gmail_service()
|
| 230 |
-
|
| 231 |
def send_html_email(to_emails: List[str], subject: str, html_content: str) -> int:
|
| 232 |
if not gmail_service:
|
| 233 |
log("Gmail not configured; skipping email", "warn", "gmail")
|
|
@@ -244,6 +218,7 @@ def send_html_email(to_emails: List[str], subject: str, html_content: str) -> in
|
|
| 244 |
raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
|
| 245 |
gmail_service.users().messages().send(userId="me", body={"raw": raw}).execute()
|
| 246 |
sent += 1
|
|
|
|
| 247 |
except HttpError as e:
|
| 248 |
log(f"Gmail HTTP error to {to}: {e}", "error", "gmail")
|
| 249 |
except Exception as e:
|
|
|
|
| 4 |
from typing import List, Dict, Any, Optional
|
| 5 |
from flask import Flask, request, jsonify, send_from_directory
|
| 6 |
from flask_cors import CORS
|
| 7 |
+
from google.oauth2 import service_account
|
|
|
|
| 8 |
from googleapiclient.discovery import build
|
| 9 |
from googleapiclient.errors import HttpError
|
|
|
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
def decode_base64_with_padding(b64_string: str) -> bytes:
|
|
|
|
| 15 |
missing_padding = len(b64_string) % 4
|
| 16 |
if missing_padding:
|
| 17 |
b64_string += '=' * (4 - missing_padding)
|
|
|
|
| 23 |
|
| 24 |
# Define a writable directory for ALL runtime files
|
| 25 |
WRITABLE_DIR = "/tmp"
|
|
|
|
| 26 |
COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
|
| 27 |
+
SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
|
|
|
|
| 28 |
|
| 29 |
# Decode secrets at startup into the /tmp directory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
if 'FB_COOKIES_B64' in os.environ:
|
| 31 |
decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
|
| 32 |
if decoded_cookies:
|
| 33 |
with open(COOKIES_PATH, 'wb') as f:
|
| 34 |
f.write(decoded_cookies)
|
| 35 |
|
| 36 |
+
# --- FIX: Decode the Service Account secret ---
|
| 37 |
if 'SERVICE_ACCOUNT_B64' in os.environ:
|
| 38 |
decoded_service_account = decode_base64_with_padding(os.environ['SERVICE_ACCOUNT_B64'])
|
| 39 |
if decoded_service_account:
|
| 40 |
+
with open(SERVICE_ACCOUNT_FILE, 'w') as f:
|
| 41 |
f.write(decoded_service_account.decode('utf-8'))
|
| 42 |
|
| 43 |
GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
|
|
|
|
| 45 |
PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
|
| 46 |
SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
|
| 47 |
|
|
|
|
|
|
|
| 48 |
SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
|
| 49 |
ANALYSIS_OUTDIR = os.path.join(WRITABLE_DIR, "analysis")
|
|
|
|
|
|
|
| 50 |
|
| 51 |
GEMINI_KEYS = []
|
| 52 |
for i in range(1, 6):
|
|
|
|
| 54 |
if key:
|
| 55 |
GEMINI_KEYS.append(key)
|
| 56 |
|
| 57 |
+
GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
|
| 59 |
os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
|
| 60 |
|
| 61 |
+
# --- FIX: Implement non-interactive, service-account based authentication ---
|
| 62 |
+
def build_gmail_service():
|
| 63 |
+
if not os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 64 |
+
log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
|
| 65 |
+
return None
|
| 66 |
+
try:
|
| 67 |
+
# Impersonate the SENDER_EMAIL user
|
| 68 |
+
creds = service_account.Credentials.from_service_account_file(
|
| 69 |
+
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(SENDER_EMAIL)
|
| 70 |
+
|
| 71 |
+
service = build("gmail", "v1", credentials=creds)
|
| 72 |
+
log("Gmail service built successfully using service account.", "info", "GMAIL")
|
| 73 |
+
return service
|
| 74 |
+
except Exception as e:
|
| 75 |
+
log(f"Failed to build Gmail service: {e}", "error", "GMAIL")
|
| 76 |
+
log("CRITICAL: Ensure your service account has Domain-Wide Delegation enabled and is authorized for the GMail API for the user {SENDER_EMAIL}", "error", "GMAIL")
|
| 77 |
+
return None
|
| 78 |
+
|
| 79 |
+
gmail_service = build_gmail_service()
|
| 80 |
+
|
| 81 |
+
# The rest of api_server.py remains unchanged...
|
| 82 |
+
# (Full code omitted for brevity, just replace the top section and the build_gmail_service function)
|
| 83 |
@dataclass
|
| 84 |
class GroupRun:
|
| 85 |
link: str
|
|
|
|
| 202 |
s = re.sub(r"[^a-zA-Z0-9]+", "-", url)
|
| 203 |
return s.strip("-").lower()
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
def send_html_email(to_emails: List[str], subject: str, html_content: str) -> int:
|
| 206 |
if not gmail_service:
|
| 207 |
log("Gmail not configured; skipping email", "warn", "gmail")
|
|
|
|
| 218 |
raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
|
| 219 |
gmail_service.users().messages().send(userId="me", body={"raw": raw}).execute()
|
| 220 |
sent += 1
|
| 221 |
+
log(f"Successfully sent email to {to}", "info", "GMAIL")
|
| 222 |
except HttpError as e:
|
| 223 |
log(f"Gmail HTTP error to {to}: {e}", "error", "gmail")
|
| 224 |
except Exception as e:
|
final5.py
CHANGED
|
@@ -14,19 +14,15 @@ from selenium.webdriver.common.by import By
|
|
| 14 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 15 |
from selenium.webdriver.support import expected_conditions as EC
|
| 16 |
from selenium.common.exceptions import (
|
| 17 |
-
StaleElementReferenceException, NoSuchElementException, TimeoutException
|
| 18 |
)
|
| 19 |
-
from
|
| 20 |
-
from google.auth.transport.requests import Request
|
| 21 |
from googleapiclient.discovery import build
|
| 22 |
from googleapiclient.errors import HttpError
|
| 23 |
import google.generativeai as genai
|
| 24 |
from google.api_core.exceptions import ResourceExhausted
|
| 25 |
-
from google.oauth2 import service_account
|
| 26 |
|
| 27 |
WRITABLE_DIR = "/tmp"
|
| 28 |
-
CREDENTIALS_PATH = os.path.join(WRITABLE_DIR, "credentials.json")
|
| 29 |
-
TOKEN_PATH = os.path.join(WRITABLE_DIR, "token.pickle")
|
| 30 |
SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
|
| 31 |
|
| 32 |
def get_args():
|
|
@@ -43,48 +39,30 @@ def get_args():
|
|
| 43 |
p.add_argument("--headless", action="store_true", help="Prefer headless browser")
|
| 44 |
return p.parse_args()
|
| 45 |
|
| 46 |
-
GMAIL_SCOPES = [
|
| 47 |
-
"https://www.googleapis.com/auth/gmail.send",
|
| 48 |
-
"https://www.googleapis.com/auth/gmail.metadata",
|
| 49 |
-
]
|
| 50 |
|
|
|
|
| 51 |
def build_gmail_service():
|
|
|
|
| 52 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 53 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
credentials = service_account.Credentials.from_service_account_file(
|
| 55 |
-
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES)
|
|
|
|
| 56 |
svc = build("gmail", "v1", credentials=credentials)
|
| 57 |
-
_ = svc.users().getProfile(userId="me").execute()
|
| 58 |
return svc
|
| 59 |
except Exception as e:
|
| 60 |
-
print(f"[GMAIL] Service account authentication failed: {e}")
|
| 61 |
-
|
| 62 |
-
creds = None
|
| 63 |
-
if os.path.exists(TOKEN_PATH):
|
| 64 |
-
try:
|
| 65 |
-
with open(TOKEN_PATH, "rb") as token:
|
| 66 |
-
creds = pickle.load(token)
|
| 67 |
-
except Exception:
|
| 68 |
-
creds = None
|
| 69 |
-
if not creds or not creds.valid:
|
| 70 |
-
if creds and creds.expired and creds.refresh_token:
|
| 71 |
-
try:
|
| 72 |
-
creds.refresh(Request())
|
| 73 |
-
except Exception:
|
| 74 |
-
creds = None
|
| 75 |
-
if not creds:
|
| 76 |
-
print("[GMAIL] No valid credentials available or interactive auth required.")
|
| 77 |
return None
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
try:
|
| 81 |
-
svc = build("gmail", "v1", credentials=creds)
|
| 82 |
-
_ = svc.users().getProfile(userId="me").execute()
|
| 83 |
-
return svc
|
| 84 |
-
except Exception as e:
|
| 85 |
-
print(f"[GMAIL] service build failed: {e}")
|
| 86 |
-
return None
|
| 87 |
|
|
|
|
| 88 |
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 89 |
if not service:
|
| 90 |
print("[GMAIL] service not available; skipping email")
|
|
@@ -246,23 +224,18 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
|
| 246 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 247 |
return (len(hits) > 0, hits)
|
| 248 |
|
| 249 |
-
#
|
| 250 |
-
import tempfile
|
| 251 |
-
import os
|
| 252 |
-
|
| 253 |
def new_driver(headless: bool):
|
| 254 |
options = webdriver.ChromeOptions()
|
| 255 |
|
| 256 |
-
#
|
| 257 |
-
|
| 258 |
-
os.
|
| 259 |
-
|
| 260 |
-
os.makedirs("/tmp/.cache/selenium", exist_ok=True)
|
| 261 |
|
| 262 |
-
# Create a unique temporary directory for Chrome's user data
|
| 263 |
-
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=
|
| 264 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 265 |
-
# --- MODIFICATION END ---
|
| 266 |
|
| 267 |
options.add_argument("--disable-notifications")
|
| 268 |
options.add_argument("--disable-web-security")
|
|
@@ -272,7 +245,6 @@ def new_driver(headless: bool):
|
|
| 272 |
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 273 |
options.add_argument("--window-size=1920,1080")
|
| 274 |
options.add_argument("--lang=en-US,en")
|
| 275 |
-
# Consider removing the hardcoded user-agent or making it configurable
|
| 276 |
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
|
| 277 |
if headless:
|
| 278 |
options.add_argument("--headless=new")
|
|
@@ -282,7 +254,9 @@ def new_driver(headless: bool):
|
|
| 282 |
options.add_argument("--disable-extensions")
|
| 283 |
options.add_argument("--disable-plugins")
|
| 284 |
options.add_argument("--disable-images")
|
|
|
|
| 285 |
driver = webdriver.Chrome(options=options)
|
|
|
|
| 286 |
try:
|
| 287 |
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
| 288 |
"source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
|
|
@@ -291,6 +265,7 @@ def new_driver(headless: bool):
|
|
| 291 |
pass
|
| 292 |
return driver
|
| 293 |
|
|
|
|
| 294 |
def load_cookies(driver, cookies_file: str):
|
| 295 |
print("[FB] Loading Facebook homepage...")
|
| 296 |
driver.get("https://www.facebook.com")
|
|
@@ -389,19 +364,32 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
|
| 389 |
return posts
|
| 390 |
|
| 391 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 392 |
-
driver =
|
| 393 |
-
wait = WebDriverWait(driver, 15)
|
| 394 |
try:
|
|
|
|
|
|
|
| 395 |
load_cookies(driver, cookies_file)
|
| 396 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 397 |
return posts, driver
|
| 398 |
except Exception as e:
|
| 399 |
-
try:
|
| 400 |
-
driver.quit()
|
| 401 |
-
except Exception:
|
| 402 |
-
pass
|
| 403 |
print(f"[SCRAPE] Error in headless mode: {e}")
|
| 404 |
return [], None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
def main():
|
| 407 |
args = get_args()
|
|
@@ -418,14 +406,11 @@ def main():
|
|
| 418 |
gemini_keys.append(key)
|
| 419 |
gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
|
| 420 |
|
| 421 |
-
|
|
|
|
| 422 |
|
| 423 |
posts, driver = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
|
| 424 |
-
|
| 425 |
-
try:
|
| 426 |
-
driver.quit()
|
| 427 |
-
except Exception:
|
| 428 |
-
pass
|
| 429 |
|
| 430 |
try:
|
| 431 |
with open(args.out, "w", encoding="utf-8") as f:
|
|
@@ -478,7 +463,6 @@ if __name__ == "__main__":
|
|
| 478 |
try:
|
| 479 |
main()
|
| 480 |
except Exception as e:
|
| 481 |
-
print("Unhandled error:")
|
| 482 |
-
print(e)
|
| 483 |
print(traceback.format_exc())
|
| 484 |
raise
|
|
|
|
| 14 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 15 |
from selenium.webdriver.support import expected_conditions as EC
|
| 16 |
from selenium.common.exceptions import (
|
| 17 |
+
StaleElementReferenceException, NoSuchElementException, TimeoutException, SessionNotCreatedException
|
| 18 |
)
|
| 19 |
+
from google.oauth2 import service_account
|
|
|
|
| 20 |
from googleapiclient.discovery import build
|
| 21 |
from googleapiclient.errors import HttpError
|
| 22 |
import google.generativeai as genai
|
| 23 |
from google.api_core.exceptions import ResourceExhausted
|
|
|
|
| 24 |
|
| 25 |
WRITABLE_DIR = "/tmp"
|
|
|
|
|
|
|
| 26 |
SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
|
| 27 |
|
| 28 |
def get_args():
|
|
|
|
| 39 |
p.add_argument("--headless", action="store_true", help="Prefer headless browser")
|
| 40 |
return p.parse_args()
|
| 41 |
|
| 42 |
+
GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
+
# --- FIX: Simplify this function. It's not the primary auth method. ---
|
| 45 |
def build_gmail_service():
|
| 46 |
+
"""Builds Gmail service if a service account file exists."""
|
| 47 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 48 |
try:
|
| 49 |
+
sender_email = os.environ.get("SENDER_EMAIL")
|
| 50 |
+
if not sender_email:
|
| 51 |
+
print("[GMAIL] SENDER_EMAIL environment variable not set.")
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
credentials = service_account.Credentials.from_service_account_file(
|
| 55 |
+
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
|
| 56 |
+
|
| 57 |
svc = build("gmail", "v1", credentials=credentials)
|
|
|
|
| 58 |
return svc
|
| 59 |
except Exception as e:
|
| 60 |
+
print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
return None
|
| 62 |
+
print("[GMAIL] Service account file not found in final5.py.")
|
| 63 |
+
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
# The send_html_email function is kept for potential future direct use, but it's not called by main()
|
| 66 |
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 67 |
if not service:
|
| 68 |
print("[GMAIL] service not available; skipping email")
|
|
|
|
| 224 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 225 |
return (len(hits) > 0, hits)
|
| 226 |
|
| 227 |
+
# --- FIX: Set a writable cache path for Selenium Manager and ensure unique user-data-dir ---
|
|
|
|
|
|
|
|
|
|
| 228 |
def new_driver(headless: bool):
|
| 229 |
options = webdriver.ChromeOptions()
|
| 230 |
|
| 231 |
+
# Specify a writable directory for Selenium Manager's driver cache
|
| 232 |
+
cache_path = os.path.join(WRITABLE_DIR, "selenium")
|
| 233 |
+
os.makedirs(cache_path, exist_ok=True)
|
| 234 |
+
os.environ["SE_CACHE_PATH"] = cache_path
|
|
|
|
| 235 |
|
| 236 |
+
# Create a unique temporary directory for this specific Chrome instance's user data
|
| 237 |
+
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
| 238 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
|
|
|
| 239 |
|
| 240 |
options.add_argument("--disable-notifications")
|
| 241 |
options.add_argument("--disable-web-security")
|
|
|
|
| 245 |
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
| 246 |
options.add_argument("--window-size=1920,1080")
|
| 247 |
options.add_argument("--lang=en-US,en")
|
|
|
|
| 248 |
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
|
| 249 |
if headless:
|
| 250 |
options.add_argument("--headless=new")
|
|
|
|
| 254 |
options.add_argument("--disable-extensions")
|
| 255 |
options.add_argument("--disable-plugins")
|
| 256 |
options.add_argument("--disable-images")
|
| 257 |
+
|
| 258 |
driver = webdriver.Chrome(options=options)
|
| 259 |
+
|
| 260 |
try:
|
| 261 |
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
|
| 262 |
"source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
|
|
|
|
| 265 |
pass
|
| 266 |
return driver
|
| 267 |
|
| 268 |
+
|
| 269 |
def load_cookies(driver, cookies_file: str):
|
| 270 |
print("[FB] Loading Facebook homepage...")
|
| 271 |
driver.get("https://www.facebook.com")
|
|
|
|
| 364 |
return posts
|
| 365 |
|
| 366 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 367 |
+
driver = None
|
|
|
|
| 368 |
try:
|
| 369 |
+
driver = new_driver(headless=True)
|
| 370 |
+
wait = WebDriverWait(driver, 15)
|
| 371 |
load_cookies(driver, cookies_file)
|
| 372 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 373 |
return posts, driver
|
| 374 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
print(f"[SCRAPE] Error in headless mode: {e}")
|
| 376 |
return [], None
|
| 377 |
+
finally:
|
| 378 |
+
if driver:
|
| 379 |
+
try:
|
| 380 |
+
# Also try to clean up the temporary user data directory
|
| 381 |
+
user_data_dir = None
|
| 382 |
+
for arg in driver.options.arguments:
|
| 383 |
+
if arg.startswith('--user-data-dir='):
|
| 384 |
+
user_data_dir = arg.split('=', 1)[1]
|
| 385 |
+
break
|
| 386 |
+
driver.quit()
|
| 387 |
+
if user_data_dir and os.path.exists(user_data_dir):
|
| 388 |
+
import shutil
|
| 389 |
+
shutil.rmtree(user_data_dir, ignore_errors=True)
|
| 390 |
+
except Exception as e:
|
| 391 |
+
print(f"Error during driver cleanup: {e}")
|
| 392 |
+
|
| 393 |
|
| 394 |
def main():
|
| 395 |
args = get_args()
|
|
|
|
| 406 |
gemini_keys.append(key)
|
| 407 |
gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
|
| 408 |
|
| 409 |
+
# This is not used to send mail, but just to check if auth is possible
|
| 410 |
+
_ = build_gmail_service()
|
| 411 |
|
| 412 |
posts, driver = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
|
| 413 |
+
# Driver is cleaned up in try_scrape_with_fallback's finally block
|
|
|
|
|
|
|
|
|
|
|
|
|
| 414 |
|
| 415 |
try:
|
| 416 |
with open(args.out, "w", encoding="utf-8") as f:
|
|
|
|
| 463 |
try:
|
| 464 |
main()
|
| 465 |
except Exception as e:
|
| 466 |
+
print(f"Unhandled error in main: {e}")
|
|
|
|
| 467 |
print(traceback.format_exc())
|
| 468 |
raise
|