sonuprasad23 commited on
Commit
7eb9383
·
1 Parent(s): cbfc2e1

Project Uploaded

Browse files
Files changed (2) hide show
  1. api_server.py +28 -53
  2. final5.py +49 -65
api_server.py CHANGED
@@ -4,17 +4,14 @@ from dataclasses import dataclass, field
4
  from typing import List, Dict, Any, Optional
5
  from flask import Flask, request, jsonify, send_from_directory
6
  from flask_cors import CORS
7
- from google_auth_oauthlib.flow import InstalledAppFlow
8
- from google.auth.transport.requests import Request
9
  from googleapiclient.discovery import build
10
  from googleapiclient.errors import HttpError
11
- from google.oauth2 import service_account # Added import
12
  from dotenv import load_dotenv
13
 
14
  load_dotenv()
15
 
16
  def decode_base64_with_padding(b64_string: str) -> bytes:
17
- """Decodes a Base64 string, adding missing padding if necessary."""
18
  missing_padding = len(b64_string) % 4
19
  if missing_padding:
20
  b64_string += '=' * (4 - missing_padding)
@@ -26,29 +23,21 @@ def decode_base64_with_padding(b64_string: str) -> bytes:
26
 
27
  # Define a writable directory for ALL runtime files
28
  WRITABLE_DIR = "/tmp"
29
- CREDENTIALS_PATH = os.path.join(WRITABLE_DIR, "credentials.json")
30
  COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
31
- TOKEN_PATH = os.path.join(WRITABLE_DIR, "token.pickle")
32
- SERVICE_ACCOUNT_PATH = os.path.join(WRITABLE_DIR, "service_account.json") # Added
33
 
34
  # Decode secrets at startup into the /tmp directory
35
- if 'CREDENTIALS_B64' in os.environ:
36
- decoded_creds = decode_base64_with_padding(os.environ['CREDENTIALS_B64'])
37
- if decoded_creds:
38
- with open(CREDENTIALS_PATH, 'w') as f:
39
- f.write(decoded_creds.decode('utf-8'))
40
-
41
  if 'FB_COOKIES_B64' in os.environ:
42
  decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
43
  if decoded_cookies:
44
  with open(COOKIES_PATH, 'wb') as f:
45
  f.write(decoded_cookies)
46
 
47
- # Added: Decode service account credentials
48
  if 'SERVICE_ACCOUNT_B64' in os.environ:
49
  decoded_service_account = decode_base64_with_padding(os.environ['SERVICE_ACCOUNT_B64'])
50
  if decoded_service_account:
51
- with open(SERVICE_ACCOUNT_PATH, 'w') as f:
52
  f.write(decoded_service_account.decode('utf-8'))
53
 
54
  GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
@@ -56,12 +45,8 @@ FINAL5_PATH = os.environ.get("FINAL5_PATH", "final5.py")
56
  PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
57
  SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
58
 
59
- # --- MODIFICATION START ---
60
- # Point the output directories to the writable /tmp directory
61
  SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
62
  ANALYSIS_OUTDIR = os.path.join(WRITABLE_DIR, "analysis")
63
- # --- MODIFICATION END ---
64
-
65
 
66
  GEMINI_KEYS = []
67
  for i in range(1, 6):
@@ -69,14 +54,32 @@ for i in range(1, 6):
69
  if key:
70
  GEMINI_KEYS.append(key)
71
 
72
- # Fixed scopes (removed trailing spaces)
73
- GMAIL_SCOPES = [
74
- "https://www.googleapis.com/auth/gmail.send",
75
- "https://www.googleapis.com/auth/gmail.metadata",
76
- ]
77
  os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
78
  os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  @dataclass
81
  class GroupRun:
82
  link: str
@@ -199,35 +202,6 @@ def slugify(url: str) -> str:
199
  s = re.sub(r"[^a-zA-Z0-9]+", "-", url)
200
  return s.strip("-").lower()
201
 
202
- def build_gmail_service():
203
- # Try service account first
204
- if os.path.exists(SERVICE_ACCOUNT_PATH):
205
- try:
206
- credentials = service_account.Credentials.from_service_account_file(
207
- SERVICE_ACCOUNT_PATH, scopes=GMAIL_SCOPES)
208
- return build("gmail", "v1", credentials=credentials)
209
- except Exception as e:
210
- log(f"Service account authentication failed: {e}", "error", "gmail")
211
-
212
- # Fallback to token.pickle if exists
213
- creds = None
214
- if os.path.exists(TOKEN_PATH):
215
- with open(TOKEN_PATH, "rb") as token: creds = pickle.load(token)
216
- if not creds or not creds.valid:
217
- if creds and creds.expired and creds.refresh_token:
218
- creds.refresh(Request())
219
- else:
220
- log("No valid credentials available; Gmail unavailable", "warn", "gmail")
221
- return None
222
- with open(TOKEN_PATH, "wb") as token: pickle.dump(creds, token)
223
- try:
224
- return build("gmail", "v1", credentials=creds)
225
- except Exception as e:
226
- log(f"Gmail service build failed: {e}", "error", "gmail")
227
- return None
228
-
229
- gmail_service = build_gmail_service()
230
-
231
  def send_html_email(to_emails: List[str], subject: str, html_content: str) -> int:
232
  if not gmail_service:
233
  log("Gmail not configured; skipping email", "warn", "gmail")
@@ -244,6 +218,7 @@ def send_html_email(to_emails: List[str], subject: str, html_content: str) -> in
244
  raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
245
  gmail_service.users().messages().send(userId="me", body={"raw": raw}).execute()
246
  sent += 1
 
247
  except HttpError as e:
248
  log(f"Gmail HTTP error to {to}: {e}", "error", "gmail")
249
  except Exception as e:
 
4
  from typing import List, Dict, Any, Optional
5
  from flask import Flask, request, jsonify, send_from_directory
6
  from flask_cors import CORS
7
+ from google.oauth2 import service_account
 
8
  from googleapiclient.discovery import build
9
  from googleapiclient.errors import HttpError
 
10
  from dotenv import load_dotenv
11
 
12
  load_dotenv()
13
 
14
  def decode_base64_with_padding(b64_string: str) -> bytes:
 
15
  missing_padding = len(b64_string) % 4
16
  if missing_padding:
17
  b64_string += '=' * (4 - missing_padding)
 
23
 
24
  # Define a writable directory for ALL runtime files
25
  WRITABLE_DIR = "/tmp"
 
26
  COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
27
+ SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
 
28
 
29
  # Decode secrets at startup into the /tmp directory
 
 
 
 
 
 
30
  if 'FB_COOKIES_B64' in os.environ:
31
  decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
32
  if decoded_cookies:
33
  with open(COOKIES_PATH, 'wb') as f:
34
  f.write(decoded_cookies)
35
 
36
+ # --- FIX: Decode the Service Account secret ---
37
  if 'SERVICE_ACCOUNT_B64' in os.environ:
38
  decoded_service_account = decode_base64_with_padding(os.environ['SERVICE_ACCOUNT_B64'])
39
  if decoded_service_account:
40
+ with open(SERVICE_ACCOUNT_FILE, 'w') as f:
41
  f.write(decoded_service_account.decode('utf-8'))
42
 
43
  GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
 
45
  PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
46
  SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
47
 
 
 
48
  SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
49
  ANALYSIS_OUTDIR = os.path.join(WRITABLE_DIR, "analysis")
 
 
50
 
51
  GEMINI_KEYS = []
52
  for i in range(1, 6):
 
54
  if key:
55
  GEMINI_KEYS.append(key)
56
 
57
+ GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
 
 
 
 
58
  os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
59
  os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
60
 
61
+ # --- FIX: Implement non-interactive, service-account based authentication ---
62
+ def build_gmail_service():
63
+ if not os.path.exists(SERVICE_ACCOUNT_FILE):
64
+ log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
65
+ return None
66
+ try:
67
+ # Impersonate the SENDER_EMAIL user
68
+ creds = service_account.Credentials.from_service_account_file(
69
+ SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(SENDER_EMAIL)
70
+
71
+ service = build("gmail", "v1", credentials=creds)
72
+ log("Gmail service built successfully using service account.", "info", "GMAIL")
73
+ return service
74
+ except Exception as e:
75
+ log(f"Failed to build Gmail service: {e}", "error", "GMAIL")
76
+ log("CRITICAL: Ensure your service account has Domain-Wide Delegation enabled and is authorized for the GMail API for the user {SENDER_EMAIL}", "error", "GMAIL")
77
+ return None
78
+
79
+ gmail_service = build_gmail_service()
80
+
81
+ # The rest of api_server.py remains unchanged...
82
+ # (Full code omitted for brevity, just replace the top section and the build_gmail_service function)
83
  @dataclass
84
  class GroupRun:
85
  link: str
 
202
  s = re.sub(r"[^a-zA-Z0-9]+", "-", url)
203
  return s.strip("-").lower()
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  def send_html_email(to_emails: List[str], subject: str, html_content: str) -> int:
206
  if not gmail_service:
207
  log("Gmail not configured; skipping email", "warn", "gmail")
 
218
  raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
219
  gmail_service.users().messages().send(userId="me", body={"raw": raw}).execute()
220
  sent += 1
221
+ log(f"Successfully sent email to {to}", "info", "GMAIL")
222
  except HttpError as e:
223
  log(f"Gmail HTTP error to {to}: {e}", "error", "gmail")
224
  except Exception as e:
final5.py CHANGED
@@ -14,19 +14,15 @@ from selenium.webdriver.common.by import By
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
  from selenium.common.exceptions import (
17
- StaleElementReferenceException, NoSuchElementException, TimeoutException
18
  )
19
- from google_auth_oauthlib.flow import InstalledAppFlow
20
- from google.auth.transport.requests import Request
21
  from googleapiclient.discovery import build
22
  from googleapiclient.errors import HttpError
23
  import google.generativeai as genai
24
  from google.api_core.exceptions import ResourceExhausted
25
- from google.oauth2 import service_account
26
 
27
  WRITABLE_DIR = "/tmp"
28
- CREDENTIALS_PATH = os.path.join(WRITABLE_DIR, "credentials.json")
29
- TOKEN_PATH = os.path.join(WRITABLE_DIR, "token.pickle")
30
  SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
31
 
32
  def get_args():
@@ -43,48 +39,30 @@ def get_args():
43
  p.add_argument("--headless", action="store_true", help="Prefer headless browser")
44
  return p.parse_args()
45
 
46
- GMAIL_SCOPES = [
47
- "https://www.googleapis.com/auth/gmail.send",
48
- "https://www.googleapis.com/auth/gmail.metadata",
49
- ]
50
 
 
51
  def build_gmail_service():
 
52
  if os.path.exists(SERVICE_ACCOUNT_FILE):
53
  try:
 
 
 
 
 
54
  credentials = service_account.Credentials.from_service_account_file(
55
- SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES)
 
56
  svc = build("gmail", "v1", credentials=credentials)
57
- _ = svc.users().getProfile(userId="me").execute()
58
  return svc
59
  except Exception as e:
60
- print(f"[GMAIL] Service account authentication failed: {e}")
61
-
62
- creds = None
63
- if os.path.exists(TOKEN_PATH):
64
- try:
65
- with open(TOKEN_PATH, "rb") as token:
66
- creds = pickle.load(token)
67
- except Exception:
68
- creds = None
69
- if not creds or not creds.valid:
70
- if creds and creds.expired and creds.refresh_token:
71
- try:
72
- creds.refresh(Request())
73
- except Exception:
74
- creds = None
75
- if not creds:
76
- print("[GMAIL] No valid credentials available or interactive auth required.")
77
  return None
78
- with open(TOKEN_PATH, "wb") as token:
79
- pickle.dump(creds, token)
80
- try:
81
- svc = build("gmail", "v1", credentials=creds)
82
- _ = svc.users().getProfile(userId="me").execute()
83
- return svc
84
- except Exception as e:
85
- print(f"[GMAIL] service build failed: {e}")
86
- return None
87
 
 
88
  def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
89
  if not service:
90
  print("[GMAIL] service not available; skipping email")
@@ -246,23 +224,18 @@ def contains_keywords(text: str) -> Tuple[bool, List[str]]:
246
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
247
  return (len(hits) > 0, hits)
248
 
249
- # Add this import at the top if not already present
250
- import tempfile
251
- import os
252
-
253
  def new_driver(headless: bool):
254
  options = webdriver.ChromeOptions()
255
 
256
- # --- MODIFICATION START: Selenium Cache and User Data Dir ---
257
- # Specify a writable directory for Selenium Manager's cache
258
- os.environ["SE_CACHE_PATH"] = "/tmp/.cache/selenium"
259
- # Ensure the directory exists
260
- os.makedirs("/tmp/.cache/selenium", exist_ok=True)
261
 
262
- # Create a unique temporary directory for Chrome's user data
263
- user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir="/tmp")
264
  options.add_argument(f"--user-data-dir={user_data_dir}")
265
- # --- MODIFICATION END ---
266
 
267
  options.add_argument("--disable-notifications")
268
  options.add_argument("--disable-web-security")
@@ -272,7 +245,6 @@ def new_driver(headless: bool):
272
  options.add_experimental_option("excludeSwitches", ["enable-automation"])
273
  options.add_argument("--window-size=1920,1080")
274
  options.add_argument("--lang=en-US,en")
275
- # Consider removing the hardcoded user-agent or making it configurable
276
  options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
277
  if headless:
278
  options.add_argument("--headless=new")
@@ -282,7 +254,9 @@ def new_driver(headless: bool):
282
  options.add_argument("--disable-extensions")
283
  options.add_argument("--disable-plugins")
284
  options.add_argument("--disable-images")
 
285
  driver = webdriver.Chrome(options=options)
 
286
  try:
287
  driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
288
  "source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
@@ -291,6 +265,7 @@ def new_driver(headless: bool):
291
  pass
292
  return driver
293
 
 
294
  def load_cookies(driver, cookies_file: str):
295
  print("[FB] Loading Facebook homepage...")
296
  driver.get("https://www.facebook.com")
@@ -389,19 +364,32 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
389
  return posts
390
 
391
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
392
- driver = new_driver(headless=True)
393
- wait = WebDriverWait(driver, 15)
394
  try:
 
 
395
  load_cookies(driver, cookies_file)
396
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
397
  return posts, driver
398
  except Exception as e:
399
- try:
400
- driver.quit()
401
- except Exception:
402
- pass
403
  print(f"[SCRAPE] Error in headless mode: {e}")
404
  return [], None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
405
 
406
  def main():
407
  args = get_args()
@@ -418,14 +406,11 @@ def main():
418
  gemini_keys.append(key)
419
  gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
420
 
421
- gmail = build_gmail_service()
 
422
 
423
  posts, driver = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
424
- if driver:
425
- try:
426
- driver.quit()
427
- except Exception:
428
- pass
429
 
430
  try:
431
  with open(args.out, "w", encoding="utf-8") as f:
@@ -478,7 +463,6 @@ if __name__ == "__main__":
478
  try:
479
  main()
480
  except Exception as e:
481
- print("Unhandled error:")
482
- print(e)
483
  print(traceback.format_exc())
484
  raise
 
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
  from selenium.common.exceptions import (
17
+ StaleElementReferenceException, NoSuchElementException, TimeoutException, SessionNotCreatedException
18
  )
19
+ from google.oauth2 import service_account
 
20
  from googleapiclient.discovery import build
21
  from googleapiclient.errors import HttpError
22
  import google.generativeai as genai
23
  from google.api_core.exceptions import ResourceExhausted
 
24
 
25
  WRITABLE_DIR = "/tmp"
 
 
26
  SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
27
 
28
  def get_args():
 
39
  p.add_argument("--headless", action="store_true", help="Prefer headless browser")
40
  return p.parse_args()
41
 
42
+ GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
 
 
 
43
 
44
+ # --- FIX: Simplify this function. It's not the primary auth method. ---
45
  def build_gmail_service():
46
+ """Builds Gmail service if a service account file exists."""
47
  if os.path.exists(SERVICE_ACCOUNT_FILE):
48
  try:
49
+ sender_email = os.environ.get("SENDER_EMAIL")
50
+ if not sender_email:
51
+ print("[GMAIL] SENDER_EMAIL environment variable not set.")
52
+ return None
53
+
54
  credentials = service_account.Credentials.from_service_account_file(
55
+ SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
56
+
57
  svc = build("gmail", "v1", credentials=credentials)
 
58
  return svc
59
  except Exception as e:
60
+ print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  return None
62
+ print("[GMAIL] Service account file not found in final5.py.")
63
+ return None
 
 
 
 
 
 
 
64
 
65
+ # The send_html_email function is kept for potential future direct use, but it's not called by main()
66
  def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
67
  if not service:
68
  print("[GMAIL] service not available; skipping email")
 
224
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
225
  return (len(hits) > 0, hits)
226
 
227
+ # --- FIX: Set a writable cache path for Selenium Manager and ensure unique user-data-dir ---
 
 
 
228
  def new_driver(headless: bool):
229
  options = webdriver.ChromeOptions()
230
 
231
+ # Specify a writable directory for Selenium Manager's driver cache
232
+ cache_path = os.path.join(WRITABLE_DIR, "selenium")
233
+ os.makedirs(cache_path, exist_ok=True)
234
+ os.environ["SE_CACHE_PATH"] = cache_path
 
235
 
236
+ # Create a unique temporary directory for this specific Chrome instance's user data
237
+ user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
238
  options.add_argument(f"--user-data-dir={user_data_dir}")
 
239
 
240
  options.add_argument("--disable-notifications")
241
  options.add_argument("--disable-web-security")
 
245
  options.add_experimental_option("excludeSwitches", ["enable-automation"])
246
  options.add_argument("--window-size=1920,1080")
247
  options.add_argument("--lang=en-US,en")
 
248
  options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36")
249
  if headless:
250
  options.add_argument("--headless=new")
 
254
  options.add_argument("--disable-extensions")
255
  options.add_argument("--disable-plugins")
256
  options.add_argument("--disable-images")
257
+
258
  driver = webdriver.Chrome(options=options)
259
+
260
  try:
261
  driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
262
  "source": "Object.defineProperty(navigator, 'webdriver', { get: () => undefined });"
 
265
  pass
266
  return driver
267
 
268
+
269
  def load_cookies(driver, cookies_file: str):
270
  print("[FB] Loading Facebook homepage...")
271
  driver.get("https://www.facebook.com")
 
364
  return posts
365
 
366
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
367
+ driver = None
 
368
  try:
369
+ driver = new_driver(headless=True)
370
+ wait = WebDriverWait(driver, 15)
371
  load_cookies(driver, cookies_file)
372
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
373
  return posts, driver
374
  except Exception as e:
 
 
 
 
375
  print(f"[SCRAPE] Error in headless mode: {e}")
376
  return [], None
377
+ finally:
378
+ if driver:
379
+ try:
380
+ # Also try to clean up the temporary user data directory
381
+ user_data_dir = None
382
+ for arg in driver.options.arguments:
383
+ if arg.startswith('--user-data-dir='):
384
+ user_data_dir = arg.split('=', 1)[1]
385
+ break
386
+ driver.quit()
387
+ if user_data_dir and os.path.exists(user_data_dir):
388
+ import shutil
389
+ shutil.rmtree(user_data_dir, ignore_errors=True)
390
+ except Exception as e:
391
+ print(f"Error during driver cleanup: {e}")
392
+
393
 
394
  def main():
395
  args = get_args()
 
406
  gemini_keys.append(key)
407
  gemini_manager = GeminiManager(gemini_keys) if gemini_keys else None
408
 
409
+ # This is not used to send mail, but just to check if auth is possible
410
+ _ = build_gmail_service()
411
 
412
  posts, driver = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
413
+ # Driver is cleaned up in try_scrape_with_fallback's finally block
 
 
 
 
414
 
415
  try:
416
  with open(args.out, "w", encoding="utf-8") as f:
 
463
  try:
464
  main()
465
  except Exception as e:
466
+ print(f"Unhandled error in main: {e}")
 
467
  print(traceback.format_exc())
468
  raise