sonuprasad23 commited on
Commit
00f0b39
·
1 Parent(s): 5571520

Project Uploaded

Browse files
Files changed (1) hide show
  1. final5.py +29 -86
final5.py CHANGED
@@ -14,7 +14,7 @@ from selenium.webdriver.common.by import By
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
  from selenium.common.exceptions import (
17
- StaleElementReferenceException, NoSuchElementException, TimeoutException, SessionNotCreatedException
18
  )
19
  from google.oauth2 import service_account
20
  from googleapiclient.discovery import build
@@ -39,44 +39,22 @@ def get_args():
39
  p.add_argument("--headless", action="store_true", help="Prefer headless browser")
40
  return p.parse_args()
41
 
42
- GMAIL_SCOPES = [ "https://www.googleapis.com/auth/gmail.send" ]
43
-
44
  def build_gmail_service():
45
  if os.path.exists(SERVICE_ACCOUNT_FILE):
46
  try:
47
  sender_email = os.environ.get("SENDER_EMAIL")
48
- if not sender_email:
49
- print("[GMAIL] SENDER_EMAIL environment variable not set.")
50
- return None
51
  credentials = service_account.Credentials.from_service_account_file(
52
- SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES).with_subject(sender_email)
53
  return build("gmail", "v1", credentials=credentials)
54
  except Exception as e:
55
- print(f"[GMAIL] Service account authentication failed in final5.py: {e}")
56
  return None
57
 
58
- # The send_html_email function is not used by main() but is kept for modularity
59
- def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
60
- if not service: return 0
61
- from email.message import EmailMessage
62
- sent = 0
63
- for to in to_list:
64
- try:
65
- msg = EmailMessage()
66
- msg["to"] = to
67
- msg["from"] = sender
68
- msg["subject"] = subject
69
- msg.set_content(html, subtype="html")
70
- raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
71
- service.users().messages().send(userId="me", body={"raw": raw}).execute()
72
- sent += 1
73
- except Exception as e:
74
- print(f"[GMAIL] send error to {to}: {e}")
75
- return sent
76
-
77
  GEMINI_MODEL = "gemini-1.5-flash"
78
-
79
  class GeminiManager:
 
80
  def __init__(self, api_keys: List[str]):
81
  self.api_keys = api_keys
82
  self.current_key_index = 0
@@ -120,30 +98,25 @@ class GeminiManager:
120
  else:
121
  raise e
122
 
 
123
  def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
124
- fallback = {
125
- "is_medical_seeking": False, "confidence": "low",
126
- "medical_summary": "Not a medical request (AI unavailable/throttled)",
127
- "suggested_services": [], "urgency_level": "low", "analysis": "Keyword-based fallback",
128
- "reasoning": "short explanation", "matched_keywords": found_keywords
129
- }
130
  if not gemini_manager or not gemini_manager.is_available(): return fallback
131
  keywords_str = ", ".join(found_keywords) if found_keywords else "none"
132
- prompt = f"""
133
- Analyze this social post and decide if the author is genuinely seeking medical help, doctor/hospital recommendations, or healthcare services for PERSONAL HEALTH NEEDS (not business, donations, or casual mentions).
134
- KEYWORDS FOUND IN POST: {keywords_str}
135
- CRITICAL RULES:
136
- 1. ONLY flag posts where someone is seeking medical care for themselves or a loved one
137
- 2. IGNORE posts about: business services, donations, selling products, job postings, general info sharing, or academic inquiries.
138
- 3. ONLY flag if it's a PERSONAL HEALTH NEED
139
  Post: "{post_text}"
140
  Return ONLY JSON:
141
  {{
142
  "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
143
  "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
144
- "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1", "keyword2"]
145
  }}"""
146
- for _ in range(1, 5):
147
  try:
148
  resp = gemini_manager.generate_content(prompt)
149
  txt = (resp.text or "").strip()
@@ -154,31 +127,19 @@ Return ONLY JSON:
154
  if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
155
  return result
156
  return fallback
157
- except ResourceExhausted:
158
- gemini_manager.rotate_key()
159
- if not gemini_manager.is_available(): return fallback
160
  except Exception as e:
161
- print(f"[GEMINI] error: {e}")
162
  gemini_manager.rotate_key()
163
- if not gemini_manager.is_available(): return fallback
164
  return fallback
165
 
166
- MEDICAL_KEYWORDS = [
167
- "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care",
168
- "emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health",
169
- "health center","family doctor","maternity","prenatal","postnatal","labor","delivery",
170
- "need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help",
171
- "appointment","checkup","treatment","prescription","medicine","surgery","best hospital",
172
- "best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception",
173
- "fertility","hillside","medical group","wellness center"
174
- ]
175
 
176
  def contains_keywords(text: str) -> Tuple[bool, List[str]]:
177
  tl = (text or "").lower()
178
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
179
  return (len(hits) > 0, hits)
180
 
181
- # --- FIX #1: The Definitive Solution for the Selenium Crash ---
182
  def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
183
  options = webdriver.ChromeOptions()
184
 
@@ -192,7 +153,7 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
192
  options.add_argument(f"--user-data-dir={user_data_dir}")
193
  options.add_argument("--headless=new")
194
  options.add_argument("--no-sandbox")
195
- options.add_argument("--disable-dev-shm-usage") # CRITICAL: THIS IS THE FIX
196
  options.add_argument("--disable-gpu")
197
  options.add_argument("--disable-notifications")
198
  options.add_argument("--window-size=1920,1080")
@@ -201,7 +162,6 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
201
  print("[SELENIUM] WebDriver session created successfully.")
202
  return driver, user_data_dir
203
 
204
- # --- FIX #2: Add Better Logging to the Login Process ---
205
  def load_cookies(driver, cookies_file: str):
206
  print("[FB] Navigating to Facebook homepage to load cookies...")
207
  driver.get("https://www.facebook.com")
@@ -222,7 +182,6 @@ def load_cookies(driver, cookies_file: str):
222
  driver.refresh()
223
  time.sleep(5)
224
 
225
- # Check for login success by looking for a keyword in the title
226
  if "log in" in driver.title.lower():
227
  print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
228
  else:
@@ -230,24 +189,12 @@ def load_cookies(driver, cookies_file: str):
230
 
231
  def wait_group_feed(driver, wait):
232
  wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
233
- feed_loaded = False
234
- start = time.time(); timeout = 30
235
- while not feed_loaded and (time.time() - start) < timeout:
236
- try:
237
- driver.find_element(By.XPATH, "//div[@data-pagelet='GroupFeed' or @role='feed']")
238
- feed_loaded = True; break
239
- except NoSuchElementException:
240
- try:
241
- driver.find_element(By.XPATH, "//div[@role='article']")
242
- feed_loaded = True; break
243
- except NoSuchElementException: pass
244
- time.sleep(1)
245
- if not feed_loaded:
246
  raise TimeoutException("Timed out waiting for group feed to load.")
247
 
248
- def find_message_nodes(driver):
249
- return driver.find_elements(By.XPATH, "//div[@role='article']")
250
-
251
  def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
252
  print(f"[SCRAPE] Navigating to group: {group_url}")
253
  driver.get(group_url)
@@ -258,17 +205,13 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
258
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
259
  time.sleep(pause)
260
 
261
- divs = find_message_nodes(driver)
262
  added_this_scroll = 0
263
  for d in divs:
264
  try:
265
  txt = (d.text or "").strip()
266
  if len(txt) < 25 or txt in seen: continue
267
-
268
- # Filter out common UI text that gets scraped as a post
269
- if any(ui_text in txt for ui_text in ["Comment Share", "Write a comment...", "View more comments"]):
270
- continue
271
-
272
  seen.add(txt)
273
  posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
274
  added_this_scroll += 1
@@ -278,7 +221,6 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
278
  print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
279
  return posts
280
 
281
- # --- FIX #3: Make the Script Fail Properly on Critical Errors ---
282
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
283
  driver = None
284
  user_data_dir = None
@@ -290,8 +232,7 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
290
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
291
  except Exception as e:
292
  print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
293
- # Re-raise the exception to make the script exit with a non-zero code
294
- raise
295
  finally:
296
  if driver:
297
  try: driver.quit()
@@ -303,6 +244,8 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
303
  except Exception as e:
304
  print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
305
  return posts
 
 
306
 
307
  def main():
308
  args = get_args()
 
14
  from selenium.webdriver.support.ui import WebDriverWait
15
  from selenium.webdriver.support import expected_conditions as EC
16
  from selenium.common.exceptions import (
17
+ StaleElementReferenceException, NoSuchElementException, TimeoutException
18
  )
19
  from google.oauth2 import service_account
20
  from googleapiclient.discovery import build
 
39
  p.add_argument("--headless", action="store_true", help="Prefer headless browser")
40
  return p.parse_args()
41
 
42
+ # This function is not called in the main flow but kept for modularity
 
43
  def build_gmail_service():
44
  if os.path.exists(SERVICE_ACCOUNT_FILE):
45
  try:
46
  sender_email = os.environ.get("SENDER_EMAIL")
47
+ if not sender_email: return None
 
 
48
  credentials = service_account.Credentials.from_service_account_file(
49
+ SERVICE_ACCOUNT_FILE, scopes=["https://www.googleapis.com/auth/gmail.send"]).with_subject(sender_email)
50
  return build("gmail", "v1", credentials=credentials)
51
  except Exception as e:
52
+ print(f"[GMAIL] Auth failed in final5.py: {e}")
53
  return None
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  GEMINI_MODEL = "gemini-1.5-flash"
 
56
  class GeminiManager:
57
+ # ... (This class is correct, no changes needed)
58
  def __init__(self, api_keys: List[str]):
59
  self.api_keys = api_keys
60
  self.current_key_index = 0
 
98
  else:
99
  raise e
100
 
101
+
102
  def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
103
+ fallback = { "is_medical_seeking": False, "confidence": "low", "medical_summary": "AI unavailable", "suggested_services": [], "urgency_level": "low", "analysis": "Fallback", "reasoning": "AI error", "matched_keywords": found_keywords }
 
 
 
 
 
104
  if not gemini_manager or not gemini_manager.is_available(): return fallback
105
  keywords_str = ", ".join(found_keywords) if found_keywords else "none"
106
+ prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
107
+ KEYWORDS: {keywords_str}
108
+ RULES:
109
+ 1. Flag ONLY posts where someone seeks medical care for themselves or a loved one.
110
+ 2. IGNORE posts about business, donations, selling products, jobs, or general info.
111
+ 3. Flag ONLY if it is a PERSONAL HEALTH NEED.
 
112
  Post: "{post_text}"
113
  Return ONLY JSON:
114
  {{
115
  "is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
116
  "suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
117
+ "analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1"]
118
  }}"""
119
+ for _ in range(2): # Reduced retries for speed
120
  try:
121
  resp = gemini_manager.generate_content(prompt)
122
  txt = (resp.text or "").strip()
 
127
  if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
128
  return result
129
  return fallback
 
 
 
130
  except Exception as e:
131
+ print(f"[GEMINI] Error: {e}")
132
  gemini_manager.rotate_key()
 
133
  return fallback
134
 
135
+ MEDICAL_KEYWORDS = [ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor","maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine","surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception","fertility","hillside","medical group","wellness center" ]
 
 
 
 
 
 
 
 
136
 
137
  def contains_keywords(text: str) -> Tuple[bool, List[str]]:
138
  tl = (text or "").lower()
139
  hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
140
  return (len(hits) > 0, hits)
141
 
142
+ # --- START: CRITICAL SELENIUM FIXES ---
143
  def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
144
  options = webdriver.ChromeOptions()
145
 
 
153
  options.add_argument(f"--user-data-dir={user_data_dir}")
154
  options.add_argument("--headless=new")
155
  options.add_argument("--no-sandbox")
156
+ options.add_argument("--disable-dev-shm-usage") # THIS IS THE KEY FIX
157
  options.add_argument("--disable-gpu")
158
  options.add_argument("--disable-notifications")
159
  options.add_argument("--window-size=1920,1080")
 
162
  print("[SELENIUM] WebDriver session created successfully.")
163
  return driver, user_data_dir
164
 
 
165
  def load_cookies(driver, cookies_file: str):
166
  print("[FB] Navigating to Facebook homepage to load cookies...")
167
  driver.get("https://www.facebook.com")
 
182
  driver.refresh()
183
  time.sleep(5)
184
 
 
185
  if "log in" in driver.title.lower():
186
  print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
187
  else:
 
189
 
190
  def wait_group_feed(driver, wait):
191
  wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
192
+ try:
193
+ wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed' or @data-pagelet='GroupFeed']")))
194
+ print("[SCRAPE] Group feed detected.")
195
+ except TimeoutException:
 
 
 
 
 
 
 
 
 
196
  raise TimeoutException("Timed out waiting for group feed to load.")
197
 
 
 
 
198
  def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
199
  print(f"[SCRAPE] Navigating to group: {group_url}")
200
  driver.get(group_url)
 
205
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
206
  time.sleep(pause)
207
 
208
+ divs = driver.find_elements(By.XPATH, "//div[@role='article']")
209
  added_this_scroll = 0
210
  for d in divs:
211
  try:
212
  txt = (d.text or "").strip()
213
  if len(txt) < 25 or txt in seen: continue
214
+ if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]): continue
 
 
 
 
215
  seen.add(txt)
216
  posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
217
  added_this_scroll += 1
 
221
  print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
222
  return posts
223
 
 
224
  def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
225
  driver = None
226
  user_data_dir = None
 
232
  posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
233
  except Exception as e:
234
  print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
235
+ raise # Re-raise the exception to make the script exit with a non-zero code
 
236
  finally:
237
  if driver:
238
  try: driver.quit()
 
244
  except Exception as e:
245
  print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
246
  return posts
247
+ # --- END: CRITICAL SELENIUM FIXES ---
248
+
249
 
250
  def main():
251
  args = get_args()