Spaces:
Sleeping
Sleeping
Commit ·
00f0b39
1
Parent(s): 5571520
Project Uploaded
Browse files
final5.py
CHANGED
|
@@ -14,7 +14,7 @@ from selenium.webdriver.common.by import By
|
|
| 14 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 15 |
from selenium.webdriver.support import expected_conditions as EC
|
| 16 |
from selenium.common.exceptions import (
|
| 17 |
-
StaleElementReferenceException, NoSuchElementException, TimeoutException
|
| 18 |
)
|
| 19 |
from google.oauth2 import service_account
|
| 20 |
from googleapiclient.discovery import build
|
|
@@ -39,44 +39,22 @@ def get_args():
|
|
| 39 |
p.add_argument("--headless", action="store_true", help="Prefer headless browser")
|
| 40 |
return p.parse_args()
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
def build_gmail_service():
|
| 45 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 46 |
try:
|
| 47 |
sender_email = os.environ.get("SENDER_EMAIL")
|
| 48 |
-
if not sender_email:
|
| 49 |
-
print("[GMAIL] SENDER_EMAIL environment variable not set.")
|
| 50 |
-
return None
|
| 51 |
credentials = service_account.Credentials.from_service_account_file(
|
| 52 |
-
SERVICE_ACCOUNT_FILE, scopes=
|
| 53 |
return build("gmail", "v1", credentials=credentials)
|
| 54 |
except Exception as e:
|
| 55 |
-
print(f"[GMAIL]
|
| 56 |
return None
|
| 57 |
|
| 58 |
-
# The send_html_email function is not used by main() but is kept for modularity
|
| 59 |
-
def send_html_email(service, sender: str, to_list: List[str], subject: str, html: str) -> int:
|
| 60 |
-
if not service: return 0
|
| 61 |
-
from email.message import EmailMessage
|
| 62 |
-
sent = 0
|
| 63 |
-
for to in to_list:
|
| 64 |
-
try:
|
| 65 |
-
msg = EmailMessage()
|
| 66 |
-
msg["to"] = to
|
| 67 |
-
msg["from"] = sender
|
| 68 |
-
msg["subject"] = subject
|
| 69 |
-
msg.set_content(html, subtype="html")
|
| 70 |
-
raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
|
| 71 |
-
service.users().messages().send(userId="me", body={"raw": raw}).execute()
|
| 72 |
-
sent += 1
|
| 73 |
-
except Exception as e:
|
| 74 |
-
print(f"[GMAIL] send error to {to}: {e}")
|
| 75 |
-
return sent
|
| 76 |
-
|
| 77 |
GEMINI_MODEL = "gemini-1.5-flash"
|
| 78 |
-
|
| 79 |
class GeminiManager:
|
|
|
|
| 80 |
def __init__(self, api_keys: List[str]):
|
| 81 |
self.api_keys = api_keys
|
| 82 |
self.current_key_index = 0
|
|
@@ -120,30 +98,25 @@ class GeminiManager:
|
|
| 120 |
else:
|
| 121 |
raise e
|
| 122 |
|
|
|
|
| 123 |
def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
|
| 124 |
-
fallback = {
|
| 125 |
-
"is_medical_seeking": False, "confidence": "low",
|
| 126 |
-
"medical_summary": "Not a medical request (AI unavailable/throttled)",
|
| 127 |
-
"suggested_services": [], "urgency_level": "low", "analysis": "Keyword-based fallback",
|
| 128 |
-
"reasoning": "short explanation", "matched_keywords": found_keywords
|
| 129 |
-
}
|
| 130 |
if not gemini_manager or not gemini_manager.is_available(): return fallback
|
| 131 |
keywords_str = ", ".join(found_keywords) if found_keywords else "none"
|
| 132 |
-
prompt = f"""
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
3. ONLY flag if it's a PERSONAL HEALTH NEED
|
| 139 |
Post: "{post_text}"
|
| 140 |
Return ONLY JSON:
|
| 141 |
{{
|
| 142 |
"is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
|
| 143 |
"suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
|
| 144 |
-
"analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1"
|
| 145 |
}}"""
|
| 146 |
-
for _ in range(
|
| 147 |
try:
|
| 148 |
resp = gemini_manager.generate_content(prompt)
|
| 149 |
txt = (resp.text or "").strip()
|
|
@@ -154,31 +127,19 @@ Return ONLY JSON:
|
|
| 154 |
if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
|
| 155 |
return result
|
| 156 |
return fallback
|
| 157 |
-
except ResourceExhausted:
|
| 158 |
-
gemini_manager.rotate_key()
|
| 159 |
-
if not gemini_manager.is_available(): return fallback
|
| 160 |
except Exception as e:
|
| 161 |
-
print(f"[GEMINI]
|
| 162 |
gemini_manager.rotate_key()
|
| 163 |
-
if not gemini_manager.is_available(): return fallback
|
| 164 |
return fallback
|
| 165 |
|
| 166 |
-
MEDICAL_KEYWORDS = [
|
| 167 |
-
"doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care",
|
| 168 |
-
"emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health",
|
| 169 |
-
"health center","family doctor","maternity","prenatal","postnatal","labor","delivery",
|
| 170 |
-
"need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help",
|
| 171 |
-
"appointment","checkup","treatment","prescription","medicine","surgery","best hospital",
|
| 172 |
-
"best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception",
|
| 173 |
-
"fertility","hillside","medical group","wellness center"
|
| 174 |
-
]
|
| 175 |
|
| 176 |
def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
| 177 |
tl = (text or "").lower()
|
| 178 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 179 |
return (len(hits) > 0, hits)
|
| 180 |
|
| 181 |
-
# ---
|
| 182 |
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 183 |
options = webdriver.ChromeOptions()
|
| 184 |
|
|
@@ -192,7 +153,7 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
|
| 192 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 193 |
options.add_argument("--headless=new")
|
| 194 |
options.add_argument("--no-sandbox")
|
| 195 |
-
options.add_argument("--disable-dev-shm-usage") #
|
| 196 |
options.add_argument("--disable-gpu")
|
| 197 |
options.add_argument("--disable-notifications")
|
| 198 |
options.add_argument("--window-size=1920,1080")
|
|
@@ -201,7 +162,6 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
|
| 201 |
print("[SELENIUM] WebDriver session created successfully.")
|
| 202 |
return driver, user_data_dir
|
| 203 |
|
| 204 |
-
# --- FIX #2: Add Better Logging to the Login Process ---
|
| 205 |
def load_cookies(driver, cookies_file: str):
|
| 206 |
print("[FB] Navigating to Facebook homepage to load cookies...")
|
| 207 |
driver.get("https://www.facebook.com")
|
|
@@ -222,7 +182,6 @@ def load_cookies(driver, cookies_file: str):
|
|
| 222 |
driver.refresh()
|
| 223 |
time.sleep(5)
|
| 224 |
|
| 225 |
-
# Check for login success by looking for a keyword in the title
|
| 226 |
if "log in" in driver.title.lower():
|
| 227 |
print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
|
| 228 |
else:
|
|
@@ -230,24 +189,12 @@ def load_cookies(driver, cookies_file: str):
|
|
| 230 |
|
| 231 |
def wait_group_feed(driver, wait):
|
| 232 |
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
driver.find_element(By.XPATH, "//div[@data-pagelet='GroupFeed' or @role='feed']")
|
| 238 |
-
feed_loaded = True; break
|
| 239 |
-
except NoSuchElementException:
|
| 240 |
-
try:
|
| 241 |
-
driver.find_element(By.XPATH, "//div[@role='article']")
|
| 242 |
-
feed_loaded = True; break
|
| 243 |
-
except NoSuchElementException: pass
|
| 244 |
-
time.sleep(1)
|
| 245 |
-
if not feed_loaded:
|
| 246 |
raise TimeoutException("Timed out waiting for group feed to load.")
|
| 247 |
|
| 248 |
-
def find_message_nodes(driver):
|
| 249 |
-
return driver.find_elements(By.XPATH, "//div[@role='article']")
|
| 250 |
-
|
| 251 |
def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
| 252 |
print(f"[SCRAPE] Navigating to group: {group_url}")
|
| 253 |
driver.get(group_url)
|
|
@@ -258,17 +205,13 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
|
| 258 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 259 |
time.sleep(pause)
|
| 260 |
|
| 261 |
-
divs =
|
| 262 |
added_this_scroll = 0
|
| 263 |
for d in divs:
|
| 264 |
try:
|
| 265 |
txt = (d.text or "").strip()
|
| 266 |
if len(txt) < 25 or txt in seen: continue
|
| 267 |
-
|
| 268 |
-
# Filter out common UI text that gets scraped as a post
|
| 269 |
-
if any(ui_text in txt for ui_text in ["Comment Share", "Write a comment...", "View more comments"]):
|
| 270 |
-
continue
|
| 271 |
-
|
| 272 |
seen.add(txt)
|
| 273 |
posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
|
| 274 |
added_this_scroll += 1
|
|
@@ -278,7 +221,6 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
|
| 278 |
print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
|
| 279 |
return posts
|
| 280 |
|
| 281 |
-
# --- FIX #3: Make the Script Fail Properly on Critical Errors ---
|
| 282 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 283 |
driver = None
|
| 284 |
user_data_dir = None
|
|
@@ -290,8 +232,7 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
|
|
| 290 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 291 |
except Exception as e:
|
| 292 |
print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
|
| 293 |
-
# Re-raise the exception to make the script exit with a non-zero code
|
| 294 |
-
raise
|
| 295 |
finally:
|
| 296 |
if driver:
|
| 297 |
try: driver.quit()
|
|
@@ -303,6 +244,8 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
|
|
| 303 |
except Exception as e:
|
| 304 |
print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
|
| 305 |
return posts
|
|
|
|
|
|
|
| 306 |
|
| 307 |
def main():
|
| 308 |
args = get_args()
|
|
|
|
| 14 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 15 |
from selenium.webdriver.support import expected_conditions as EC
|
| 16 |
from selenium.common.exceptions import (
|
| 17 |
+
StaleElementReferenceException, NoSuchElementException, TimeoutException
|
| 18 |
)
|
| 19 |
from google.oauth2 import service_account
|
| 20 |
from googleapiclient.discovery import build
|
|
|
|
| 39 |
p.add_argument("--headless", action="store_true", help="Prefer headless browser")
|
| 40 |
return p.parse_args()
|
| 41 |
|
| 42 |
+
# This function is not called in the main flow but kept for modularity
|
|
|
|
| 43 |
def build_gmail_service():
|
| 44 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 45 |
try:
|
| 46 |
sender_email = os.environ.get("SENDER_EMAIL")
|
| 47 |
+
if not sender_email: return None
|
|
|
|
|
|
|
| 48 |
credentials = service_account.Credentials.from_service_account_file(
|
| 49 |
+
SERVICE_ACCOUNT_FILE, scopes=["https://www.googleapis.com/auth/gmail.send"]).with_subject(sender_email)
|
| 50 |
return build("gmail", "v1", credentials=credentials)
|
| 51 |
except Exception as e:
|
| 52 |
+
print(f"[GMAIL] Auth failed in final5.py: {e}")
|
| 53 |
return None
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
GEMINI_MODEL = "gemini-1.5-flash"
|
|
|
|
| 56 |
class GeminiManager:
|
| 57 |
+
# ... (This class is correct, no changes needed)
|
| 58 |
def __init__(self, api_keys: List[str]):
|
| 59 |
self.api_keys = api_keys
|
| 60 |
self.current_key_index = 0
|
|
|
|
| 98 |
else:
|
| 99 |
raise e
|
| 100 |
|
| 101 |
+
|
| 102 |
def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
|
| 103 |
+
fallback = { "is_medical_seeking": False, "confidence": "low", "medical_summary": "AI unavailable", "suggested_services": [], "urgency_level": "low", "analysis": "Fallback", "reasoning": "AI error", "matched_keywords": found_keywords }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
if not gemini_manager or not gemini_manager.is_available(): return fallback
|
| 105 |
keywords_str = ", ".join(found_keywords) if found_keywords else "none"
|
| 106 |
+
prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
|
| 107 |
+
KEYWORDS: {keywords_str}
|
| 108 |
+
RULES:
|
| 109 |
+
1. Flag ONLY posts where someone seeks medical care for themselves or a loved one.
|
| 110 |
+
2. IGNORE posts about business, donations, selling products, jobs, or general info.
|
| 111 |
+
3. Flag ONLY if it is a PERSONAL HEALTH NEED.
|
|
|
|
| 112 |
Post: "{post_text}"
|
| 113 |
Return ONLY JSON:
|
| 114 |
{{
|
| 115 |
"is_medical_seeking": true/false, "confidence": "high/medium/low", "medical_summary": "short summary",
|
| 116 |
"suggested_services": ["service1","service2"], "urgency_level": "high/medium/low",
|
| 117 |
+
"analysis": "why it's seeking help", "reasoning": "short explanation", "matched_keywords": ["keyword1"]
|
| 118 |
}}"""
|
| 119 |
+
for _ in range(2): # Reduced retries for speed
|
| 120 |
try:
|
| 121 |
resp = gemini_manager.generate_content(prompt)
|
| 122 |
txt = (resp.text or "").strip()
|
|
|
|
| 127 |
if "matched_keywords" not in result: result["matched_keywords"] = found_keywords
|
| 128 |
return result
|
| 129 |
return fallback
|
|
|
|
|
|
|
|
|
|
| 130 |
except Exception as e:
|
| 131 |
+
print(f"[GEMINI] Error: {e}")
|
| 132 |
gemini_manager.rotate_key()
|
|
|
|
| 133 |
return fallback
|
| 134 |
|
| 135 |
+
MEDICAL_KEYWORDS = [ "doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er","specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor","maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor","recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine","surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control","contraception","fertility","hillside","medical group","wellness center" ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
| 138 |
tl = (text or "").lower()
|
| 139 |
hits = [kw for kw in MEDICAL_KEYWORDS if kw in tl]
|
| 140 |
return (len(hits) > 0, hits)
|
| 141 |
|
| 142 |
+
# --- START: CRITICAL SELENIUM FIXES ---
|
| 143 |
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 144 |
options = webdriver.ChromeOptions()
|
| 145 |
|
|
|
|
| 153 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 154 |
options.add_argument("--headless=new")
|
| 155 |
options.add_argument("--no-sandbox")
|
| 156 |
+
options.add_argument("--disable-dev-shm-usage") # THIS IS THE KEY FIX
|
| 157 |
options.add_argument("--disable-gpu")
|
| 158 |
options.add_argument("--disable-notifications")
|
| 159 |
options.add_argument("--window-size=1920,1080")
|
|
|
|
| 162 |
print("[SELENIUM] WebDriver session created successfully.")
|
| 163 |
return driver, user_data_dir
|
| 164 |
|
|
|
|
| 165 |
def load_cookies(driver, cookies_file: str):
|
| 166 |
print("[FB] Navigating to Facebook homepage to load cookies...")
|
| 167 |
driver.get("https://www.facebook.com")
|
|
|
|
| 182 |
driver.refresh()
|
| 183 |
time.sleep(5)
|
| 184 |
|
|
|
|
| 185 |
if "log in" in driver.title.lower():
|
| 186 |
print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
|
| 187 |
else:
|
|
|
|
| 189 |
|
| 190 |
def wait_group_feed(driver, wait):
|
| 191 |
wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
|
| 192 |
+
try:
|
| 193 |
+
wait.until(EC.presence_of_element_located((By.XPATH, "//div[@role='feed' or @data-pagelet='GroupFeed']")))
|
| 194 |
+
print("[SCRAPE] Group feed detected.")
|
| 195 |
+
except TimeoutException:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
raise TimeoutException("Timed out waiting for group feed to load.")
|
| 197 |
|
|
|
|
|
|
|
|
|
|
| 198 |
def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
| 199 |
print(f"[SCRAPE] Navigating to group: {group_url}")
|
| 200 |
driver.get(group_url)
|
|
|
|
| 205 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 206 |
time.sleep(pause)
|
| 207 |
|
| 208 |
+
divs = driver.find_elements(By.XPATH, "//div[@role='article']")
|
| 209 |
added_this_scroll = 0
|
| 210 |
for d in divs:
|
| 211 |
try:
|
| 212 |
txt = (d.text or "").strip()
|
| 213 |
if len(txt) < 25 or txt in seen: continue
|
| 214 |
+
if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]): continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
seen.add(txt)
|
| 216 |
posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
|
| 217 |
added_this_scroll += 1
|
|
|
|
| 221 |
print(f"[SCRAPE] Finished scraping. Total unique posts found: {len(posts)}")
|
| 222 |
return posts
|
| 223 |
|
|
|
|
| 224 |
def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int, pause: float):
|
| 225 |
driver = None
|
| 226 |
user_data_dir = None
|
|
|
|
| 232 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 233 |
except Exception as e:
|
| 234 |
print(f"[SCRAPE] FATAL ERROR during scraping: {e}")
|
| 235 |
+
raise # Re-raise the exception to make the script exit with a non-zero code
|
|
|
|
| 236 |
finally:
|
| 237 |
if driver:
|
| 238 |
try: driver.quit()
|
|
|
|
| 244 |
except Exception as e:
|
| 245 |
print(f"[SELENIUM] Error cleaning up directory {user_data_dir}: {e}")
|
| 246 |
return posts
|
| 247 |
+
# --- END: CRITICAL SELENIUM FIXES ---
|
| 248 |
+
|
| 249 |
|
| 250 |
def main():
|
| 251 |
args = get_args()
|