Spaces:
Sleeping
Sleeping
Commit
·
8a3c8d8
1
Parent(s):
9624e78
Project Uploaded
Browse files- Dockerfile +15 -18
- api_server.py +200 -92
- final5.py +89 -55
Dockerfile
CHANGED
|
@@ -1,44 +1,41 @@
|
|
| 1 |
-
# Use a base image with Python
|
| 2 |
FROM python:3.10-slim
|
| 3 |
|
| 4 |
-
# Set the working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
# Set environment variables to prevent interactive prompts during installation
|
| 8 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 9 |
ENV PYTHONUNBUFFERED=1
|
| 10 |
ENV PYTHONIOENCODING=utf-8
|
| 11 |
ENV FLASK_APP=api_server.py
|
| 12 |
ENV FLASK_RUN_HOST=0.0.0.0
|
| 13 |
ENV FLASK_RUN_PORT=7860
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
# Install system dependencies needed for apt-add-repository and Chrome
|
| 16 |
RUN apt-get update && apt-get install -y \
|
| 17 |
wget \
|
| 18 |
gnupg \
|
| 19 |
curl \
|
| 20 |
unzip \
|
|
|
|
|
|
|
| 21 |
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 30 |
-
# --- END: CORRECTED CHROME INSTALLATION ---
|
| 31 |
|
| 32 |
-
# Copy the requirements file and install Python dependencies
|
| 33 |
COPY requirements.txt .
|
| 34 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 35 |
|
| 36 |
-
# Copy the rest of the application files
|
| 37 |
COPY . .
|
| 38 |
|
| 39 |
-
|
|
|
|
| 40 |
EXPOSE 7860
|
| 41 |
|
| 42 |
-
|
| 43 |
-
# Use gunicorn for a more robust server in production if needed, but flask dev server is fine for spaces
|
| 44 |
-
CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]
|
|
|
|
|
|
|
| 1 |
FROM python:3.10-slim
|
| 2 |
|
|
|
|
| 3 |
WORKDIR /app
|
| 4 |
|
|
|
|
| 5 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 6 |
ENV PYTHONUNBUFFERED=1
|
| 7 |
ENV PYTHONIOENCODING=utf-8
|
| 8 |
ENV FLASK_APP=api_server.py
|
| 9 |
ENV FLASK_RUN_HOST=0.0.0.0
|
| 10 |
ENV FLASK_RUN_PORT=7860
|
| 11 |
+
ENV HOME=/tmp
|
| 12 |
+
ENV WDM_LOCAL=1
|
| 13 |
+
ENV WDM_CACHE_DIR=/tmp/.wdm
|
| 14 |
+
ENV SE_MANAGER_DRIVER_CACHE=/tmp/selenium
|
| 15 |
|
|
|
|
| 16 |
RUN apt-get update && apt-get install -y \
|
| 17 |
wget \
|
| 18 |
gnupg \
|
| 19 |
curl \
|
| 20 |
unzip \
|
| 21 |
+
ca-certificates \
|
| 22 |
+
fonts-liberation \
|
| 23 |
&& rm -rf /var/lib/apt/lists/*
|
| 24 |
|
| 25 |
+
RUN mkdir -p /etc/apt/keyrings && \
|
| 26 |
+
curl -sS -o - https://dl.google.com/linux/linux_signing_key.pub | gpg --dearmor | tee /etc/apt/keyrings/google-chrome.gpg >/dev/null && \
|
| 27 |
+
echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/google-chrome.gpg] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \
|
| 28 |
+
apt-get update && \
|
| 29 |
+
apt-get install -y google-chrome-stable && \
|
| 30 |
+
rm -rf /var/lib/apt/lists/*
|
|
|
|
|
|
|
| 31 |
|
|
|
|
| 32 |
COPY requirements.txt .
|
| 33 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 34 |
|
|
|
|
| 35 |
COPY . .
|
| 36 |
|
| 37 |
+
RUN mkdir -p /tmp/.wdm /tmp/selenium
|
| 38 |
+
|
| 39 |
EXPOSE 7860
|
| 40 |
|
| 41 |
+
CMD ["flask", "run", "--host=0.0.0.0", "--port=7860"]
|
|
|
|
|
|
api_server.py
CHANGED
|
@@ -1,4 +1,14 @@
|
|
| 1 |
-
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from datetime import datetime
|
| 3 |
from dataclasses import dataclass, field
|
| 4 |
from typing import List, Dict, Any, Optional
|
|
@@ -11,9 +21,6 @@ from dotenv import load_dotenv
|
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
-
# --- START: CRITICAL DEFINITIONS ---
|
| 15 |
-
# This block is moved to the top to guarantee 'log' exists before it's ever called.
|
| 16 |
-
|
| 17 |
class LogBuffer:
|
| 18 |
def __init__(self, max_items: int = 10000):
|
| 19 |
self._buf: List[Dict[str, Any]] = []
|
|
@@ -24,57 +31,57 @@ class LogBuffer:
|
|
| 24 |
ts = datetime.now().strftime("%H:%M:%S")
|
| 25 |
line = {"id": self._next_id, "ts": ts, "level": level, "source": source, "msg": msg}
|
| 26 |
with self._lock:
|
| 27 |
-
self._buf.append(line)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
def clear(self):
|
| 30 |
-
with self._lock:
|
|
|
|
| 31 |
def get_after(self, after_id: int, limit: int = 500):
|
| 32 |
with self._lock:
|
| 33 |
-
if after_id <= 0:
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
last_id = self._buf[-1]["id"] if self._buf else after_id
|
| 36 |
return data, last_id
|
| 37 |
|
| 38 |
logs = LogBuffer()
|
|
|
|
| 39 |
def log(msg: str, level: str = "info", source: str = "server"):
|
| 40 |
logs.append(msg, level, source)
|
| 41 |
print(f"[{level.upper()}][{source}] {msg}", flush=True)
|
| 42 |
|
| 43 |
def decode_base64_with_padding(b64_string: str) -> bytes:
|
| 44 |
-
"""Decodes a Base64 string, adding missing padding if necessary."""
|
| 45 |
missing_padding = len(b64_string) % 4
|
| 46 |
if missing_padding:
|
| 47 |
-
b64_string +=
|
| 48 |
try:
|
| 49 |
return base64.b64decode(b64_string)
|
| 50 |
except binascii.Error as e:
|
| 51 |
log(f"Error decoding base64 string: {e}", "error", "SERVER")
|
| 52 |
return b""
|
| 53 |
-
# --- END: CRITICAL DEFINITIONS ---
|
| 54 |
|
| 55 |
-
|
| 56 |
-
# Define a writable directory for ALL runtime files
|
| 57 |
WRITABLE_DIR = "/tmp"
|
| 58 |
COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
|
| 59 |
SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
decoded_cookies = decode_base64_with_padding(os.environ['FB_COOKIES_B64'])
|
| 64 |
if decoded_cookies:
|
| 65 |
-
with open(COOKIES_PATH,
|
| 66 |
f.write(decoded_cookies)
|
| 67 |
|
| 68 |
-
if
|
| 69 |
-
decoded_service_account = decode_base64_with_padding(os.environ[
|
| 70 |
if decoded_service_account:
|
| 71 |
-
with open(SERVICE_ACCOUNT_FILE,
|
| 72 |
-
f.write(decoded_service_account.decode(
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
|
| 78 |
SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
|
| 79 |
|
| 80 |
SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
|
|
@@ -86,20 +93,18 @@ for i in range(1, 6):
|
|
| 86 |
if key:
|
| 87 |
GEMINI_KEYS.append(key)
|
| 88 |
|
| 89 |
-
GMAIL_SCOPES = [
|
| 90 |
os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
|
| 91 |
os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
|
| 92 |
|
| 93 |
-
|
| 94 |
-
# Define the Gmail service builder function
|
| 95 |
def build_gmail_service():
|
| 96 |
if not os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 97 |
log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
|
| 98 |
return None
|
| 99 |
try:
|
| 100 |
creds = service_account.Credentials.from_service_account_file(
|
| 101 |
-
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES
|
| 102 |
-
|
| 103 |
service = build("gmail", "v1", credentials=creds)
|
| 104 |
log("Gmail service built successfully using service account.", "info", "GMAIL")
|
| 105 |
return service
|
|
@@ -108,11 +113,8 @@ def build_gmail_service():
|
|
| 108 |
log(f"CRITICAL: Ensure your service account has Domain-Wide Delegation enabled for the user {SENDER_EMAIL}", "error", "GMAIL")
|
| 109 |
return None
|
| 110 |
|
| 111 |
-
# Now that all setup is done, build the service
|
| 112 |
gmail_service = build_gmail_service()
|
| 113 |
|
| 114 |
-
|
| 115 |
-
# --- The rest of the application code follows ---
|
| 116 |
@dataclass
|
| 117 |
class GroupRun:
|
| 118 |
link: str
|
|
@@ -135,7 +137,7 @@ class PipelineState:
|
|
| 135 |
recipients: List[str] = field(default_factory=list)
|
| 136 |
summary_path: str = ""
|
| 137 |
|
| 138 |
-
app = Flask(__name__, static_folder=
|
| 139 |
CORS(app)
|
| 140 |
|
| 141 |
live_lock = threading.Lock()
|
|
@@ -172,16 +174,19 @@ def load_scraped_into_live(path: str):
|
|
| 172 |
live_state["counts"]["total_posts"] = len(posts)
|
| 173 |
|
| 174 |
def handle_event_line(line: str):
|
| 175 |
-
if not line.startswith("::"):
|
|
|
|
| 176 |
try:
|
| 177 |
if "::SCRAPE_SAVED::" in line:
|
| 178 |
path = line.split("::SCRAPE_SAVED::", 1)[1].strip()
|
| 179 |
-
if path:
|
|
|
|
| 180 |
elif "::KW_HIT::" in line:
|
| 181 |
d = json.loads(line.split("::KW_HIT::", 1)[1].strip())
|
| 182 |
p = ensure_post_obj(int(d["id"]))
|
| 183 |
p["found_keywords"] = d.get("found_keywords", [])
|
| 184 |
-
with live_lock:
|
|
|
|
| 185 |
elif "::AI_RESULT::" in line:
|
| 186 |
d = json.loads(line.split("::AI_RESULT::", 1)[1].strip())
|
| 187 |
p = ensure_post_obj(int(d["id"]))
|
|
@@ -189,19 +194,22 @@ def handle_event_line(line: str):
|
|
| 189 |
p["ai"] = ai
|
| 190 |
with live_lock:
|
| 191 |
live_state["counts"]["ai_done"] += 1
|
| 192 |
-
if ai.get("is_medical_seeking"):
|
|
|
|
| 193 |
elif "::EMAIL_SENT::" in line:
|
| 194 |
d = json.loads(line.split("::EMAIL_SENT::", 1)[1].strip())
|
| 195 |
p = ensure_post_obj(int(d["id"]))
|
| 196 |
sent = int(d.get("sent", 0))
|
| 197 |
p["email_sent"] = sent > 0
|
| 198 |
if sent > 0:
|
| 199 |
-
with live_lock:
|
|
|
|
| 200 |
except Exception as e:
|
| 201 |
log(f"live parse error: {e}", "error", "LIVE")
|
| 202 |
|
| 203 |
def read_groups(path: str) -> List[str]:
|
| 204 |
-
if not os.path.exists(path):
|
|
|
|
| 205 |
with open(path, "r", encoding="utf-8") as f:
|
| 206 |
return [ln.strip() for ln in f.read().splitlines() if ln.strip()]
|
| 207 |
|
|
@@ -232,46 +240,103 @@ def send_html_email(to_emails: List[str], subject: str, html_content: str) -> in
|
|
| 232 |
log(f"Gmail send error to {to}: {e}", "error", "gmail")
|
| 233 |
return sent
|
| 234 |
|
| 235 |
-
def build_confirmed_posts_email(groups_run: List[GroupRun], all_confirmed_posts: List[Dict[str, Any]]) -> str:
|
| 236 |
-
total_groups
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
<tr>
|
| 239 |
<td style="padding: 8px; border-bottom: 1px solid #eee;"><a href="{g.link}" target="_blank">{g.link}</a></td>
|
| 240 |
<td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.scraped_posts}</td>
|
| 241 |
<td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.detected_posts}</td>
|
| 242 |
<td style="padding: 8px; border-bottom: 1px solid #eee;">{"OK" if g.stage == "done" else "ERROR"}</td>
|
| 243 |
-
</tr>"""
|
| 244 |
-
summary_table_html = f"""<h3>Group Summary</h3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
if all_confirmed_posts:
|
| 246 |
-
posts_html = "".join(
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
state = PipelineState()
|
| 258 |
|
| 259 |
def stream_process_lines(args: List[str], env: Optional[Dict[str, str]] = None, tag: str = "FINAL5") -> int:
|
| 260 |
log(f"Exec: {' '.join(args)}", "info", tag)
|
| 261 |
-
proc = subprocess.Popen(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 262 |
def pump(pipe, name):
|
| 263 |
for raw in pipe:
|
| 264 |
line = (raw or "").rstrip("\n")
|
| 265 |
-
if not line:
|
|
|
|
| 266 |
if line.startswith("::"):
|
| 267 |
-
try:
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
log(line, "info" if name == "stdout" else "warn", tag)
|
| 270 |
t1 = threading.Thread(target=pump, args=(proc.stdout, "stdout"), daemon=True)
|
| 271 |
t2 = threading.Thread(target=pump, args=(proc.stderr, "stderr"), daemon=True)
|
| 272 |
-
t1.start()
|
|
|
|
| 273 |
rc = proc.wait()
|
| 274 |
-
t1.join(timeout=0.2)
|
|
|
|
| 275 |
log(f"Exit code: {rc}", "info", tag)
|
| 276 |
return rc
|
| 277 |
|
|
@@ -286,10 +351,17 @@ def call_final5_for_group(group_url: str, out_json: str, analysis_json: str, rec
|
|
| 286 |
"--cookies-file", COOKIES_PATH,
|
| 287 |
"--headless"
|
| 288 |
]
|
| 289 |
-
if GEMINI_KEYS:
|
|
|
|
| 290 |
env = os.environ.copy()
|
| 291 |
env["PYTHONUNBUFFERED"] = "1"
|
| 292 |
env["PYTHONIOENCODING"] = "utf-8"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
rc = stream_process_lines(args, env=env, tag="FINAL5")
|
| 294 |
return {"ok": rc == 0, "code": rc}
|
| 295 |
|
|
@@ -297,44 +369,57 @@ def run_pipeline(recipients: List[str]):
|
|
| 297 |
try:
|
| 298 |
logs.clear()
|
| 299 |
log("Pipeline starting", "info", "ORCHESTRATOR")
|
| 300 |
-
state.running
|
|
|
|
|
|
|
|
|
|
| 301 |
state.groups.clear()
|
| 302 |
links = read_groups(GROUPS_TXT)
|
| 303 |
state.total = len(links)
|
| 304 |
if not links:
|
| 305 |
log("No groups found in groups.txt", "warn", "ORCHESTRATOR")
|
| 306 |
-
state.message
|
|
|
|
| 307 |
return
|
| 308 |
all_confirmed_posts = []
|
| 309 |
for i, link in enumerate(links, start=1):
|
| 310 |
reset_live_state(link)
|
| 311 |
g = GroupRun(link=link, stage="running")
|
| 312 |
state.groups.append(g)
|
| 313 |
-
state.current
|
|
|
|
|
|
|
| 314 |
log(f"[{i}/{state.total}] Processing group: {link}", "info", "ORCHESTRATOR")
|
| 315 |
slug = slugify(link)
|
| 316 |
-
out_json
|
| 317 |
-
|
|
|
|
|
|
|
| 318 |
result = call_final5_for_group(link, out_json, analysis_json, recipients)
|
| 319 |
if not result.get("ok"):
|
| 320 |
-
g.stage
|
|
|
|
| 321 |
log(f"final5 failed for {link}: code {result.get('code')}", "error", "ORCHESTRATOR")
|
| 322 |
else:
|
| 323 |
try:
|
| 324 |
if os.path.exists(out_json):
|
| 325 |
-
with open(out_json, "r", encoding="utf-8") as f:
|
|
|
|
| 326 |
if os.path.exists(analysis_json):
|
| 327 |
-
with open(analysis_json, "r", encoding="utf-8") as f:
|
|
|
|
| 328 |
g.detected_posts = a.get("confirmed_medical", 0)
|
| 329 |
g.emails_sent_by_final5 = a.get("emails_sent", 0)
|
| 330 |
confirmed_posts = a.get("posts", [])
|
| 331 |
for post in confirmed_posts:
|
| 332 |
-
if "group_link" not in post:
|
|
|
|
| 333 |
all_confirmed_posts.extend(confirmed_posts)
|
| 334 |
g.stage = "done"
|
| 335 |
log(f"Group done: scraped={g.scraped_posts}, confirmed={g.detected_posts}", "info", "ORCHESTRATOR")
|
| 336 |
except Exception as e:
|
| 337 |
-
g.stage
|
|
|
|
| 338 |
log(f"Parsing outputs failed for {link}: {e}", "error", "ORCHESTRATOR")
|
| 339 |
state.progress = int((i / max(1, state.total)) * 100)
|
| 340 |
try:
|
|
@@ -346,24 +431,33 @@ def run_pipeline(recipients: List[str]):
|
|
| 346 |
log(f"Error building or sending consolidated email: {e}", "error", "ORCHESTRATOR")
|
| 347 |
summary = {"run_date": datetime.now().isoformat(), "groups": [g.__dict__ for g in state.groups]}
|
| 348 |
summary_path = os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
|
| 349 |
-
with open(summary_path, "w", encoding="utf-8") as f:
|
| 350 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
log("Pipeline finished", "info", "ORCHESTRATOR")
|
| 352 |
except Exception as e:
|
| 353 |
-
state.message
|
|
|
|
| 354 |
log(f"Pipeline error: {e}\n{traceback.format_exc()}", "error", "ORCHESTRATOR")
|
| 355 |
|
| 356 |
@app.route("/")
|
| 357 |
def index():
|
| 358 |
-
return send_from_directory(
|
| 359 |
|
| 360 |
@app.get("/api/system/status")
|
| 361 |
def system_status():
|
| 362 |
return jsonify({
|
| 363 |
-
"gmail": gmail_service is not None,
|
| 364 |
-
"
|
| 365 |
-
"
|
| 366 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
})
|
| 368 |
|
| 369 |
@app.get("/api/groups")
|
|
@@ -372,18 +466,26 @@ def api_groups():
|
|
| 372 |
|
| 373 |
@app.post("/api/process/start")
|
| 374 |
def api_process_start():
|
| 375 |
-
if state.running:
|
|
|
|
| 376 |
data = request.json or {}
|
| 377 |
recips = data.get("recipients") or [SENDER_EMAIL]
|
| 378 |
-
if isinstance(recips, str):
|
|
|
|
| 379 |
threading.Thread(target=run_pipeline, args=(recips,), daemon=True).start()
|
| 380 |
log(f"Start requested by client; recipients={recips}", "info", "API")
|
| 381 |
return jsonify({"success": True, "message": "Pipeline started", "recipients": recips})
|
| 382 |
|
| 383 |
@app.get("/api/process/status")
|
| 384 |
def api_process_status():
|
| 385 |
-
return jsonify({
|
| 386 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 387 |
|
| 388 |
@app.get("/api/process/logs")
|
| 389 |
def api_process_logs():
|
|
@@ -398,25 +500,31 @@ def api_clear_logs():
|
|
| 398 |
|
| 399 |
@app.get("/api/live/state")
|
| 400 |
def api_live_state():
|
| 401 |
-
with live_lock:
|
|
|
|
| 402 |
|
| 403 |
@app.get("/api/results/summary")
|
| 404 |
def api_results_summary():
|
| 405 |
p = state.summary_path or os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
|
| 406 |
-
if not os.path.exists(p):
|
| 407 |
-
|
|
|
|
|
|
|
| 408 |
|
| 409 |
@app.get("/api/recipients")
|
| 410 |
def api_get_recipients():
|
| 411 |
recipients_path = "recipients.json"
|
| 412 |
-
if not os.path.exists(recipients_path):
|
|
|
|
| 413 |
try:
|
| 414 |
-
with open(recipients_path, "r", encoding="utf-8") as f:
|
| 415 |
-
|
|
|
|
|
|
|
| 416 |
return jsonify({"success": True, "data": data})
|
| 417 |
except Exception as e:
|
| 418 |
return jsonify({"success": False, "message": f"Error reading file: {str(e)}"}), 500
|
| 419 |
|
| 420 |
if __name__ == "__main__":
|
| 421 |
port = int(os.environ.get("PORT", 7860))
|
| 422 |
-
app.run(host="0.0.0.0", port=port)
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import json
|
| 4 |
+
import time
|
| 5 |
+
import base64
|
| 6 |
+
import pickle
|
| 7 |
+
import subprocess
|
| 8 |
+
import threading
|
| 9 |
+
import traceback
|
| 10 |
+
import html
|
| 11 |
+
import binascii
|
| 12 |
from datetime import datetime
|
| 13 |
from dataclasses import dataclass, field
|
| 14 |
from typing import List, Dict, Any, Optional
|
|
|
|
| 21 |
|
| 22 |
load_dotenv()
|
| 23 |
|
|
|
|
|
|
|
|
|
|
| 24 |
class LogBuffer:
|
| 25 |
def __init__(self, max_items: int = 10000):
|
| 26 |
self._buf: List[Dict[str, Any]] = []
|
|
|
|
| 31 |
ts = datetime.now().strftime("%H:%M:%S")
|
| 32 |
line = {"id": self._next_id, "ts": ts, "level": level, "source": source, "msg": msg}
|
| 33 |
with self._lock:
|
| 34 |
+
self._buf.append(line)
|
| 35 |
+
self._next_id += 1
|
| 36 |
+
if len(self._buf) > self._max:
|
| 37 |
+
self._buf = self._buf[-self._max:]
|
| 38 |
def clear(self):
|
| 39 |
+
with self._lock:
|
| 40 |
+
self._buf.clear()
|
| 41 |
def get_after(self, after_id: int, limit: int = 500):
|
| 42 |
with self._lock:
|
| 43 |
+
if after_id <= 0:
|
| 44 |
+
data = self._buf[-limit:]
|
| 45 |
+
else:
|
| 46 |
+
data = [x for x in self._buf if x["id"] > after_id][:limit]
|
| 47 |
last_id = self._buf[-1]["id"] if self._buf else after_id
|
| 48 |
return data, last_id
|
| 49 |
|
| 50 |
logs = LogBuffer()
|
| 51 |
+
|
| 52 |
def log(msg: str, level: str = "info", source: str = "server"):
|
| 53 |
logs.append(msg, level, source)
|
| 54 |
print(f"[{level.upper()}][{source}] {msg}", flush=True)
|
| 55 |
|
| 56 |
def decode_base64_with_padding(b64_string: str) -> bytes:
|
|
|
|
| 57 |
missing_padding = len(b64_string) % 4
|
| 58 |
if missing_padding:
|
| 59 |
+
b64_string += "=" * (4 - missing_padding)
|
| 60 |
try:
|
| 61 |
return base64.b64decode(b64_string)
|
| 62 |
except binascii.Error as e:
|
| 63 |
log(f"Error decoding base64 string: {e}", "error", "SERVER")
|
| 64 |
return b""
|
|
|
|
| 65 |
|
|
|
|
|
|
|
| 66 |
WRITABLE_DIR = "/tmp"
|
| 67 |
COOKIES_PATH = os.path.join(WRITABLE_DIR, "facebook_cookies.pkl")
|
| 68 |
SERVICE_ACCOUNT_FILE = os.path.join(WRITABLE_DIR, "service_account.json")
|
| 69 |
|
| 70 |
+
if "FB_COOKIES_B64" in os.environ:
|
| 71 |
+
decoded_cookies = decode_base64_with_padding(os.environ["FB_COOKIES_B64"])
|
|
|
|
| 72 |
if decoded_cookies:
|
| 73 |
+
with open(COOKIES_PATH, "wb") as f:
|
| 74 |
f.write(decoded_cookies)
|
| 75 |
|
| 76 |
+
if "SERVICE_ACCOUNT_B64" in os.environ:
|
| 77 |
+
decoded_service_account = decode_base64_with_padding(os.environ["SERVICE_ACCOUNT_B64"])
|
| 78 |
if decoded_service_account:
|
| 79 |
+
with open(SERVICE_ACCOUNT_FILE, "w") as f:
|
| 80 |
+
f.write(decoded_service_account.decode("utf-8"))
|
| 81 |
|
| 82 |
+
GROUPS_TXT = os.environ.get("GROUPS_TXT", "groups.txt")
|
| 83 |
+
FINAL5_PATH = os.environ.get("FINAL5_PATH", "final5.py")
|
| 84 |
+
PYTHON_BIN = os.environ.get("PYTHON_BIN", "python")
|
|
|
|
| 85 |
SENDER_EMAIL = os.environ.get("SENDER_EMAIL", "smahato@hillsidemedicalgroup.com")
|
| 86 |
|
| 87 |
SCRAPE_OUTDIR = os.path.join(WRITABLE_DIR, "scraped")
|
|
|
|
| 93 |
if key:
|
| 94 |
GEMINI_KEYS.append(key)
|
| 95 |
|
| 96 |
+
GMAIL_SCOPES = ["https://www.googleapis.com/auth/gmail.send"]
|
| 97 |
os.makedirs(SCRAPE_OUTDIR, exist_ok=True)
|
| 98 |
os.makedirs(ANALYSIS_OUTDIR, exist_ok=True)
|
| 99 |
|
|
|
|
|
|
|
| 100 |
def build_gmail_service():
|
| 101 |
if not os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 102 |
log("Service account file not found, Gmail unavailable.", "error", "GMAIL")
|
| 103 |
return None
|
| 104 |
try:
|
| 105 |
creds = service_account.Credentials.from_service_account_file(
|
| 106 |
+
SERVICE_ACCOUNT_FILE, scopes=GMAIL_SCOPES
|
| 107 |
+
).with_subject(SENDER_EMAIL)
|
| 108 |
service = build("gmail", "v1", credentials=creds)
|
| 109 |
log("Gmail service built successfully using service account.", "info", "GMAIL")
|
| 110 |
return service
|
|
|
|
| 113 |
log(f"CRITICAL: Ensure your service account has Domain-Wide Delegation enabled for the user {SENDER_EMAIL}", "error", "GMAIL")
|
| 114 |
return None
|
| 115 |
|
|
|
|
| 116 |
gmail_service = build_gmail_service()
|
| 117 |
|
|
|
|
|
|
|
| 118 |
@dataclass
|
| 119 |
class GroupRun:
|
| 120 |
link: str
|
|
|
|
| 137 |
recipients: List[str] = field(default_factory=list)
|
| 138 |
summary_path: str = ""
|
| 139 |
|
| 140 |
+
app = Flask(__name__, static_folder=".", static_url_path="")
|
| 141 |
CORS(app)
|
| 142 |
|
| 143 |
live_lock = threading.Lock()
|
|
|
|
| 174 |
live_state["counts"]["total_posts"] = len(posts)
|
| 175 |
|
| 176 |
def handle_event_line(line: str):
|
| 177 |
+
if not line.startswith("::"):
|
| 178 |
+
return
|
| 179 |
try:
|
| 180 |
if "::SCRAPE_SAVED::" in line:
|
| 181 |
path = line.split("::SCRAPE_SAVED::", 1)[1].strip()
|
| 182 |
+
if path:
|
| 183 |
+
load_scraped_into_live(path)
|
| 184 |
elif "::KW_HIT::" in line:
|
| 185 |
d = json.loads(line.split("::KW_HIT::", 1)[1].strip())
|
| 186 |
p = ensure_post_obj(int(d["id"]))
|
| 187 |
p["found_keywords"] = d.get("found_keywords", [])
|
| 188 |
+
with live_lock:
|
| 189 |
+
live_state["counts"]["kw_hits"] += 1
|
| 190 |
elif "::AI_RESULT::" in line:
|
| 191 |
d = json.loads(line.split("::AI_RESULT::", 1)[1].strip())
|
| 192 |
p = ensure_post_obj(int(d["id"]))
|
|
|
|
| 194 |
p["ai"] = ai
|
| 195 |
with live_lock:
|
| 196 |
live_state["counts"]["ai_done"] += 1
|
| 197 |
+
if ai.get("is_medical_seeking"):
|
| 198 |
+
live_state["counts"]["confirmed"] += 1
|
| 199 |
elif "::EMAIL_SENT::" in line:
|
| 200 |
d = json.loads(line.split("::EMAIL_SENT::", 1)[1].strip())
|
| 201 |
p = ensure_post_obj(int(d["id"]))
|
| 202 |
sent = int(d.get("sent", 0))
|
| 203 |
p["email_sent"] = sent > 0
|
| 204 |
if sent > 0:
|
| 205 |
+
with live_lock:
|
| 206 |
+
live_state["counts"]["emails"] += sent
|
| 207 |
except Exception as e:
|
| 208 |
log(f"live parse error: {e}", "error", "LIVE")
|
| 209 |
|
| 210 |
def read_groups(path: str) -> List[str]:
|
| 211 |
+
if not os.path.exists(path):
|
| 212 |
+
return []
|
| 213 |
with open(path, "r", encoding="utf-8") as f:
|
| 214 |
return [ln.strip() for ln in f.read().splitlines() if ln.strip()]
|
| 215 |
|
|
|
|
| 240 |
log(f"Gmail send error to {to}: {e}", "error", "gmail")
|
| 241 |
return sent
|
| 242 |
|
| 243 |
+
def build_confirmed_posts_email(groups_run: List["GroupRun"], all_confirmed_posts: List[Dict[str, Any]]) -> str:
|
| 244 |
+
total_groups = len(groups_run)
|
| 245 |
+
total_scraped = sum(g.scraped_posts for g in groups_run)
|
| 246 |
+
total_confirmed = len(all_confirmed_posts)
|
| 247 |
+
rows = []
|
| 248 |
+
for g in groups_run:
|
| 249 |
+
rows.append(f"""
|
| 250 |
<tr>
|
| 251 |
<td style="padding: 8px; border-bottom: 1px solid #eee;"><a href="{g.link}" target="_blank">{g.link}</a></td>
|
| 252 |
<td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.scraped_posts}</td>
|
| 253 |
<td style="padding: 8px; border-bottom: 1px solid #eee; text-align: center;">{g.detected_posts}</td>
|
| 254 |
<td style="padding: 8px; border-bottom: 1px solid #eee;">{"OK" if g.stage == "done" else "ERROR"}</td>
|
| 255 |
+
</tr>""")
|
| 256 |
+
summary_table_html = f"""<h3>Group Summary</h3>
|
| 257 |
+
<table style="width: 100%; border-collapse: collapse; margin-top: 8px; border: 1px solid #ddd;">
|
| 258 |
+
<thead>
|
| 259 |
+
<tr style="background: #0f172a; color: #fff;">
|
| 260 |
+
<th style="text-align: left; padding: 8px;">Group Link</th>
|
| 261 |
+
<th style="text-align: center; padding: 8px;">Posts Scraped</th>
|
| 262 |
+
<th style="text-align: center; padding: 8px;">Confirmed Posts</th>
|
| 263 |
+
<th style="text-align: left; padding: 8px;">Status</th>
|
| 264 |
+
</tr>
|
| 265 |
+
</thead>
|
| 266 |
+
<tbody>
|
| 267 |
+
{''.join(rows)}
|
| 268 |
+
</tbody>
|
| 269 |
+
</table>"""
|
| 270 |
if all_confirmed_posts:
|
| 271 |
+
posts_html = "".join(
|
| 272 |
+
f"""
|
| 273 |
+
<div style="margin-bottom: 25px; padding: 12px; border: 1px solid #ddd; border-radius: 5px; background-color: #fafafa;">
|
| 274 |
+
<h4 style="margin-top: 0; margin-bottom: 8px;">Post ID: {p.get("id", "N/A")} | Urgency: {p.get("ai_analysis", {}).get("urgency_level", "N/A")} | Confidence: {p.get("ai_analysis", {}).get("confidence", "N/A")}</h4>
|
| 275 |
+
<p style="margin: 5px 0;"><strong>Summary:</strong> {html.escape(p.get("ai_analysis", {}).get("medical_summary", "N/A"))}</p>
|
| 276 |
+
<p style="margin: 5px 0;"><strong>Text:</strong></p>
|
| 277 |
+
<pre style="white-space: pre-wrap; background-color: #f0f0f0; padding: 8px; border: 1px solid #eee; border-radius: 3px; font-family: monospace; font-size: 0.9em;">{html.escape(p.get("text", "N/A"))}</pre>
|
| 278 |
+
<p style="margin: 5px 0;"><a href="{p.get("group_link", "#")}" target="_blank">View Group</a></p>
|
| 279 |
+
</div>"""
|
| 280 |
+
for p in all_confirmed_posts
|
| 281 |
+
)
|
| 282 |
+
else:
|
| 283 |
+
posts_html = "<p>No confirmed medical posts were found during this run.</p>"
|
| 284 |
+
return f"""<!DOCTYPE html>
|
| 285 |
+
<html>
|
| 286 |
+
<head><title>Hillside Medical Group - Confirmed Medical Posts Summary</title></head>
|
| 287 |
+
<body style="font-family: Arial, sans-serif; margin: 0; padding: 0; background-color: #f5f5f5;">
|
| 288 |
+
<div style="max-width: 900px; margin: 20px auto; padding: 20px; background-color: #ffffff; border: 1px solid #e0e0e0; border-radius: 8px;">
|
| 289 |
+
<div style="background: #1e3c72; color: #fff; padding: 16px 20px; border-radius: 6px 6px 0 0;">
|
| 290 |
+
<h2 style="margin: 0;">Hillside Medical Group - Confirmed Medical Posts</h2>
|
| 291 |
+
<div style="font-size: 0.9em;">Run completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
|
| 292 |
+
</div>
|
| 293 |
+
<div style="padding: 16px;">
|
| 294 |
+
<p><strong>Overall Summary:</strong> Processed {total_groups} groups, scraped {total_scraped} posts, found {total_confirmed} confirmed medical posts.</p>
|
| 295 |
+
<hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">
|
| 296 |
+
{summary_table_html}
|
| 297 |
+
<hr style="margin: 20px 0; border: 0; border-top: 1px solid #eee;">
|
| 298 |
+
<h3>Confirmed Posts Details</h3>
|
| 299 |
+
{posts_html}
|
| 300 |
+
</div>
|
| 301 |
+
<div style="margin-top: 20px; padding: 10px; font-size: 0.8em; color: #666; border-top: 1px solid #eee;">
|
| 302 |
+
<p>This email contains posts identified as potentially seeking personal medical help. Please review and take appropriate action.</p>
|
| 303 |
+
<p><em>Note: The link provided is to the group. Direct post links are not currently extracted.</em></p>
|
| 304 |
+
</div>
|
| 305 |
+
</div>
|
| 306 |
+
</body>
|
| 307 |
+
</html>"""
|
| 308 |
|
| 309 |
state = PipelineState()
|
| 310 |
|
| 311 |
def stream_process_lines(args: List[str], env: Optional[Dict[str, str]] = None, tag: str = "FINAL5") -> int:
|
| 312 |
log(f"Exec: {' '.join(args)}", "info", tag)
|
| 313 |
+
proc = subprocess.Popen(
|
| 314 |
+
args,
|
| 315 |
+
stdout=subprocess.PIPE,
|
| 316 |
+
stderr=subprocess.PIPE,
|
| 317 |
+
text=True,
|
| 318 |
+
bufsize=1,
|
| 319 |
+
universal_newlines=True,
|
| 320 |
+
env=env or os.environ.copy()
|
| 321 |
+
)
|
| 322 |
def pump(pipe, name):
|
| 323 |
for raw in pipe:
|
| 324 |
line = (raw or "").rstrip("\n")
|
| 325 |
+
if not line:
|
| 326 |
+
continue
|
| 327 |
if line.startswith("::"):
|
| 328 |
+
try:
|
| 329 |
+
handle_event_line(line)
|
| 330 |
+
except Exception as e:
|
| 331 |
+
log(f"event parse error: {e}", "error", tag)
|
| 332 |
log(line, "info" if name == "stdout" else "warn", tag)
|
| 333 |
t1 = threading.Thread(target=pump, args=(proc.stdout, "stdout"), daemon=True)
|
| 334 |
t2 = threading.Thread(target=pump, args=(proc.stderr, "stderr"), daemon=True)
|
| 335 |
+
t1.start()
|
| 336 |
+
t2.start()
|
| 337 |
rc = proc.wait()
|
| 338 |
+
t1.join(timeout=0.2)
|
| 339 |
+
t2.join(timeout=0.2)
|
| 340 |
log(f"Exit code: {rc}", "info", tag)
|
| 341 |
return rc
|
| 342 |
|
|
|
|
| 351 |
"--cookies-file", COOKIES_PATH,
|
| 352 |
"--headless"
|
| 353 |
]
|
| 354 |
+
if GEMINI_KEYS:
|
| 355 |
+
args.extend(["--gemini-keys", ",".join(GEMINI_KEYS)])
|
| 356 |
env = os.environ.copy()
|
| 357 |
env["PYTHONUNBUFFERED"] = "1"
|
| 358 |
env["PYTHONIOENCODING"] = "utf-8"
|
| 359 |
+
env.setdefault("HOME", WRITABLE_DIR)
|
| 360 |
+
env.setdefault("WDM_LOCAL", "1")
|
| 361 |
+
env.setdefault("WDM_CACHE_DIR", os.path.join(WRITABLE_DIR, ".wdm"))
|
| 362 |
+
env.setdefault("SE_MANAGER_DRIVER_CACHE", os.path.join(WRITABLE_DIR, "selenium"))
|
| 363 |
+
os.makedirs(env["WDM_CACHE_DIR"], exist_ok=True)
|
| 364 |
+
os.makedirs(env["SE_MANAGER_DRIVER_CACHE"], exist_ok=True)
|
| 365 |
rc = stream_process_lines(args, env=env, tag="FINAL5")
|
| 366 |
return {"ok": rc == 0, "code": rc}
|
| 367 |
|
|
|
|
| 369 |
try:
|
| 370 |
logs.clear()
|
| 371 |
log("Pipeline starting", "info", "ORCHESTRATOR")
|
| 372 |
+
state.running = True
|
| 373 |
+
state.message = "initializing"
|
| 374 |
+
state.progress = 0
|
| 375 |
+
state.recipients = recipients
|
| 376 |
state.groups.clear()
|
| 377 |
links = read_groups(GROUPS_TXT)
|
| 378 |
state.total = len(links)
|
| 379 |
if not links:
|
| 380 |
log("No groups found in groups.txt", "warn", "ORCHESTRATOR")
|
| 381 |
+
state.message = "No groups"
|
| 382 |
+
state.running = False
|
| 383 |
return
|
| 384 |
all_confirmed_posts = []
|
| 385 |
for i, link in enumerate(links, start=1):
|
| 386 |
reset_live_state(link)
|
| 387 |
g = GroupRun(link=link, stage="running")
|
| 388 |
state.groups.append(g)
|
| 389 |
+
state.current = i
|
| 390 |
+
state.message = f"Processing {link}"
|
| 391 |
+
state.progress = int(((i - 1) / max(1, state.total)) * 100)
|
| 392 |
log(f"[{i}/{state.total}] Processing group: {link}", "info", "ORCHESTRATOR")
|
| 393 |
slug = slugify(link)
|
| 394 |
+
out_json = os.path.join(SCRAPE_OUTDIR, f"{slug}.json")
|
| 395 |
+
analysis_json = os.path.join(ANALYSIS_OUTDIR, f"analysis_{slug}.json")
|
| 396 |
+
g.scraped_json = out_json
|
| 397 |
+
g.analysis_json = analysis_json
|
| 398 |
result = call_final5_for_group(link, out_json, analysis_json, recipients)
|
| 399 |
if not result.get("ok"):
|
| 400 |
+
g.stage = "error"
|
| 401 |
+
g.error = f"final5 exit code {result.get('code')}"
|
| 402 |
log(f"final5 failed for {link}: code {result.get('code')}", "error", "ORCHESTRATOR")
|
| 403 |
else:
|
| 404 |
try:
|
| 405 |
if os.path.exists(out_json):
|
| 406 |
+
with open(out_json, "r", encoding="utf-8") as f:
|
| 407 |
+
g.scraped_posts = len(json.load(f))
|
| 408 |
if os.path.exists(analysis_json):
|
| 409 |
+
with open(analysis_json, "r", encoding="utf-8") as f:
|
| 410 |
+
a = json.load(f)
|
| 411 |
g.detected_posts = a.get("confirmed_medical", 0)
|
| 412 |
g.emails_sent_by_final5 = a.get("emails_sent", 0)
|
| 413 |
confirmed_posts = a.get("posts", [])
|
| 414 |
for post in confirmed_posts:
|
| 415 |
+
if "group_link" not in post:
|
| 416 |
+
post["group_link"] = link
|
| 417 |
all_confirmed_posts.extend(confirmed_posts)
|
| 418 |
g.stage = "done"
|
| 419 |
log(f"Group done: scraped={g.scraped_posts}, confirmed={g.detected_posts}", "info", "ORCHESTRATOR")
|
| 420 |
except Exception as e:
|
| 421 |
+
g.stage = "error"
|
| 422 |
+
g.error = f"parse_error: {e}"
|
| 423 |
log(f"Parsing outputs failed for {link}: {e}", "error", "ORCHESTRATOR")
|
| 424 |
state.progress = int((i / max(1, state.total)) * 100)
|
| 425 |
try:
|
|
|
|
| 431 |
log(f"Error building or sending consolidated email: {e}", "error", "ORCHESTRATOR")
|
| 432 |
summary = {"run_date": datetime.now().isoformat(), "groups": [g.__dict__ for g in state.groups]}
|
| 433 |
summary_path = os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
|
| 434 |
+
with open(summary_path, "w", encoding="utf-8") as f:
|
| 435 |
+
json.dump(summary, f, ensure_ascii=False, indent=2)
|
| 436 |
+
state.summary_path = summary_path
|
| 437 |
+
state.message = "All groups processed"
|
| 438 |
+
state.progress = 100
|
| 439 |
+
state.running = False
|
| 440 |
log("Pipeline finished", "info", "ORCHESTRATOR")
|
| 441 |
except Exception as e:
|
| 442 |
+
state.message = f"pipeline_error: {e}"
|
| 443 |
+
state.running = False
|
| 444 |
log(f"Pipeline error: {e}\n{traceback.format_exc()}", "error", "ORCHESTRATOR")
|
| 445 |
|
| 446 |
@app.route("/")
|
| 447 |
def index():
|
| 448 |
+
return send_from_directory(".", "index.html")
|
| 449 |
|
| 450 |
@app.get("/api/system/status")
|
| 451 |
def system_status():
|
| 452 |
return jsonify({
|
| 453 |
+
"gmail": gmail_service is not None,
|
| 454 |
+
"groups_file_exists": os.path.exists(GROUPS_TXT),
|
| 455 |
+
"groups_count": len(read_groups(GROUPS_TXT)),
|
| 456 |
+
"scrape_outdir": SCRAPE_OUTDIR,
|
| 457 |
+
"analysis_outdir": ANALYSIS_OUTDIR,
|
| 458 |
+
"sender_email": SENDER_EMAIL,
|
| 459 |
+
"final5_exists": os.path.exists(FINAL5_PATH),
|
| 460 |
+
"gemini_keys_count": len(GEMINI_KEYS)
|
| 461 |
})
|
| 462 |
|
| 463 |
@app.get("/api/groups")
|
|
|
|
| 466 |
|
| 467 |
@app.post("/api/process/start")
|
| 468 |
def api_process_start():
|
| 469 |
+
if state.running:
|
| 470 |
+
return jsonify({"success": False, "message": "Already running"}), 409
|
| 471 |
data = request.json or {}
|
| 472 |
recips = data.get("recipients") or [SENDER_EMAIL]
|
| 473 |
+
if isinstance(recips, str):
|
| 474 |
+
recips = [e.strip() for e in recips.split(",") if e.strip()]
|
| 475 |
threading.Thread(target=run_pipeline, args=(recips,), daemon=True).start()
|
| 476 |
log(f"Start requested by client; recipients={recips}", "info", "API")
|
| 477 |
return jsonify({"success": True, "message": "Pipeline started", "recipients": recips})
|
| 478 |
|
| 479 |
@app.get("/api/process/status")
|
| 480 |
def api_process_status():
|
| 481 |
+
return jsonify({
|
| 482 |
+
"running": state.running,
|
| 483 |
+
"message": state.message,
|
| 484 |
+
"progress": state.progress,
|
| 485 |
+
"current": state.current,
|
| 486 |
+
"total": state.total,
|
| 487 |
+
"groups": [g.__dict__ for g in state.groups]
|
| 488 |
+
})
|
| 489 |
|
| 490 |
@app.get("/api/process/logs")
|
| 491 |
def api_process_logs():
|
|
|
|
| 500 |
|
| 501 |
@app.get("/api/live/state")
|
| 502 |
def api_live_state():
|
| 503 |
+
with live_lock:
|
| 504 |
+
return jsonify({"success": True, "data": live_state})
|
| 505 |
|
| 506 |
@app.get("/api/results/summary")
|
| 507 |
def api_results_summary():
|
| 508 |
p = state.summary_path or os.path.join(ANALYSIS_OUTDIR, "analysis_summary.json")
|
| 509 |
+
if not os.path.exists(p):
|
| 510 |
+
return jsonify({"success": False, "message": "No summary yet"}), 404
|
| 511 |
+
with open(p, "r", encoding="utf-8") as f:
|
| 512 |
+
return jsonify({"success": True, "data": json.load(f)})
|
| 513 |
|
| 514 |
@app.get("/api/recipients")
|
| 515 |
def api_get_recipients():
|
| 516 |
recipients_path = "recipients.json"
|
| 517 |
+
if not os.path.exists(recipients_path):
|
| 518 |
+
return jsonify({"success": False, "message": "recipients.json not found"}), 404
|
| 519 |
try:
|
| 520 |
+
with open(recipients_path, "r", encoding="utf-8") as f:
|
| 521 |
+
data = json.load(f)
|
| 522 |
+
if not isinstance(data, list):
|
| 523 |
+
return jsonify({"success": False, "message": "Invalid format"}), 500
|
| 524 |
return jsonify({"success": True, "data": data})
|
| 525 |
except Exception as e:
|
| 526 |
return jsonify({"success": False, "message": f"Error reading file: {str(e)}"}), 500
|
| 527 |
|
| 528 |
if __name__ == "__main__":
|
| 529 |
port = int(os.environ.get("PORT", 7860))
|
| 530 |
+
app.run(host="0.0.0.0", port=port)
|
final5.py
CHANGED
|
@@ -1,5 +1,13 @@
|
|
| 1 |
-
|
| 2 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from typing import List, Dict, Any, Tuple
|
| 4 |
from datetime import datetime
|
| 5 |
import tempfile
|
|
@@ -11,16 +19,16 @@ except Exception:
|
|
| 11 |
pass
|
| 12 |
|
| 13 |
from selenium import webdriver
|
| 14 |
-
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 15 |
from selenium.webdriver.common.by import By
|
|
|
|
| 16 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 17 |
from selenium.webdriver.support import expected_conditions as EC
|
| 18 |
-
from selenium.common.exceptions import
|
| 19 |
-
|
| 20 |
-
)
|
| 21 |
from google.oauth2 import service_account
|
| 22 |
from googleapiclient.discovery import build
|
| 23 |
from googleapiclient.errors import HttpError
|
|
|
|
| 24 |
import google.generativeai as genai
|
| 25 |
from google.api_core.exceptions import ResourceExhausted
|
| 26 |
|
|
@@ -35,20 +43,18 @@ def get_args():
|
|
| 35 |
p.add_argument("--recipients", default="")
|
| 36 |
p.add_argument("--sender", default=os.environ.get("SENDER_EMAIL", ""))
|
| 37 |
p.add_argument("--cookies-file", default=os.path.join(WRITABLE_DIR, "facebook_cookies.pkl"))
|
| 38 |
-
p.add_argument("--max-scrolls", type=int, default=int(os.environ.get("MAX_SCROLLS","5")))
|
| 39 |
-
p.add_argument("--scroll-pause", type=float, default=float(os.environ.get("SCROLL_PAUSE","3")))
|
| 40 |
p.add_argument("--gemini-keys", default="")
|
| 41 |
-
p.add_argument("--headless", action="store_true"
|
| 42 |
return p.parse_args()
|
| 43 |
|
| 44 |
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 45 |
options = webdriver.ChromeOptions()
|
| 46 |
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
| 47 |
-
|
| 48 |
-
options.binary_location = "/usr/bin/google-chrome"
|
| 49 |
-
|
| 50 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 51 |
-
|
|
|
|
| 52 |
options.add_argument("--no-sandbox")
|
| 53 |
options.add_argument("--disable-dev-shm-usage")
|
| 54 |
options.add_argument("--disable-gpu")
|
|
@@ -57,27 +63,34 @@ def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
|
| 57 |
options.add_argument("--disable-extensions")
|
| 58 |
options.add_argument("--remote-debugging-port=9222")
|
| 59 |
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
driver = webdriver.Chrome(service=service, options=options)
|
| 64 |
-
|
| 65 |
-
print("[SELENIUM] WebDriver session created successfully using system binaries.")
|
| 66 |
return driver, user_data_dir
|
| 67 |
|
| 68 |
def build_gmail_service():
|
| 69 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 70 |
try:
|
| 71 |
sender_email = os.environ.get("SENDER_EMAIL")
|
| 72 |
-
if not sender_email:
|
|
|
|
| 73 |
credentials = service_account.Credentials.from_service_account_file(
|
| 74 |
-
SERVICE_ACCOUNT_FILE,
|
|
|
|
|
|
|
| 75 |
return build("gmail", "v1", credentials=credentials)
|
| 76 |
except Exception as e:
|
| 77 |
print(f"[GMAIL] Auth failed in final5.py: {e}")
|
| 78 |
return None
|
| 79 |
|
| 80 |
GEMINI_MODEL = "gemini-1.5-flash"
|
|
|
|
| 81 |
class GeminiManager:
|
| 82 |
def __init__(self, api_keys: List[str]):
|
| 83 |
self.api_keys = api_keys
|
|
@@ -122,9 +135,19 @@ class GeminiManager:
|
|
| 122 |
else:
|
| 123 |
raise e
|
| 124 |
|
| 125 |
-
def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str,Any]:
|
| 126 |
-
fallback = {
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
keywords_str = ", ".join(found_keywords) if found_keywords else "none"
|
| 129 |
prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
|
| 130 |
KEYWORDS: {keywords_str}
|
|
@@ -135,19 +158,25 @@ RULES:
|
|
| 135 |
Post: "{post_text}"
|
| 136 |
Return ONLY JSON:
|
| 137 |
{{
|
| 138 |
-
"is_medical_seeking": true/false,
|
| 139 |
-
"
|
| 140 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
}}"""
|
| 142 |
for _ in range(2):
|
| 143 |
try:
|
| 144 |
resp = gemini_manager.generate_content(prompt)
|
| 145 |
-
txt = (resp
|
| 146 |
s, e = txt.find("{"), txt.rfind("}") + 1
|
| 147 |
if s >= 0 and e > s:
|
| 148 |
result = json.loads(txt[s:e])
|
| 149 |
result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
|
| 150 |
-
if "matched_keywords" not in result:
|
|
|
|
| 151 |
return result
|
| 152 |
return fallback
|
| 153 |
except Exception as e:
|
|
@@ -155,7 +184,14 @@ Return ONLY JSON:
|
|
| 155 |
gemini_manager.rotate_key()
|
| 156 |
return fallback
|
| 157 |
|
| 158 |
-
MEDICAL_KEYWORDS = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
| 161 |
tl = (text or "").lower()
|
|
@@ -166,22 +202,20 @@ def load_cookies(driver, cookies_file: str):
|
|
| 166 |
print("[FB] Navigating to Facebook homepage to load cookies...")
|
| 167 |
driver.get("https://www.facebook.com")
|
| 168 |
time.sleep(2)
|
| 169 |
-
|
| 170 |
if not os.path.exists(cookies_file):
|
| 171 |
raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
|
| 172 |
-
|
| 173 |
with open(cookies_file, "rb") as f:
|
| 174 |
cookies = pickle.load(f)
|
| 175 |
-
|
| 176 |
for cookie in cookies:
|
| 177 |
-
if "sameSite" in cookie and cookie["sameSite"] not in ["Strict","Lax","None"]:
|
| 178 |
cookie["sameSite"] = "Lax"
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
| 181 |
print("[FB] All cookies loaded. Refreshing page to apply session...")
|
| 182 |
driver.refresh()
|
| 183 |
time.sleep(5)
|
| 184 |
-
|
| 185 |
if "log in" in driver.title.lower():
|
| 186 |
print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
|
| 187 |
else:
|
|
@@ -204,14 +238,15 @@ def scrape_group(driver, wait, group_url: str, max_scrolls: int, pause: float):
|
|
| 204 |
print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
|
| 205 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 206 |
time.sleep(pause)
|
| 207 |
-
|
| 208 |
divs = driver.find_elements(By.XPATH, "//div[@role='article']")
|
| 209 |
added_this_scroll = 0
|
| 210 |
for d in divs:
|
| 211 |
try:
|
| 212 |
txt = (d.text or "").strip()
|
| 213 |
-
if len(txt) < 25 or txt in seen:
|
| 214 |
-
|
|
|
|
|
|
|
| 215 |
seen.add(txt)
|
| 216 |
posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
|
| 217 |
added_this_scroll += 1
|
|
@@ -227,7 +262,7 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
|
|
| 227 |
posts = []
|
| 228 |
try:
|
| 229 |
driver, user_data_dir = new_driver(headless=True)
|
| 230 |
-
wait = WebDriverWait(driver,
|
| 231 |
load_cookies(driver, cookies_file)
|
| 232 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 233 |
except Exception as e:
|
|
@@ -235,8 +270,10 @@ def try_scrape_with_fallback(group_url: str, cookies_file: str, max_scrolls: int
|
|
| 235 |
raise
|
| 236 |
finally:
|
| 237 |
if driver:
|
| 238 |
-
try:
|
| 239 |
-
|
|
|
|
|
|
|
| 240 |
if user_data_dir and os.path.exists(user_data_dir):
|
| 241 |
try:
|
| 242 |
shutil.rmtree(user_data_dir, ignore_errors=True)
|
|
@@ -249,42 +286,39 @@ def main():
|
|
| 249 |
args = get_args()
|
| 250 |
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 251 |
os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
|
| 252 |
-
|
| 253 |
gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
|
| 254 |
gemini_manager = GeminiManager(gemini_keys)
|
| 255 |
-
|
| 256 |
-
posts = try_scrape_with_fallback(args.group, args.cookies_file, args.max_scrolls, args.scroll_pause)
|
| 257 |
-
|
| 258 |
with open(args.out, "w", encoding="utf-8") as f:
|
| 259 |
json.dump(posts, f, ensure_ascii=False, indent=2)
|
| 260 |
print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
|
| 261 |
print(f"::SCRAPE_SAVED::{args.out}")
|
| 262 |
-
|
| 263 |
keyword_hits, confirmed = [], []
|
| 264 |
for p in posts:
|
| 265 |
-
has, hits = contains_keywords(p.get("text",""))
|
| 266 |
if has:
|
| 267 |
p["found_keywords"] = hits
|
| 268 |
keyword_hits.append(p)
|
| 269 |
print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
|
| 270 |
-
|
| 271 |
per_call_sleep = 5
|
| 272 |
for idx, p in enumerate(keyword_hits, start=1):
|
| 273 |
found_kws = p.get("found_keywords", [])
|
| 274 |
-
ai = ai_medical_intent(gemini_manager, p.get("text",""), found_kws)
|
| 275 |
p["ai_analysis"] = ai
|
| 276 |
print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
|
| 277 |
if ai.get("is_medical_seeking"):
|
| 278 |
confirmed.append(p)
|
| 279 |
if idx < len(keyword_hits):
|
| 280 |
time.sleep(per_call_sleep)
|
| 281 |
-
|
| 282 |
report = {
|
| 283 |
-
"analysis_date": datetime.now().isoformat(),
|
| 284 |
-
"
|
| 285 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
}
|
| 287 |
-
|
| 288 |
with open(args.analysis_out, "w", encoding="utf-8") as f:
|
| 289 |
json.dump(report, f, ensure_ascii=False, indent=2)
|
| 290 |
print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
import sys
|
| 4 |
+
import time
|
| 5 |
+
import json
|
| 6 |
+
import base64
|
| 7 |
+
import pickle
|
| 8 |
+
import argparse
|
| 9 |
+
import traceback
|
| 10 |
+
import shutil
|
| 11 |
from typing import List, Dict, Any, Tuple
|
| 12 |
from datetime import datetime
|
| 13 |
import tempfile
|
|
|
|
| 19 |
pass
|
| 20 |
|
| 21 |
from selenium import webdriver
|
|
|
|
| 22 |
from selenium.webdriver.common.by import By
|
| 23 |
+
from selenium.webdriver.chrome.service import Service as ChromeService
|
| 24 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 25 |
from selenium.webdriver.support import expected_conditions as EC
|
| 26 |
+
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException, TimeoutException
|
| 27 |
+
|
|
|
|
| 28 |
from google.oauth2 import service_account
|
| 29 |
from googleapiclient.discovery import build
|
| 30 |
from googleapiclient.errors import HttpError
|
| 31 |
+
|
| 32 |
import google.generativeai as genai
|
| 33 |
from google.api_core.exceptions import ResourceExhausted
|
| 34 |
|
|
|
|
| 43 |
p.add_argument("--recipients", default="")
|
| 44 |
p.add_argument("--sender", default=os.environ.get("SENDER_EMAIL", ""))
|
| 45 |
p.add_argument("--cookies-file", default=os.path.join(WRITABLE_DIR, "facebook_cookies.pkl"))
|
| 46 |
+
p.add_argument("--max-scrolls", type=int, default=int(os.environ.get("MAX_SCROLLS", "5")))
|
| 47 |
+
p.add_argument("--scroll-pause", type=float, default=float(os.environ.get("SCROLL_PAUSE", "3")))
|
| 48 |
p.add_argument("--gemini-keys", default="")
|
| 49 |
+
p.add_argument("--headless", action="store_true")
|
| 50 |
return p.parse_args()
|
| 51 |
|
| 52 |
def new_driver(headless: bool) -> Tuple[webdriver.Chrome, str]:
|
| 53 |
options = webdriver.ChromeOptions()
|
| 54 |
user_data_dir = tempfile.mkdtemp(prefix="chrome_user_data_", dir=WRITABLE_DIR)
|
|
|
|
|
|
|
|
|
|
| 55 |
options.add_argument(f"--user-data-dir={user_data_dir}")
|
| 56 |
+
if headless:
|
| 57 |
+
options.add_argument("--headless=new")
|
| 58 |
options.add_argument("--no-sandbox")
|
| 59 |
options.add_argument("--disable-dev-shm-usage")
|
| 60 |
options.add_argument("--disable-gpu")
|
|
|
|
| 63 |
options.add_argument("--disable-extensions")
|
| 64 |
options.add_argument("--remote-debugging-port=9222")
|
| 65 |
options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
|
| 66 |
+
os.environ.setdefault("HOME", WRITABLE_DIR)
|
| 67 |
+
os.environ.setdefault("WDM_LOCAL", "1")
|
| 68 |
+
os.environ.setdefault("WDM_CACHE_DIR", os.path.join(WRITABLE_DIR, ".wdm"))
|
| 69 |
+
os.environ.setdefault("SE_MANAGER_DRIVER_CACHE", os.path.join(WRITABLE_DIR, "selenium"))
|
| 70 |
+
os.makedirs(os.environ["WDM_CACHE_DIR"], exist_ok=True)
|
| 71 |
+
os.makedirs(os.environ["SE_MANAGER_DRIVER_CACHE"], exist_ok=True)
|
| 72 |
+
service = ChromeService()
|
| 73 |
driver = webdriver.Chrome(service=service, options=options)
|
| 74 |
+
print("[SELENIUM] WebDriver session created successfully.")
|
|
|
|
| 75 |
return driver, user_data_dir
|
| 76 |
|
| 77 |
def build_gmail_service():
|
| 78 |
if os.path.exists(SERVICE_ACCOUNT_FILE):
|
| 79 |
try:
|
| 80 |
sender_email = os.environ.get("SENDER_EMAIL")
|
| 81 |
+
if not sender_email:
|
| 82 |
+
return None
|
| 83 |
credentials = service_account.Credentials.from_service_account_file(
|
| 84 |
+
SERVICE_ACCOUNT_FILE,
|
| 85 |
+
scopes=["https://www.googleapis.com/auth/gmail.send"]
|
| 86 |
+
).with_subject(sender_email)
|
| 87 |
return build("gmail", "v1", credentials=credentials)
|
| 88 |
except Exception as e:
|
| 89 |
print(f"[GMAIL] Auth failed in final5.py: {e}")
|
| 90 |
return None
|
| 91 |
|
| 92 |
GEMINI_MODEL = "gemini-1.5-flash"
|
| 93 |
+
|
| 94 |
class GeminiManager:
|
| 95 |
def __init__(self, api_keys: List[str]):
|
| 96 |
self.api_keys = api_keys
|
|
|
|
| 135 |
else:
|
| 136 |
raise e
|
| 137 |
|
| 138 |
+
def ai_medical_intent(gemini_manager: GeminiManager, post_text: str, found_keywords: List[str]) -> Dict[str, Any]:
|
| 139 |
+
fallback = {
|
| 140 |
+
"is_medical_seeking": False,
|
| 141 |
+
"confidence": "low",
|
| 142 |
+
"medical_summary": "AI unavailable",
|
| 143 |
+
"suggested_services": [],
|
| 144 |
+
"urgency_level": "low",
|
| 145 |
+
"analysis": "Fallback",
|
| 146 |
+
"reasoning": "AI error",
|
| 147 |
+
"matched_keywords": found_keywords
|
| 148 |
+
}
|
| 149 |
+
if not gemini_manager or not gemini_manager.is_available():
|
| 150 |
+
return fallback
|
| 151 |
keywords_str = ", ".join(found_keywords) if found_keywords else "none"
|
| 152 |
prompt = f"""Analyze this social post to determine if the author is seeking medical help for a personal health need.
|
| 153 |
KEYWORDS: {keywords_str}
|
|
|
|
| 158 |
Post: "{post_text}"
|
| 159 |
Return ONLY JSON:
|
| 160 |
{{
|
| 161 |
+
"is_medical_seeking": true/false,
|
| 162 |
+
"confidence": "high/medium/low",
|
| 163 |
+
"medical_summary": "short summary",
|
| 164 |
+
"suggested_services": ["service1","service2"],
|
| 165 |
+
"urgency_level": "high/medium/low",
|
| 166 |
+
"analysis": "why it's seeking help",
|
| 167 |
+
"reasoning": "short explanation",
|
| 168 |
+
"matched_keywords": ["keyword1"]
|
| 169 |
}}"""
|
| 170 |
for _ in range(2):
|
| 171 |
try:
|
| 172 |
resp = gemini_manager.generate_content(prompt)
|
| 173 |
+
txt = (getattr(resp, "text", "") or "").strip()
|
| 174 |
s, e = txt.find("{"), txt.rfind("}") + 1
|
| 175 |
if s >= 0 and e > s:
|
| 176 |
result = json.loads(txt[s:e])
|
| 177 |
result["is_medical_seeking"] = bool(result.get("is_medical_seeking", False))
|
| 178 |
+
if "matched_keywords" not in result:
|
| 179 |
+
result["matched_keywords"] = found_keywords
|
| 180 |
return result
|
| 181 |
return fallback
|
| 182 |
except Exception as e:
|
|
|
|
| 184 |
gemini_manager.rotate_key()
|
| 185 |
return fallback
|
| 186 |
|
| 187 |
+
MEDICAL_KEYWORDS = [
|
| 188 |
+
"doctor","physician","primary care","healthcare","medical","clinic","hospital","urgent care","emergency","er",
|
| 189 |
+
"specialist","pediatrician","dentist","gynecologist","obgyn","women's health","health center","family doctor",
|
| 190 |
+
"maternity","prenatal","postnatal","labor","delivery","need doctor","looking for doctor","find doctor",
|
| 191 |
+
"recommend doctor","medical help","health help","appointment","checkup","treatment","prescription","medicine",
|
| 192 |
+
"surgery","best hospital","best clinic","where to go","doctor recommendation","pregnancy","birth control",
|
| 193 |
+
"contraception","fertility","hillside","medical group","wellness center"
|
| 194 |
+
]
|
| 195 |
|
| 196 |
def contains_keywords(text: str) -> Tuple[bool, List[str]]:
|
| 197 |
tl = (text or "").lower()
|
|
|
|
| 202 |
print("[FB] Navigating to Facebook homepage to load cookies...")
|
| 203 |
driver.get("https://www.facebook.com")
|
| 204 |
time.sleep(2)
|
|
|
|
| 205 |
if not os.path.exists(cookies_file):
|
| 206 |
raise RuntimeError(f"[FB] FATAL: Cookies file not found at {cookies_file}")
|
|
|
|
| 207 |
with open(cookies_file, "rb") as f:
|
| 208 |
cookies = pickle.load(f)
|
|
|
|
| 209 |
for cookie in cookies:
|
| 210 |
+
if "sameSite" in cookie and cookie["sameSite"] not in ["Strict", "Lax", "None"]:
|
| 211 |
cookie["sameSite"] = "Lax"
|
| 212 |
+
try:
|
| 213 |
+
driver.add_cookie(cookie)
|
| 214 |
+
except Exception:
|
| 215 |
+
pass
|
| 216 |
print("[FB] All cookies loaded. Refreshing page to apply session...")
|
| 217 |
driver.refresh()
|
| 218 |
time.sleep(5)
|
|
|
|
| 219 |
if "log in" in driver.title.lower():
|
| 220 |
print(f"[FB] WARNING: Login may have failed. Page title is: '{driver.title}'")
|
| 221 |
else:
|
|
|
|
| 238 |
print(f"[SCRAPE] --- Scroll {s+1}/{max_scrolls} ---")
|
| 239 |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 240 |
time.sleep(pause)
|
|
|
|
| 241 |
divs = driver.find_elements(By.XPATH, "//div[@role='article']")
|
| 242 |
added_this_scroll = 0
|
| 243 |
for d in divs:
|
| 244 |
try:
|
| 245 |
txt = (d.text or "").strip()
|
| 246 |
+
if len(txt) < 25 or txt in seen:
|
| 247 |
+
continue
|
| 248 |
+
if any(ui in txt for ui in ["Comment Share", "Write a comment...", "View more comments"]):
|
| 249 |
+
continue
|
| 250 |
seen.add(txt)
|
| 251 |
posts.append({"id": len(posts) + 1, "text": txt, "group_link": group_url})
|
| 252 |
added_this_scroll += 1
|
|
|
|
| 262 |
posts = []
|
| 263 |
try:
|
| 264 |
driver, user_data_dir = new_driver(headless=True)
|
| 265 |
+
wait = WebDriverWait(driver, 30)
|
| 266 |
load_cookies(driver, cookies_file)
|
| 267 |
posts = scrape_group(driver, wait, group_url, max_scrolls, pause)
|
| 268 |
except Exception as e:
|
|
|
|
| 270 |
raise
|
| 271 |
finally:
|
| 272 |
if driver:
|
| 273 |
+
try:
|
| 274 |
+
driver.quit()
|
| 275 |
+
except Exception:
|
| 276 |
+
pass
|
| 277 |
if user_data_dir and os.path.exists(user_data_dir):
|
| 278 |
try:
|
| 279 |
shutil.rmtree(user_data_dir, ignore_errors=True)
|
|
|
|
| 286 |
args = get_args()
|
| 287 |
os.makedirs(os.path.dirname(args.out) or ".", exist_ok=True)
|
| 288 |
os.makedirs(os.path.dirname(args.analysis_out) or ".", exist_ok=True)
|
|
|
|
| 289 |
gemini_keys = [k.strip() for k in args.gemini_keys.split(",") if k.strip()] if args.gemini_keys else []
|
| 290 |
gemini_manager = GeminiManager(gemini_keys)
|
| 291 |
+
posts = try_scrape_with_fallback(args.group.strip(), args.cookies_file, args.max_scrolls, args.scroll_pause)
|
|
|
|
|
|
|
| 292 |
with open(args.out, "w", encoding="utf-8") as f:
|
| 293 |
json.dump(posts, f, ensure_ascii=False, indent=2)
|
| 294 |
print(f"[SCRAPE] Saved {len(posts)} scraped posts to {args.out}")
|
| 295 |
print(f"::SCRAPE_SAVED::{args.out}")
|
|
|
|
| 296 |
keyword_hits, confirmed = [], []
|
| 297 |
for p in posts:
|
| 298 |
+
has, hits = contains_keywords(p.get("text", ""))
|
| 299 |
if has:
|
| 300 |
p["found_keywords"] = hits
|
| 301 |
keyword_hits.append(p)
|
| 302 |
print(f"::KW_HIT::{json.dumps({'id': p['id'], 'found_keywords': hits}, ensure_ascii=False)}")
|
|
|
|
| 303 |
per_call_sleep = 5
|
| 304 |
for idx, p in enumerate(keyword_hits, start=1):
|
| 305 |
found_kws = p.get("found_keywords", [])
|
| 306 |
+
ai = ai_medical_intent(gemini_manager, p.get("text", ""), found_kws)
|
| 307 |
p["ai_analysis"] = ai
|
| 308 |
print(f"::AI_RESULT::{json.dumps({'id': p['id'], 'ai': ai}, ensure_ascii=False)}")
|
| 309 |
if ai.get("is_medical_seeking"):
|
| 310 |
confirmed.append(p)
|
| 311 |
if idx < len(keyword_hits):
|
| 312 |
time.sleep(per_call_sleep)
|
|
|
|
| 313 |
report = {
|
| 314 |
+
"analysis_date": datetime.now().isoformat(),
|
| 315 |
+
"group_link": args.group,
|
| 316 |
+
"total_posts": len(posts),
|
| 317 |
+
"keyword_hits": len(keyword_hits),
|
| 318 |
+
"confirmed_medical": len(confirmed),
|
| 319 |
+
"emails_sent": 0,
|
| 320 |
+
"posts": confirmed
|
| 321 |
}
|
|
|
|
| 322 |
with open(args.analysis_out, "w", encoding="utf-8") as f:
|
| 323 |
json.dump(report, f, ensure_ascii=False, indent=2)
|
| 324 |
print(f"[ANALYSIS] Saved analysis to {args.analysis_out}")
|