ADM-Purchasing-Tools

Running

App Files Files Community

abdulsalam2121 commited on May 21

Commit

2cd7bd9

1 Parent(s): 38a89cd

Enhance error handling and logging during authentication and navigation processes

Browse files

Files changed (6) hide show

app/auth_handler.py +24 -0
app/bot.py +4 -2
app/browser_session.py +54 -47
app/listing_scraper.py +46 -121
app/product_scraper.py +25 -89
app/services/run_bot.py +12 -4

app/auth_handler.py CHANGED Viewed

@@ -162,6 +162,25 @@ class AuthHandler:
         except Exception:
             return False
     def login(self) -> bool:
         page = self.page
         logger.info("Attempting login")
@@ -236,6 +255,11 @@ class AuthHandler:
         deadline = time.time() + 20
         while time.time() < deadline:
             self.handle_age_gate()
             if self.is_logged_in():
                 logger.info("Login successful")

         except Exception:
             return False
+    def _login_failed_visible(self) -> bool:
+        page = self.page
+        try:
+            body_text = page.locator("body").inner_text(timeout=1500).lower()
+        except Exception:
+            try:
+                body_text = page.content().lower()
+            except Exception:
+                return False
+        failure_markers = [
+            "login failed",
+            "unable to authenticate",
+            "please try your login again",
+            "invalid username",
+            "incorrect username or password",
+        ]
+        return any(marker in body_text for marker in failure_markers)
     def login(self) -> bool:
         page = self.page
         logger.info("Attempting login")
         deadline = time.time() + 20
         while time.time() < deadline:
+            if self._login_failed_visible():
+                logger.error("Login failed page detected")
+                self.session.screenshot("login_failed")
+                return False
             self.handle_age_gate()
             if self.is_logged_in():
                 logger.info("Login successful")

app/bot.py CHANGED Viewed

@@ -88,12 +88,14 @@ class BotRunner:
                 studio_url = navigator._ensure_price_sort(studio_url)
                 self._log(f"Direct studio URL detected, navigating directly: {studio_url}")
                 if not navigator.navigate_to_studio_url(studio_url):
-                    raise RuntimeError("Could not open studio page")
             else:
                 self._log(f"Studio name detected, searching directory: {studio_input}")
                 studio_url = navigator.find_studio_by_name(studio_input)
                 if not studio_url:
-                    raise RuntimeError(f"Could not find studio: {studio_input}")
             self._log("Phase 3 ▶ Listing scan")
             listing = ListingScraper(

                 studio_url = navigator._ensure_price_sort(studio_url)
                 self._log(f"Direct studio URL detected, navigating directly: {studio_url}")
                 if not navigator.navigate_to_studio_url(studio_url):
+                    self._log("Could not open studio page")
+                    return
             else:
                 self._log(f"Studio name detected, searching directory: {studio_input}")
                 studio_url = navigator.find_studio_by_name(studio_input)
                 if not studio_url:
+                    self._log(f"Could not find studio: {studio_input}")
+                    return
             self._log("Phase 3 ▶ Listing scan")
             listing = ListingScraper(

app/browser_session.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 from pathlib import Path
 from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page
 from config import HOME_URL
@@ -22,6 +23,17 @@ class BrowserSession:
         self._context: BrowserContext = None
         self.page: Page = None
     def start(self):
         self.state_path.parent.mkdir(exist_ok=True)
         self.logs_dir.mkdir(exist_ok=True, parents=True)
@@ -75,60 +87,55 @@ class BrowserSession:
         logger.info("Browser session closed")
     def goto(self, url: str, wait_until: str = "domcontentloaded", timeout_override: int = None) -> bool:
-        try:
-            # Use override timeout or default
-            nav_timeout = timeout_override if timeout_override else self.timeout
-            response = self.page.goto(url, wait_until=wait_until, timeout=nav_timeout)
-            # If we have a response object, check HTTP status
-            if response is not None:
-                status = response.status
-                if status >= 400:
-                    logger.error(f"Navigation returned HTTP {status} for {url}")
-                    # Save context for debugging
-                    try:
-                        self.screenshot(f"http_{status}")
-                        html = self.page.content()
-                        Path("logs").mkdir(exist_ok=True)
-                        Path(f"logs/http_{status}.html").write_text(html, encoding="utf-8")
-                    except Exception:
-                        pass
-                    return False
-            try:
-                # Update the latest screenshot after successful navigation for debugging/UI
-                self.screenshot("latest")
-            except Exception:
-                pass
-            return True
-        except Exception as e:
-            logger.error(f"Navigation failed [{url}]: {e}")
             try:
-                self.screenshot("navigation_error")
-            except Exception:
-                pass
-            return False
     def cleanup_page(self):
         """
-        Cleanup page resources to prevent memory accumulation during long pagination runs.
-        Clears local storage, session storage, and forces garbage collection.
         """
         try:
             if self.page:
-                # Clear storages and cleanup DOM
-                self.page.evaluate("""
-                    () => {
-                        localStorage.clear();
-                        sessionStorage.clear();
-                        // Purge cache
-                        if (window.caches) {
-                            caches.keys().then(names => {
-                                names.forEach(name => caches.delete(name));
-                            });
-                        }
-                    }
-                """)
-                logger.debug("Cleared page storage and cache")
         except Exception as e:
             logger.debug(f"Page cleanup warning (non-critical): {e}")

 import logging
+import time
 from pathlib import Path
 from playwright.sync_api import sync_playwright, Browser, BrowserContext, Page
 from config import HOME_URL
         self._context: BrowserContext = None
         self.page: Page = None
+    def _ensure_page(self) -> Page:
+        try:
+            if self.page is None or self.page.is_closed():
+                self.page = self._context.new_page()
+        except Exception:
+            try:
+                self.page = self._context.new_page()
+            except Exception:
+                return None
+        return self.page
     def start(self):
         self.state_path.parent.mkdir(exist_ok=True)
         self.logs_dir.mkdir(exist_ok=True, parents=True)
         logger.info("Browser session closed")
     def goto(self, url: str, wait_until: str = "domcontentloaded", timeout_override: int = None) -> bool:
+        nav_timeout = timeout_override if timeout_override else self.timeout
+        for attempt in range(1, 3):
             try:
+                page = self._ensure_page()
+                if page is None:
+                    raise RuntimeError("Browser page is not available")
+                response = page.goto(url, wait_until=wait_until, timeout=nav_timeout)
+                if response is not None:
+                    status = response.status
+                    if status >= 400:
+                        logger.error(f"Navigation returned HTTP {status} for {url}")
+                        try:
+                            self.screenshot(f"http_{status}")
+                            html = page.content()
+                            Path("logs").mkdir(exist_ok=True)
+                            Path(f"logs/http_{status}.html").write_text(html, encoding="utf-8")
+                        except Exception:
+                            pass
+                        return False
+                try:
+                    self.screenshot("latest")
+                except Exception:
+                    pass
+                return True
+            except Exception as e:
+                logger.error(f"Navigation failed [{url}] attempt {attempt}/2: {e}")
+                try:
+                    self.screenshot("navigation_error")
+                except Exception:
+                    pass
+                if attempt < 2:
+                    time.sleep(1)
+                    continue
+                return False
     def cleanup_page(self):
         """
+        Lightweight cleanup hook for long runs.
+        The previous implementation cleared browser storage on every page, which
+        can invalidate authenticated site state. Keep this method non-destructive
+        so it cannot disrupt the session mid-run.
         """
         try:
             if self.page:
+                logger.debug("Page cleanup skipped to preserve authenticated state")
         except Exception as e:
             logger.debug(f"Page cleanup warning (non-critical): {e}")

app/listing_scraper.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 import re
-import threading
 import time
 from typing import Callable, Any, Dict, List, Optional
 from urllib.parse import urljoin
@@ -79,84 +78,48 @@ class ListingScraper:
         logger.debug("Extracting products from div.row structure")
         try:
-            # Timeout extraction with a thread-based approach
-            extraction_timeout = self.page_timeout - 5  # Leave 5s buffer
-            result_container = {"products": [], "error": None, "done": False}
-            def extract_products():
                 try:
-                    rows = page.locator("div.row").all()
-                    logger.debug(f"Found {len(rows)} potential product rows")
-                    for idx, row in enumerate(rows):
-                        try:
-                            # Extract title from div.caption h4 a
-                            try:
-                                title_elem = row.locator("div.caption h4 a").first
-                                title = (title_elem.inner_text(timeout=500) or "").strip()
-                                link = title_elem.get_attribute("href") or ""
-                            except Exception:
-                                logger.debug(f"Row {idx}: Could not extract title")
-                                continue
-                            if not title or not link:
-                                logger.debug(f"Row {idx}: Missing title or link")
-                                continue
-                            # Extract price from div.price strong
-                            price = None
-                            try:
-                                price_elem = row.locator("div.price strong").first
-                                price_text = (price_elem.inner_text(timeout=500) or "").strip()
-                                price = _parse_price(price_text)
-                                logger.debug(f"Row {idx}: Extracted price text: {price_text} -> ${price}")
-                            except Exception as e:
-                                logger.debug(f"Row {idx}: Could not extract price: {e}")
-                            # Skip if price is unknown
-                            if price is None:
-                                self.skipped_unknown_price += 1
-                                logger.debug(f"Row {idx}: Skipping '{title}' - no price found")
-                                continue
-                            # Filter by minimum price - ONLY collect items >= min_price
-                            if price < self.min_price:
-                                self.skipped_below_threshold += 1
-                                logger.debug(f"Row {idx}: Skip '{title}' (${price:.2f} < ${self.min_price:.2f})")
-                                continue
-                            # This product qualifies
-                            logger.debug(f"Row {idx}: ✓ QUALIFY '{title}' @ ${price:.2f}")
-                            result_container["products"].append({
-                                "url": link,
-                                "title": title,
-                                "price": price
-                            })
-                        except Exception as e:
-                            logger.debug(f"Row {idx}: Error processing row: {e}")
-                            continue
                 except Exception as e:
-                    result_container["error"] = str(e)
-                finally:
-                    result_container["done"] = True
-            # Run extraction in thread with timeout
-            thread = threading.Thread(target=extract_products, daemon=True)
-            thread.start()
-            thread.join(timeout=extraction_timeout)
-            if result_container["done"]:
-                if result_container["error"]:
-                    logger.error(f"Product extraction error: {result_container['error']}")
-                products = result_container["products"]
-            else:
-                logger.warning(f"Product extraction timed out after {extraction_timeout}s")
-                try:
-                    self.session.screenshot("extraction_timeout")
-                except Exception:
-                    pass
             logger.info(f"Extracted {len(products)} qualifying product(s) from page")
             return products
@@ -292,27 +255,11 @@ class ListingScraper:
         return next_url
     def _resolve_next_page_url_with_timeout(self, current_url: str, timeout_ms: int = 5000) -> Optional[str]:
-        """Resolve next page URL with timeout protection to prevent hangs."""
-        result = {"url": None, "done": False, "error": None}
-        def resolve_thread():
-            try:
-                result["url"] = self._resolve_next_page_url(current_url)
-            except Exception as e:
-                logger.warning(f"Error resolving next page URL: {e}")
-                result["error"] = str(e)
-            finally:
-                result["done"] = True
-        thread = threading.Thread(target=resolve_thread, daemon=True)
-        thread.daemon = True
-        thread.start()
-        thread.join(timeout=timeout_ms / 1000.0)  # Convert to seconds
-        if result["done"]:
-            return result["url"]
-        else:
-            logger.warning(f"Next page URL resolution timed out after {timeout_ms}ms - stopping pagination")
             return None
     def _load_page_with_retry(self, url: str, page_num: int, max_retries: int = 3) -> bool:
@@ -325,38 +272,16 @@ class ListingScraper:
         for attempt in range(1, max_retries + 1):
             try:
                 logger.info(f"Page load attempt {attempt}/{max_retries} for page {page_num}: {url}")
-                # Load page with timeout thread
-                load_result = {"success": False, "done": False}
-                def load_thread():
-                    try:
-                        load_result["success"] = self.session.goto(url)
-                    except Exception as e:
-                        logger.warning(f"Navigation exception: {e}")
-                    finally:
-                        load_result["done"] = True
-                thread = threading.Thread(target=load_thread, daemon=True)
-                thread.start()
-                thread.join(timeout=self.page_timeout)
-                if load_result["done"] and load_result["success"]:
                     logger.info(f"✓ Successfully loaded page {page_num} on attempt {attempt}")
-                    # Aggressive cleanup after successful navigation
                     try:
                         self.session.cleanup_page()
-                        # Force garbage collection
                         import gc
                         gc.collect()
                     except Exception as e:
                         logger.debug(f"Cleanup warning (non-critical): {e}")
                     return True
-                else:
-                    if not load_result["done"]:
-                        logger.warning(f"✗ Page load timed out after {self.page_timeout}s for page {page_num}, attempt {attempt}")
-                    else:
-                        logger.warning(f"✗ Page load failed for page {page_num}, attempt {attempt}")
             except Exception as e:
                 logger.warning(f"✗ Exception during page load for page {page_num}, attempt {attempt}: {e}")

 import logging
 import re
 import time
 from typing import Callable, Any, Dict, List, Optional
 from urllib.parse import urljoin
         logger.debug("Extracting products from div.row structure")
         try:
+            rows = page.locator("div.row").all()
+            logger.debug(f"Found {len(rows)} potential product rows")
+            for idx, row in enumerate(rows):
                 try:
+                    try:
+                        title_elem = row.locator("div.caption h4 a").first
+                        title = (title_elem.inner_text(timeout=500) or "").strip()
+                        link = title_elem.get_attribute("href") or ""
+                    except Exception:
+                        logger.debug(f"Row {idx}: Could not extract title")
+                        continue
+                    if not title or not link:
+                        logger.debug(f"Row {idx}: Missing title or link")
+                        continue
+                    price = None
+                    try:
+                        price_elem = row.locator("div.price strong").first
+                        price_text = (price_elem.inner_text(timeout=500) or "").strip()
+                        price = _parse_price(price_text)
+                        logger.debug(f"Row {idx}: Extracted price text: {price_text} -> ${price}")
+                    except Exception as e:
+                        logger.debug(f"Row {idx}: Could not extract price: {e}")
+                    if price is None:
+                        self.skipped_unknown_price += 1
+                        logger.debug(f"Row {idx}: Skipping '{title}' - no price found")
+                        continue
+                    if price < self.min_price:
+                        self.skipped_below_threshold += 1
+                        logger.debug(f"Row {idx}: Skip '{title}' (${price:.2f} < ${self.min_price:.2f})")
+                        continue
+                    logger.debug(f"Row {idx}: ✓ QUALIFY '{title}' @ ${price:.2f}")
+                    products.append({"url": link, "title": title, "price": price})
                 except Exception as e:
+                    logger.debug(f"Row {idx}: Error processing row: {e}")
+                    continue
             logger.info(f"Extracted {len(products)} qualifying product(s) from page")
             return products
         return next_url
     def _resolve_next_page_url_with_timeout(self, current_url: str, timeout_ms: int = 5000) -> Optional[str]:
+        """Resolve next page URL without crossing thread boundaries."""
+        try:
+            return self._resolve_next_page_url(current_url)
+        except Exception as e:
+            logger.warning(f"Error resolving next page URL: {e}")
             return None
     def _load_page_with_retry(self, url: str, page_num: int, max_retries: int = 3) -> bool:
         for attempt in range(1, max_retries + 1):
             try:
                 logger.info(f"Page load attempt {attempt}/{max_retries} for page {page_num}: {url}")
+                if self.session.goto(url):
                     logger.info(f"✓ Successfully loaded page {page_num} on attempt {attempt}")
                     try:
                         self.session.cleanup_page()
                         import gc
                         gc.collect()
                     except Exception as e:
                         logger.debug(f"Cleanup warning (non-critical): {e}")
                     return True
+                logger.warning(f"✗ Page load failed for page {page_num}, attempt {attempt}")
             except Exception as e:
                 logger.warning(f"✗ Exception during page load for page {page_num}, attempt {attempt}: {e}")

app/product_scraper.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import logging
 import re
 import time
-import threading
 from datetime import datetime
 from typing import Any, Dict, Optional
 from urllib.parse import urljoin
@@ -25,85 +24,36 @@ class ProductScraper:
         return self.session.page
     def _meta_value_safe(self, label: str, timeout: int = 8) -> Optional[str]:
-        """Extract metadata value with timeout protection."""
-        result = {"value": None, "done": False}
-        def extract():
-            try:
-                result["value"] = self._meta_value(label)
-            except Exception as e:
-                logger.debug(f"Meta value extraction error for '{label}': {e}")
-            finally:
-                result["done"] = True
-        thread = threading.Thread(target=extract, daemon=True)
-        thread.start()
-        thread.join(timeout=timeout)
-        if not result["done"]:
-            logger.warning(f"Meta value extraction timed out for '{label}'")
             return None
-        return result["value"]
     def _get_title_safe(self, timeout: int = 8) -> str:
-        """Extract title with timeout protection."""
-        result = {"value": "", "done": False}
-        def extract():
-            try:
-                result["value"] = self._get_title()
-            except Exception as e:
-                logger.debug(f"Title extraction error: {e}")
-            finally:
-                result["done"] = True
-        thread = threading.Thread(target=extract, daemon=True)
-        thread.start()
-        thread.join(timeout=timeout)
-        if not result["done"]:
-            logger.warning("Title extraction timed out")
-        return result["value"]
     def _get_price_safe(self, timeout: int = 8) -> str:
-        """Extract price with timeout protection."""
-        result = {"value": "", "done": False}
-        def extract():
-            try:
-                result["value"] = self._get_price()
-            except Exception as e:
-                logger.debug(f"Price extraction error: {e}")
-            finally:
-                result["done"] = True
-        thread = threading.Thread(target=extract, daemon=True)
-        thread.start()
-        thread.join(timeout=timeout)
-        if not result["done"]:
-            logger.warning("Price extraction timed out")
-        return result["value"]
     def _get_category_safe(self, timeout: int = 8) -> str:
-        """Extract category with timeout protection."""
-        result = {"value": "", "done": False}
-        def extract():
-            try:
-                result["value"] = self._get_category()
-            except Exception as e:
-                logger.debug(f"Category extraction error: {e}")
-            finally:
-                result["done"] = True
-        thread = threading.Thread(target=extract, daemon=True)
-        thread.start()
-        thread.join(timeout=timeout)
-        if not result["done"]:
-            logger.warning("Category extraction timed out")
-        return result["value"]
     # ------------------------------------------------------------------
     # Metadata extraction
@@ -237,22 +187,8 @@ class ProductScraper:
                     return result
                 logger.debug(f"Scraping (attempt {attempt}): {url}")
-                # Navigate with timeout protection
-                nav_success = False
-                nav_result = {"success": False, "done": False}
-                def navigate():
-                    try:
-                        nav_result["success"] = self.session.goto(url)
-                    finally:
-                        nav_result["done"] = True
-                thread = threading.Thread(target=navigate, daemon=True)
-                thread.start()
-                thread.join(timeout=self.product_timeout)
-                if not nav_result["done"] or not nav_result["success"]:
                     raise RuntimeError("Navigation failed or timed out")
                 time.sleep(0.4)

 import logging
 import re
 import time
 from datetime import datetime
 from typing import Any, Dict, Optional
 from urllib.parse import urljoin
         return self.session.page
     def _meta_value_safe(self, label: str, timeout: int = 8) -> Optional[str]:
+        """Extract metadata value with direct Playwright calls."""
+        try:
+            return self._meta_value(label)
+        except Exception as e:
+            logger.debug(f"Meta value extraction error for '{label}': {e}")
             return None
     def _get_title_safe(self, timeout: int = 8) -> str:
+        """Extract title with direct Playwright calls."""
+        try:
+            return self._get_title()
+        except Exception as e:
+            logger.debug(f"Title extraction error: {e}")
+            return ""
     def _get_price_safe(self, timeout: int = 8) -> str:
+        """Extract price with direct Playwright calls."""
+        try:
+            return self._get_price()
+        except Exception as e:
+            logger.debug(f"Price extraction error: {e}")
+            return ""
     def _get_category_safe(self, timeout: int = 8) -> str:
+        """Extract category with direct Playwright calls."""
+        try:
+            return self._get_category()
+        except Exception as e:
+            logger.debug(f"Category extraction error: {e}")
+            return ""
     # ------------------------------------------------------------------
     # Metadata extraction
                     return result
                 logger.debug(f"Scraping (attempt {attempt}): {url}")
+                if not self.session.goto(url):
                     raise RuntimeError("Navigation failed or timed out")
                 time.sleep(0.4)

app/services/run_bot.py CHANGED Viewed

@@ -196,7 +196,9 @@ class AutomationController:
             self._set_state(progress=10, current_state="Logging in")
             auth = AuthHandler(session, payload["username"], payload["password"])
             if not auth.ensure_authenticated():
-                raise RuntimeError("Login failed")
             if self._stop_event.is_set():
                 self._set_state(current_state="Stopped by user")
@@ -214,7 +216,9 @@ class AutomationController:
                 if not session.goto(studio_url):
                     logger.warning("Direct navigation failed, falling back to navigator")
                     if not navigator.navigate_to_studio_url(studio_url):
-                        raise RuntimeError("Could not open studio page")
                 else:
                     time.sleep(1)
                     logger.info("Direct studio page loaded")
@@ -227,7 +231,9 @@ class AutomationController:
                     if not session.goto(studio_url):
                         logger.warning("Direct navigation failed, falling back to navigator")
                         if not navigator.navigate_to_studio_url(studio_url):
-                            raise RuntimeError("Could not open studio page")
                     else:
                         time.sleep(1)
                         logger.info("Direct studio page loaded")
@@ -235,7 +241,9 @@ class AutomationController:
                     # Treat input as a studio name and search the directory
                     studio_url = navigator.find_studio_by_name(studio_input)
                     if not studio_url:
-                        raise RuntimeError(f"Could not find studio: {studio_input}")
             if self._stop_event.is_set():
                 self._set_state(current_state="Stopped by user")

             self._set_state(progress=10, current_state="Logging in")
             auth = AuthHandler(session, payload["username"], payload["password"])
             if not auth.ensure_authenticated():
+                self._set_state(last_error="Login failed", current_state="Login failed")
+                self.append_log("Login failed")
+                return
             if self._stop_event.is_set():
                 self._set_state(current_state="Stopped by user")
                 if not session.goto(studio_url):
                     logger.warning("Direct navigation failed, falling back to navigator")
                     if not navigator.navigate_to_studio_url(studio_url):
+                        self._set_state(last_error="Could not open studio page", current_state="Studio navigation failed")
+                        self.append_log("Could not open studio page")
+                        return
                 else:
                     time.sleep(1)
                     logger.info("Direct studio page loaded")
                     if not session.goto(studio_url):
                         logger.warning("Direct navigation failed, falling back to navigator")
                         if not navigator.navigate_to_studio_url(studio_url):
+                            self._set_state(last_error="Could not open studio page", current_state="Studio navigation failed")
+                            self.append_log("Could not open studio page")
+                            return
                     else:
                         time.sleep(1)
                         logger.info("Direct studio page loaded")
                     # Treat input as a studio name and search the directory
                     studio_url = navigator.find_studio_by_name(studio_input)
                     if not studio_url:
+                        self._set_state(last_error=f"Could not find studio: {studio_input}", current_state="Studio not found")
+                        self.append_log(f"Could not find studio: {studio_input}")
+                        return
             if self._stop_event.is_set():
                 self._set_state(current_state="Stopped by user")