from apify import Actor from typing import Set class StateManager: """ Manages the persistent state of the actor, specifically for deduplication. """ def __init__(self): self.seen_urls: Set[str] = set() async def load_state(self): """ Loads the state from the default key-value store. """ state = await Actor.get_value("STATE") if not state: # Fallback for local development import json from pathlib import Path cwd = Path.cwd() local_state = cwd / "local_state.json" Actor.log.info(f"Checking for local state at: {local_state}") if local_state.exists(): try: state = json.loads(local_state.read_text(encoding="utf-8")) Actor.log.info(f"Loaded state from local file: {local_state}") except Exception as e: Actor.log.warning(f"Failed to load local state: {e}") state = {} else: state = {} self.seen_urls = set(state.get("seen_urls", [])) Actor.log.info(f"Loaded state: {len(self.seen_urls)} seen URLs.") async def save_state(self): """ Saves the current state to the default key-value store. """ state = { "seen_urls": list(self.seen_urls) } await Actor.set_value("STATE", state) # Backup for local development import json from pathlib import Path cwd = Path.cwd() local_state = cwd / "local_state.json" try: local_state.write_text(json.dumps(state, indent=2), encoding="utf-8") Actor.log.info(f"Backed up state to local file: {local_state}") except Exception as e: Actor.log.warning(f"Failed to backup local state: {e}") Actor.log.info(f"Saved state: {len(self.seen_urls)} seen URLs.") def is_seen(self, url: str) -> bool: """ Checks if a URL has already been seen. """ return url in self.seen_urls def add_seen(self, url: str): """ Adds a URL to the set of seen URLs. """ self.seen_urls.add(url)