Spaces:

Quivara
/

alisto-project

Running

App Files Files Community

Quivara commited on 3 days ago

Commit

42deb05

verified ·

1 Parent(s): 97ebfdc

Update alisto_project/backend/ingest_reddit.py

Browse files

Files changed (1) hide show

alisto_project/backend/ingest_reddit.py +28 -23

alisto_project/backend/ingest_reddit.py CHANGED Viewed

@@ -385,10 +385,9 @@ def is_credible_user(post):
         # allows posts to pass if Reddit API fails to get user info
         return True
-# orchestrates the entire scraping process (historical scan + real-time stream)
-# 4. Main Scraper Loop (LIVE MODE ONLY)
 async def scrape_reddit():
-    print("Connecting to Reddit API...")
     # Load credentials
     client_id = os.getenv("REDDIT_CLIENT_ID")
@@ -406,28 +405,34 @@ async def scrape_reddit():
         password=os.getenv("REDDIT_PASSWORD")
     )
-    try:
-        subreddit = await reddit.subreddit(SUBREDDITS)
-        print(f"👁️  ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
-        # --- DELETED PHASE 1 (History Scan) ---
-        # We removed the code that caused the 403 error.
-        print("⚠️ Skipped history scan to avoid Cloud Ban.")
-        # --- PHASE 2: START REAL-TIME STREAM ---
-        print("📡 Starting real-time stream for new submissions...")
-        # skip_existing=True is VITAL. It ignores old data and only waits for NEW posts.
-        async for post in subreddit.stream.submissions(skip_existing=True):
-            print(f"📥 New Post Detected: {post.title}")
-            await process_post(post)
-    except Exception as e:
-        print(f"Global Scraper Error: {e}")
-    finally:
-        await reddit.close()
-        print("Scraper stopped")
 # executes the main scraping loop when the script is run
 if __name__ == "__main__":

         # allows posts to pass if Reddit API fails to get user info
         return True
+# 4. Main Scraper Loop (POLLING MODE - BYPASSES CLOUD BLOCK)
 async def scrape_reddit():
+    print("Connecting to Reddit API (Polling Mode)...")
     # Load credentials
     client_id = os.getenv("REDDIT_CLIENT_ID")
         password=os.getenv("REDDIT_PASSWORD")
     )
+    print(f"👁️  ALISTO ACTIVE: Polling r/{SUBREDDITS} every 60 seconds...")
+    # Keep track of the last post we saw so we don't duplicate
+    last_processed_id = None
+    while True:
+        try:
+            subreddit = await reddit.subreddit(SUBREDDITS)
+            # Fetch ONLY the single newest post
+            async for post in subreddit.new(limit=1):
+                if post.id != last_processed_id:
+                    print(f"📥 New Post Detected: {post.title}")
+                    await process_post(post)
+                    last_processed_id = post.id
+                else:
+                    print("   (No new posts, waiting...)")
+            # Disconnect and sleep for 60 seconds (This prevents the 403 Ban)
+            await asyncio.sleep(60)
+        except Exception as e:
+            print(f"⚠️ Connection glitch: {e}")
+            print("   Waiting 2 minutes before retry...")
+            await asyncio.sleep(120)
+    # Note: We technically never reach this, but good practice to close
+    await reddit.close()
 # executes the main scraping loop when the script is run
 if __name__ == "__main__":