Quivara commited on
Commit
42deb05
·
verified ·
1 Parent(s): 97ebfdc

Update alisto_project/backend/ingest_reddit.py

Browse files
alisto_project/backend/ingest_reddit.py CHANGED
@@ -385,10 +385,9 @@ def is_credible_user(post):
385
  # allows posts to pass if Reddit API fails to get user info
386
  return True
387
 
388
- # orchestrates the entire scraping process (historical scan + real-time stream)
389
- # 4. Main Scraper Loop (LIVE MODE ONLY)
390
  async def scrape_reddit():
391
- print("Connecting to Reddit API...")
392
 
393
  # Load credentials
394
  client_id = os.getenv("REDDIT_CLIENT_ID")
@@ -406,28 +405,34 @@ async def scrape_reddit():
406
  password=os.getenv("REDDIT_PASSWORD")
407
  )
408
 
409
- try:
410
- subreddit = await reddit.subreddit(SUBREDDITS)
411
- print(f"👁️ ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
412
-
413
- # --- DELETED PHASE 1 (History Scan) ---
414
- # We removed the code that caused the 403 error.
415
- print("⚠️ Skipped history scan to avoid Cloud Ban.")
416
-
417
- # --- PHASE 2: START REAL-TIME STREAM ---
418
- print("📡 Starting real-time stream for new submissions...")
419
-
420
- # skip_existing=True is VITAL. It ignores old data and only waits for NEW posts.
421
- async for post in subreddit.stream.submissions(skip_existing=True):
422
- print(f"📥 New Post Detected: {post.title}")
423
- await process_post(post)
424
 
425
- except Exception as e:
426
- print(f"Global Scraper Error: {e}")
427
- finally:
428
- await reddit.close()
429
- print("Scraper stopped")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
 
 
431
 
432
  # executes the main scraping loop when the script is run
433
  if __name__ == "__main__":
 
385
  # allows posts to pass if Reddit API fails to get user info
386
  return True
387
 
388
+ # 4. Main Scraper Loop (POLLING MODE - BYPASSES CLOUD BLOCK)
 
389
  async def scrape_reddit():
390
+ print("Connecting to Reddit API (Polling Mode)...")
391
 
392
  # Load credentials
393
  client_id = os.getenv("REDDIT_CLIENT_ID")
 
405
  password=os.getenv("REDDIT_PASSWORD")
406
  )
407
 
408
+ print(f"👁️ ALISTO ACTIVE: Polling r/{SUBREDDITS} every 60 seconds...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
 
410
+ # Keep track of the last post we saw so we don't duplicate
411
+ last_processed_id = None
412
+
413
+ while True:
414
+ try:
415
+ subreddit = await reddit.subreddit(SUBREDDITS)
416
+
417
+ # Fetch ONLY the single newest post
418
+ async for post in subreddit.new(limit=1):
419
+ if post.id != last_processed_id:
420
+ print(f"📥 New Post Detected: {post.title}")
421
+ await process_post(post)
422
+ last_processed_id = post.id
423
+ else:
424
+ print(" (No new posts, waiting...)")
425
+
426
+ # Disconnect and sleep for 60 seconds (This prevents the 403 Ban)
427
+ await asyncio.sleep(60)
428
+
429
+ except Exception as e:
430
+ print(f"⚠️ Connection glitch: {e}")
431
+ print(" Waiting 2 minutes before retry...")
432
+ await asyncio.sleep(120)
433
 
434
+ # Note: We technically never reach this, but good practice to close
435
+ await reddit.close()
436
 
437
  # executes the main scraping loop when the script is run
438
  if __name__ == "__main__":