Quivara commited on
Commit
97ebfdc
Β·
verified Β·
1 Parent(s): 387a925

Update alisto_project/backend/ingest_reddit.py

Browse files
alisto_project/backend/ingest_reddit.py CHANGED
@@ -385,46 +385,41 @@ def is_credible_user(post):
385
  # allows posts to pass if Reddit API fails to get user info
386
  return True
387
 
388
-
389
- # 4. Main Scraper Loop
390
  # orchestrates the entire scraping process (historical scan + real-time stream)
 
391
  async def scrape_reddit():
392
  print("Connecting to Reddit API...")
393
-
394
- # --- MISSING LINES ADDED HERE ---
395
  client_id = os.getenv("REDDIT_CLIENT_ID")
396
  client_secret = os.getenv("REDDIT_CLIENT_SECRET")
397
- # --------------------------------
398
 
399
  if not client_id or not client_secret:
400
  print("❌ Error: Client ID or Secret missing in .env")
401
  return
402
 
403
  reddit = asyncpraw.Reddit(
404
- client_id=os.getenv("REDDIT_CLIENT_ID"),
405
- client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
406
  user_agent=os.getenv("REDDIT_USER_AGENT"),
407
  username=os.getenv("REDDIT_USERNAME"),
408
  password=os.getenv("REDDIT_PASSWORD")
409
- )
410
 
411
  try:
412
  subreddit = await reddit.subreddit(SUBREDDITS)
413
- print(f"πŸ‘οΈ Β ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
414
-
415
- # --- PHASE 1: FETCH LATEST EXISTING POSTS (e.g., last 500) ---
416
- print("πŸ” Scanning last 5 posts for missed alerts...")
417
- # iterates over the last 5 posts asynchronously
418
- async for post in subreddit.new(limit=5):
419
- await process_post(post)
420
 
421
- print("βœ… Historical scan complete")
 
 
422
 
423
- # --- PHASE 2: START REAL-TIME STREAM (Forever Loop) ---
424
  print("πŸ“‘ Starting real-time stream for new submissions...")
425
 
426
- # starts the continuous loop to monitor for new submissions
427
- async for post in subreddit.stream.submissions(skip_existing=False):
 
428
  await process_post(post)
429
 
430
  except Exception as e:
 
385
  # allows posts to pass if Reddit API fails to get user info
386
  return True
387
 
 
 
388
  # orchestrates the entire scraping process (historical scan + real-time stream)
389
+ # 4. Main Scraper Loop (LIVE MODE ONLY)
390
  async def scrape_reddit():
391
  print("Connecting to Reddit API...")
392
+
393
+ # Load credentials
394
  client_id = os.getenv("REDDIT_CLIENT_ID")
395
  client_secret = os.getenv("REDDIT_CLIENT_SECRET")
 
396
 
397
  if not client_id or not client_secret:
398
  print("❌ Error: Client ID or Secret missing in .env")
399
  return
400
 
401
  reddit = asyncpraw.Reddit(
402
+ client_id=client_id,
403
+ client_secret=client_secret,
404
  user_agent=os.getenv("REDDIT_USER_AGENT"),
405
  username=os.getenv("REDDIT_USERNAME"),
406
  password=os.getenv("REDDIT_PASSWORD")
407
+ )
408
 
409
  try:
410
  subreddit = await reddit.subreddit(SUBREDDITS)
411
+ print(f"πŸ‘οΈ ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
 
 
 
 
 
 
412
 
413
+ # --- DELETED PHASE 1 (History Scan) ---
414
+ # We removed the code that caused the 403 error.
415
+ print("⚠️ Skipped history scan to avoid Cloud Ban.")
416
 
417
+ # --- PHASE 2: START REAL-TIME STREAM ---
418
  print("πŸ“‘ Starting real-time stream for new submissions...")
419
 
420
+ # skip_existing=True is VITAL. It ignores old data and only waits for NEW posts.
421
+ async for post in subreddit.stream.submissions(skip_existing=True):
422
+ print(f"πŸ“₯ New Post Detected: {post.title}")
423
  await process_post(post)
424
 
425
  except Exception as e: