Spaces:
Running
Running
Update alisto_project/backend/ingest_reddit.py
Browse files
alisto_project/backend/ingest_reddit.py
CHANGED
|
@@ -385,10 +385,9 @@ def is_credible_user(post):
|
|
| 385 |
# allows posts to pass if Reddit API fails to get user info
|
| 386 |
return True
|
| 387 |
|
| 388 |
-
#
|
| 389 |
-
# 4. Main Scraper Loop (LIVE MODE ONLY)
|
| 390 |
async def scrape_reddit():
|
| 391 |
-
print("Connecting to Reddit API...")
|
| 392 |
|
| 393 |
# Load credentials
|
| 394 |
client_id = os.getenv("REDDIT_CLIENT_ID")
|
|
@@ -406,28 +405,34 @@ async def scrape_reddit():
|
|
| 406 |
password=os.getenv("REDDIT_PASSWORD")
|
| 407 |
)
|
| 408 |
|
| 409 |
-
|
| 410 |
-
subreddit = await reddit.subreddit(SUBREDDITS)
|
| 411 |
-
print(f"👁️ ALISTO ACTIVE: Monitoring r/{SUBREDDITS}...")
|
| 412 |
-
|
| 413 |
-
# --- DELETED PHASE 1 (History Scan) ---
|
| 414 |
-
# We removed the code that caused the 403 error.
|
| 415 |
-
print("⚠️ Skipped history scan to avoid Cloud Ban.")
|
| 416 |
-
|
| 417 |
-
# --- PHASE 2: START REAL-TIME STREAM ---
|
| 418 |
-
print("📡 Starting real-time stream for new submissions...")
|
| 419 |
-
|
| 420 |
-
# skip_existing=True is VITAL. It ignores old data and only waits for NEW posts.
|
| 421 |
-
async for post in subreddit.stream.submissions(skip_existing=True):
|
| 422 |
-
print(f"📥 New Post Detected: {post.title}")
|
| 423 |
-
await process_post(post)
|
| 424 |
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
|
|
|
|
|
|
| 431 |
|
| 432 |
# executes the main scraping loop when the script is run
|
| 433 |
if __name__ == "__main__":
|
|
|
|
| 385 |
# allows posts to pass if Reddit API fails to get user info
|
| 386 |
return True
|
| 387 |
|
| 388 |
+
# 4. Main Scraper Loop (POLLING MODE - BYPASSES CLOUD BLOCK)
|
|
|
|
| 389 |
async def scrape_reddit():
|
| 390 |
+
print("Connecting to Reddit API (Polling Mode)...")
|
| 391 |
|
| 392 |
# Load credentials
|
| 393 |
client_id = os.getenv("REDDIT_CLIENT_ID")
|
|
|
|
| 405 |
password=os.getenv("REDDIT_PASSWORD")
|
| 406 |
)
|
| 407 |
|
| 408 |
+
print(f"👁️ ALISTO ACTIVE: Polling r/{SUBREDDITS} every 60 seconds...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
+
# Keep track of the last post we saw so we don't duplicate
|
| 411 |
+
last_processed_id = None
|
| 412 |
+
|
| 413 |
+
while True:
|
| 414 |
+
try:
|
| 415 |
+
subreddit = await reddit.subreddit(SUBREDDITS)
|
| 416 |
+
|
| 417 |
+
# Fetch ONLY the single newest post
|
| 418 |
+
async for post in subreddit.new(limit=1):
|
| 419 |
+
if post.id != last_processed_id:
|
| 420 |
+
print(f"📥 New Post Detected: {post.title}")
|
| 421 |
+
await process_post(post)
|
| 422 |
+
last_processed_id = post.id
|
| 423 |
+
else:
|
| 424 |
+
print(" (No new posts, waiting...)")
|
| 425 |
+
|
| 426 |
+
# Disconnect and sleep for 60 seconds (This prevents the 403 Ban)
|
| 427 |
+
await asyncio.sleep(60)
|
| 428 |
+
|
| 429 |
+
except Exception as e:
|
| 430 |
+
print(f"⚠️ Connection glitch: {e}")
|
| 431 |
+
print(" Waiting 2 minutes before retry...")
|
| 432 |
+
await asyncio.sleep(120)
|
| 433 |
|
| 434 |
+
# Note: We technically never reach this, but good practice to close
|
| 435 |
+
await reddit.close()
|
| 436 |
|
| 437 |
# executes the main scraping loop when the script is run
|
| 438 |
if __name__ == "__main__":
|