Spaces:
Sleeping
Sleeping
| # scripts/fetch_posts.py | |
| import praw | |
| import requests | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| from config import settings | |
| from utils.helpers import logger, ensure_folder | |
| def fetch_posts(days=settings.FETCH_DAYS, limit=None): | |
| """Fetch posts via Reddit API""" | |
| ensure_folder(settings.RAW_DATA_PATH) | |
| reddit = praw.Reddit( | |
| client_id=settings.REDDIT_CLIENT_ID, | |
| client_secret=settings.REDDIT_CLIENT_SECRET, | |
| user_agent=settings.REDDIT_USER_AGENT | |
| ) | |
| posts_data = [] | |
| end_time = datetime.utcnow() | |
| start_time = end_time - timedelta(days=days) | |
| for subreddit_name in settings.SUBREDDITS: | |
| try: | |
| subreddit = reddit.subreddit(subreddit_name) | |
| logger.info(f"Fetching posts from r/{subreddit_name}") | |
| for post in subreddit.new(limit=limit): | |
| post_time = datetime.utcfromtimestamp(post.created_utc) | |
| if post_time >= start_time: | |
| posts_data.append({ | |
| "id": post.id, | |
| "subreddit": subreddit_name, | |
| "title": post.title, | |
| "text": post.selftext, | |
| "author": str(post.author), | |
| "created_utc": post_time, | |
| "score": post.score, | |
| "num_comments": post.num_comments, | |
| "permalink": f"https://reddit.com{post.permalink}" | |
| }) | |
| except Exception as e: | |
| logger.error(f"Failed to fetch posts from r/{subreddit_name}: {e}") | |
| df = pd.DataFrame(posts_data) | |
| file_path = f"{settings.RAW_DATA_PATH}reddit_posts.csv" | |
| df.to_csv(file_path, index=False) | |
| logger.info(f"Saved {len(df)} posts to {file_path}") | |
| return df | |
| def fetch_posts_pushshift(subreddit, start_epoch, end_epoch, limit=500): | |
| """Fetch posts via Pushshift API as fallback""" | |
| url = f"https://api.pushshift.io/reddit/submission/search/?subreddit={subreddit}&after={start_epoch}&before={end_epoch}&size={limit}" | |
| try: | |
| resp = requests.get(url) | |
| resp.raise_for_status() | |
| data = resp.json()["data"] | |
| posts = [] | |
| for p in data: | |
| posts.append({ | |
| "id": p.get("id"), | |
| "subreddit": subreddit, | |
| "title": p.get("title"), | |
| "text": p.get("selftext"), | |
| "author": p.get("author"), | |
| "created_utc": datetime.utcfromtimestamp(p.get("created_utc")), | |
| "score": p.get("score"), | |
| "num_comments": p.get("num_comments"), | |
| "permalink": f"https://reddit.com{p.get('permalink')}" | |
| }) | |
| return posts | |
| except Exception as e: | |
| logger.error(f"Pushshift fetch failed for {subreddit}: {e}") | |
| return [] | |
| def fetch_posts_with_fallback(subreddit, days=settings.FETCH_DAYS, limit=None): | |
| """Fetch posts using Reddit API first, fallback to Pushshift""" | |
| end_time = datetime.utcnow() | |
| start_time = end_time - timedelta(days=days) | |
| start_epoch = int(start_time.timestamp()) | |
| end_epoch = int(end_time.timestamp()) | |
| df_posts = fetch_posts(days=days, limit=limit) | |
| posts = df_posts.to_dict("records") | |
| if limit and len(posts) < limit: | |
| remaining = limit - len(posts) | |
| logger.info(f"Reddit API returned {len(posts)} posts, fetching {remaining} more from Pushshift...") | |
| pushshift_posts = fetch_posts_pushshift(subreddit, start_epoch, end_epoch, limit=remaining) | |
| posts += pushshift_posts | |
| # Deduplicate | |
| posts_dict = {p['id']: p for p in posts} | |
| posts = list(posts_dict.values()) | |
| df_posts = pd.DataFrame(posts) | |
| ensure_folder(settings.RAW_DATA_PATH) | |
| file_path = f"{settings.RAW_DATA_PATH}reddit_posts.csv" | |
| df_posts.to_csv(file_path, index=False) | |
| logger.info(f"Saved {len(df_posts)} combined posts to {file_path}") | |
| return df_posts | |