#scripts/fetch_comments.py import praw import pandas as pd from datetime import datetime, timedelta from config import settings from utils.helpers import logger, ensure_folder def fetch_comments(days=settings.FETCH_DAYS, limit_posts=None, max_comments_per_post=None): ensure_folder(settings.RAW_DATA_PATH) reddit = praw.Reddit( client_id=settings.REDDIT_CLIENT_ID, client_secret=settings.REDDIT_CLIENT_SECRET, user_agent=settings.REDDIT_USER_AGENT ) comments_data = [] end_time = datetime.utcnow() start_time = end_time - timedelta(days=days) for subreddit_name in settings.SUBREDDITS: try: subreddit = reddit.subreddit(subreddit_name) logger.info(f"Fetching comments from r/{subreddit_name}") for post in subreddit.new(limit=limit_posts): post_time = datetime.utcfromtimestamp(post.created_utc) if post_time >= start_time: post.comments.replace_more(limit=0) # only top-level for comment in post.comments.list()[:max_comments_per_post]: comments_data.append({ "post_id": post.id, "comment_id": comment.id, "author": str(comment.author), "body": comment.body, "created_utc": datetime.utcfromtimestamp(comment.created_utc), "score": comment.score, "subreddit": subreddit_name }) except Exception as e: logger.error(f"Failed fetching comments for r/{subreddit_name}: {e}") df = pd.DataFrame(comments_data) file_path = f"{settings.RAW_DATA_PATH}reddit_comments.csv" df.to_csv(file_path, index=False) logger.info(f"Saved {len(df)} comments to {file_path}") return df if __name__ == "__main__": fetch_comments()