Spaces:
Sleeping
Sleeping
| #scripts/fetch_comments.py | |
| import praw | |
| import pandas as pd | |
| from datetime import datetime, timedelta | |
| from config import settings | |
| from utils.helpers import logger, ensure_folder | |
| def fetch_comments(days=settings.FETCH_DAYS, limit_posts=None, max_comments_per_post=None): | |
| ensure_folder(settings.RAW_DATA_PATH) | |
| reddit = praw.Reddit( | |
| client_id=settings.REDDIT_CLIENT_ID, | |
| client_secret=settings.REDDIT_CLIENT_SECRET, | |
| user_agent=settings.REDDIT_USER_AGENT | |
| ) | |
| comments_data = [] | |
| end_time = datetime.utcnow() | |
| start_time = end_time - timedelta(days=days) | |
| for subreddit_name in settings.SUBREDDITS: | |
| try: | |
| subreddit = reddit.subreddit(subreddit_name) | |
| logger.info(f"Fetching comments from r/{subreddit_name}") | |
| for post in subreddit.new(limit=limit_posts): | |
| post_time = datetime.utcfromtimestamp(post.created_utc) | |
| if post_time >= start_time: | |
| post.comments.replace_more(limit=0) # only top-level | |
| for comment in post.comments.list()[:max_comments_per_post]: | |
| comments_data.append({ | |
| "post_id": post.id, | |
| "comment_id": comment.id, | |
| "author": str(comment.author), | |
| "body": comment.body, | |
| "created_utc": datetime.utcfromtimestamp(comment.created_utc), | |
| "score": comment.score, | |
| "subreddit": subreddit_name | |
| }) | |
| except Exception as e: | |
| logger.error(f"Failed fetching comments for r/{subreddit_name}: {e}") | |
| df = pd.DataFrame(comments_data) | |
| file_path = f"{settings.RAW_DATA_PATH}reddit_comments.csv" | |
| df.to_csv(file_path, index=False) | |
| logger.info(f"Saved {len(df)} comments to {file_path}") | |
| return df | |
| if __name__ == "__main__": | |
| fetch_comments() | |