File size: 2,017 Bytes
f6c54d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#scripts/fetch_comments.py
import praw
import pandas as pd
from datetime import datetime, timedelta
from config import settings
from utils.helpers import logger, ensure_folder

def fetch_comments(days=settings.FETCH_DAYS, limit_posts=None, max_comments_per_post=None):
    ensure_folder(settings.RAW_DATA_PATH)
    reddit = praw.Reddit(
        client_id=settings.REDDIT_CLIENT_ID,
        client_secret=settings.REDDIT_CLIENT_SECRET,
        user_agent=settings.REDDIT_USER_AGENT
    )

    comments_data = []
    end_time = datetime.utcnow()
    start_time = end_time - timedelta(days=days)

    for subreddit_name in settings.SUBREDDITS:
        try:
            subreddit = reddit.subreddit(subreddit_name)
            logger.info(f"Fetching comments from r/{subreddit_name}")
            for post in subreddit.new(limit=limit_posts):
                post_time = datetime.utcfromtimestamp(post.created_utc)
                if post_time >= start_time:
                    post.comments.replace_more(limit=0)  # only top-level
                    for comment in post.comments.list()[:max_comments_per_post]:
                        comments_data.append({
                            "post_id": post.id,
                            "comment_id": comment.id,
                            "author": str(comment.author),
                            "body": comment.body,
                            "created_utc": datetime.utcfromtimestamp(comment.created_utc),
                            "score": comment.score,
                            "subreddit": subreddit_name
                        })
        except Exception as e:
            logger.error(f"Failed fetching comments for r/{subreddit_name}: {e}")

    df = pd.DataFrame(comments_data)
    file_path = f"{settings.RAW_DATA_PATH}reddit_comments.csv"
    df.to_csv(file_path, index=False)
    logger.info(f"Saved {len(df)} comments to {file_path}")
    return df

if __name__ == "__main__":
    fetch_comments()