Spaces:
Sleeping
Sleeping
File size: 2,017 Bytes
f6c54d5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | #scripts/fetch_comments.py
import praw
import pandas as pd
from datetime import datetime, timedelta
from config import settings
from utils.helpers import logger, ensure_folder
def fetch_comments(days=settings.FETCH_DAYS, limit_posts=None, max_comments_per_post=None):
ensure_folder(settings.RAW_DATA_PATH)
reddit = praw.Reddit(
client_id=settings.REDDIT_CLIENT_ID,
client_secret=settings.REDDIT_CLIENT_SECRET,
user_agent=settings.REDDIT_USER_AGENT
)
comments_data = []
end_time = datetime.utcnow()
start_time = end_time - timedelta(days=days)
for subreddit_name in settings.SUBREDDITS:
try:
subreddit = reddit.subreddit(subreddit_name)
logger.info(f"Fetching comments from r/{subreddit_name}")
for post in subreddit.new(limit=limit_posts):
post_time = datetime.utcfromtimestamp(post.created_utc)
if post_time >= start_time:
post.comments.replace_more(limit=0) # only top-level
for comment in post.comments.list()[:max_comments_per_post]:
comments_data.append({
"post_id": post.id,
"comment_id": comment.id,
"author": str(comment.author),
"body": comment.body,
"created_utc": datetime.utcfromtimestamp(comment.created_utc),
"score": comment.score,
"subreddit": subreddit_name
})
except Exception as e:
logger.error(f"Failed fetching comments for r/{subreddit_name}: {e}")
df = pd.DataFrame(comments_data)
file_path = f"{settings.RAW_DATA_PATH}reddit_comments.csv"
df.to_csv(file_path, index=False)
logger.info(f"Saved {len(df)} comments to {file_path}")
return df
if __name__ == "__main__":
fetch_comments()
|