Spaces:

lawlevisan
/

Reddit-Analysis

Sleeping

App Files Files Community

Reddit-Analysis / src /fetch_posts.py

lawlevisan

Upload 4 files

f6c54d5 verified 4 months ago

raw

history blame contribute delete

4.05 kB

	# scripts/fetch_posts.py
	import praw
	import requests
	import pandas as pd
	from datetime import datetime, timedelta
	from config import settings
	from utils.helpers import logger, ensure_folder

	def fetch_posts(days=settings.FETCH_DAYS, limit=None):
	"""Fetch posts via Reddit API"""
	ensure_folder(settings.RAW_DATA_PATH)
	reddit = praw.Reddit(
	client_id=settings.REDDIT_CLIENT_ID,
	client_secret=settings.REDDIT_CLIENT_SECRET,
	user_agent=settings.REDDIT_USER_AGENT
	)

	posts_data = []
	end_time = datetime.utcnow()
	start_time = end_time - timedelta(days=days)

	for subreddit_name in settings.SUBREDDITS:
	try:
	subreddit = reddit.subreddit(subreddit_name)
	logger.info(f"Fetching posts from r/{subreddit_name}")
	for post in subreddit.new(limit=limit):
	post_time = datetime.utcfromtimestamp(post.created_utc)
	if post_time >= start_time:
	posts_data.append({
	"id": post.id,
	"subreddit": subreddit_name,
	"title": post.title,
	"text": post.selftext,
	"author": str(post.author),
	"created_utc": post_time,
	"score": post.score,
	"num_comments": post.num_comments,
	"permalink": f"https://reddit.com{post.permalink}"
	})
	except Exception as e:
	logger.error(f"Failed to fetch posts from r/{subreddit_name}: {e}")

	df = pd.DataFrame(posts_data)
	file_path = f"{settings.RAW_DATA_PATH}reddit_posts.csv"
	df.to_csv(file_path, index=False)
	logger.info(f"Saved {len(df)} posts to {file_path}")
	return df

	def fetch_posts_pushshift(subreddit, start_epoch, end_epoch, limit=500):
	"""Fetch posts via Pushshift API as fallback"""
	url = f"https://api.pushshift.io/reddit/submission/search/?subreddit={subreddit}&after={start_epoch}&before={end_epoch}&size={limit}"
	try:
	resp = requests.get(url)
	resp.raise_for_status()
	data = resp.json()["data"]
	posts = []
	for p in data:
	posts.append({
	"id": p.get("id"),
	"subreddit": subreddit,
	"title": p.get("title"),
	"text": p.get("selftext"),
	"author": p.get("author"),
	"created_utc": datetime.utcfromtimestamp(p.get("created_utc")),
	"score": p.get("score"),
	"num_comments": p.get("num_comments"),
	"permalink": f"https://reddit.com{p.get('permalink')}"
	})
	return posts
	except Exception as e:
	logger.error(f"Pushshift fetch failed for {subreddit}: {e}")
	return []

	def fetch_posts_with_fallback(subreddit, days=settings.FETCH_DAYS, limit=None):
	"""Fetch posts using Reddit API first, fallback to Pushshift"""
	end_time = datetime.utcnow()
	start_time = end_time - timedelta(days=days)
	start_epoch = int(start_time.timestamp())
	end_epoch = int(end_time.timestamp())

	df_posts = fetch_posts(days=days, limit=limit)
	posts = df_posts.to_dict("records")

	if limit and len(posts) < limit:
	remaining = limit - len(posts)
	logger.info(f"Reddit API returned {len(posts)} posts, fetching {remaining} more from Pushshift...")
	pushshift_posts = fetch_posts_pushshift(subreddit, start_epoch, end_epoch, limit=remaining)
	posts += pushshift_posts

	# Deduplicate
	posts_dict = {p['id']: p for p in posts}
	posts = list(posts_dict.values())

	df_posts = pd.DataFrame(posts)
	ensure_folder(settings.RAW_DATA_PATH)
	file_path = f"{settings.RAW_DATA_PATH}reddit_posts.csv"
	df_posts.to_csv(file_path, index=False)
	logger.info(f"Saved {len(df_posts)} combined posts to {file_path}")

	return df_posts