|
|
|
|
| import praw
|
| from prawcore.exceptions import RequestException, ServerError, ResponseException, Forbidden
|
| import os
|
| import time
|
| from dotenv import load_dotenv
|
|
|
|
|
| load_dotenv()
|
|
|
|
|
| reddit = praw.Reddit(
|
| client_id=os.getenv("REDDIT_CLIENT_ID"),
|
| client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
|
| user_agent=os.getenv("REDDIT_USER_AGENT")
|
| )
|
|
|
| def wait_and_retry(func, *args, retries=5, backoff_factor=2, **kwargs):
|
| """
|
| Retry a function if a rate limit or server error occurs.
|
| """
|
| attempt = 0
|
| while attempt < retries:
|
| try:
|
| return func(*args, **kwargs)
|
| except (RequestException, ServerError, ResponseException) as e:
|
| attempt += 1
|
| wait_time = backoff_factor ** attempt
|
| print(f"Error: {e}. Retrying in {wait_time} seconds...")
|
| time.sleep(wait_time)
|
| except Forbidden:
|
| print("Access forbidden. Skipping...")
|
| return None
|
| print(f"Failed after {retries} attempts.")
|
| return None
|
|
|
| def scrape_reddit_user(username, task_id, tasks):
|
| """
|
| Scrape Reddit user data and update the tasks dict with progress.
|
| """
|
| output_data = ""
|
| try:
|
| tasks[task_id]['progress'] = 'Fetching user information...'
|
|
|
| user = wait_and_retry(reddit.redditor, username)
|
| if not user:
|
| print(f"Unable to fetch data for user: {username}")
|
| tasks[task_id]['progress'] = 'Failed to fetch user data.'
|
| tasks[task_id]['status'] = 'Failed'
|
| return None
|
|
|
| output_data += f"# Reddit User: {username}\n\n## π Posts:\n\n"
|
|
|
|
|
| tasks[task_id]['progress'] = 'Counting total posts and comments...'
|
| total_posts = wait_and_retry(lambda: sum(1 for _ in user.submissions.new(limit=None)))
|
| total_comments = wait_and_retry(lambda: sum(1 for _ in user.comments.new(limit=None)))
|
| tasks[task_id]['total_posts'] = total_posts
|
| tasks[task_id]['total_comments'] = total_comments
|
| tasks[task_id]['progress'] = f"Total Posts: {total_posts}, Total Comments: {total_comments}\n"
|
|
|
|
|
| tasks[task_id]['scraped_posts'] = 0
|
| tasks[task_id]['scraped_comments'] = 0
|
|
|
|
|
| tasks[task_id]['progress'] = 'Scraping posts...'
|
| submissions = wait_and_retry(user.submissions.new, limit=None)
|
| if submissions:
|
| for post in submissions:
|
| try:
|
| post_data = (
|
| f"### Title: {post.title}\n"
|
| f"**Subreddit:** {post.subreddit}\n"
|
| f"**URL:** {post.url}\n"
|
| f"**Content:** {post.selftext or 'No Content'}\n\n"
|
| )
|
| output_data += post_data
|
| tasks[task_id]['scraped_posts'] += 1
|
| tasks[task_id]['progress'] = f"Scraping posts... ({tasks[task_id]['scraped_posts']}/{tasks[task_id]['total_posts']})"
|
| except Exception as post_error:
|
| print(f"Error with post: {post_error}")
|
|
|
|
|
| output_data += "\n## π¬ Comments:\n\n"
|
|
|
|
|
| tasks[task_id]['progress'] = 'Scraping comments...'
|
| comments = wait_and_retry(user.comments.new, limit=None)
|
| if comments:
|
| for comment in comments:
|
| try:
|
| comment_data = (
|
| f"### Comment:\n{comment.body}\n"
|
| f"**Subreddit:** {comment.subreddit}\n"
|
| f"**Post:** {comment.submission.title}\n"
|
| )
|
|
|
|
|
| if not comment.is_root:
|
| parent_comment = wait_and_retry(comment.parent)
|
| if isinstance(parent_comment, praw.models.Comment):
|
| comment_data += f"**Parent Comment:** {parent_comment.body}\n"
|
|
|
| comment_data += "\n"
|
| output_data += comment_data
|
| tasks[task_id]['scraped_comments'] += 1
|
| tasks[task_id]['progress'] = f"Scraping comments... ({tasks[task_id]['scraped_comments']}/{tasks[task_id]['total_comments']})"
|
| except Exception as comment_error:
|
| print(f"Error with comment: {comment_error}")
|
|
|
| print("\nScraping completed!")
|
| tasks[task_id]['progress'] = 'Scraping completed. Processing data...'
|
| tasks[task_id]['status'] = 'Processing'
|
| return output_data
|
|
|
| except:
|
| pass |