Spaces:
Runtime error
Runtime error
| # import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import praw | |
| from huggingface_hub import HfApi, HfFolder | |
| import time | |
| import os | |
| from datetime import datetime | |
| # from tqdm import tqdm | |
| HfFolder.save_token(os.getenv("HF_TOKEN")) | |
| try: | |
| # def initialize_reddit(): | |
| reddit = praw.Reddit(client_id= os.getenv("PRAW_CLIENT_ID"), | |
| client_secret= os.getenv("PRAW_CLIENT_SECRET"), | |
| user_agent= os.getenv("RPAW_AGENT"), | |
| check_for_async=False | |
| ) | |
| except praw.exceptions.PRAWException as e: | |
| print(f"PRAW Exception: {str(e)}") | |
| # return None | |
| except Exception as e: | |
| print(f"An error occurred: {str(e)}") | |
| # return None | |
| def scrape_reddit(subreddit_name = None, keywords = None, limit = 1000): | |
| posts_data = [] | |
| if subreddit_name: | |
| subreddit = reddit.subreddit(subreddit_name) | |
| if keywords: | |
| posts = subreddit.search(keywords, limit=limit) | |
| else: | |
| posts = subreddit.hot(limit=limit) | |
| else: | |
| posts = reddit.subreddit("all").search(keywords, limit=limit) | |
| # print(posts) | |
| for post in posts: | |
| # print(post.title) | |
| try: | |
| post_data = { | |
| "title": post.title, | |
| "score": post.score, | |
| "id": post.id, | |
| "url": post.url, | |
| "num_comments": post.num_comments, | |
| "created": datetime.fromtimestamp(post.created), | |
| "body": post.selftext, | |
| "subreddit": post.subreddit.display_name | |
| } | |
| posts_data.append(post_data) | |
| # Add a small delay to avoid hitting rate limits | |
| time.sleep(0.1) | |
| except praw.exceptions.PRAWException as e: | |
| print(f"Error processing post {post.id}: {str(e)}") | |
| continue | |
| df = pd.DataFrame(posts_data) | |
| df['content'] = df['title'] + '\n' + df['body'] | |
| return df | |
| def get_comments(reddit, post_id, limit=100): | |
| """ | |
| Get top comments from a specific post. | |
| Args: | |
| reddit: Reddit instance | |
| post_id (str): ID of the post to get comments from | |
| limit (int): Maximum number of comments to retrieve (default 100) | |
| Returns: | |
| pd.DataFrame: DataFrame containing top comments data | |
| """ | |
| try: | |
| submission = reddit.submission(id=post_id) | |
| comments_data = [] | |
| # Replace MoreComments objects with actual comments, limited to save time | |
| submission.comments.replace_more(limit=0) # Ignore "More Comments" expansions | |
| # Get all top-level comments | |
| all_comments = submission.comments.list() | |
| # Sort comments by score and take top ones | |
| sorted_comments = sorted(all_comments, key=lambda x: x.score, reverse=True)[:limit] | |
| for comment in sorted_comments: | |
| try: | |
| comment_data = { | |
| 'comment_id': comment.id, | |
| 'post_id': post_id, | |
| 'post_title': submission.title, | |
| # 'author': str(comment.author) if comment.author else '[deleted]', | |
| 'body': comment.body, | |
| 'score': comment.score, | |
| 'created_utc': datetime.fromtimestamp(comment.created_utc) | |
| # 'parent_id': comment.parent_id, | |
| # 'is_submitter': comment.is_submitter | |
| } | |
| comments_data.append(comment_data) | |
| except Exception as e: | |
| print(f"Error processing comment {comment.id}: {str(e)}") | |
| continue | |
| print(comments_data) | |
| # Create DataFrame | |
| df = pd.DataFrame(comments_data) | |
| # Sort by score (highest first) | |
| if not df.empty: | |
| print("sort comments by score") | |
| df = df.sort_values('score', ascending=False) | |
| return df | |
| except praw.exceptions.PRAWException as e: | |
| print(f"PRAW Exception while getting comments: {str(e)}") | |
| return pd.DataFrame() | |
| except Exception as e: | |
| print(f"Error getting comments: {str(e)}") | |
| return pd.DataFrame() | |
| def get_comments_and_upload(df, dataset_repo_id): | |
| # Initialize the Hugging Face API | |
| api = HfApi() | |
| existing_files = api.list_repo_files(repo_id=dataset_repo_id, repo_type="dataset") | |
| # Iterate over each submission in the DataFrame | |
| for index, row in df.iterrows(): | |
| csv_file_path = f"comments_{row['id']}.csv" | |
| repo_csv_path = f"comments/{csv_file_path}" | |
| # Check if this file already exists in the Hugging Face dataset | |
| # if repo_csv_path in existing_files: | |
| # print(f"{csv_file_path} already exists in the dataset. Skipping upload.") | |
| # continue | |
| # Fetch comments for the current submission | |
| comments_df = get_comments(reddit, row['id']) | |
| if len(comments_df) == 0: | |
| print(f"No comments found for {row['id']}") | |
| # continue | |
| # Define a unique CSV filename for each submission based on its ID | |
| csv_file_path = f"comments_{row['id']}.csv" | |
| # Save the comments DataFrame as a CSV file | |
| comments_df.to_csv(csv_file_path, index=False) | |
| # Upload the CSV file to the Hugging Face dataset repository | |
| api.upload_file( | |
| path_or_fileobj=csv_file_path, | |
| path_in_repo=f"comments/{csv_file_path}", # Save in a 'comments' folder in the dataset repo | |
| repo_id=dataset_repo_id, | |
| repo_type="dataset" | |
| ) | |
| print(f"Uploaded {csv_file_path} to Hugging Face.") | |
| # Optionally, delete the local CSV file to save space | |
| os.remove(csv_file_path) | |
| print("All comments CSV files uploaded successfully!") | |
| def main(): | |
| # Example usage | |
| try: | |
| # Search for 2016 election posts | |
| df = scrape_reddit(keywords="election") | |
| if df is not None and not df.empty: | |
| print(f"Successfully scraped {len(df)} posts") | |
| # Save to CSV | |
| # df.to_csv("reddit_2016_election_posts.csv", index=False) | |
| df['created'] = pd.to_datetime(df['created'], unit='s') | |
| df = df.sort_values(by='created', ascending=True) | |
| df_24 = df[df['created'] > '2024-01-01'].reset_index(drop=True) | |
| # df_16 = df_16[df_16['created'] > '2015-12-31'].reset_index(drop=True) | |
| dataset_repo_id = "Vera-ZWY/reddite2024elections_submissions" | |
| # reate database if it's not exsit | |
| api = HfApi() | |
| try: | |
| api.dataset_info(dataset_repo_id) | |
| # dataset_exists = True | |
| print(f"Dataset {dataset_repo_id} already exists.") | |
| except Exception: | |
| # dataset_exists = False | |
| print(f"Dataset {dataset_repo_id} will be created.") | |
| # If the dataset doesn't exist, create it and then upload the CSV file | |
| # api.create_repo(repo_id=dataset_repo_id, repo_type="dataset") | |
| today_date = datetime.now().strftime('%Y%m%d') | |
| filename = f"df_24_{today_date}.csv" | |
| df_24.to_csv(filename, index=False) | |
| # csv_file_path = filename | |
| api.upload_file( | |
| path_or_fileobj= filename, | |
| path_in_repo=f"submissions/{filename}", | |
| repo_id=dataset_repo_id, | |
| repo_type="dataset" | |
| ) | |
| get_comments_and_upload(df_24, dataset_repo_id) | |
| else: | |
| print("No data was retrieved") | |
| except Exception as e: | |
| print(f"Error in main: {str(e)}") | |
| if __name__ == '__main__': | |
| main() | |