| | import gradio as gr |
| | from datasets import load_dataset |
| | import pandas as pd |
| | import sys |
| | import subprocess |
| | from datetime import datetime |
| | from huggingface_hub import HfApi |
| |
|
| | def get_newest_file(repo_id, prefix): |
| | """Get the newest file with given prefix from HuggingFace repo""" |
| | api = HfApi() |
| | files = api.list_repo_files(repo_id, repo_type="dataset") |
| | relevant_files = [f for f in files if f.startswith(prefix)] |
| | |
| | if not relevant_files: |
| | return None |
| | |
| | file_dates = [] |
| | for filename in relevant_files: |
| | try: |
| | date_str = filename.split('_')[-1].split('.')[0] |
| | date = datetime.strptime(date_str, '%Y%m%d') |
| | file_dates.append((date, filename)) |
| | except (IndexError, ValueError): |
| | continue |
| | |
| | if not file_dates: |
| | return None |
| | |
| | newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] |
| | return newest_file |
| |
|
| | def load_data(repo_id, file_path): |
| | """Load data from HuggingFace and return as DataFrame""" |
| | try: |
| | dataset = load_dataset(repo_id, data_files={'train': file_path}, split='train') |
| | df = pd.DataFrame(dataset) |
| | return df.head(3) |
| | except Exception as e: |
| | return pd.DataFrame({'Error': [str(e)]}) |
| |
|
| | def praw_new_data(): |
| | """Execute praw.py and show the latest data""" |
| | try: |
| | |
| | subprocess.run([sys.executable, "praw.py"], check=True) |
| | success_message = "β
Successfully crawled new data!" |
| | except Exception as e: |
| | success_message = f"β Error executing praw.py: {str(e)}" |
| | |
| | |
| | repo_id = "Vera-ZWY/reddite2024elections_submissions" |
| | newest_file = get_newest_file(repo_id, "submissions/df_") |
| | |
| | if newest_file: |
| | df = load_data(repo_id, newest_file) |
| | return success_message, df, load_merged_data()[1] |
| | else: |
| | return "No crawled data files found", pd.DataFrame(), load_merged_data()[1] |
| |
|
| | def merge_data(): |
| | """Execute merge.py and show the latest merged data""" |
| | try: |
| | |
| | subprocess.run([sys.executable, "merge.py"], check=True) |
| | success_message = "β
Successfully merged data!" |
| | except Exception as e: |
| | success_message = f"β Error executing merge.py: {str(e)}" |
| | |
| | |
| | merged_df = load_merged_data()[1] |
| | crawled_df = load_crawled_data()[1] |
| | return success_message, crawled_df, merged_df |
| |
|
| | def load_crawled_data(): |
| | """Load latest crawled data""" |
| | repo_id = "Vera-ZWY/reddite2024elections_submissions" |
| | newest_file = get_newest_file(repo_id, "submissions/df_24") |
| | |
| | if newest_file: |
| | return f"Latest crawled data ({newest_file}):", load_data(repo_id, newest_file) |
| | return "No crawled data available", pd.DataFrame() |
| |
|
| | def load_merged_data(): |
| | """Load latest merged data""" |
| | repo_id = "Vera-ZWY/reddite2024elections_submissions" |
| | newest_merged = "submission/merged_reddit_data.csv" |
| | |
| | if newest_merged: |
| | return f"Latest merged data ({newest_merged}):", load_data(repo_id, newest_merged) |
| | return "No merged data available", pd.DataFrame() |
| |
|
| | |
| | with gr.Blocks(title="Reddit Data Processing") as iface: |
| | gr.Markdown("# Reddit Data Processing Interface") |
| | |
| | |
| | status_text = gr.Textbox(label="Status", interactive=False) |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | praw_button = gr.Button("Crawl New Data", variant="primary") |
| | with gr.Column(): |
| | merge_button = gr.Button("Merge Data", variant="primary") |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Markdown("### Latest Crawled Data (Top 3 Rows)") |
| | crawled_table = gr.Dataframe( |
| | headers=["title", "score", "id", "url", "comms_num", "created", "body", "subreddit"], |
| | value=load_crawled_data()[1], |
| | wrap=True |
| | ) |
| | |
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Markdown("### Latest Merged Data (Top 3 Rows)") |
| | merged_table = gr.Dataframe( |
| | headers=["title", "score", "id", "url", "num_comments", "created", "body", "content", "subreddit"], |
| | value=load_merged_data()[1], |
| | wrap=True |
| | ) |
| | |
| | |
| | praw_button.click( |
| | fn=praw_new_data, |
| | outputs=[status_text, crawled_table, merged_table] |
| | ) |
| | |
| | merge_button.click( |
| | fn=merge_data, |
| | outputs=[status_text, crawled_table, merged_table] |
| | ) |
| | |
| | gr.Markdown(""" |
| | ## The full dataset storage at https://huggingface.co/datasets/Vera-ZWY/reddite2024elections_submissions/ |
| | ### Instructions: |
| | 1. Click 'Crawl New Data' to fetch new Reddit data |
| | 2. Click 'Merge Data' to merge the latest datasets |
| | 3. Tables will automatically update to show the latest data |
| | """) |
| |
|
| | |
| | if __name__ == "__main__": |
| | iface.launch() |