Spaces:
Runtime error
Runtime error
| from datasets import load_dataset | |
| import pandas as pd | |
| from datetime import datetime | |
| from huggingface_hub import HfApi, HfFolder | |
| import time | |
| import logging | |
| from tqdm.auto import tqdm | |
| import os | |
| # Set up logging | |
| HfFolder.save_token(os.getenv("HF_TOKEN")) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def load_huggingface_data(dataset_name, file1_name, file2_name): | |
| """Load datasets from Hugging Face""" | |
| logger.info("Loading datasets from Hugging Face...") | |
| # Load the first CSV file | |
| dataset1 = load_dataset(dataset_name, | |
| data_files={'train': file1_name}, | |
| split='train') | |
| # Load the second CSV file | |
| dataset2 = load_dataset(dataset_name, | |
| data_files={'train': file2_name}, | |
| split='train') | |
| # Convert to pandas DataFrames | |
| df1 = pd.DataFrame(dataset1) | |
| df2 = pd.DataFrame(dataset2) | |
| logger.info(f"Loaded {len(df1)} rows from {file1_name}") | |
| logger.info(f"Loaded {len(df2)} rows from {file2_name}") | |
| return df1, df2 | |
| def merge_newest(df1, df2): | |
| """Process and merge the datasets""" | |
| logger.info("Processing datasets...") | |
| # Perform full outer join on idg | |
| merged_df = pd.merge(df1, df2, | |
| on='id', | |
| how='outer', | |
| suffixes=('', '_y')) | |
| # For each column that got a suffix, combine it with the original column | |
| for col in merged_df.columns: | |
| if col.endswith('_y'): | |
| original_col = col[:-2] # Remove the '_y' suffix | |
| # Combine columns, taking the non-null value | |
| merged_df[original_col] = merged_df[original_col].combine_first(merged_df[col]) | |
| # Drop the suffix column | |
| merged_df = merged_df.drop(columns=[col]) | |
| # Final column order | |
| desired_columns = ['title', 'score', 'id', 'url', 'num_comments', | |
| 'created', 'body', 'content', 'subreddit'] | |
| # Reorder columns, only keeping those that exist | |
| final_columns = [col for col in desired_columns if col in merged_df.columns] | |
| merged_df = merged_df[final_columns] | |
| return merged_df | |
| def save_to_huggingface(df, repo_id): | |
| """Save the merged dataset to Hugging Face""" | |
| logger.info("Saving to Hugging Face...") | |
| # Generate filename with today's date | |
| # today_date = datetime.now().strftime('%Y%m%d') | |
| filename = f"merged_reddit_data.csv" | |
| # Save locally first | |
| df.to_csv(filename, index=False) | |
| # Upload to Hugging Face | |
| api = HfApi() | |
| api.upload_file( | |
| path_or_fileobj=filename, | |
| path_in_repo= f"submission/{filename}", | |
| repo_id=repo_id, | |
| repo_type="dataset" | |
| ) | |
| return filename | |
| def get_newes_file(repo_id): | |
| """ | |
| Get the newest file from the HuggingFace repository | |
| Args: | |
| repo_id (str): The repository ID on HuggingFace | |
| Returns: | |
| str: The filename of the newest merged file | |
| """ | |
| api = HfApi() | |
| # List all files in the repository | |
| files = api.list_repo_files(repo_id, repo_type="dataset") | |
| # Filter for merged files | |
| merged_files = [f for f in files if f.startswith('merged_reddit_data_')] | |
| if not merged_files: | |
| raise ValueError("No merged files found in repository") | |
| # Extract dates from filenames and pair with filenames | |
| file_dates = [] | |
| for filename in merged_files: | |
| try: | |
| # Extract date string (assuming format: merged_reddit_data_YYYYMMDD.csv) | |
| date_str = filename.split('_')[-1].split('.')[0] | |
| date = datetime.strptime(date_str, '%Y%m%d') | |
| file_dates.append((date, filename)) | |
| except (IndexError, ValueError): | |
| continue | |
| if not file_dates: | |
| raise ValueError("No valid dated files found") | |
| # Sort by date and get the newest file | |
| newest_file = sorted(file_dates, key=lambda x: x[0], reverse=True)[0][1] | |
| return newest_file | |
| def main(): | |
| # Initialize Reddit API | |
| repo_id = "Vera-ZWY/reddite2024elections_submissions" | |
| file_new = get_newes_file(repo_id) | |
| file_old = "submission/merged_reddit_data.csv" | |
| df1, df2 = load_huggingface_data(repo_id, file_new, file_old) | |
| print(f"Newest dataset shape: {df1.shape}") | |
| print(f"Old dataset columns: {df1.columns.tolist()}") | |
| # Process and merge data | |
| merged_df = process_data(df1, df2) | |
| output_file = save_to_huggingface(merged_df, repo_id) | |
| logger.info(f"Processing complete. File saved as {output_file}") | |
| return f"Processing complete. File saved as {output_file}. Old dataset columns: {merged_df.columns.tolist()}" | |
| if __name__ == "__main__": | |
| main() | |