Spaces:
Sleeping
Sleeping
Update rss_processor.py
Browse files- rss_processor.py +37 -20
rss_processor.py
CHANGED
|
@@ -12,9 +12,7 @@ import json
|
|
| 12 |
import re
|
| 13 |
import requests
|
| 14 |
import pandas as pd
|
| 15 |
-
from datasets import Dataset
|
| 16 |
-
|
| 17 |
-
|
| 18 |
|
| 19 |
|
| 20 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
@@ -296,29 +294,48 @@ def upload_to_hf_hub():
|
|
| 296 |
except Exception as e:
|
| 297 |
logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)
|
| 298 |
|
|
|
|
|
|
|
| 299 |
try:
|
| 300 |
-
logger.info(f"
|
| 301 |
|
| 302 |
-
#
|
| 303 |
-
with open('local_rss_store.json','r') as f:
|
| 304 |
-
|
| 305 |
-
f.close()
|
| 306 |
-
json_list = json.loads(json_data) # json_data is your JSON string
|
| 307 |
-
timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
| 308 |
-
local_filename = f'{timestamp}.parquet'
|
| 309 |
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
-
|
| 313 |
-
path_or_fileobj=local_filename,
|
| 314 |
-
path_in_repo=f"data/{timestamp}.parquet", # Recommended to keep in a 'data/' folder
|
| 315 |
-
repo_id=DATASET_REPO_ID,
|
| 316 |
-
repo_type="dataset"
|
| 317 |
-
)
|
| 318 |
|
| 319 |
-
logger.info(f"Raw feeds folder 'local' uploaded to: {DATASET_REPO_ID}")
|
| 320 |
except Exception as e:
|
| 321 |
-
logger.error(f"Error
|
| 322 |
|
| 323 |
|
| 324 |
|
|
|
|
| 12 |
import re
|
| 13 |
import requests
|
| 14 |
import pandas as pd
|
| 15 |
+
from datasets import Dataset, load_dataset, concatenate_datasets
|
|
|
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
| 294 |
except Exception as e:
|
| 295 |
logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)
|
| 296 |
|
| 297 |
+
|
| 298 |
+
|
| 299 |
try:
|
| 300 |
+
logger.info(f"Processing RSS feeds for {DATASET_REPO_ID}...")
|
| 301 |
|
| 302 |
+
# 1. Load Local JSON
|
| 303 |
+
with open('local_rss_store.json', 'r') as f:
|
| 304 |
+
json_list = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
|
| 306 |
+
if not json_list:
|
| 307 |
+
logger.info("No local RSS data to upload.")
|
| 308 |
+
# return # Optional: Exit if empty
|
| 309 |
+
else:
|
| 310 |
+
# Create a HF Dataset object from the new local data
|
| 311 |
+
new_dataset = Dataset.from_list(json_list)
|
| 312 |
+
|
| 313 |
+
# 2. Try to Load Existing Dataset from the Hub
|
| 314 |
+
try:
|
| 315 |
+
# We load the existing dataset to append to it
|
| 316 |
+
existing_dataset = load_dataset(DATASET_REPO_ID, split="train")
|
| 317 |
+
logger.info(f"Found existing dataset with {len(existing_dataset)} rows.")
|
| 318 |
+
|
| 319 |
+
# OPTIONAL: Align features (columns) if RSS structure changes
|
| 320 |
+
# new_dataset = new_dataset.cast(existing_dataset.features)
|
| 321 |
+
|
| 322 |
+
# 3. Concatenate (Append)
|
| 323 |
+
final_dataset = concatenate_datasets([existing_dataset, new_dataset])
|
| 324 |
+
logger.info(f"Appending {len(new_dataset)} new rows. Total size: {len(final_dataset)}")
|
| 325 |
+
|
| 326 |
+
except Exception as e:
|
| 327 |
+
# If dataset doesn't exist yet, start fresh
|
| 328 |
+
logger.info(f"No existing dataset found (or error loading). Creating new. Details: {e}")
|
| 329 |
+
final_dataset = new_dataset
|
| 330 |
+
|
| 331 |
+
# 4. Push the Unified Dataset back to Hub
|
| 332 |
+
# This updates the main parquet file(s) cleanly
|
| 333 |
+
final_dataset.push_to_hub(DATASET_REPO_ID)
|
| 334 |
|
| 335 |
+
logger.info(f"Successfully pushed updated dataset to {DATASET_REPO_ID}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
|
|
|
|
| 337 |
except Exception as e:
|
| 338 |
+
logger.error(f"Error appending RSS feeds to Hugging Face Hub: {e}", exc_info=True)
|
| 339 |
|
| 340 |
|
| 341 |
|