broadfield-dev commited on
Commit
880db9b
·
verified ·
1 Parent(s): e2b7a0a

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +37 -20
rss_processor.py CHANGED
@@ -12,9 +12,7 @@ import json
12
  import re
13
  import requests
14
  import pandas as pd
15
- from datasets import Dataset
16
-
17
-
18
 
19
 
20
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
@@ -296,29 +294,48 @@ def upload_to_hf_hub():
296
  except Exception as e:
297
  logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)
298
 
 
 
299
  try:
300
- logger.info(f"Uploading raw RSS feeds from 'local' to {DATASET_REPO_ID}...")
301
 
302
- # 2. Convert list to a Hugging Face Dataset
303
- with open('local_rss_store.json','r') as f:
304
- json_data=f.read()
305
- f.close()
306
- json_list = json.loads(json_data) # json_data is your JSON string
307
- timestamp = datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
308
- local_filename = f'{timestamp}.parquet'
309
 
310
- Dataset.from_list(json_list).to_parquet(local_filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
- hf_api.upload_file(
313
- path_or_fileobj=local_filename,
314
- path_in_repo=f"data/{timestamp}.parquet", # Recommended to keep in a 'data/' folder
315
- repo_id=DATASET_REPO_ID,
316
- repo_type="dataset"
317
- )
318
 
319
- logger.info(f"Raw feeds folder 'local' uploaded to: {DATASET_REPO_ID}")
320
  except Exception as e:
321
- logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)
322
 
323
 
324
 
 
12
  import re
13
  import requests
14
  import pandas as pd
15
+ from datasets import Dataset, load_dataset, concatenate_datasets
 
 
16
 
17
 
18
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
294
  except Exception as e:
295
  logger.error(f"Error uploading raw feeds to Hugging Face Hub: {e}", exc_info=True)
296
 
297
+
298
+
299
  try:
300
+ logger.info(f"Processing RSS feeds for {DATASET_REPO_ID}...")
301
 
302
+ # 1. Load Local JSON
303
+ with open('local_rss_store.json', 'r') as f:
304
+ json_list = json.load(f)
 
 
 
 
305
 
306
+ if not json_list:
307
+ logger.info("No local RSS data to upload.")
308
+ # return # Optional: Exit if empty
309
+ else:
310
+ # Create a HF Dataset object from the new local data
311
+ new_dataset = Dataset.from_list(json_list)
312
+
313
+ # 2. Try to Load Existing Dataset from the Hub
314
+ try:
315
+ # We load the existing dataset to append to it
316
+ existing_dataset = load_dataset(DATASET_REPO_ID, split="train")
317
+ logger.info(f"Found existing dataset with {len(existing_dataset)} rows.")
318
+
319
+ # OPTIONAL: Align features (columns) if RSS structure changes
320
+ # new_dataset = new_dataset.cast(existing_dataset.features)
321
+
322
+ # 3. Concatenate (Append)
323
+ final_dataset = concatenate_datasets([existing_dataset, new_dataset])
324
+ logger.info(f"Appending {len(new_dataset)} new rows. Total size: {len(final_dataset)}")
325
+
326
+ except Exception as e:
327
+ # If dataset doesn't exist yet, start fresh
328
+ logger.info(f"No existing dataset found (or error loading). Creating new. Details: {e}")
329
+ final_dataset = new_dataset
330
+
331
+ # 4. Push the Unified Dataset back to Hub
332
+ # This updates the main parquet file(s) cleanly
333
+ final_dataset.push_to_hub(DATASET_REPO_ID)
334
 
335
+ logger.info(f"Successfully pushed updated dataset to {DATASET_REPO_ID}")
 
 
 
 
 
336
 
 
337
  except Exception as e:
338
+ logger.error(f"Error appending RSS feeds to Hugging Face Hub: {e}", exc_info=True)
339
 
340
 
341