broadfield-dev commited on
Commit
d3c98a4
·
verified ·
1 Parent(s): 7072ceb

Update rss_processor.py

Browse files
Files changed (1) hide show
  1. rss_processor.py +28 -2
rss_processor.py CHANGED
@@ -10,6 +10,7 @@ import dateutil.parser
10
  import hashlib
11
  import json
12
  import re
 
13
 
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
  logger = logging.getLogger(__name__)
@@ -20,6 +21,7 @@ COLLECTION_NAME = "news_articles"
20
  HF_API_TOKEN = os.getenv("HF_TOKEN")
21
  REPO_ID = "broadfield-dev/news-rag-db"
22
  MAX_ARTICLES_PER_FEED = 1000
 
23
 
24
  def initialize_hf_api():
25
  if not HF_API_TOKEN:
@@ -45,6 +47,22 @@ def clean_text(text):
45
  text = ' '.join(text.split())
46
  return text.strip()
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def fetch_rss_feeds():
49
  articles = []
50
  seen_links = set()
@@ -65,7 +83,13 @@ def fetch_rss_feeds():
65
 
66
  try:
67
  logger.info(f"Fetching {feed_url}")
68
- feed = feedparser.parse(feed_url)
 
 
 
 
 
 
69
  if feed.bozo:
70
  logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
71
  continue
@@ -118,8 +142,10 @@ def fetch_rss_feeds():
118
  "category": category,
119
  "image": image,
120
  })
 
 
121
  except Exception as e:
122
- logger.error(f"Error fetching or parsing {feed_url}: {e}")
123
 
124
  logger.info(f"Total unique articles fetched: {len(articles)}")
125
  return articles
 
10
  import hashlib
11
  import json
12
  import re
13
+ import requests
14
 
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
  logger = logging.getLogger(__name__)
 
21
  HF_API_TOKEN = os.getenv("HF_TOKEN")
22
  REPO_ID = "broadfield-dev/news-rag-db"
23
  MAX_ARTICLES_PER_FEED = 1000
24
+ RAW_FEEDS_DIR = "raw_rss_feeds"
25
 
26
  def initialize_hf_api():
27
  if not HF_API_TOKEN:
 
47
  text = ' '.join(text.split())
48
  return text.strip()
49
 
50
+ def save_raw_rss_to_file(feed_url, content):
51
+ """Saves the raw RSS content to a file."""
52
+ if not os.path.exists(RAW_FEEDS_DIR):
53
+ os.makedirs(RAW_FEEDS_DIR)
54
+
55
+ # Create a safe filename from the URL
56
+ filename = re.sub(r'[^a-zA-Z0-9]', '_', feed_url) + ".xml"
57
+ filepath = os.path.join(RAW_FEEDS_DIR, filename)
58
+
59
+ try:
60
+ with open(filepath, 'w', encoding='utf-8') as f:
61
+ f.write(content)
62
+ logger.info(f"Saved raw RSS from {feed_url} to {filepath}")
63
+ except Exception as e:
64
+ logger.error(f"Could not save raw RSS from {feed_url}: {e}")
65
+
66
  def fetch_rss_feeds():
67
  articles = []
68
  seen_links = set()
 
83
 
84
  try:
85
  logger.info(f"Fetching {feed_url}")
86
+ # Fetch raw content first to save it
87
+ response = requests.get(feed_url, headers={'User-Agent': 'Mozilla/5.0'})
88
+ response.raise_for_status()
89
+ raw_content = response.text
90
+ save_raw_rss_to_file(feed_url, raw_content)
91
+
92
+ feed = feedparser.parse(raw_content)
93
  if feed.bozo:
94
  logger.warning(f"Parse error for {feed_url}: {feed.bozo_exception}")
95
  continue
 
142
  "category": category,
143
  "image": image,
144
  })
145
+ except requests.exceptions.RequestException as e:
146
+ logger.error(f"Error fetching {feed_url}: {e}")
147
  except Exception as e:
148
+ logger.error(f"Error processing {feed_url}: {e}")
149
 
150
  logger.info(f"Total unique articles fetched: {len(articles)}")
151
  return articles