embeddinggemma-tuning-lab / src /data_fetcher.py
bebechien's picture
Upload folder using huggingface_hub
beabfb7 verified
import feedparser
import pickle
import os
import time
from datetime import datetime
from typing import Tuple, Any, Optional
# Assuming AppConfig is passed in via dependency injection in the refactored main app.
def format_published_time(published_parsed: Optional[time.struct_time]) -> str:
"""Safely converts a feedparser time struct to a formatted string."""
if published_parsed:
try:
dt_obj = datetime.fromtimestamp(time.mktime(published_parsed))
return dt_obj.strftime('%Y-%m-%d %H:%M')
except Exception:
return 'N/A'
return 'N/A'
def load_feed_from_cache(config: Any) -> Tuple[Optional[Any], str]:
"""Attempts to load a feed object from the cache file if it exists and is not expired."""
if not os.path.exists(config.CACHE_FILE):
return None, "Cache file not found."
try:
# Check cache age
file_age_seconds = time.time() - os.path.getmtime(config.CACHE_FILE)
if file_age_seconds > config.CACHE_DURATION_SECONDS:
# The cache is too old
return None, f"Cache expired ({file_age_seconds:.0f}s old, limit is {config.CACHE_DURATION_SECONDS}s)."
with open(config.CACHE_FILE, 'rb') as f:
feed = pickle.load(f)
return feed, f"Loaded successfully from cache (Age: {file_age_seconds:.0f}s)."
except Exception as e:
# If loading fails, treat it as a miss and attempt to clean up
print(f"Warning: Failed to load cache file. Deleting corrupted cache. Reason: {e}")
try:
os.remove(config.CACHE_FILE)
except OSError:
pass # Ignore if removal fails
return None, "Cache file corrupted or invalid. Will re-fetch."
def save_feed_to_cache(config: Any, feed: Any) -> None:
"""Saves the fetched feed object to the cache file."""
try:
with open(config.CACHE_FILE, 'wb') as f:
pickle.dump(feed, f)
print(f"Successfully saved new feed data to cache: {config.CACHE_FILE}")
except Exception as e:
print(f"Error saving to cache: {e}")
def read_hacker_news_rss(config: Any) -> Tuple[Optional[Any], str]:
"""
Reads and parses the Hacker News RSS feed, using a cache if available.
Returns the feedparser object and a status message.
"""
url = config.HN_RSS_URL
print(f"Attempting to fetch and parse RSS feed from: {url}")
print("-" * 50)
# 1. Attempt to load from cache
feed, cache_status = load_feed_from_cache(config)
print(f"Cache Status: {cache_status}")
# 2. If cache miss or stale, fetch from web
if feed is None:
print("Starting network fetch...")
try:
# Use feedparser to fetch and parse the feed
feed = feedparser.parse(url)
if feed.status >= 400:
status_msg = f"Error fetching the feed. HTTP Status: {feed.status}"
print(status_msg)
return None, status_msg
if feed.bozo:
# Bozo is set if any error occurred, even non-critical ones.
print(f"Warning: Failed to fully parse the feed. Reason: {feed.get('bozo_exception')}")
# 3. If fetch successful, save new data to cache
if feed.entries:
save_feed_to_cache(config, feed)
status_msg = f"Successfully fetched and cached {len(feed.entries)} entries."
else:
status_msg = "Fetch successful, but no entries found in the feed."
print(status_msg)
feed = None # Ensure feed is None if no entries
except Exception as e:
status_msg = f"An unexpected error occurred during network processing: {e}"
print(status_msg)
return None, status_msg
else:
status_msg = cache_status
return feed, status_msg
# Example usage (not part of the refactored module's purpose but good for testing)
if __name__ == '__main__':
from .config import AppConfig
feed, status = read_hacker_news_rss(AppConfig)
if feed and feed.entries:
print(f"\nFetched {len(feed.entries)} entries. Top 3 titles:")
for entry in feed.entries[:3]:
print(f"- {entry.title}")
else:
print(f"Could not fetch the feed. Status: {status}")