suppfactsdaily / dev_main.py
RidhiD.
Add multi-stage scraper fallback and force re-scrape for pre-sources cache entries
068fb63
"""
Dev flow orchestration script for content scraping and email delivery.
This module coordinates the dev workflow:
1. Scrape article content
2. Rewrite and generate tags
3. Fetch featured image
4. Generate HTML webpage
5. Send HTML email to recipient
6. Summary
"""
import sys
import os
from logger import logger
from config import (
validate_config,
DEV_EMAIL_RECIPIENT,
DEV_CACHE_CONTENT,
DEV_CACHE_SEO,
DEV_CACHE_IMAGE,
)
from index_manager import IndexManager
from scraper import ArticleScraper
from content_processor import ContentProcessor
from image_handler import ImageHandler
from html_handler import HTMLHandler
from email_handler import EmailHandler
from status_tracker import StatusTracker
def get_or_scrape_content(url):
"""
Get content from cache or scrape it fresh.
Args:
url (str): Article URL
Returns:
dict: Contains 'title' and 'body' keys
"""
try:
index_manager = IndexManager()
if index_manager.url_exists(url):
logger.info(f"URL found in cache: {url}")
cached_data = index_manager.load_scraped_data(url)
if cached_data and "sources" in cached_data:
return cached_data
if cached_data:
logger.info("Cached entry is missing 'sources' key (old format) — re-scraping to refresh")
logger.info(f"Scraping fresh content: {url}")
scraper = ArticleScraper()
article_data = scraper.scrape(url)
# Save to cache
index_manager.save_scraped_data(
url,
article_data["title"],
article_data["body"],
sources=article_data.get("sources", []),
)
return article_data
except Exception as e:
logger.error(f"Failed to get/scrape content: {e}")
raise
def process_article_dev(url, email_recipient=None):
"""
Dev workflow to process an article and send via email.
Args:
url (str): Article URL to process
email_recipient (str, optional): Email recipient. Defaults to config value.
Raises:
Exception: If any step fails
"""
if email_recipient is None:
email_recipient = DEV_EMAIL_RECIPIENT
status_tracker = StatusTracker("dev")
try:
logger.info("=" * 70)
logger.info(f"Starting DEV flow - article processing: {url}")
logger.info("=" * 70)
status_tracker.mark_in_progress(url)
# Step 1: Get or scrape content
logger.info("\n[1/6] Retrieving article content...")
article_data = get_or_scrape_content(url)
original_title = article_data["title"]
original_body = article_data["body"]
logger.info(f"✓ Content retrieved: {original_title} and size is {len(original_body)} characters")
logger.debug(f"Original body: {original_body[:300000]}")
index_manager = IndexManager()
dev_cache = index_manager.load_dev_cache(url)
cached_content = DEV_CACHE_CONTENT and dev_cache is not None and bool(dev_cache.get("title"))
cached_seo = DEV_CACHE_SEO and dev_cache is not None and bool(dev_cache.get("seo_data"))
cached_image = DEV_CACHE_IMAGE and dev_cache is not None and bool(dev_cache.get("image_info"))
# Step 2: Generate SEO data first
processor = None
seo_focus_words = []
if cached_seo:
logger.info("\n[2/6] Loading cached SEO data...")
seo_data = dev_cache["seo_data"]
seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", [])
tags = seo_data.get("tags", [])
logger.info(f"✓ Reused cached SEO: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags")
else:
logger.info("\n[2/6] Generating SEO metadata with AI...")
processor = ContentProcessor()
seo_data = processor.generate_seo_data(original_title, original_body)
seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", [])
tags = seo_data.get("tags", [])
logger.info(f"✓ SEO generated: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags")
# Step 2b: Rewrite content (title + body)
if cached_content:
logger.info("\n[2b/6] Loading cached content (title + body)...")
new_title = dev_cache["title"]
new_body = dev_cache["body"]
logger.info(f"✓ Reused cached content: {new_title}")
else:
logger.info("\n[2b/6] Rewriting content with AI...")
if processor is None:
processor = ContentProcessor()
processed_data = processor.rewrite_content(original_title, original_body, url, seo_focus_words, sources=article_data.get("sources", []))
new_title = processed_data["title"]
new_body = processed_data["body"]
logger.info(f"✓ Content rewritten: {new_title}")
# Step 3: Fetch featured image
image_handler = ImageHandler()
if cached_image:
logger.info("\n[3/6] Reusing cached featured image for dev flow...")
image_info = dev_cache["image_info"]
image_data = image_handler.download_image(image_info["url"])
logger.info(f"✓ Reused image: {image_info.get('url')}")
else:
logger.info("\n[3/6] Fetching featured image...")
image_info = image_handler.fetch_image(tags)
image_data = image_handler.download_image(image_info["url"])
logger.info(f"✓ Image fetched from {image_info['credit']}")
if DEV_CACHE_CONTENT or DEV_CACHE_SEO or DEV_CACHE_IMAGE:
index_manager.save_dev_cache(
url,
title=new_title,
body=new_body,
tags=tags,
image_info=image_info,
seo_data=seo_data,
)
# Step 4: Generate HTML webpage
logger.info("\n[4/6] Generating HTML webpage...")
html_handler = HTMLHandler()
# Create featured image HTML
img_alt = seo_focus_words[0] if seo_focus_words else new_title.replace('"', "")
credit_name = image_info.get("credit", "")
credit_url = image_info.get("credit_url", "https://unsplash.com")
if credit_name:
photo_credit_html = (
f'<p style="font-size:0.75rem;color:#888888;text-align:left;margin-top:0.25rem;padding-left:0;">'
f'Photo by <a href="{credit_url}" target="_blank" rel="noopener">{credit_name}</a> on '
f'<a href="https://unsplash.com" target="_blank" rel="noopener">Unsplash</a>'
f'</p>'
)
else:
photo_credit_html = (
'<p style="font-size:0.75rem;color:#888888;text-align:left;margin-top:0.25rem;padding-left:0;">'
'Photo via <a href="https://unsplash.com" target="_blank" rel="noopener">Unsplash</a>'
'</p>'
)
featured_image_html = (
f'<figure style="margin:0;padding:0;width:100%;">'
f'<img src="{image_info["url"]}" alt="{img_alt}" '
f'style="width:100%;height:240px;object-fit:cover;object-position:center;display:block;">'
f'</figure>'
+ photo_credit_html
)
enhanced_body = (
featured_image_html
+ '\n<div style="max-width:100%;overflow-x:hidden;box-sizing:border-box;word-break:break-word;overflow-wrap:break-word;">'
+ new_body
+ "</div>"
)
# Generate and save HTML
html_file_path = html_handler.generate_and_save(
title=new_title,
body=enhanced_body,
tags=tags,
image_url=image_info["url"],
original_url=url,
)
logger.info(f"✓ HTML file generated: {html_file_path}")
# Extract filename from path
html_filename = os.path.basename(html_file_path)
# # Step 5: Send HTML email
# logger.info("\n[5/6] Saving email file...")
# try:
# email_handler = EmailHandler()
# # Read HTML content from file
# with open(html_file_path, "r", encoding="utf-8") as f:
# html_content = f.read()
# # Save email file
# email_file_path = email_handler.save_article_email(
# recipient=email_recipient,
# article_title=new_title,
# html_content=html_content,
# )
# logger.info(f"✓ Email saved to {email_file_path}")
# except ValueError as e:
# logger.warning(f"Email not configured: {e}")
# logger.info("Skipping email step - configure EMAIL_SENDER to enable")
# email_file_path = None
# except (ConnectionRefusedError, OSError) as e:
# logger.warning(f"Mail server not available: {e}")
# logger.info("Skipping email step - mail server may not be running. Install Postfix: sudo apt install postfix")
# except Exception as e:
# logger.warning(f"Failed to send email (continuing anyway): {e}")
# Step 6: Summary
logger.info("\n[6/6] Processing complete!")
logger.info("=" * 70)
logger.info("✅ SUCCESS - DEV flow completed")
logger.info(f" Title: {new_title}")
logger.info(f" Tags: {', '.join(tags)}")
logger.info(f" HTML File: {html_file_path}")
logger.info("=" * 70 + "\n")
# Mark as successful
status_tracker.mark_successful(
url=url,
filename=html_filename,
title=original_title,
new_title=new_title,
)
return {
"title": new_title,
"tags": tags,
"html_file": html_file_path,
}
except Exception as e:
logger.error("=" * 70)
logger.error(f"❌ FAILED - DEV flow failed: {e}")
logger.error("=" * 70)
# Determine which step failed
failure_step = "unknown"
if "Retrieving article content" in str(e):
failure_step = "step_1_scrape"
elif "Rewriting content" in str(e):
failure_step = "step_2_rewrite"
elif "Fetching featured image" in str(e):
failure_step = "step_3_image"
elif "Generating HTML" in str(e):
failure_step = "step_4_html"
else:
failure_step = f"unknown_error: {str(e)[:50]}"
# Mark as failed
try:
original_title = article_data.get("title", "") if 'article_data' in locals() else ""
status_tracker.mark_failed(
url=url,
failure_step=failure_step,
title=original_title,
)
except:
pass # If status tracking fails, continue with raising exception
raise
def main():
"""Main entry point for dev flow."""
try:
# Validate configuration
logger.info("Validating configuration...")
validate_config()
logger.info("✓ Configuration valid")
# Process article
#url = "https://www.healthline.com/nutrition/12-omega-3-rich-foods"
#url = "https://www.healthline.com/health/10-gut-foods"
urls = [
# "https://www.healthline.com/health/the-benefits-of-biotin",
# "https://www.healthline.com/nutrition/12-omega-3-rich-foods",
#"https://www.healthline.com/nutrition/how-much-collagen-per-day",
##"https://www.healthline.com/health/beauty-skin-care/supplements-for-better-skin",
#"https://www.healthline.com/nutrition/ashwagandha",
#"https://www.healthline.com/health/10-gut-foods"
# "https://indianexpress.com/article/lifestyle/food-wine/icmr-protein-supplements-powder-health-kidney-bones-9318508/",
#"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991"
#"https://www.verywellhealth.com/best-time-of-day-to-eat-your-fiber-11945630"
#"https://www.webmd.com/vitamins/ai/ingredientmono-464/gamma-aminobutyric-acid-gaba#overview"
# "https://www.webmd.com/vitamins-supplements/activated-charcoal",
# "https://www.webmd.com/vitamins-supplements/5-htp",
# "https://www.webmd.com/vitamins/ai/ingredientmono-1101/holy-basil",
# "https://www.webmd.com/vitamins/ai/ingredientmono-1062/hyaluronic-acid",
# "https://www.webmd.com/vitamins/ai/ingredientmono-875/l-arginine",
# "https://www.webmd.com/vitamins-supplements/evening-primrose-oil",
# "https://www.webmd.com/vitamins/ai/ingredientmono-1242/moringa",
# "https://www.webmd.com/vitamins-supplements/xylitol"
# "https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks"
#"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991"
#Batch2
#"https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks"
#"https://www.medicalnewstoday.com/articles/how-much-protein-do-you-need-to-build-muscle#How-much-protein-do-you-need",
# to completed
# "https://www.medicalnewstoday.com/articles/is-it-better-to-eat-several-small-meals-or-fewer-larger-ones",
# "https://www.medicalnewstoday.com/articles/not-all-plant-based-diets-are-the-same-junk-veggie-food-and-its-impact-on-health",
# "https://www.medicalnewstoday.com/articles/intermittent-fasting-is-it-all-its-cracked-up-to-be",
# "https://www.who.int/health-topics/nutrition#tab=tab_1",
#failing
# "https://publichealth.jhu.edu/2025/the-evidence-behind-seed-oils-health-effects",
# "https://nutritionsource.hsph.harvard.edu/what-should-you-eat/vegetables-and-fruits",
# "https://health.clevelandclinic.org/plant-based-milk-options",
# "https://www.nhs.uk/live-well/eat-well/food-guidelines-and-food-labels/the-eatwell-guide/",
# "https://www.futureoffood.ox.ac.uk/improving-diet-and-nutrition",
# "https://medlineplus.gov/ency/article/002465.htm",
# "https://www.bbc.com/future/article/20260424-diet-why-enjoying-your-food-is-key-to-weight-loss",
# "https://www.fda.gov/food/nutrition-food-labeling-and-critical-foods/fdas-nutrition-initiatives",
# "https://www.medindia.net/news/healthwatch/can-peanut-butter-keep-you-stronger-as-you-age-223205-1.htm"
# "https://www.healthline.com/health/type-2-diabetes/basal-insulin-types-benefits-dosage-side-effects"
"https://www.webmd.com/vitamins/ai/ingredientmono-707/java-tea#overview"
]
# 2. Loop through the list and run the function for each one
for url in urls:
process_article_dev(url)
except ValueError as e:
logger.error(f"Configuration error: {e}")
sys.exit(1)
except Exception as e:
logger.error(f"Fatal error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()