Spaces:

ginntonicfun
/

suppfactsdaily

Sleeping

RidhiD.

Add multi-stage scraper fallback and force re-scrape for pre-sources cache entries

068fb63 18 days ago

15.2 kB

	"""
	Dev flow orchestration script for content scraping and email delivery.

	This module coordinates the dev workflow:
	1. Scrape article content
	2. Rewrite and generate tags
	3. Fetch featured image
	4. Generate HTML webpage
	5. Send HTML email to recipient
	6. Summary
	"""

	import sys
	import os
	from logger import logger
	from config import (
	validate_config,
	DEV_EMAIL_RECIPIENT,
	DEV_CACHE_CONTENT,
	DEV_CACHE_SEO,
	DEV_CACHE_IMAGE,
	)
	from index_manager import IndexManager
	from scraper import ArticleScraper
	from content_processor import ContentProcessor
	from image_handler import ImageHandler
	from html_handler import HTMLHandler
	from email_handler import EmailHandler
	from status_tracker import StatusTracker


	def get_or_scrape_content(url):
	"""
	Get content from cache or scrape it fresh.

	Args:
	url (str): Article URL

	Returns:
	dict: Contains 'title' and 'body' keys
	"""
	try:
	index_manager = IndexManager()

	if index_manager.url_exists(url):
	logger.info(f"URL found in cache: {url}")
	cached_data = index_manager.load_scraped_data(url)
	if cached_data and "sources" in cached_data:
	return cached_data
	if cached_data:
	logger.info("Cached entry is missing 'sources' key (old format) — re-scraping to refresh")

	logger.info(f"Scraping fresh content: {url}")
	scraper = ArticleScraper()
	article_data = scraper.scrape(url)

	# Save to cache
	index_manager.save_scraped_data(
	url,
	article_data["title"],
	article_data["body"],
	sources=article_data.get("sources", []),
	)

	return article_data

	except Exception as e:
	logger.error(f"Failed to get/scrape content: {e}")
	raise


	def process_article_dev(url, email_recipient=None):
	"""
	Dev workflow to process an article and send via email.

	Args:
	url (str): Article URL to process
	email_recipient (str, optional): Email recipient. Defaults to config value.

	Raises:
	Exception: If any step fails
	"""
	if email_recipient is None:
	email_recipient = DEV_EMAIL_RECIPIENT

	status_tracker = StatusTracker("dev")

	try:
	logger.info("=" * 70)
	logger.info(f"Starting DEV flow - article processing: {url}")
	logger.info("=" * 70)

	status_tracker.mark_in_progress(url)

	# Step 1: Get or scrape content
	logger.info("\n[1/6] Retrieving article content...")
	article_data = get_or_scrape_content(url)
	original_title = article_data["title"]
	original_body = article_data["body"]
	logger.info(f"✓ Content retrieved: {original_title} and size is {len(original_body)} characters")
	logger.debug(f"Original body: {original_body[:300000]}")

	index_manager = IndexManager()
	dev_cache = index_manager.load_dev_cache(url)
	cached_content = DEV_CACHE_CONTENT and dev_cache is not None and bool(dev_cache.get("title"))
	cached_seo = DEV_CACHE_SEO and dev_cache is not None and bool(dev_cache.get("seo_data"))
	cached_image = DEV_CACHE_IMAGE and dev_cache is not None and bool(dev_cache.get("image_info"))

	# Step 2: Generate SEO data first
	processor = None
	seo_focus_words = []
	if cached_seo:
	logger.info("\n[2/6] Loading cached SEO data...")
	seo_data = dev_cache["seo_data"]
	seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", [])
	tags = seo_data.get("tags", [])
	logger.info(f"✓ Reused cached SEO: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags")
	else:
	logger.info("\n[2/6] Generating SEO metadata with AI...")
	processor = ContentProcessor()
	seo_data = processor.generate_seo_data(original_title, original_body)
	seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", [])
	tags = seo_data.get("tags", [])
	logger.info(f"✓ SEO generated: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags")

	# Step 2b: Rewrite content (title + body)
	if cached_content:
	logger.info("\n[2b/6] Loading cached content (title + body)...")
	new_title = dev_cache["title"]
	new_body = dev_cache["body"]
	logger.info(f"✓ Reused cached content: {new_title}")
	else:
	logger.info("\n[2b/6] Rewriting content with AI...")
	if processor is None:
	processor = ContentProcessor()
	processed_data = processor.rewrite_content(original_title, original_body, url, seo_focus_words, sources=article_data.get("sources", []))
	new_title = processed_data["title"]
	new_body = processed_data["body"]
	logger.info(f"✓ Content rewritten: {new_title}")

	# Step 3: Fetch featured image
	image_handler = ImageHandler()
	if cached_image:
	logger.info("\n[3/6] Reusing cached featured image for dev flow...")
	image_info = dev_cache["image_info"]
	image_data = image_handler.download_image(image_info["url"])
	logger.info(f"✓ Reused image: {image_info.get('url')}")
	else:
	logger.info("\n[3/6] Fetching featured image...")
	image_info = image_handler.fetch_image(tags)
	image_data = image_handler.download_image(image_info["url"])
	logger.info(f"✓ Image fetched from {image_info['credit']}")

	if DEV_CACHE_CONTENT or DEV_CACHE_SEO or DEV_CACHE_IMAGE:
	index_manager.save_dev_cache(
	url,
	title=new_title,
	body=new_body,
	tags=tags,
	image_info=image_info,
	seo_data=seo_data,
	)

	# Step 4: Generate HTML webpage
	logger.info("\n[4/6] Generating HTML webpage...")
	html_handler = HTMLHandler()

	# Create featured image HTML
	img_alt = seo_focus_words[0] if seo_focus_words else new_title.replace('"', "")
	credit_name = image_info.get("credit", "")
	credit_url = image_info.get("credit_url", "https://unsplash.com")
	if credit_name:
	photo_credit_html = (
	f'<p style="font-size:0.75rem;color:#888888;text-align:left;margin-top:0.25rem;padding-left:0;">'
	f'Photo by <a href="{credit_url}" target="_blank" rel="noopener">{credit_name}</a> on '
	f'<a href="https://unsplash.com" target="_blank" rel="noopener">Unsplash</a>'
	f'</p>'
	)
	else:
	photo_credit_html = (
	'<p style="font-size:0.75rem;color:#888888;text-align:left;margin-top:0.25rem;padding-left:0;">'
	'Photo via <a href="https://unsplash.com" target="_blank" rel="noopener">Unsplash</a>'
	'</p>'
	)
	featured_image_html = (
	f'<figure style="margin:0;padding:0;width:100%;">'
	f'<img src="{image_info["url"]}" alt="{img_alt}" '
	f'style="width:100%;height:240px;object-fit:cover;object-position:center;display:block;">'
	f'</figure>'
	+ photo_credit_html
	)
	enhanced_body = (
	featured_image_html
	+ '\n<div style="max-width:100%;overflow-x:hidden;box-sizing:border-box;word-break:break-word;overflow-wrap:break-word;">'
	+ new_body
	+ "</div>"
	)

	# Generate and save HTML
	html_file_path = html_handler.generate_and_save(
	title=new_title,
	body=enhanced_body,
	tags=tags,
	image_url=image_info["url"],
	original_url=url,
	)
	logger.info(f"✓ HTML file generated: {html_file_path}")

	# Extract filename from path
	html_filename = os.path.basename(html_file_path)

	# # Step 5: Send HTML email
	# logger.info("\n[5/6] Saving email file...")
	# try:
	# email_handler = EmailHandler()

	# # Read HTML content from file
	# with open(html_file_path, "r", encoding="utf-8") as f:
	# html_content = f.read()

	# # Save email file
	# email_file_path = email_handler.save_article_email(
	# recipient=email_recipient,
	# article_title=new_title,
	# html_content=html_content,
	# )
	# logger.info(f"✓ Email saved to {email_file_path}")

	# except ValueError as e:
	# logger.warning(f"Email not configured: {e}")
	# logger.info("Skipping email step - configure EMAIL_SENDER to enable")
	# email_file_path = None
	# except (ConnectionRefusedError, OSError) as e:
	# logger.warning(f"Mail server not available: {e}")
	# logger.info("Skipping email step - mail server may not be running. Install Postfix: sudo apt install postfix")
	# except Exception as e:
	# logger.warning(f"Failed to send email (continuing anyway): {e}")

	# Step 6: Summary
	logger.info("\n[6/6] Processing complete!")
	logger.info("=" * 70)
	logger.info("✅ SUCCESS - DEV flow completed")
	logger.info(f" Title: {new_title}")
	logger.info(f" Tags: {', '.join(tags)}")
	logger.info(f" HTML File: {html_file_path}")
	logger.info("=" * 70 + "\n")

	# Mark as successful
	status_tracker.mark_successful(
	url=url,
	filename=html_filename,
	title=original_title,
	new_title=new_title,
	)

	return {
	"title": new_title,
	"tags": tags,
	"html_file": html_file_path,
	}

	except Exception as e:
	logger.error("=" * 70)
	logger.error(f"❌ FAILED - DEV flow failed: {e}")
	logger.error("=" * 70)

	# Determine which step failed
	failure_step = "unknown"
	if "Retrieving article content" in str(e):
	failure_step = "step_1_scrape"
	elif "Rewriting content" in str(e):
	failure_step = "step_2_rewrite"
	elif "Fetching featured image" in str(e):
	failure_step = "step_3_image"
	elif "Generating HTML" in str(e):
	failure_step = "step_4_html"
	else:
	failure_step = f"unknown_error: {str(e)[:50]}"

	# Mark as failed
	try:
	original_title = article_data.get("title", "") if 'article_data' in locals() else ""
	status_tracker.mark_failed(
	url=url,
	failure_step=failure_step,
	title=original_title,
	)
	except:
	pass # If status tracking fails, continue with raising exception

	raise


	def main():
	"""Main entry point for dev flow."""
	try:
	# Validate configuration
	logger.info("Validating configuration...")
	validate_config()
	logger.info("✓ Configuration valid")

	# Process article
	#url = "https://www.healthline.com/nutrition/12-omega-3-rich-foods"
	#url = "https://www.healthline.com/health/10-gut-foods"

	urls = [
	# "https://www.healthline.com/health/the-benefits-of-biotin",
	# "https://www.healthline.com/nutrition/12-omega-3-rich-foods",
	#"https://www.healthline.com/nutrition/how-much-collagen-per-day",
	##"https://www.healthline.com/health/beauty-skin-care/supplements-for-better-skin",
	#"https://www.healthline.com/nutrition/ashwagandha",
	#"https://www.healthline.com/health/10-gut-foods"

	# "https://indianexpress.com/article/lifestyle/food-wine/icmr-protein-supplements-powder-health-kidney-bones-9318508/",
	#"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991"
	#"https://www.verywellhealth.com/best-time-of-day-to-eat-your-fiber-11945630"
	#"https://www.webmd.com/vitamins/ai/ingredientmono-464/gamma-aminobutyric-acid-gaba#overview"
	# "https://www.webmd.com/vitamins-supplements/activated-charcoal",
	# "https://www.webmd.com/vitamins-supplements/5-htp",
	# "https://www.webmd.com/vitamins/ai/ingredientmono-1101/holy-basil",
	# "https://www.webmd.com/vitamins/ai/ingredientmono-1062/hyaluronic-acid",
	# "https://www.webmd.com/vitamins/ai/ingredientmono-875/l-arginine",
	# "https://www.webmd.com/vitamins-supplements/evening-primrose-oil",
	# "https://www.webmd.com/vitamins/ai/ingredientmono-1242/moringa",
	# "https://www.webmd.com/vitamins-supplements/xylitol"

	# "https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks"

	#"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991"



	#Batch2
	#"https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks"
	#"https://www.medicalnewstoday.com/articles/how-much-protein-do-you-need-to-build-muscle#How-much-protein-do-you-need",

	# to completed
	# "https://www.medicalnewstoday.com/articles/is-it-better-to-eat-several-small-meals-or-fewer-larger-ones",
	# "https://www.medicalnewstoday.com/articles/not-all-plant-based-diets-are-the-same-junk-veggie-food-and-its-impact-on-health",
	# "https://www.medicalnewstoday.com/articles/intermittent-fasting-is-it-all-its-cracked-up-to-be",


	# "https://www.who.int/health-topics/nutrition#tab=tab_1",
	#failing
	# "https://publichealth.jhu.edu/2025/the-evidence-behind-seed-oils-health-effects",
	# "https://nutritionsource.hsph.harvard.edu/what-should-you-eat/vegetables-and-fruits",
	# "https://health.clevelandclinic.org/plant-based-milk-options",
	# "https://www.nhs.uk/live-well/eat-well/food-guidelines-and-food-labels/the-eatwell-guide/",
	# "https://www.futureoffood.ox.ac.uk/improving-diet-and-nutrition",
	# "https://medlineplus.gov/ency/article/002465.htm",
	# "https://www.bbc.com/future/article/20260424-diet-why-enjoying-your-food-is-key-to-weight-loss",
	# "https://www.fda.gov/food/nutrition-food-labeling-and-critical-foods/fdas-nutrition-initiatives",
	# "https://www.medindia.net/news/healthwatch/can-peanut-butter-keep-you-stronger-as-you-age-223205-1.htm"
	# "https://www.healthline.com/health/type-2-diabetes/basal-insulin-types-benefits-dosage-side-effects"
	"https://www.webmd.com/vitamins/ai/ingredientmono-707/java-tea#overview"


	]

	# 2. Loop through the list and run the function for each one
	for url in urls:
	process_article_dev(url)


	except ValueError as e:
	logger.error(f"Configuration error: {e}")
	sys.exit(1)
	except Exception as e:
	logger.error(f"Fatal error: {e}")
	sys.exit(1)


	if __name__ == "__main__":
	main()