""" Dev flow orchestration script for content scraping and email delivery. This module coordinates the dev workflow: 1. Scrape article content 2. Rewrite and generate tags 3. Fetch featured image 4. Generate HTML webpage 5. Send HTML email to recipient 6. Summary """ import sys import os from logger import logger from config import ( validate_config, DEV_EMAIL_RECIPIENT, DEV_CACHE_CONTENT, DEV_CACHE_SEO, DEV_CACHE_IMAGE, ) from index_manager import IndexManager from scraper import ArticleScraper from content_processor import ContentProcessor from image_handler import ImageHandler from html_handler import HTMLHandler from email_handler import EmailHandler from status_tracker import StatusTracker def get_or_scrape_content(url): """ Get content from cache or scrape it fresh. Args: url (str): Article URL Returns: dict: Contains 'title' and 'body' keys """ try: index_manager = IndexManager() if index_manager.url_exists(url): logger.info(f"URL found in cache: {url}") cached_data = index_manager.load_scraped_data(url) if cached_data and "sources" in cached_data: return cached_data if cached_data: logger.info("Cached entry is missing 'sources' key (old format) — re-scraping to refresh") logger.info(f"Scraping fresh content: {url}") scraper = ArticleScraper() article_data = scraper.scrape(url) # Save to cache index_manager.save_scraped_data( url, article_data["title"], article_data["body"], sources=article_data.get("sources", []), ) return article_data except Exception as e: logger.error(f"Failed to get/scrape content: {e}") raise def process_article_dev(url, email_recipient=None): """ Dev workflow to process an article and send via email. Args: url (str): Article URL to process email_recipient (str, optional): Email recipient. Defaults to config value. Raises: Exception: If any step fails """ if email_recipient is None: email_recipient = DEV_EMAIL_RECIPIENT status_tracker = StatusTracker("dev") try: logger.info("=" * 70) logger.info(f"Starting DEV flow - article processing: {url}") logger.info("=" * 70) status_tracker.mark_in_progress(url) # Step 1: Get or scrape content logger.info("\n[1/6] Retrieving article content...") article_data = get_or_scrape_content(url) original_title = article_data["title"] original_body = article_data["body"] logger.info(f"✓ Content retrieved: {original_title} and size is {len(original_body)} characters") logger.debug(f"Original body: {original_body[:300000]}") index_manager = IndexManager() dev_cache = index_manager.load_dev_cache(url) cached_content = DEV_CACHE_CONTENT and dev_cache is not None and bool(dev_cache.get("title")) cached_seo = DEV_CACHE_SEO and dev_cache is not None and bool(dev_cache.get("seo_data")) cached_image = DEV_CACHE_IMAGE and dev_cache is not None and bool(dev_cache.get("image_info")) # Step 2: Generate SEO data first processor = None seo_focus_words = [] if cached_seo: logger.info("\n[2/6] Loading cached SEO data...") seo_data = dev_cache["seo_data"] seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", []) tags = seo_data.get("tags", []) logger.info(f"✓ Reused cached SEO: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags") else: logger.info("\n[2/6] Generating SEO metadata with AI...") processor = ContentProcessor() seo_data = processor.generate_seo_data(original_title, original_body) seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", []) tags = seo_data.get("tags", []) logger.info(f"✓ SEO generated: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags") # Step 2b: Rewrite content (title + body) if cached_content: logger.info("\n[2b/6] Loading cached content (title + body)...") new_title = dev_cache["title"] new_body = dev_cache["body"] logger.info(f"✓ Reused cached content: {new_title}") else: logger.info("\n[2b/6] Rewriting content with AI...") if processor is None: processor = ContentProcessor() processed_data = processor.rewrite_content(original_title, original_body, url, seo_focus_words, sources=article_data.get("sources", [])) new_title = processed_data["title"] new_body = processed_data["body"] logger.info(f"✓ Content rewritten: {new_title}") # Step 3: Fetch featured image image_handler = ImageHandler() if cached_image: logger.info("\n[3/6] Reusing cached featured image for dev flow...") image_info = dev_cache["image_info"] image_data = image_handler.download_image(image_info["url"]) logger.info(f"✓ Reused image: {image_info.get('url')}") else: logger.info("\n[3/6] Fetching featured image...") image_info = image_handler.fetch_image(tags) image_data = image_handler.download_image(image_info["url"]) logger.info(f"✓ Image fetched from {image_info['credit']}") if DEV_CACHE_CONTENT or DEV_CACHE_SEO or DEV_CACHE_IMAGE: index_manager.save_dev_cache( url, title=new_title, body=new_body, tags=tags, image_info=image_info, seo_data=seo_data, ) # Step 4: Generate HTML webpage logger.info("\n[4/6] Generating HTML webpage...") html_handler = HTMLHandler() # Create featured image HTML img_alt = seo_focus_words[0] if seo_focus_words else new_title.replace('"', "") credit_name = image_info.get("credit", "") credit_url = image_info.get("credit_url", "https://unsplash.com") if credit_name: photo_credit_html = ( f'

' f'Photo by {credit_name} on ' f'Unsplash' f'

' ) else: photo_credit_html = ( '

' 'Photo via Unsplash' '

' ) featured_image_html = ( f'
' f'{img_alt}' f'
' + photo_credit_html ) enhanced_body = ( featured_image_html + '\n
' + new_body + "
" ) # Generate and save HTML html_file_path = html_handler.generate_and_save( title=new_title, body=enhanced_body, tags=tags, image_url=image_info["url"], original_url=url, ) logger.info(f"✓ HTML file generated: {html_file_path}") # Extract filename from path html_filename = os.path.basename(html_file_path) # # Step 5: Send HTML email # logger.info("\n[5/6] Saving email file...") # try: # email_handler = EmailHandler() # # Read HTML content from file # with open(html_file_path, "r", encoding="utf-8") as f: # html_content = f.read() # # Save email file # email_file_path = email_handler.save_article_email( # recipient=email_recipient, # article_title=new_title, # html_content=html_content, # ) # logger.info(f"✓ Email saved to {email_file_path}") # except ValueError as e: # logger.warning(f"Email not configured: {e}") # logger.info("Skipping email step - configure EMAIL_SENDER to enable") # email_file_path = None # except (ConnectionRefusedError, OSError) as e: # logger.warning(f"Mail server not available: {e}") # logger.info("Skipping email step - mail server may not be running. Install Postfix: sudo apt install postfix") # except Exception as e: # logger.warning(f"Failed to send email (continuing anyway): {e}") # Step 6: Summary logger.info("\n[6/6] Processing complete!") logger.info("=" * 70) logger.info("✅ SUCCESS - DEV flow completed") logger.info(f" Title: {new_title}") logger.info(f" Tags: {', '.join(tags)}") logger.info(f" HTML File: {html_file_path}") logger.info("=" * 70 + "\n") # Mark as successful status_tracker.mark_successful( url=url, filename=html_filename, title=original_title, new_title=new_title, ) return { "title": new_title, "tags": tags, "html_file": html_file_path, } except Exception as e: logger.error("=" * 70) logger.error(f"❌ FAILED - DEV flow failed: {e}") logger.error("=" * 70) # Determine which step failed failure_step = "unknown" if "Retrieving article content" in str(e): failure_step = "step_1_scrape" elif "Rewriting content" in str(e): failure_step = "step_2_rewrite" elif "Fetching featured image" in str(e): failure_step = "step_3_image" elif "Generating HTML" in str(e): failure_step = "step_4_html" else: failure_step = f"unknown_error: {str(e)[:50]}" # Mark as failed try: original_title = article_data.get("title", "") if 'article_data' in locals() else "" status_tracker.mark_failed( url=url, failure_step=failure_step, title=original_title, ) except: pass # If status tracking fails, continue with raising exception raise def main(): """Main entry point for dev flow.""" try: # Validate configuration logger.info("Validating configuration...") validate_config() logger.info("✓ Configuration valid") # Process article #url = "https://www.healthline.com/nutrition/12-omega-3-rich-foods" #url = "https://www.healthline.com/health/10-gut-foods" urls = [ # "https://www.healthline.com/health/the-benefits-of-biotin", # "https://www.healthline.com/nutrition/12-omega-3-rich-foods", #"https://www.healthline.com/nutrition/how-much-collagen-per-day", ##"https://www.healthline.com/health/beauty-skin-care/supplements-for-better-skin", #"https://www.healthline.com/nutrition/ashwagandha", #"https://www.healthline.com/health/10-gut-foods" # "https://indianexpress.com/article/lifestyle/food-wine/icmr-protein-supplements-powder-health-kidney-bones-9318508/", #"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991" #"https://www.verywellhealth.com/best-time-of-day-to-eat-your-fiber-11945630" #"https://www.webmd.com/vitamins/ai/ingredientmono-464/gamma-aminobutyric-acid-gaba#overview" # "https://www.webmd.com/vitamins-supplements/activated-charcoal", # "https://www.webmd.com/vitamins-supplements/5-htp", # "https://www.webmd.com/vitamins/ai/ingredientmono-1101/holy-basil", # "https://www.webmd.com/vitamins/ai/ingredientmono-1062/hyaluronic-acid", # "https://www.webmd.com/vitamins/ai/ingredientmono-875/l-arginine", # "https://www.webmd.com/vitamins-supplements/evening-primrose-oil", # "https://www.webmd.com/vitamins/ai/ingredientmono-1242/moringa", # "https://www.webmd.com/vitamins-supplements/xylitol" # "https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks" #"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991" #Batch2 #"https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks" #"https://www.medicalnewstoday.com/articles/how-much-protein-do-you-need-to-build-muscle#How-much-protein-do-you-need", # to completed # "https://www.medicalnewstoday.com/articles/is-it-better-to-eat-several-small-meals-or-fewer-larger-ones", # "https://www.medicalnewstoday.com/articles/not-all-plant-based-diets-are-the-same-junk-veggie-food-and-its-impact-on-health", # "https://www.medicalnewstoday.com/articles/intermittent-fasting-is-it-all-its-cracked-up-to-be", # "https://www.who.int/health-topics/nutrition#tab=tab_1", #failing # "https://publichealth.jhu.edu/2025/the-evidence-behind-seed-oils-health-effects", # "https://nutritionsource.hsph.harvard.edu/what-should-you-eat/vegetables-and-fruits", # "https://health.clevelandclinic.org/plant-based-milk-options", # "https://www.nhs.uk/live-well/eat-well/food-guidelines-and-food-labels/the-eatwell-guide/", # "https://www.futureoffood.ox.ac.uk/improving-diet-and-nutrition", # "https://medlineplus.gov/ency/article/002465.htm", # "https://www.bbc.com/future/article/20260424-diet-why-enjoying-your-food-is-key-to-weight-loss", # "https://www.fda.gov/food/nutrition-food-labeling-and-critical-foods/fdas-nutrition-initiatives", # "https://www.medindia.net/news/healthwatch/can-peanut-butter-keep-you-stronger-as-you-age-223205-1.htm" # "https://www.healthline.com/health/type-2-diabetes/basal-insulin-types-benefits-dosage-side-effects" "https://www.webmd.com/vitamins/ai/ingredientmono-707/java-tea#overview" ] # 2. Loop through the list and run the function for each one for url in urls: process_article_dev(url) except ValueError as e: logger.error(f"Configuration error: {e}") sys.exit(1) except Exception as e: logger.error(f"Fatal error: {e}") sys.exit(1) if __name__ == "__main__": main()