Spaces:
Sleeping
Sleeping
| """ | |
| Dev flow orchestration script for content scraping and email delivery. | |
| This module coordinates the dev workflow: | |
| 1. Scrape article content | |
| 2. Rewrite and generate tags | |
| 3. Fetch featured image | |
| 4. Generate HTML webpage | |
| 5. Send HTML email to recipient | |
| 6. Summary | |
| """ | |
| import sys | |
| import os | |
| from logger import logger | |
| from config import ( | |
| validate_config, | |
| DEV_EMAIL_RECIPIENT, | |
| DEV_CACHE_CONTENT, | |
| DEV_CACHE_SEO, | |
| DEV_CACHE_IMAGE, | |
| ) | |
| from index_manager import IndexManager | |
| from scraper import ArticleScraper | |
| from content_processor import ContentProcessor | |
| from image_handler import ImageHandler | |
| from html_handler import HTMLHandler | |
| from email_handler import EmailHandler | |
| from status_tracker import StatusTracker | |
| def get_or_scrape_content(url): | |
| """ | |
| Get content from cache or scrape it fresh. | |
| Args: | |
| url (str): Article URL | |
| Returns: | |
| dict: Contains 'title' and 'body' keys | |
| """ | |
| try: | |
| index_manager = IndexManager() | |
| if index_manager.url_exists(url): | |
| logger.info(f"URL found in cache: {url}") | |
| cached_data = index_manager.load_scraped_data(url) | |
| if cached_data and "sources" in cached_data: | |
| return cached_data | |
| if cached_data: | |
| logger.info("Cached entry is missing 'sources' key (old format) — re-scraping to refresh") | |
| logger.info(f"Scraping fresh content: {url}") | |
| scraper = ArticleScraper() | |
| article_data = scraper.scrape(url) | |
| # Save to cache | |
| index_manager.save_scraped_data( | |
| url, | |
| article_data["title"], | |
| article_data["body"], | |
| sources=article_data.get("sources", []), | |
| ) | |
| return article_data | |
| except Exception as e: | |
| logger.error(f"Failed to get/scrape content: {e}") | |
| raise | |
| def process_article_dev(url, email_recipient=None): | |
| """ | |
| Dev workflow to process an article and send via email. | |
| Args: | |
| url (str): Article URL to process | |
| email_recipient (str, optional): Email recipient. Defaults to config value. | |
| Raises: | |
| Exception: If any step fails | |
| """ | |
| if email_recipient is None: | |
| email_recipient = DEV_EMAIL_RECIPIENT | |
| status_tracker = StatusTracker("dev") | |
| try: | |
| logger.info("=" * 70) | |
| logger.info(f"Starting DEV flow - article processing: {url}") | |
| logger.info("=" * 70) | |
| status_tracker.mark_in_progress(url) | |
| # Step 1: Get or scrape content | |
| logger.info("\n[1/6] Retrieving article content...") | |
| article_data = get_or_scrape_content(url) | |
| original_title = article_data["title"] | |
| original_body = article_data["body"] | |
| logger.info(f"✓ Content retrieved: {original_title} and size is {len(original_body)} characters") | |
| logger.debug(f"Original body: {original_body[:300000]}") | |
| index_manager = IndexManager() | |
| dev_cache = index_manager.load_dev_cache(url) | |
| cached_content = DEV_CACHE_CONTENT and dev_cache is not None and bool(dev_cache.get("title")) | |
| cached_seo = DEV_CACHE_SEO and dev_cache is not None and bool(dev_cache.get("seo_data")) | |
| cached_image = DEV_CACHE_IMAGE and dev_cache is not None and bool(dev_cache.get("image_info")) | |
| # Step 2: Generate SEO data first | |
| processor = None | |
| seo_focus_words = [] | |
| if cached_seo: | |
| logger.info("\n[2/6] Loading cached SEO data...") | |
| seo_data = dev_cache["seo_data"] | |
| seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", []) | |
| tags = seo_data.get("tags", []) | |
| logger.info(f"✓ Reused cached SEO: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags") | |
| else: | |
| logger.info("\n[2/6] Generating SEO metadata with AI...") | |
| processor = ContentProcessor() | |
| seo_data = processor.generate_seo_data(original_title, original_body) | |
| seo_focus_words = [seo_data.get("focus_keyword", "")] + seo_data.get("secondary_keywords", []) | |
| tags = seo_data.get("tags", []) | |
| logger.info(f"✓ SEO generated: focus_keyword='{seo_data.get('focus_keyword')}', {len(tags)} tags") | |
| # Step 2b: Rewrite content (title + body) | |
| if cached_content: | |
| logger.info("\n[2b/6] Loading cached content (title + body)...") | |
| new_title = dev_cache["title"] | |
| new_body = dev_cache["body"] | |
| logger.info(f"✓ Reused cached content: {new_title}") | |
| else: | |
| logger.info("\n[2b/6] Rewriting content with AI...") | |
| if processor is None: | |
| processor = ContentProcessor() | |
| processed_data = processor.rewrite_content(original_title, original_body, url, seo_focus_words, sources=article_data.get("sources", [])) | |
| new_title = processed_data["title"] | |
| new_body = processed_data["body"] | |
| logger.info(f"✓ Content rewritten: {new_title}") | |
| # Step 3: Fetch featured image | |
| image_handler = ImageHandler() | |
| if cached_image: | |
| logger.info("\n[3/6] Reusing cached featured image for dev flow...") | |
| image_info = dev_cache["image_info"] | |
| image_data = image_handler.download_image(image_info["url"]) | |
| logger.info(f"✓ Reused image: {image_info.get('url')}") | |
| else: | |
| logger.info("\n[3/6] Fetching featured image...") | |
| image_info = image_handler.fetch_image(tags) | |
| image_data = image_handler.download_image(image_info["url"]) | |
| logger.info(f"✓ Image fetched from {image_info['credit']}") | |
| if DEV_CACHE_CONTENT or DEV_CACHE_SEO or DEV_CACHE_IMAGE: | |
| index_manager.save_dev_cache( | |
| url, | |
| title=new_title, | |
| body=new_body, | |
| tags=tags, | |
| image_info=image_info, | |
| seo_data=seo_data, | |
| ) | |
| # Step 4: Generate HTML webpage | |
| logger.info("\n[4/6] Generating HTML webpage...") | |
| html_handler = HTMLHandler() | |
| # Create featured image HTML | |
| img_alt = seo_focus_words[0] if seo_focus_words else new_title.replace('"', "") | |
| credit_name = image_info.get("credit", "") | |
| credit_url = image_info.get("credit_url", "https://unsplash.com") | |
| if credit_name: | |
| photo_credit_html = ( | |
| f'<p style="font-size:0.75rem;color:#888888;text-align:left;margin-top:0.25rem;padding-left:0;">' | |
| f'Photo by <a href="{credit_url}" target="_blank" rel="noopener">{credit_name}</a> on ' | |
| f'<a href="https://unsplash.com" target="_blank" rel="noopener">Unsplash</a>' | |
| f'</p>' | |
| ) | |
| else: | |
| photo_credit_html = ( | |
| '<p style="font-size:0.75rem;color:#888888;text-align:left;margin-top:0.25rem;padding-left:0;">' | |
| 'Photo via <a href="https://unsplash.com" target="_blank" rel="noopener">Unsplash</a>' | |
| '</p>' | |
| ) | |
| featured_image_html = ( | |
| f'<figure style="margin:0;padding:0;width:100%;">' | |
| f'<img src="{image_info["url"]}" alt="{img_alt}" ' | |
| f'style="width:100%;height:240px;object-fit:cover;object-position:center;display:block;">' | |
| f'</figure>' | |
| + photo_credit_html | |
| ) | |
| enhanced_body = ( | |
| featured_image_html | |
| + '\n<div style="max-width:100%;overflow-x:hidden;box-sizing:border-box;word-break:break-word;overflow-wrap:break-word;">' | |
| + new_body | |
| + "</div>" | |
| ) | |
| # Generate and save HTML | |
| html_file_path = html_handler.generate_and_save( | |
| title=new_title, | |
| body=enhanced_body, | |
| tags=tags, | |
| image_url=image_info["url"], | |
| original_url=url, | |
| ) | |
| logger.info(f"✓ HTML file generated: {html_file_path}") | |
| # Extract filename from path | |
| html_filename = os.path.basename(html_file_path) | |
| # # Step 5: Send HTML email | |
| # logger.info("\n[5/6] Saving email file...") | |
| # try: | |
| # email_handler = EmailHandler() | |
| # # Read HTML content from file | |
| # with open(html_file_path, "r", encoding="utf-8") as f: | |
| # html_content = f.read() | |
| # # Save email file | |
| # email_file_path = email_handler.save_article_email( | |
| # recipient=email_recipient, | |
| # article_title=new_title, | |
| # html_content=html_content, | |
| # ) | |
| # logger.info(f"✓ Email saved to {email_file_path}") | |
| # except ValueError as e: | |
| # logger.warning(f"Email not configured: {e}") | |
| # logger.info("Skipping email step - configure EMAIL_SENDER to enable") | |
| # email_file_path = None | |
| # except (ConnectionRefusedError, OSError) as e: | |
| # logger.warning(f"Mail server not available: {e}") | |
| # logger.info("Skipping email step - mail server may not be running. Install Postfix: sudo apt install postfix") | |
| # except Exception as e: | |
| # logger.warning(f"Failed to send email (continuing anyway): {e}") | |
| # Step 6: Summary | |
| logger.info("\n[6/6] Processing complete!") | |
| logger.info("=" * 70) | |
| logger.info("✅ SUCCESS - DEV flow completed") | |
| logger.info(f" Title: {new_title}") | |
| logger.info(f" Tags: {', '.join(tags)}") | |
| logger.info(f" HTML File: {html_file_path}") | |
| logger.info("=" * 70 + "\n") | |
| # Mark as successful | |
| status_tracker.mark_successful( | |
| url=url, | |
| filename=html_filename, | |
| title=original_title, | |
| new_title=new_title, | |
| ) | |
| return { | |
| "title": new_title, | |
| "tags": tags, | |
| "html_file": html_file_path, | |
| } | |
| except Exception as e: | |
| logger.error("=" * 70) | |
| logger.error(f"❌ FAILED - DEV flow failed: {e}") | |
| logger.error("=" * 70) | |
| # Determine which step failed | |
| failure_step = "unknown" | |
| if "Retrieving article content" in str(e): | |
| failure_step = "step_1_scrape" | |
| elif "Rewriting content" in str(e): | |
| failure_step = "step_2_rewrite" | |
| elif "Fetching featured image" in str(e): | |
| failure_step = "step_3_image" | |
| elif "Generating HTML" in str(e): | |
| failure_step = "step_4_html" | |
| else: | |
| failure_step = f"unknown_error: {str(e)[:50]}" | |
| # Mark as failed | |
| try: | |
| original_title = article_data.get("title", "") if 'article_data' in locals() else "" | |
| status_tracker.mark_failed( | |
| url=url, | |
| failure_step=failure_step, | |
| title=original_title, | |
| ) | |
| except: | |
| pass # If status tracking fails, continue with raising exception | |
| raise | |
| def main(): | |
| """Main entry point for dev flow.""" | |
| try: | |
| # Validate configuration | |
| logger.info("Validating configuration...") | |
| validate_config() | |
| logger.info("✓ Configuration valid") | |
| # Process article | |
| #url = "https://www.healthline.com/nutrition/12-omega-3-rich-foods" | |
| #url = "https://www.healthline.com/health/10-gut-foods" | |
| urls = [ | |
| # "https://www.healthline.com/health/the-benefits-of-biotin", | |
| # "https://www.healthline.com/nutrition/12-omega-3-rich-foods", | |
| #"https://www.healthline.com/nutrition/how-much-collagen-per-day", | |
| ##"https://www.healthline.com/health/beauty-skin-care/supplements-for-better-skin", | |
| #"https://www.healthline.com/nutrition/ashwagandha", | |
| #"https://www.healthline.com/health/10-gut-foods" | |
| # "https://indianexpress.com/article/lifestyle/food-wine/icmr-protein-supplements-powder-health-kidney-bones-9318508/", | |
| #"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991" | |
| #"https://www.verywellhealth.com/best-time-of-day-to-eat-your-fiber-11945630" | |
| #"https://www.webmd.com/vitamins/ai/ingredientmono-464/gamma-aminobutyric-acid-gaba#overview" | |
| # "https://www.webmd.com/vitamins-supplements/activated-charcoal", | |
| # "https://www.webmd.com/vitamins-supplements/5-htp", | |
| # "https://www.webmd.com/vitamins/ai/ingredientmono-1101/holy-basil", | |
| # "https://www.webmd.com/vitamins/ai/ingredientmono-1062/hyaluronic-acid", | |
| # "https://www.webmd.com/vitamins/ai/ingredientmono-875/l-arginine", | |
| # "https://www.webmd.com/vitamins-supplements/evening-primrose-oil", | |
| # "https://www.webmd.com/vitamins/ai/ingredientmono-1242/moringa", | |
| # "https://www.webmd.com/vitamins-supplements/xylitol" | |
| # "https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks" | |
| #"https://www.verywellhealth.com/keto-diet-long-term-risks-5197991" | |
| #Batch2 | |
| #"https://www.healthline.com/nutrition/best-testosterone-booster-supplements#our-picks" | |
| #"https://www.medicalnewstoday.com/articles/how-much-protein-do-you-need-to-build-muscle#How-much-protein-do-you-need", | |
| # to completed | |
| # "https://www.medicalnewstoday.com/articles/is-it-better-to-eat-several-small-meals-or-fewer-larger-ones", | |
| # "https://www.medicalnewstoday.com/articles/not-all-plant-based-diets-are-the-same-junk-veggie-food-and-its-impact-on-health", | |
| # "https://www.medicalnewstoday.com/articles/intermittent-fasting-is-it-all-its-cracked-up-to-be", | |
| # "https://www.who.int/health-topics/nutrition#tab=tab_1", | |
| #failing | |
| # "https://publichealth.jhu.edu/2025/the-evidence-behind-seed-oils-health-effects", | |
| # "https://nutritionsource.hsph.harvard.edu/what-should-you-eat/vegetables-and-fruits", | |
| # "https://health.clevelandclinic.org/plant-based-milk-options", | |
| # "https://www.nhs.uk/live-well/eat-well/food-guidelines-and-food-labels/the-eatwell-guide/", | |
| # "https://www.futureoffood.ox.ac.uk/improving-diet-and-nutrition", | |
| # "https://medlineplus.gov/ency/article/002465.htm", | |
| # "https://www.bbc.com/future/article/20260424-diet-why-enjoying-your-food-is-key-to-weight-loss", | |
| # "https://www.fda.gov/food/nutrition-food-labeling-and-critical-foods/fdas-nutrition-initiatives", | |
| # "https://www.medindia.net/news/healthwatch/can-peanut-butter-keep-you-stronger-as-you-age-223205-1.htm" | |
| # "https://www.healthline.com/health/type-2-diabetes/basal-insulin-types-benefits-dosage-side-effects" | |
| "https://www.webmd.com/vitamins/ai/ingredientmono-707/java-tea#overview" | |
| ] | |
| # 2. Loop through the list and run the function for each one | |
| for url in urls: | |
| process_article_dev(url) | |
| except ValueError as e: | |
| logger.error(f"Configuration error: {e}") | |
| sys.exit(1) | |
| except Exception as e: | |
| logger.error(f"Fatal error: {e}") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() | |