import time
from datetime import datetime
from bs4 import BeautifulSoup
import json
import requests

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager

BASE_TAG_URL = "https://www.deeplearning.ai/the-batch/tag/"
VALID_CATEGORIES = [
    "letters",
    "data-points",
    "research",
    "business",
    "science",
    "culture",
    "hardware",
    "ai-careers"
]


def initialize_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver


def load_all_articles(driver, url):
    wait = WebDriverWait(driver, 20)
    driver.get(url)
    time.sleep(3)

    category = url.split('/')[-2]
    all_articles_links = set()

    if category == "letters":
        last_url = ""
        while True:
            current_links = get_article_links_from_page(driver)
            all_articles_links.update(current_links)
            print(f"Collected {len(current_links)} articles on the current page in '{category}'")

            try:
                older_button = wait.until(
                    EC.element_to_be_clickable((By.CLASS_NAME, "justify-self-end"))
                )
                driver.execute_script("arguments[0].scrollIntoView({block: 'end'});", older_button)
                time.sleep(1)
                older_button.click()
                print(f"Clicked 'Older Posts' in'{category}'...")
                time.sleep(2)

                current_url = driver.current_url
                if current_url == last_url:
                    print("The URL did not change after the click, we are stopping the 'Older Posts' pagination.")
                    break
                last_url = current_url

            except (TimeoutException, NoSuchElementException):
                print("There is no 'Older Posts' button. Let's move on to the next category.")
                break

    else:
        while True:
            current_links = get_article_links_from_page(driver)
            all_articles_links.update(current_links)
            print(f"Collected {len(current_links)} articles on the current page in '{category}'")

            try:
                load_more_button = wait.until(
                    EC.element_to_be_clickable((By.CLASS_NAME, "buttons_secondary__8o9u6"))
                )
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
                time.sleep(1)
                driver.execute_script("arguments[0].click();", load_more_button)
                print(f"Clicked 'Load More' in '{category}'...")
                time.sleep(2)
            except (TimeoutException, NoSuchElementException):
                print(
                    f"The 'Load More' button is unavailable or missing in '{category}'. Moving to the next category.")
                break

    return list(all_articles_links)


def get_article_links_from_page(driver):
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    all_links = set()
    for a in soup.find_all("a", href=True):
        href = a['href']
        if href.startswith("/the-batch/") and not href.startswith("/the-batch/tag/"):
            full_url = "https://www.deeplearning.ai" + href
            if "issue" not in full_url:
                all_links.add(full_url)
    return list(all_links)


def get_article_links():
    driver = initialize_driver()
    all_links = set()

    for category in VALID_CATEGORIES:
        url = f"{BASE_TAG_URL}{category}/"
        print(f"Loading the category: {url}")
        category_links = load_all_articles(driver, url)
        print(f"Found {len(category_links)} articles in category '{category}'")
        all_links.update(category_links)

    driver.quit()
    return list(all_links)


def parse_article(url, max_retries=3, delay=2):
    attempts = 0
    while attempts < max_retries:
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, "html.parser")

            h1 = soup.find("h1")
            title = h1.get_text(strip=True) if h1 else ""
            description = ""
            if h1:
                span = h1.find("span")
                if span:
                    description = span.get_text(strip=True)
                    span.extract()
                title = h1.get_text(strip=True)

            image_tag = soup.find("meta", attrs={"property": "og:image"})
            image_url = image_tag["content"] if image_tag else None

            date_meta = soup.find("meta", attrs={"property": "article:published_time"})
            date_str = ""
            if date_meta:
                try:
                    date_raw = date_meta["content"]
                    date_str = datetime.fromisoformat(date_raw.split("T")[0]).strftime("%Y-%m-%d")
                except Exception:
                    date_str = date_meta["content"]

            content = ""
            main_content = soup.find("div", class_="prose--styled")

            if main_content:
                paragraphs = main_content.find_all(["p", "li"])
                content_lines = [p.get_text(strip=True) for p in paragraphs]
                content = "\n".join(content_lines)

            time.sleep(delay)

            return {
                "title": title.strip(),
                "description": description.strip(),
                "image_url": image_url,
                "date": date_str,
                "content": content.strip(),
                "source_url": url,
            }

        except (requests.RequestException, Exception) as e:
            attempts += 1
            print(f"Error parsing URL {url} (Attempt {attempts}/{max_retries}): {e}")
            time.sleep(delay * attempts)

    print(f"Article skipped due to repeated errors: {url}")
    return None


def run_parser_and_save_to_json(output_filename="data/articles_export.json"):
    print("Starting to parse article links...")
    all_article_urls = get_article_links()
    print(f"{len(all_article_urls)} unique links to articles collected.")

    parsed_articles = []
    print("\n Starting to parse article content...")
    for i, url in enumerate(all_article_urls):
        print(f"Parsing the article {i + 1}/{len(all_article_urls)}: {url}")
        article_data = parse_article(url)
        if article_data:
            parsed_articles.append(article_data)

    print(f"\n Parsing completed. {len(parsed_articles)} articles collected.")

    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(parsed_articles, f, ensure_ascii=False, indent=4)
    print(f"All articles are saved in '{output_filename}'")

    print("\n Starting to parse articles...")
    try:
        with open(output_filename, "r", encoding="utf-8") as f:
            articles_to_filter = json.load(f)
    except FileNotFoundError:
        print(f"File '{output_filename}' not found for parse.")
        articles_to_filter = []

    initial_count = len(articles_to_filter)
    filtered_articles = [a for a in articles_to_filter if a.get("content") != "[image]"]
    filtered_count = len(filtered_articles)

    print(f"Articles for parse: {initial_count}")
    print(f"Parsed articles: {filtered_count}")

    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(filtered_articles, f, ensure_ascii=False, indent=4)
    print(f"Parsed articles saved in '{output_filename}'")


if __name__ == "__main__":
    import os

    os.makedirs("data", exist_ok=True)
    run_parser_and_save_to_json()