import os
import time
import re
import json
import requests
import io
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from pypdf import PdfReader
from docx import Document

# Paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw_selenium')
BASE_URL = "https://nihe.org.vn"

class DeepNiheCrawler:
    def __init__(self, headless=True):
        self.visited_urls = set()
        self.article_count = 0
        self.driver = None
        self.headless = headless
        if not os.path.exists(DATA_DIR):
            os.makedirs(DATA_DIR)

    def setup_driver(self):
        options = Options()
        if self.headless:
            options.add_argument('--headless=new')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--window-size=1920,1080')
        options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=options)

    def close_driver(self):
        if self.driver:
            self.driver.quit()

    def extract_text_from_file(self, url):
        try:
            response = requests.get(url, timeout=15, verify=False)
            if response.status_code != 200: return None
            
            f = io.BytesIO(response.content)
            text = ""
            if url.lower().endswith('.pdf'):
                reader = PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() + "\n"
            elif url.lower().endswith('.docx'):
                doc = Document(f)
                for para in doc.paragraphs:
                    text += para.text + "\n"
            return text.strip() if len(text.strip()) > 100 else None
        except:
            return None

    def save_article(self, article):
        if not article: return
        safe_title = re.sub(r'[\\/*?:"<>|]', "", article['title'])[:80].strip()
        url_hash = str(abs(hash(article['url'])))[:8]
        filename = f"{safe_title}_{url_hash}.txt"
        
        with open(os.path.join(DATA_DIR, filename), 'w', encoding='utf-8') as f:
            f.write(f"Title: {article['title']}\n")
            f.write(f"URL: {article['url']}\n\n")
            f.write(article['content'])
        self.article_count += 1
        print(f"  ✓ Saved: {filename}")

    def run_crawl(self, seed_url, max_pages=20):
        print(f"Starting deep crawl from: {seed_url}")
        self.setup_driver()
        try:
            self.driver.get(seed_url)
            time.sleep(3)
            
            links = set()
            a_elements = self.driver.find_elements(By.TAG_NAME, "a")
            for a in a_elements:
                href = a.get_attribute('href')
                if href and BASE_URL in href and '/vi/' in href:
                    links.add(href)
            
            print(f"Found {len(links)} links. Processing...")
            for link in list(links)[:max_pages]:
                if link in self.visited_urls: continue
                
                if any(link.lower().endswith(ext) for ext in ['.pdf', '.docx']):
                    content = self.extract_text_from_file(link)
                    if content:
                        self.save_article({'title': link.split('/')[-1], 'url': link, 'content': content})
                else:
                    self.driver.get(link)
                    time.sleep(2)
                    try:
                        title = self.driver.find_element(By.TAG_NAME, "h1").text
                        content = self.driver.find_element(By.TAG_NAME, "body").text
                        if len(content) > 300:
                            self.save_article({'title': title, 'url': link, 'content': content})
                    except:
                        pass
                self.visited_urls.add(link)
        finally:
            self.close_driver()

if __name__ == "__main__":
    crawler = DeepNiheCrawler()
    crawler.run_crawl("https://nihe.org.vn/vi/tin-tuc-su-kien", max_pages=5)