import requests
from bs4 import BeautifulSoup
import os
import time
import re

# Resolve paths correctly
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')
BASE_URL = "https://nihe.org.vn"

# Headers to mimic a real browser to avoid 403 Forbidden
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def clean_filename(title, url):
    safe_title = re.sub(r'[\\/*?:"<>|]', "", title)[:50].strip()
    if not safe_title: safe_title = "untitled"
    url_hash = str(abs(hash(url)))[:8] 
    return f"{safe_title}_{url_hash}"

def get_article_content(url):
    try:
        if not url.startswith('http'):
            url = BASE_URL + url
            
        print(f"Crawling: {url}")
        response = requests.get(url, headers=HEADERS, timeout=15)
        if response.status_code != 200:
            print(f"Failed to fetch {url}: {response.status_code}")
            return None

        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Title extraction
        title = soup.find('h1') or soup.find('h2', class_='title') or soup.find('title')
        title_text = title.get_text(strip=True) if title else "No Title"
        
        # Content extraction with multiple selectors
        content_text = ""
        content_div = soup.select_one('div.article-detail, div.content, div.post-content, article')
        
        if content_div:
            paragraphs = content_div.find_all(['p', 'div', 'span'])
            content_text = "\n".join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20])
        else:
            # Fallback to all long paragraphs in body
            paras = soup.body.find_all('p')
            content_text = "\n".join([p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 30])

        if not content_text.strip():
            return None

        return {
            "title": title_text,
            "content": content_text,
            "url": url
        }

    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

def save_article(article):
    if not article:
        return
    
    if not os.path.exists(DATA_DIR):
        os.makedirs(DATA_DIR)
        
    filename = clean_filename(article['title'], article['url']) + ".txt"
    filepath = os.path.join(DATA_DIR, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(f"Title: {article['title']}\n")
        f.write(f"URL: {article['url']}\n\n")
        f.write(article['content'])
    print(f"Saved: {filename}")

def crawl_category(category_url, limit=10):
    print(f"Scanning category: {category_url}")
    try:
        response = requests.get(category_url, headers=HEADERS, timeout=15)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            # Basic validation
            if len(href) > 20 and (href.startswith('/') or BASE_URL in href):
                if any(x in href for x in ['contact', 'login', 'register', 'search', 'category', 'danh-muc']):
                    continue
                links.append(href)

        links = list(set(links))
        print(f"Found {len(links)} potential articles. Processing up to {limit}...")
        
        count = 0
        for link in links:
            if count >= limit: break
            article = get_article_content(link)
            if article:
                save_article(article)
                count += 1
                time.sleep(1)

    except Exception as e:
        print(f"Error crawling {category_url}: {e}")

if __name__ == "__main__":
    seed_urls = [
        "https://nihe.org.vn/vi/tin-tuc-su-kien",
        "https://nihe.org.vn/vi/y-hoc-du-phong",
        "https://nihe.org.vn/vi/thong-tin-suc-khoe",
        "https://nihe.org.vn/vi/vac-xin-tiem-chung"
    ]
    for url in seed_urls:
        crawl_category(url, limit=5)