import os import time import re import json import requests import io from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup from pypdf import PdfReader from docx import Document # Paths PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw_selenium') BASE_URL = "https://nihe.org.vn" class DeepNiheCrawler: def __init__(self, headless=True): self.visited_urls = set() self.article_count = 0 self.driver = None self.headless = headless if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) def setup_driver(self): options = Options() if self.headless: options.add_argument('--headless=new') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('--window-size=1920,1080') options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') service = Service(ChromeDriverManager().install()) self.driver = webdriver.Chrome(service=service, options=options) def close_driver(self): if self.driver: self.driver.quit() def extract_text_from_file(self, url): try: response = requests.get(url, timeout=15, verify=False) if response.status_code != 200: return None f = io.BytesIO(response.content) text = "" if url.lower().endswith('.pdf'): reader = PdfReader(f) for page in reader.pages: text += page.extract_text() + "\n" elif url.lower().endswith('.docx'): doc = Document(f) for para in doc.paragraphs: text += para.text + "\n" return text.strip() if len(text.strip()) > 100 else None except: return None def save_article(self, article): if not article: return safe_title = re.sub(r'[\\/*?:"<>|]', "", article['title'])[:80].strip() url_hash = str(abs(hash(article['url'])))[:8] filename = f"{safe_title}_{url_hash}.txt" with open(os.path.join(DATA_DIR, filename), 'w', encoding='utf-8') as f: f.write(f"Title: {article['title']}\n") f.write(f"URL: {article['url']}\n\n") f.write(article['content']) self.article_count += 1 print(f" ✓ Saved: {filename}") def run_crawl(self, seed_url, max_pages=20): print(f"Starting deep crawl from: {seed_url}") self.setup_driver() try: self.driver.get(seed_url) time.sleep(3) links = set() a_elements = self.driver.find_elements(By.TAG_NAME, "a") for a in a_elements: href = a.get_attribute('href') if href and BASE_URL in href and '/vi/' in href: links.add(href) print(f"Found {len(links)} links. Processing...") for link in list(links)[:max_pages]: if link in self.visited_urls: continue if any(link.lower().endswith(ext) for ext in ['.pdf', '.docx']): content = self.extract_text_from_file(link) if content: self.save_article({'title': link.split('/')[-1], 'url': link, 'content': content}) else: self.driver.get(link) time.sleep(2) try: title = self.driver.find_element(By.TAG_NAME, "h1").text content = self.driver.find_element(By.TAG_NAME, "body").text if len(content) > 300: self.save_article({'title': title, 'url': link, 'content': content}) except: pass self.visited_urls.add(link) finally: self.close_driver() if __name__ == "__main__": crawler = DeepNiheCrawler() crawler.run_crawl("https://nihe.org.vn/vi/tin-tuc-su-kien", max_pages=5)