chatbot_nihe / src /crawling /deep_crawler.py
Auto Deploy Script
Auto deploy from local machine
f9b0dca
import os
import time
import re
import json
import requests
import io
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from pypdf import PdfReader
from docx import Document
# Paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw_selenium')
BASE_URL = "https://nihe.org.vn"
class DeepNiheCrawler:
def __init__(self, headless=True):
self.visited_urls = set()
self.article_count = 0
self.driver = None
self.headless = headless
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
def setup_driver(self):
options = Options()
if self.headless:
options.add_argument('--headless=new')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--window-size=1920,1080')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
service = Service(ChromeDriverManager().install())
self.driver = webdriver.Chrome(service=service, options=options)
def close_driver(self):
if self.driver:
self.driver.quit()
def extract_text_from_file(self, url):
try:
response = requests.get(url, timeout=15, verify=False)
if response.status_code != 200: return None
f = io.BytesIO(response.content)
text = ""
if url.lower().endswith('.pdf'):
reader = PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n"
elif url.lower().endswith('.docx'):
doc = Document(f)
for para in doc.paragraphs:
text += para.text + "\n"
return text.strip() if len(text.strip()) > 100 else None
except:
return None
def save_article(self, article):
if not article: return
safe_title = re.sub(r'[\\/*?:"<>|]', "", article['title'])[:80].strip()
url_hash = str(abs(hash(article['url'])))[:8]
filename = f"{safe_title}_{url_hash}.txt"
with open(os.path.join(DATA_DIR, filename), 'w', encoding='utf-8') as f:
f.write(f"Title: {article['title']}\n")
f.write(f"URL: {article['url']}\n\n")
f.write(article['content'])
self.article_count += 1
print(f" ✓ Saved: {filename}")
def run_crawl(self, seed_url, max_pages=20):
print(f"Starting deep crawl from: {seed_url}")
self.setup_driver()
try:
self.driver.get(seed_url)
time.sleep(3)
links = set()
a_elements = self.driver.find_elements(By.TAG_NAME, "a")
for a in a_elements:
href = a.get_attribute('href')
if href and BASE_URL in href and '/vi/' in href:
links.add(href)
print(f"Found {len(links)} links. Processing...")
for link in list(links)[:max_pages]:
if link in self.visited_urls: continue
if any(link.lower().endswith(ext) for ext in ['.pdf', '.docx']):
content = self.extract_text_from_file(link)
if content:
self.save_article({'title': link.split('/')[-1], 'url': link, 'content': content})
else:
self.driver.get(link)
time.sleep(2)
try:
title = self.driver.find_element(By.TAG_NAME, "h1").text
content = self.driver.find_element(By.TAG_NAME, "body").text
if len(content) > 300:
self.save_article({'title': title, 'url': link, 'content': content})
except:
pass
self.visited_urls.add(link)
finally:
self.close_driver()
if __name__ == "__main__":
crawler = DeepNiheCrawler()
crawler.run_crawl("https://nihe.org.vn/vi/tin-tuc-su-kien", max_pages=5)