Spaces:

hung2903
/

chatbot_nihe

Sleeping

chatbot_nihe / src /crawling /deep_crawler.py

Auto Deploy Script

Auto deploy from local machine

f9b0dca about 1 month ago

4.56 kB

	import os
	import time
	import re
	import json
	import requests
	import io
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from webdriver_manager.chrome import ChromeDriverManager
	from bs4 import BeautifulSoup
	from pypdf import PdfReader
	from docx import Document

	# Paths
	PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
	DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw_selenium')
	BASE_URL = "https://nihe.org.vn"

	class DeepNiheCrawler:
	def __init__(self, headless=True):
	self.visited_urls = set()
	self.article_count = 0
	self.driver = None
	self.headless = headless
	if not os.path.exists(DATA_DIR):
	os.makedirs(DATA_DIR)

	def setup_driver(self):
	options = Options()
	if self.headless:
	options.add_argument('--headless=new')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')
	options.add_argument('--window-size=1920,1080')
	options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

	service = Service(ChromeDriverManager().install())
	self.driver = webdriver.Chrome(service=service, options=options)

	def close_driver(self):
	if self.driver:
	self.driver.quit()

	def extract_text_from_file(self, url):
	try:
	response = requests.get(url, timeout=15, verify=False)
	if response.status_code != 200: return None

	f = io.BytesIO(response.content)
	text = ""
	if url.lower().endswith('.pdf'):
	reader = PdfReader(f)
	for page in reader.pages:
	text += page.extract_text() + "\n"
	elif url.lower().endswith('.docx'):
	doc = Document(f)
	for para in doc.paragraphs:
	text += para.text + "\n"
	return text.strip() if len(text.strip()) > 100 else None
	except:
	return None

	def save_article(self, article):
	if not article: return
	safe_title = re.sub(r'[\\/*?:"<>\|]', "", article['title'])[:80].strip()
	url_hash = str(abs(hash(article['url'])))[:8]
	filename = f"{safe_title}_{url_hash}.txt"

	with open(os.path.join(DATA_DIR, filename), 'w', encoding='utf-8') as f:
	f.write(f"Title: {article['title']}\n")
	f.write(f"URL: {article['url']}\n\n")
	f.write(article['content'])
	self.article_count += 1
	print(f" ✓ Saved: {filename}")

	def run_crawl(self, seed_url, max_pages=20):
	print(f"Starting deep crawl from: {seed_url}")
	self.setup_driver()
	try:
	self.driver.get(seed_url)
	time.sleep(3)

	links = set()
	a_elements = self.driver.find_elements(By.TAG_NAME, "a")
	for a in a_elements:
	href = a.get_attribute('href')
	if href and BASE_URL in href and '/vi/' in href:
	links.add(href)

	print(f"Found {len(links)} links. Processing...")
	for link in list(links)[:max_pages]:
	if link in self.visited_urls: continue

	if any(link.lower().endswith(ext) for ext in ['.pdf', '.docx']):
	content = self.extract_text_from_file(link)
	if content:
	self.save_article({'title': link.split('/')[-1], 'url': link, 'content': content})
	else:
	self.driver.get(link)
	time.sleep(2)
	try:
	title = self.driver.find_element(By.TAG_NAME, "h1").text
	content = self.driver.find_element(By.TAG_NAME, "body").text
	if len(content) > 300:
	self.save_article({'title': title, 'url': link, 'content': content})
	except:
	pass
	self.visited_urls.add(link)
	finally:
	self.close_driver()

	if __name__ == "__main__":
	crawler = DeepNiheCrawler()
	crawler.run_crawl("https://nihe.org.vn/vi/tin-tuc-su-kien", max_pages=5)