Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import re | |
| import json | |
| import requests | |
| import io | |
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from bs4 import BeautifulSoup | |
| from pypdf import PdfReader | |
| from docx import Document | |
| # Paths | |
| PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) | |
| DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw_selenium') | |
| BASE_URL = "https://nihe.org.vn" | |
| class DeepNiheCrawler: | |
| def __init__(self, headless=True): | |
| self.visited_urls = set() | |
| self.article_count = 0 | |
| self.driver = None | |
| self.headless = headless | |
| if not os.path.exists(DATA_DIR): | |
| os.makedirs(DATA_DIR) | |
| def setup_driver(self): | |
| options = Options() | |
| if self.headless: | |
| options.add_argument('--headless=new') | |
| options.add_argument('--no-sandbox') | |
| options.add_argument('--disable-dev-shm-usage') | |
| options.add_argument('--window-size=1920,1080') | |
| options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36') | |
| service = Service(ChromeDriverManager().install()) | |
| self.driver = webdriver.Chrome(service=service, options=options) | |
| def close_driver(self): | |
| if self.driver: | |
| self.driver.quit() | |
| def extract_text_from_file(self, url): | |
| try: | |
| response = requests.get(url, timeout=15, verify=False) | |
| if response.status_code != 200: return None | |
| f = io.BytesIO(response.content) | |
| text = "" | |
| if url.lower().endswith('.pdf'): | |
| reader = PdfReader(f) | |
| for page in reader.pages: | |
| text += page.extract_text() + "\n" | |
| elif url.lower().endswith('.docx'): | |
| doc = Document(f) | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| return text.strip() if len(text.strip()) > 100 else None | |
| except: | |
| return None | |
| def save_article(self, article): | |
| if not article: return | |
| safe_title = re.sub(r'[\\/*?:"<>|]', "", article['title'])[:80].strip() | |
| url_hash = str(abs(hash(article['url'])))[:8] | |
| filename = f"{safe_title}_{url_hash}.txt" | |
| with open(os.path.join(DATA_DIR, filename), 'w', encoding='utf-8') as f: | |
| f.write(f"Title: {article['title']}\n") | |
| f.write(f"URL: {article['url']}\n\n") | |
| f.write(article['content']) | |
| self.article_count += 1 | |
| print(f" ✓ Saved: {filename}") | |
| def run_crawl(self, seed_url, max_pages=20): | |
| print(f"Starting deep crawl from: {seed_url}") | |
| self.setup_driver() | |
| try: | |
| self.driver.get(seed_url) | |
| time.sleep(3) | |
| links = set() | |
| a_elements = self.driver.find_elements(By.TAG_NAME, "a") | |
| for a in a_elements: | |
| href = a.get_attribute('href') | |
| if href and BASE_URL in href and '/vi/' in href: | |
| links.add(href) | |
| print(f"Found {len(links)} links. Processing...") | |
| for link in list(links)[:max_pages]: | |
| if link in self.visited_urls: continue | |
| if any(link.lower().endswith(ext) for ext in ['.pdf', '.docx']): | |
| content = self.extract_text_from_file(link) | |
| if content: | |
| self.save_article({'title': link.split('/')[-1], 'url': link, 'content': content}) | |
| else: | |
| self.driver.get(link) | |
| time.sleep(2) | |
| try: | |
| title = self.driver.find_element(By.TAG_NAME, "h1").text | |
| content = self.driver.find_element(By.TAG_NAME, "body").text | |
| if len(content) > 300: | |
| self.save_article({'title': title, 'url': link, 'content': content}) | |
| except: | |
| pass | |
| self.visited_urls.add(link) | |
| finally: | |
| self.close_driver() | |
| if __name__ == "__main__": | |
| crawler = DeepNiheCrawler() | |
| crawler.run_crawl("https://nihe.org.vn/vi/tin-tuc-su-kien", max_pages=5) | |