import requests from bs4 import BeautifulSoup import os import time import re # Resolve paths correctly PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw') BASE_URL = "https://nihe.org.vn" # Headers to mimic a real browser to avoid 403 Forbidden HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } def clean_filename(title, url): safe_title = re.sub(r'[\\/*?:"<>|]', "", title)[:50].strip() if not safe_title: safe_title = "untitled" url_hash = str(abs(hash(url)))[:8] return f"{safe_title}_{url_hash}" def get_article_content(url): try: if not url.startswith('http'): url = BASE_URL + url print(f"Crawling: {url}") response = requests.get(url, headers=HEADERS, timeout=15) if response.status_code != 200: print(f"Failed to fetch {url}: {response.status_code}") return None soup = BeautifulSoup(response.content, 'html.parser') # Title extraction title = soup.find('h1') or soup.find('h2', class_='title') or soup.find('title') title_text = title.get_text(strip=True) if title else "No Title" # Content extraction with multiple selectors content_text = "" content_div = soup.select_one('div.article-detail, div.content, div.post-content, article') if content_div: paragraphs = content_div.find_all(['p', 'div', 'span']) content_text = "\n".join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20]) else: # Fallback to all long paragraphs in body paras = soup.body.find_all('p') content_text = "\n".join([p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 30]) if not content_text.strip(): return None return { "title": title_text, "content": content_text, "url": url } except Exception as e: print(f"Error processing {url}: {e}") return None def save_article(article): if not article: return if not os.path.exists(DATA_DIR): os.makedirs(DATA_DIR) filename = clean_filename(article['title'], article['url']) + ".txt" filepath = os.path.join(DATA_DIR, filename) with open(filepath, 'w', encoding='utf-8') as f: f.write(f"Title: {article['title']}\n") f.write(f"URL: {article['url']}\n\n") f.write(article['content']) print(f"Saved: {filename}") def crawl_category(category_url, limit=10): print(f"Scanning category: {category_url}") try: response = requests.get(category_url, headers=HEADERS, timeout=15) soup = BeautifulSoup(response.content, 'html.parser') links = [] for a in soup.find_all('a', href=True): href = a['href'] # Basic validation if len(href) > 20 and (href.startswith('/') or BASE_URL in href): if any(x in href for x in ['contact', 'login', 'register', 'search', 'category', 'danh-muc']): continue links.append(href) links = list(set(links)) print(f"Found {len(links)} potential articles. Processing up to {limit}...") count = 0 for link in links: if count >= limit: break article = get_article_content(link) if article: save_article(article) count += 1 time.sleep(1) except Exception as e: print(f"Error crawling {category_url}: {e}") if __name__ == "__main__": seed_urls = [ "https://nihe.org.vn/vi/tin-tuc-su-kien", "https://nihe.org.vn/vi/y-hoc-du-phong", "https://nihe.org.vn/vi/thong-tin-suc-khoe", "https://nihe.org.vn/vi/vac-xin-tiem-chung" ] for url in seed_urls: crawl_category(url, limit=5)