chatbot_nihe / src /crawling /base_crawler.py
Auto Deploy Script
Auto deploy from local machine
f9b0dca
import requests
from bs4 import BeautifulSoup
import os
import time
import re
# Resolve paths correctly
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')
BASE_URL = "https://nihe.org.vn"
# Headers to mimic a real browser to avoid 403 Forbidden
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}
def clean_filename(title, url):
safe_title = re.sub(r'[\\/*?:"<>|]', "", title)[:50].strip()
if not safe_title: safe_title = "untitled"
url_hash = str(abs(hash(url)))[:8]
return f"{safe_title}_{url_hash}"
def get_article_content(url):
try:
if not url.startswith('http'):
url = BASE_URL + url
print(f"Crawling: {url}")
response = requests.get(url, headers=HEADERS, timeout=15)
if response.status_code != 200:
print(f"Failed to fetch {url}: {response.status_code}")
return None
soup = BeautifulSoup(response.content, 'html.parser')
# Title extraction
title = soup.find('h1') or soup.find('h2', class_='title') or soup.find('title')
title_text = title.get_text(strip=True) if title else "No Title"
# Content extraction with multiple selectors
content_text = ""
content_div = soup.select_one('div.article-detail, div.content, div.post-content, article')
if content_div:
paragraphs = content_div.find_all(['p', 'div', 'span'])
content_text = "\n".join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20])
else:
# Fallback to all long paragraphs in body
paras = soup.body.find_all('p')
content_text = "\n".join([p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 30])
if not content_text.strip():
return None
return {
"title": title_text,
"content": content_text,
"url": url
}
except Exception as e:
print(f"Error processing {url}: {e}")
return None
def save_article(article):
if not article:
return
if not os.path.exists(DATA_DIR):
os.makedirs(DATA_DIR)
filename = clean_filename(article['title'], article['url']) + ".txt"
filepath = os.path.join(DATA_DIR, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(f"Title: {article['title']}\n")
f.write(f"URL: {article['url']}\n\n")
f.write(article['content'])
print(f"Saved: {filename}")
def crawl_category(category_url, limit=10):
print(f"Scanning category: {category_url}")
try:
response = requests.get(category_url, headers=HEADERS, timeout=15)
soup = BeautifulSoup(response.content, 'html.parser')
links = []
for a in soup.find_all('a', href=True):
href = a['href']
# Basic validation
if len(href) > 20 and (href.startswith('/') or BASE_URL in href):
if any(x in href for x in ['contact', 'login', 'register', 'search', 'category', 'danh-muc']):
continue
links.append(href)
links = list(set(links))
print(f"Found {len(links)} potential articles. Processing up to {limit}...")
count = 0
for link in links:
if count >= limit: break
article = get_article_content(link)
if article:
save_article(article)
count += 1
time.sleep(1)
except Exception as e:
print(f"Error crawling {category_url}: {e}")
if __name__ == "__main__":
seed_urls = [
"https://nihe.org.vn/vi/tin-tuc-su-kien",
"https://nihe.org.vn/vi/y-hoc-du-phong",
"https://nihe.org.vn/vi/thong-tin-suc-khoe",
"https://nihe.org.vn/vi/vac-xin-tiem-chung"
]
for url in seed_urls:
crawl_category(url, limit=5)