Spaces:

hung2903
/

chatbot_nihe

Sleeping

chatbot_nihe / src /crawling /base_crawler.py

Auto Deploy Script

Auto deploy from local machine

f9b0dca about 1 month ago

4.12 kB

	import requests
	from bs4 import BeautifulSoup
	import os
	import time
	import re

	# Resolve paths correctly
	PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
	DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw')
	BASE_URL = "https://nihe.org.vn"

	# Headers to mimic a real browser to avoid 403 Forbidden
	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	}

	def clean_filename(title, url):
	safe_title = re.sub(r'[\\/*?:"<>\|]', "", title)[:50].strip()
	if not safe_title: safe_title = "untitled"
	url_hash = str(abs(hash(url)))[:8]
	return f"{safe_title}_{url_hash}"

	def get_article_content(url):
	try:
	if not url.startswith('http'):
	url = BASE_URL + url

	print(f"Crawling: {url}")
	response = requests.get(url, headers=HEADERS, timeout=15)
	if response.status_code != 200:
	print(f"Failed to fetch {url}: {response.status_code}")
	return None

	soup = BeautifulSoup(response.content, 'html.parser')

	# Title extraction
	title = soup.find('h1') or soup.find('h2', class_='title') or soup.find('title')
	title_text = title.get_text(strip=True) if title else "No Title"

	# Content extraction with multiple selectors
	content_text = ""
	content_div = soup.select_one('div.article-detail, div.content, div.post-content, article')

	if content_div:
	paragraphs = content_div.find_all(['p', 'div', 'span'])
	content_text = "\n".join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20])
	else:
	# Fallback to all long paragraphs in body
	paras = soup.body.find_all('p')
	content_text = "\n".join([p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 30])

	if not content_text.strip():
	return None

	return {
	"title": title_text,
	"content": content_text,
	"url": url
	}

	except Exception as e:
	print(f"Error processing {url}: {e}")
	return None

	def save_article(article):
	if not article:
	return

	if not os.path.exists(DATA_DIR):
	os.makedirs(DATA_DIR)

	filename = clean_filename(article['title'], article['url']) + ".txt"
	filepath = os.path.join(DATA_DIR, filename)

	with open(filepath, 'w', encoding='utf-8') as f:
	f.write(f"Title: {article['title']}\n")
	f.write(f"URL: {article['url']}\n\n")
	f.write(article['content'])
	print(f"Saved: {filename}")

	def crawl_category(category_url, limit=10):
	print(f"Scanning category: {category_url}")
	try:
	response = requests.get(category_url, headers=HEADERS, timeout=15)
	soup = BeautifulSoup(response.content, 'html.parser')

	links = []
	for a in soup.find_all('a', href=True):
	href = a['href']
	# Basic validation
	if len(href) > 20 and (href.startswith('/') or BASE_URL in href):
	if any(x in href for x in ['contact', 'login', 'register', 'search', 'category', 'danh-muc']):
	continue
	links.append(href)

	links = list(set(links))
	print(f"Found {len(links)} potential articles. Processing up to {limit}...")

	count = 0
	for link in links:
	if count >= limit: break
	article = get_article_content(link)
	if article:
	save_article(article)
	count += 1
	time.sleep(1)

	except Exception as e:
	print(f"Error crawling {category_url}: {e}")

	if __name__ == "__main__":
	seed_urls = [
	"https://nihe.org.vn/vi/tin-tuc-su-kien",
	"https://nihe.org.vn/vi/y-hoc-du-phong",
	"https://nihe.org.vn/vi/thong-tin-suc-khoe",
	"https://nihe.org.vn/vi/vac-xin-tiem-chung"
	]
	for url in seed_urls:
	crawl_category(url, limit=5)