Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| import os | |
| import time | |
| import re | |
| # Resolve paths correctly | |
| PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) | |
| DATA_DIR = os.path.join(PROJECT_ROOT, 'data', 'raw') | |
| BASE_URL = "https://nihe.org.vn" | |
| # Headers to mimic a real browser to avoid 403 Forbidden | |
| HEADERS = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| } | |
| def clean_filename(title, url): | |
| safe_title = re.sub(r'[\\/*?:"<>|]', "", title)[:50].strip() | |
| if not safe_title: safe_title = "untitled" | |
| url_hash = str(abs(hash(url)))[:8] | |
| return f"{safe_title}_{url_hash}" | |
| def get_article_content(url): | |
| try: | |
| if not url.startswith('http'): | |
| url = BASE_URL + url | |
| print(f"Crawling: {url}") | |
| response = requests.get(url, headers=HEADERS, timeout=15) | |
| if response.status_code != 200: | |
| print(f"Failed to fetch {url}: {response.status_code}") | |
| return None | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Title extraction | |
| title = soup.find('h1') or soup.find('h2', class_='title') or soup.find('title') | |
| title_text = title.get_text(strip=True) if title else "No Title" | |
| # Content extraction with multiple selectors | |
| content_text = "" | |
| content_div = soup.select_one('div.article-detail, div.content, div.post-content, article') | |
| if content_div: | |
| paragraphs = content_div.find_all(['p', 'div', 'span']) | |
| content_text = "\n".join([p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 20]) | |
| else: | |
| # Fallback to all long paragraphs in body | |
| paras = soup.body.find_all('p') | |
| content_text = "\n".join([p.get_text(strip=True) for p in paras if len(p.get_text(strip=True)) > 30]) | |
| if not content_text.strip(): | |
| return None | |
| return { | |
| "title": title_text, | |
| "content": content_text, | |
| "url": url | |
| } | |
| except Exception as e: | |
| print(f"Error processing {url}: {e}") | |
| return None | |
| def save_article(article): | |
| if not article: | |
| return | |
| if not os.path.exists(DATA_DIR): | |
| os.makedirs(DATA_DIR) | |
| filename = clean_filename(article['title'], article['url']) + ".txt" | |
| filepath = os.path.join(DATA_DIR, filename) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write(f"Title: {article['title']}\n") | |
| f.write(f"URL: {article['url']}\n\n") | |
| f.write(article['content']) | |
| print(f"Saved: {filename}") | |
| def crawl_category(category_url, limit=10): | |
| print(f"Scanning category: {category_url}") | |
| try: | |
| response = requests.get(category_url, headers=HEADERS, timeout=15) | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| links = [] | |
| for a in soup.find_all('a', href=True): | |
| href = a['href'] | |
| # Basic validation | |
| if len(href) > 20 and (href.startswith('/') or BASE_URL in href): | |
| if any(x in href for x in ['contact', 'login', 'register', 'search', 'category', 'danh-muc']): | |
| continue | |
| links.append(href) | |
| links = list(set(links)) | |
| print(f"Found {len(links)} potential articles. Processing up to {limit}...") | |
| count = 0 | |
| for link in links: | |
| if count >= limit: break | |
| article = get_article_content(link) | |
| if article: | |
| save_article(article) | |
| count += 1 | |
| time.sleep(1) | |
| except Exception as e: | |
| print(f"Error crawling {category_url}: {e}") | |
| if __name__ == "__main__": | |
| seed_urls = [ | |
| "https://nihe.org.vn/vi/tin-tuc-su-kien", | |
| "https://nihe.org.vn/vi/y-hoc-du-phong", | |
| "https://nihe.org.vn/vi/thong-tin-suc-khoe", | |
| "https://nihe.org.vn/vi/vac-xin-tiem-chung" | |
| ] | |
| for url in seed_urls: | |
| crawl_category(url, limit=5) | |