Spaces:
Sleeping
Sleeping
| import os | |
| import asyncio | |
| from crawl4ai import AsyncWebCrawler | |
| # Danh sách URL HCMUT (có thể thêm nhiều trang) | |
| URLS = [ | |
| "https://hcmut.edu.vn/thong-diep-cua-hieu-truong", | |
| "https://hcmut.edu.vn/gioi-thieu/ke-hoach-chien-luoc", | |
| "https://hcmut.edu.vn/tong-quan" | |
| "https://hcmut.edu.vn/gioi-thieu/xep-hang-dai-hoc", | |
| "https://hcmut.edu.vn/tuyen-sinh-dh/dai-hoc-chinh-quy", | |
| "https://hcmut.edu.vn/co-cau-to-chuc" | |
| "" | |
| ] | |
| OUTPUT_FILE = "Data/hcmut_crawl4ai.txt" | |
| async def crawl_hcmut(): | |
| # Tạo thư mục nếu chưa có | |
| os.makedirs("Data", exist_ok=True) | |
| async with AsyncWebCrawler() as crawler: | |
| results = await crawler.arun_many(urls=URLS) | |
| with open(OUTPUT_FILE, "w", encoding="utf-8") as f: | |
| for result in results: | |
| f.write(f"===== URL: {result.url} =====\n") | |
| f.write(result.markdown) # nội dung sạch | |
| f.write("\n\n") | |
| print(f"✅ Đã lưu dữ liệu vào {OUTPUT_FILE}") | |
| if __name__ == "__main__": | |
| asyncio.run(crawl_hcmut()) | |