Spaces:
Sleeping
Sleeping
File size: 1,059 Bytes
936810c 3381d43 936810c 3381d43 936810c 3381d43 936810c 3381d43 936810c 3381d43 936810c 3381d43 936810c 3381d43 936810c 3381d43 936810c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | import os
import asyncio
from crawl4ai import AsyncWebCrawler
# Danh sách URL HCMUT (có thể thêm nhiều trang)
URLS = [
"https://hcmut.edu.vn/thong-diep-cua-hieu-truong",
"https://hcmut.edu.vn/gioi-thieu/ke-hoach-chien-luoc",
"https://hcmut.edu.vn/tong-quan"
"https://hcmut.edu.vn/gioi-thieu/xep-hang-dai-hoc",
"https://hcmut.edu.vn/tuyen-sinh-dh/dai-hoc-chinh-quy",
"https://hcmut.edu.vn/co-cau-to-chuc"
""
]
OUTPUT_FILE = "Data/hcmut_crawl4ai.txt"
async def crawl_hcmut():
# Tạo thư mục nếu chưa có
os.makedirs("Data", exist_ok=True)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(urls=URLS)
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
for result in results:
f.write(f"===== URL: {result.url} =====\n")
f.write(result.markdown) # nội dung sạch
f.write("\n\n")
print(f"✅ Đã lưu dữ liệu vào {OUTPUT_FILE}")
if __name__ == "__main__":
asyncio.run(crawl_hcmut())
|