File size: 1,059 Bytes
936810c
 
 
3381d43
936810c
 
 
 
 
 
 
 
3381d43
936810c
 
3381d43
936810c
3381d43
936810c
 
 
3381d43
936810c
 
3381d43
936810c
 
 
 
 
3381d43
936810c
3381d43
936810c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import os
import asyncio
from crawl4ai import AsyncWebCrawler

# Danh sách URL HCMUT (có thể thêm nhiều trang)
URLS = [
    "https://hcmut.edu.vn/thong-diep-cua-hieu-truong",
    "https://hcmut.edu.vn/gioi-thieu/ke-hoach-chien-luoc",
    "https://hcmut.edu.vn/tong-quan"
    "https://hcmut.edu.vn/gioi-thieu/xep-hang-dai-hoc",
    "https://hcmut.edu.vn/tuyen-sinh-dh/dai-hoc-chinh-quy",
    "https://hcmut.edu.vn/co-cau-to-chuc"

    ""
]

OUTPUT_FILE = "Data/hcmut_crawl4ai.txt"

async def crawl_hcmut():
    # Tạo thư mục nếu chưa có
    os.makedirs("Data", exist_ok=True)

    async with AsyncWebCrawler() as crawler:
        results = await crawler.arun_many(urls=URLS)

        with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
            for result in results:
                f.write(f"===== URL: {result.url} =====\n")
                f.write(result.markdown)  # nội dung sạch
                f.write("\n\n")

    print(f"✅ Đã lưu dữ liệu vào {OUTPUT_FILE}")

if __name__ == "__main__":
    asyncio.run(crawl_hcmut())