DoAn

File size: 1,619 Bytes

b91b0a5
 
4ff2e4d
 
 
b91b0a5
 
 
c429a2d
4ff2e4d
b91b0a5
943f176
4ff2e4d
 
9681056
4ff2e4d
 
b91b0a5
9681056
4ff2e4d
b91b0a5
4ff2e4d
b91b0a5
9681056
 
 
 
4ff2e4d
9681056
b91b0a5
 
9681056
 
4ff2e4d
b91b0a5
 
9681056
 
4ff2e4d
 
 
b91b0a5
4ff2e4d
9681056
 
 
 
 
b91b0a5
4ff2e4d
9681056
4ff2e4d
b91b0a5
9681056
4ff2e4d
 
b91b0a5

"""Script test chunking markdown file."""

import sys
sys.path.insert(0, "/home/bahung/DoAn")

from dotenv import load_dotenv
load_dotenv()  # Load biến môi trường từ .env

from core.rag.chunk import chunk_markdown_file

# File test
test_file = "data/data_process/chuong_trinh_dao_tao/1.1. Kỹ thuật Cơ điện tử.md"

print("=" * 70)
print(f" File: {test_file}")
print("=" * 70)

# Chunk file markdown
nodes = chunk_markdown_file(test_file)

print(f"\n Tổng số nodes: {len(nodes)}\n")

# Hiển thị thông tin từng node
for i, node in enumerate(nodes):
    content = node.get_content()
    metadata = node.metadata
    
    print(f"\n{'─' * 70}")
    print(f" NODE #{i}")
    print(f"   Loại: {type(node).__name__}")
    print(f"   Độ dài: {len(content)} ký tự")
    if metadata:
        print(f"   Metadata: {metadata}")
    print(f"{'─' * 70}")
    
    # Preview nội dung (tối đa 200 ký tự)
    content_preview = content[:200]
    if len(content) > 200:
        content_preview += "..."
    print(content_preview)

# Lưu kết quả ra file markdown để dễ xem
with open("test_chunk.md", "w", encoding="utf-8") as f:
    for i, node in enumerate(nodes):
        content = node.get_content()
        metadata = node.metadata
        
        f.write(f"# NODE {i}\n")
        f.write(f"**Loại:** {type(node).__name__}\n\n")
        f.write("**Metadata:**\n")
        for key, value in metadata.items():
            f.write(f"- {key}: {value}\n")
        f.write("\n**Nội dung:**\n")
        f.write(content)
        f.write("\n\n---\n\n")

print("\n Hoàn tất!")