File size: 2,247 Bytes
b347548
c03c816
 
b347548
 
 
c03c816
b347548
 
c03c816
b347548
c03c816
b347548
 
c03c816
b347548
c03c816
 
 
 
 
 
b347548
c03c816
 
 
 
 
 
 
 
b347548
c03c816
b347548
c03c816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b347548
 
 
 
 
843c84e
c03c816
843c84e
 
 
 
c03c816
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Document Loader Module
Loads markdown files from the data/ directory with metadata enrichment.
"""

from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, TextLoader


def load_upb_documents(show_progress=True):
    """
    Load all markdown files from data/ directory and subdirectories.
    
    Args:
        show_progress: Whether to show progress bar (default: True)
    
    Returns:
        list: List of LangChain Document objects with content and metadata
    """
    # Get data directory path
    current_dir = Path(__file__).resolve().parent
    data_dir = current_dir.parent.parent / "data"
    
    # Load all .md files recursively
    loader = DirectoryLoader(
        str(data_dir),
        glob="**/*.md",
        loader_cls=TextLoader,
        show_progress=show_progress,
        use_multithreading=True
    )
    
    documents = loader.load()
    
    # Add source category to metadata based on subdirectory
    for doc in documents:
        source_path = Path(doc.metadata['source'])
        relative_path = source_path.relative_to(data_dir)
        
        # Determine category from subdirectory
        if relative_path.parts[0] == 'engineerings':
            doc.metadata['category'] = 'engineering'
        elif relative_path.parts[0] == 'contact':
            doc.metadata['category'] = 'contact'
        elif relative_path.parts[0] == 'enroll':
            doc.metadata['category'] = 'enrollment'
        elif relative_path.parts[0] == 'scholarships':
            doc.metadata['category'] = 'scholarships'
        else:
            doc.metadata['category'] = 'general'
    
    return documents


if __name__ == "__main__":
    print(" Loading markdown files from data/ directory...\n")
    documents = load_upb_documents()

    print(f"\n Loaded {len(documents)} documents")
    print(f" Total characters: {sum(len(doc.page_content) for doc in documents):,}")

    # Group by category
    categories = {}
    for doc in documents:
        cat = doc.metadata.get('category', 'unknown')
        categories[cat] = categories.get(cat, 0) + 1

    print("\n📚 Documents by category:")
    for cat, count in sorted(categories.items()):
        print(f"  - {cat}: {count} documents")