rosvend's picture
style: changed formatting of logging print statements
843c84e
"""
Document Loader Module
Loads markdown files from the data/ directory with metadata enrichment.
"""
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, TextLoader
def load_upb_documents(show_progress=True):
"""
Load all markdown files from data/ directory and subdirectories.
Args:
show_progress: Whether to show progress bar (default: True)
Returns:
list: List of LangChain Document objects with content and metadata
"""
# Get data directory path
current_dir = Path(__file__).resolve().parent
data_dir = current_dir.parent.parent / "data"
# Load all .md files recursively
loader = DirectoryLoader(
str(data_dir),
glob="**/*.md",
loader_cls=TextLoader,
show_progress=show_progress,
use_multithreading=True
)
documents = loader.load()
# Add source category to metadata based on subdirectory
for doc in documents:
source_path = Path(doc.metadata['source'])
relative_path = source_path.relative_to(data_dir)
# Determine category from subdirectory
if relative_path.parts[0] == 'engineerings':
doc.metadata['category'] = 'engineering'
elif relative_path.parts[0] == 'contact':
doc.metadata['category'] = 'contact'
elif relative_path.parts[0] == 'enroll':
doc.metadata['category'] = 'enrollment'
elif relative_path.parts[0] == 'scholarships':
doc.metadata['category'] = 'scholarships'
else:
doc.metadata['category'] = 'general'
return documents
if __name__ == "__main__":
print(" Loading markdown files from data/ directory...\n")
documents = load_upb_documents()
print(f"\n Loaded {len(documents)} documents")
print(f" Total characters: {sum(len(doc.page_content) for doc in documents):,}")
# Group by category
categories = {}
for doc in documents:
cat = doc.metadata.get('category', 'unknown')
categories[cat] = categories.get(cat, 0) + 1
print("\n๐Ÿ“š Documents by category:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count} documents")