Spaces:
Runtime error
Runtime error
File size: 2,247 Bytes
b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 c03c816 b347548 843c84e c03c816 843c84e c03c816 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
"""
Document Loader Module
Loads markdown files from the data/ directory with metadata enrichment.
"""
from pathlib import Path
from langchain_community.document_loaders import DirectoryLoader, TextLoader
def load_upb_documents(show_progress=True):
"""
Load all markdown files from data/ directory and subdirectories.
Args:
show_progress: Whether to show progress bar (default: True)
Returns:
list: List of LangChain Document objects with content and metadata
"""
# Get data directory path
current_dir = Path(__file__).resolve().parent
data_dir = current_dir.parent.parent / "data"
# Load all .md files recursively
loader = DirectoryLoader(
str(data_dir),
glob="**/*.md",
loader_cls=TextLoader,
show_progress=show_progress,
use_multithreading=True
)
documents = loader.load()
# Add source category to metadata based on subdirectory
for doc in documents:
source_path = Path(doc.metadata['source'])
relative_path = source_path.relative_to(data_dir)
# Determine category from subdirectory
if relative_path.parts[0] == 'engineerings':
doc.metadata['category'] = 'engineering'
elif relative_path.parts[0] == 'contact':
doc.metadata['category'] = 'contact'
elif relative_path.parts[0] == 'enroll':
doc.metadata['category'] = 'enrollment'
elif relative_path.parts[0] == 'scholarships':
doc.metadata['category'] = 'scholarships'
else:
doc.metadata['category'] = 'general'
return documents
if __name__ == "__main__":
print(" Loading markdown files from data/ directory...\n")
documents = load_upb_documents()
print(f"\n Loaded {len(documents)} documents")
print(f" Total characters: {sum(len(doc.page_content) for doc in documents):,}")
# Group by category
categories = {}
for doc in documents:
cat = doc.metadata.get('category', 'unknown')
categories[cat] = categories.get(cat, 0) + 1
print("\n📚 Documents by category:")
for cat, count in sorted(categories.items()):
print(f" - {cat}: {count} documents") |