|
|
|
|
|
""" |
|
|
Document loader for RAG ingestion. |
|
|
|
|
|
Provides: |
|
|
- load_markdown_docs(): Legacy markdown-only loader |
|
|
- load_documents(): Unified loader (uses Docling if available, falls back to markdown) |
|
|
|
|
|
CLI: |
|
|
> python3 load_docs.py /full/path/to/your/docs/folder |
|
|
prints a summary table for each file and exits with code 0. |
|
|
""" |
|
|
|
|
|
import os |
|
|
import glob |
|
|
import argparse |
|
|
import re |
|
|
import logging |
|
|
from typing import List, Dict, Optional |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def _clean_markdown(text: str) -> str: |
|
|
""" |
|
|
Clean markdown text by removing code blocks, HTML tags, and other non-content elements. |
|
|
|
|
|
Args: |
|
|
text: Raw markdown text to clean |
|
|
|
|
|
Returns: |
|
|
Cleaned text with markdown syntax removed |
|
|
""" |
|
|
|
|
|
text = re.sub(r"```.*?```", " ", text, flags=re.DOTALL) |
|
|
|
|
|
text = re.sub(r"<[^>]+>", " ", text) |
|
|
|
|
|
text = re.sub(r"!\[([^\]]*)\]\([^\)]*\)", r"\1", text) |
|
|
text = re.sub(r"\[([^\]]+)\]\([^\)]*\)", r"\1", text) |
|
|
|
|
|
text = re.sub(r"^---.*?---\s*", " ", text, flags=re.DOTALL) |
|
|
|
|
|
text = re.sub(r"\s+", " ", text).strip() |
|
|
return text |
|
|
|
|
|
def load_markdown_docs(dir_path: str, ext: str = ".md", max_chars: int = 20000) -> List[Dict]: |
|
|
""" |
|
|
Load markdown files from dir_path (non-recursive). Returns list of metadata+clean text. |
|
|
Skips files larger than max_chars (useful to enforce 'under 5 pages' rule roughly). |
|
|
|
|
|
Args: |
|
|
dir_path: Path to directory containing markdown files |
|
|
ext: File extension to look for (default: ".md") |
|
|
max_chars: Maximum number of characters to accept (default: 20000) |
|
|
|
|
|
Returns: |
|
|
List of document dictionaries with metadata and cleaned text |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If directory does not exist |
|
|
ValueError: If max_chars is not positive |
|
|
OSError: If there are issues reading files |
|
|
""" |
|
|
if max_chars <= 0: |
|
|
raise ValueError(f"max_chars must be positive, got {max_chars}") |
|
|
|
|
|
path = os.path.expanduser(dir_path) |
|
|
if not os.path.isdir(path): |
|
|
raise FileNotFoundError(f"Directory not found: {path}") |
|
|
|
|
|
pattern = os.path.join(path, f"*{ext}") |
|
|
files = sorted(glob.glob(pattern)) |
|
|
docs = [] |
|
|
for fp in files: |
|
|
try: |
|
|
with open(fp, "r", encoding="utf-8") as f: |
|
|
raw = f.read() |
|
|
except Exception as e: |
|
|
|
|
|
docs.append({ |
|
|
"filename": os.path.basename(fp), |
|
|
"path": fp, |
|
|
"text": None, |
|
|
"chars": 0, |
|
|
"words": 0, |
|
|
"status": f"ERROR_READING_FILE: {str(e)}" |
|
|
}) |
|
|
continue |
|
|
|
|
|
cleaned = _clean_markdown(raw) |
|
|
chars = len(cleaned) |
|
|
words = len(cleaned.split()) |
|
|
if chars == 0: |
|
|
|
|
|
continue |
|
|
if chars > max_chars: |
|
|
|
|
|
docs.append({ |
|
|
"filename": os.path.basename(fp), |
|
|
"path": fp, |
|
|
"text": None, |
|
|
"chars": chars, |
|
|
"words": words, |
|
|
"status": "SKIPPED_TOO_LARGE" |
|
|
}) |
|
|
continue |
|
|
docs.append({ |
|
|
"filename": os.path.basename(fp), |
|
|
"path": fp, |
|
|
"text": cleaned, |
|
|
"chars": chars, |
|
|
"words": words, |
|
|
"status": "OK" |
|
|
}) |
|
|
return docs |
|
|
|
|
|
def print_summary(docs: List[Dict]): |
|
|
if not docs: |
|
|
print("No markdown files found or all were skipped.") |
|
|
return |
|
|
print(f"{'FILENAME':40} {'STATUS':15} {'CHARS':>8} {'WORDS':>8}") |
|
|
print("-" * 80) |
|
|
for d in docs: |
|
|
name = d.get("filename", "")[:40] |
|
|
status = d.get("status", "") |
|
|
chars = d.get("chars", 0) |
|
|
words = d.get("words", 0) |
|
|
print(f"{name:40} {status:15} {chars:8d} {words:8d}") |
|
|
ok_count = sum(1 for d in docs if d.get("status") == "OK") |
|
|
skipped = len(docs) - ok_count |
|
|
print("-" * 80) |
|
|
print(f"Total files: {len(docs)} OK: {ok_count} Skipped: {skipped}") |
|
|
|
|
|
|
|
|
DOCLING_AVAILABLE = False |
|
|
try: |
|
|
from src.ingestion.docling_loader import ( |
|
|
load_documents_with_docling, |
|
|
convert_to_legacy_format, |
|
|
print_summary as docling_print_summary, |
|
|
SUPPORTED_EXTENSIONS |
|
|
) |
|
|
DOCLING_AVAILABLE = True |
|
|
except ImportError: |
|
|
SUPPORTED_EXTENSIONS = {".md", ".markdown"} |
|
|
|
|
|
|
|
|
def load_documents( |
|
|
dir_path: str, |
|
|
extensions: Optional[List[str]] = None, |
|
|
max_chars: int = 50000, |
|
|
use_docling: bool = True, |
|
|
recursive: bool = False |
|
|
) -> List[Dict]: |
|
|
""" |
|
|
Unified document loader - uses Docling if available, falls back to markdown. |
|
|
|
|
|
Args: |
|
|
dir_path: Path to directory containing documents |
|
|
extensions: File extensions to process (None = all supported) |
|
|
max_chars: Maximum characters per document |
|
|
use_docling: Prefer Docling if available |
|
|
recursive: Search subdirectories |
|
|
|
|
|
Returns: |
|
|
List of document dicts with text and metadata |
|
|
""" |
|
|
if use_docling and DOCLING_AVAILABLE: |
|
|
logger.info("Using Docling multi-format loader") |
|
|
parsed = load_documents_with_docling( |
|
|
dir_path, |
|
|
extensions=extensions, |
|
|
max_chars=max_chars, |
|
|
recursive=recursive |
|
|
) |
|
|
return convert_to_legacy_format(parsed) |
|
|
else: |
|
|
logger.info("Using legacy markdown loader") |
|
|
ext = ".md" |
|
|
if extensions and len(extensions) > 0: |
|
|
ext = extensions[0] if extensions[0].startswith(".") else f".{extensions[0]}" |
|
|
return load_markdown_docs(dir_path, ext=ext, max_chars=max_chars) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Load and summarize documents for RAG ingestion." |
|
|
) |
|
|
parser.add_argument("dir", help="Directory containing documents") |
|
|
parser.add_argument( |
|
|
"--ext", "-e", |
|
|
nargs="+", |
|
|
default=None, |
|
|
help="File extensions to load (default: all supported)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-chars", |
|
|
type=int, |
|
|
default=50000, |
|
|
help="Max characters to accept (default 50k)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--no-docling", |
|
|
action="store_true", |
|
|
help="Disable Docling, use markdown-only loader" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--recursive", "-r", |
|
|
action="store_true", |
|
|
help="Search subdirectories" |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.no_docling or not DOCLING_AVAILABLE: |
|
|
|
|
|
ext = args.ext[0] if args.ext else ".md" |
|
|
docs = load_markdown_docs(args.dir, ext=ext, max_chars=args.max_chars) |
|
|
print_summary(docs) |
|
|
else: |
|
|
|
|
|
parsed = load_documents_with_docling( |
|
|
args.dir, |
|
|
extensions=args.ext, |
|
|
max_chars=args.max_chars, |
|
|
recursive=args.recursive |
|
|
) |
|
|
docling_print_summary(parsed) |