EXAM_RAG_API / ingestion /loaders /md_loader.py
MinaNasser's picture
1st
1bc3f18
import os
import re
from typing import List
from langchain_core.documents import Document
from ingestion.loaders.normalization import normalize_text
def load_md(file_path: str) -> List[Document]:
"""Load Markdown safely, preserving inline tables and skipping unreadable sections."""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return []
text = ""
try:
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
except UnicodeDecodeError:
try:
with open(file_path, "r", encoding="latin-1") as f:
text = f.read()
except Exception as e:
print(f"Failed to read Markdown file ({file_path}): {e}")
return []
except Exception as e:
print(f"Could not open Markdown file ({file_path}): {e}")
return []
docs = []
try:
# Split into segments alternating between text and tables
parts = re.split(r"((?:\|.*\|\n)+)", text)
for part in parts:
if not part.strip():
continue
# Detect if segment is a table
content_type = "table" if re.match(r"(?:\|.*\|\n)+", part) else "text"
# Clean markdown formatting but keep structure
cleaned = normalize_text(re.sub(r'(```.*?```|`.*?`|\*\*|__|#)', '', part, flags=re.DOTALL))
if cleaned:
docs.append(Document(page_content=cleaned, metadata={"source": file_path, "type": content_type}))
except Exception as e:
print(f"Error parsing Markdown file ({file_path}): {e}")
return []
return docs