import os import re from typing import List from langchain_core.documents import Document from ingestion.loaders.normalization import normalize_text def load_md(file_path: str) -> List[Document]: """Load Markdown safely, preserving inline tables and skipping unreadable sections.""" if not os.path.exists(file_path): print(f"File not found: {file_path}") return [] text = "" try: with open(file_path, "r", encoding="utf-8") as f: text = f.read() except UnicodeDecodeError: try: with open(file_path, "r", encoding="latin-1") as f: text = f.read() except Exception as e: print(f"Failed to read Markdown file ({file_path}): {e}") return [] except Exception as e: print(f"Could not open Markdown file ({file_path}): {e}") return [] docs = [] try: # Split into segments alternating between text and tables parts = re.split(r"((?:\|.*\|\n)+)", text) for part in parts: if not part.strip(): continue # Detect if segment is a table content_type = "table" if re.match(r"(?:\|.*\|\n)+", part) else "text" # Clean markdown formatting but keep structure cleaned = normalize_text(re.sub(r'(```.*?```|`.*?`|\*\*|__|#)', '', part, flags=re.DOTALL)) if cleaned: docs.append(Document(page_content=cleaned, metadata={"source": file_path, "type": content_type})) except Exception as e: print(f"Error parsing Markdown file ({file_path}): {e}") return [] return docs