Spaces:
Paused
Paused
| import os | |
| import re | |
| from typing import List | |
| from langchain_core.documents import Document | |
| from ingestion.loaders.normalization import normalize_text | |
| def load_md(file_path: str) -> List[Document]: | |
| """Load Markdown safely, preserving inline tables and skipping unreadable sections.""" | |
| if not os.path.exists(file_path): | |
| print(f"File not found: {file_path}") | |
| return [] | |
| text = "" | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| except UnicodeDecodeError: | |
| try: | |
| with open(file_path, "r", encoding="latin-1") as f: | |
| text = f.read() | |
| except Exception as e: | |
| print(f"Failed to read Markdown file ({file_path}): {e}") | |
| return [] | |
| except Exception as e: | |
| print(f"Could not open Markdown file ({file_path}): {e}") | |
| return [] | |
| docs = [] | |
| try: | |
| # Split into segments alternating between text and tables | |
| parts = re.split(r"((?:\|.*\|\n)+)", text) | |
| for part in parts: | |
| if not part.strip(): | |
| continue | |
| # Detect if segment is a table | |
| content_type = "table" if re.match(r"(?:\|.*\|\n)+", part) else "text" | |
| # Clean markdown formatting but keep structure | |
| cleaned = normalize_text(re.sub(r'(```.*?```|`.*?`|\*\*|__|#)', '', part, flags=re.DOTALL)) | |
| if cleaned: | |
| docs.append(Document(page_content=cleaned, metadata={"source": file_path, "type": content_type})) | |
| except Exception as e: | |
| print(f"Error parsing Markdown file ({file_path}): {e}") | |
| return [] | |
| return docs | |