Spaces:

MinaNasser
/

EXAM_RAG_API

Paused

1st

1bc3f18 about 2 months ago

1.66 kB

	import os
	import re
	from typing import List
	from langchain_core.documents import Document
	from ingestion.loaders.normalization import normalize_text


	def load_md(file_path: str) -> List[Document]:
	"""Load Markdown safely, preserving inline tables and skipping unreadable sections."""
	if not os.path.exists(file_path):
	print(f"File not found: {file_path}")
	return []

	text = ""
	try:
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()
	except UnicodeDecodeError:
	try:
	with open(file_path, "r", encoding="latin-1") as f:
	text = f.read()
	except Exception as e:
	print(f"Failed to read Markdown file ({file_path}): {e}")
	return []
	except Exception as e:
	print(f"Could not open Markdown file ({file_path}): {e}")
	return []

	docs = []
	try:
	# Split into segments alternating between text and tables
	parts = re.split(r"((?:\\|.*\\|\n)+)", text)
	for part in parts:
	if not part.strip():
	continue

	# Detect if segment is a table
	content_type = "table" if re.match(r"(?:\\|.*\\|\n)+", part) else "text"

	# Clean markdown formatting but keep structure
	cleaned = normalize_text(re.sub(r'(```.?```\|`.?`\|\\\|__\|#)', '', part, flags=re.DOTALL))
	if cleaned:
	docs.append(Document(page_content=cleaned, metadata={"source": file_path, "type": content_type}))
	except Exception as e:
	print(f"Error parsing Markdown file ({file_path}): {e}")
	return []

	return docs