Spaces:

ziadsameh32
/

ContiAI-v4

Sleeping

ContiAI-v4 / rag /preprocess.py

Initial FastAPI CrewAI setup

41027b6 about 1 month ago

1.73 kB

	import re
	from typing import List
	from rapidfuzz import fuzz

	# =========================
	# Arabic Utilities
	# =========================

	_AR_DIACRITICS = re.compile(r"[\u0617-\u061A\u064B-\u0652\u0670\u06D6-\u06ED]")
	_AR_TATWEEL = "\u0640"


	def normalize_arabic(text: str) -> str:
	"""
	Normalize Arabic text:
	- remove tatweel
	- remove diacritics
	- normalize spaces
	"""
	if not text:
	return ""
	text = text.replace(_AR_TATWEEL, "")
	text = _AR_DIACRITICS.sub("", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text


	def drop_common_headers_footers(
	pages: List[str], min_similarity: int = 92
	) -> List[str]:
	"""
	Detect and remove repeated headers / footers across pages.
	"""
	if not pages:
	return pages

	first_lines, last_lines = [], []

	for p in pages:
	lines = [l.strip() for l in p.splitlines() if l.strip()]
	first_lines.append("\n".join(lines[:2]) if len(lines) >= 2 else "")
	last_lines.append("\n".join(lines[-2:]) if len(lines) >= 2 else "")

	def detect(candidates: List[str]) -> str \| None:
	candidates = sorted([c for c in candidates if c], key=len, reverse=True)
	if not candidates:
	return None
	base = candidates[0]
	hits = sum(1 for c in candidates if fuzz.ratio(base, c) >= min_similarity)
	return base if hits >= max(3, int(0.4 * len(candidates))) else None

	header = detect(first_lines)
	footer = detect(last_lines)

	cleaned_pages = []
	for p in pages:
	if header:
	p = p.replace(header, "")
	if footer:
	p = p.replace(footer, "")
	cleaned_pages.append(p.strip())

	return cleaned_pages