Spaces:

snygginghani
/

kasitbot

Running

App Files Files Community

kasitbot / rag_preprocessor.py

snygginghani

Deploy KASITBot RAG chatbot

71e1c4b 1 day ago

raw

history blame contribute delete

21.1 kB

	"""
	================================================================================
	RAG Document Preprocessing Pipeline — v4 (Structural + Semantic Awareness)
	University-Level NLP System — KASIT Faculty Assistant
	================================================================================

	KEY IMPROVEMENTS vs v3:
	✅ Section/heading-aware DOCX chunking — Heading styles mark section
	boundaries; the section title is injected into every chunk so the LLM
	always knows which part of the document a passage belongs to.
	✅ Table-aware extraction — detects the header row and prepends column names
	to every data row, making each row self-contained and searchable.
	E.g. "Date: March 11 \| Time: 9:00 AM \| Course Code: 1902214 \| ..."
	This is critical for exam schedules, office-hours tables and fee tables.
	✅ Document-type detection — filename-based routing assigns a doc_type label
	(exam_schedule, office_hours, study_plan, scholarship, regulation, …)
	to every chunk so the LLM can interpret context correctly.
	✅ Arabic-aware chunk sizing — 700 chars for Arabic (denser script),
	500 chars for English, matching proportional reading units.
	✅ Semantic split for regulation docs — splits at article markers
	(المادة X / Article X) before falling back to char-based chunking,
	so each article stays together and is not truncated mid-clause.
	✅ Minimum chunk length filter — drops noise fragments shorter than 60 chars.
	✅ Rich per-chunk metadata: doc_type + section_title in every record.
	================================================================================
	"""

	import json
	import re
	import unicodedata
	from collections import Counter
	from pathlib import Path
	from typing import Dict, List

	import fitz # PyMuPDF
	from docx import Document
	from docx.oxml.ns import qn
	from docx.table import Table as DocxTable
	from docx.text.paragraph import Paragraph as DocxParagraph
	from langdetect import LangDetectException, detect

	# ── Paths & tunables ──────────────────────────────────────────────────────────
	INPUT_DIR = Path("input_documents")
	OUTPUT_FILE = Path("rag_dataset.json")

	CHUNK_SIZE_EN = 500 # chars — English (lower density)
	CHUNK_SIZE_AR = 700 # chars — Arabic (higher glyph density per char)
	OVERLAP_EN = 100
	OVERLAP_AR = 150
	MIN_CHUNK_LEN = 60 # drop fragments shorter than this


	# ══════════════════════════════════════════════════════════════════════════════
	# Language helpers
	# ══════════════════════════════════════════════════════════════════════════════

	def detect_language(text: str) -> str:
	if not text or not text.strip():
	return "Unknown"
	arabic_chars = len(re.findall(r"[؀-ۿ]", text))
	latin_chars = len(re.findall(r"[A-Za-z]", text))
	total = arabic_chars + latin_chars
	if total == 0:
	return "Unknown"
	ratio = arabic_chars / total
	if ratio > 0.6:
	return "Arabic"
	if ratio < 0.1:
	try:
	code = detect(text)
	return "English" if code == "en" else code.upper()
	except LangDetectException:
	return "English"
	return "Mixed"


	def _arabic_dominant(text: str) -> bool:
	alpha = [c for c in text if c.isalpha()]
	if not alpha:
	return False
	return sum(1 for c in alpha if "؀" <= c <= "ۿ") / len(alpha) > 0.4


	# ══════════════════════════════════════════════════════════════════════════════
	# Document-type detection (filename-based)
	# ══════════════════════════════════════════════════════════════════════════════

	_DOC_TYPE_MAP: List[tuple] = [
	("exam_schedule", ["mid_exam", "exam_schedul", "final_exam"]),
	("office_hours", ["office_hours", "office hours", "proffs"]),
	("academic_calendar", ["calendar", "uni_cal", "academic_cal"]),
	("study_plan", ["study plan", "study_plan"]),
	("course_records", ["course record", "course_record"]),
	("departments", ["department", "majors", "departments nad"]),
	("admissions_fees", ["admission", "fees_rag", "admissions_fees"]),
	("scholarship", ["makruma", "مكرمة", "teachers_grant",
	"ashaer", "الجيش", "ثلاث", "moalim"]),
	("regulation", ["تعليمات", "قانون", "دليل_اعضاء", "دليل اعضاء"]),
	("knowledge_base", ["knowledge_base", "kasit_knowledge"]),
	("faculty_info", ["faculty_it", "faculty_infor"]),
	("curriculum", ["curriculum", "ai-english", "ds-english", "ai_curriculum"]),
	("careers", ["career"]),
	("contacts", ["email", "docs_email"]),
	("english_system", ["english_sys"]),
	]


	def detect_doc_type(filename: str) -> str:
	name = filename.lower()
	for dtype, patterns in _DOC_TYPE_MAP:
	if any(p in name for p in patterns):
	return dtype
	return "general"


	# ══════════════════════════════════════════════════════════════════════════════
	# Text cleaning
	# ══════════════════════════════════════════════════════════════════════════════

	_KEEP = re.compile(
	r"[^؀-ۿݐ-ݿﭐ-﷿ﹰ-"
	r"A-Za-z0-9\s\.,;:!?\-\(\)\[\]\"\'،؟؛/\\@#%&*+=<>\\|_]"
	)


	def clean_text(text: str) -> str:
	if not text:
	return ""
	text = unicodedata.normalize("NFC", text)
	text = re.sub(r"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]", " ", text)
	text = _KEEP.sub(" ", text)
	text = re.sub(r" {3,}", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()


	# ══════════════════════════════════════════════════════════════════════════════
	# DOCX: structured block extraction (body-order traversal)
	# ══════════════════════════════════════════════════════════════════════════════

	_DATA_VALUE_RE = re.compile(
	r"\d{2,4}[:/]\d{2}" # times 09:00 or 1:30
	r"\|\d{1,2}[-/]\d{1,2}" # short dates 3/11
	r"\|صباح\|مساء\|\bAM\b\|\bPM\b" # AM / PM in either script
	)


	def _first_row_is_header(cells: List[str]) -> bool:
	"""Heuristic: the first table row is a header when its cells are short
	labels (< 35 chars average) and none of them contain a data-value pattern
	(times, dates, AM/PM)."""
	if not cells:
	return False
	if any(_DATA_VALUE_RE.search(c) for c in cells):
	return False
	return (sum(len(c) for c in cells) / len(cells)) < 35


	def _table_to_blocks(table: DocxTable, section: str) -> List[Dict]:
	"""
	Convert a DOCX table to self-contained text blocks.

	If a header row is detected, each data row becomes:
	"ColName: value \| ColName: value \| ..."
	This makes every row independently searchable — critical for exam
	schedules (Date / Time / Course / Professor / Room) and fee tables.
	"""
	rows: List[List[str]] = []
	for row in table.rows:
	seen: set = set()
	cells: List[str] = []
	for cell in row.cells:
	t = cell.text.strip()
	if t and t not in seen:
	cells.append(t)
	seen.add(t)
	if cells:
	rows.append(cells)

	if not rows:
	return []

	headers = rows[0] if _first_row_is_header(rows[0]) else []
	data_rows = rows[1:] if headers else rows

	blocks = []
	for row_cells in data_rows:
	if not row_cells:
	continue
	if headers:
	parts = []
	for i, val in enumerate(row_cells):
	col = headers[i] if i < len(headers) else f"col{i + 1}"
	parts.append(f"{col}: {val}")
	text = " \| ".join(parts)
	else:
	text = " \| ".join(row_cells)

	text = clean_text(text)
	if len(text) >= MIN_CHUNK_LEN:
	blocks.append({
	"text": text,
	"section_title": section,
	"is_table_row": True,
	"is_heading": False,
	})

	return blocks


	def extract_docx_blocks(filepath: Path) -> List[Dict]:
	"""
	Walk the DOCX body in document order (paragraphs AND tables interleaved),
	track the current section heading, and return a list of raw blocks.

	Each block: {text, section_title, is_table_row, is_heading}
	"""
	try:
	doc = Document(str(filepath))
	except Exception as exc:
	print(f" [ERROR] Cannot open DOCX '{filepath.name}': {exc}")
	return []

	blocks: List[Dict] = []
	section = ""

	for child in doc.element.body:
	tag = child.tag

	if tag == qn("w:p"):
	para = DocxParagraph(child, doc)
	text = para.text.strip()
	if not text:
	continue

	is_heading = False
	try:
	style = para.style.name or ""
	is_heading = style.lower().startswith("heading")
	except Exception:
	pass

	if is_heading:
	section = text
	blocks.append({
	"text": text,
	"section_title": text,
	"is_table_row": False,
	"is_heading": True,
	})
	else:
	blocks.append({
	"text": text,
	"section_title": section,
	"is_table_row": False,
	"is_heading": False,
	})

	elif tag == qn("w:tbl"):
	table = DocxTable(child, doc)
	for b in _table_to_blocks(table, section):
	blocks.append(b)

	return blocks


	# ══════════════════════════════════════════════════════════════════════════════
	# PDF extraction
	# ══════════════════════════════════════════════════════════════════════════════

	def extract_text_from_pdf(filepath: Path) -> str:
	parts: List[str] = []
	try:
	doc = fitz.open(str(filepath))
	except Exception as exc:
	print(f" [ERROR] Cannot open PDF '{filepath.name}': {exc}")
	return ""
	for page_num, page in enumerate(doc, start=1):
	try:
	blocks = sorted(page.get_text("blocks"), key=lambda b: (b[1], b[0]))
	for block in blocks:
	if block[4].strip():
	parts.append(block[4])
	except Exception as exc:
	print(f" [WARN] Page {page_num} of '{filepath.name}' skipped: {exc}")
	doc.close()
	return "\n".join(parts)


	# ══════════════════════════════════════════════════════════════════════════════
	# Semantic chunking
	# ══════════════════════════════════════════════════════════════════════════════

	_ARTICLE_MARKER = re.compile(r"(?:^\|\n)((?:المادة\|Article)\s+\d+)", re.IGNORECASE)
	_SENT_END = re.compile(r"[.!?؟\n]")


	def _char_chunk(text: str, size: int, overlap: int) -> List[str]:
	if not text:
	return []
	chunks: List[str] = []
	start, n = 0, len(text)
	while start < n:
	end = min(start + size, n)
	if end < n:
	m = list(_SENT_END.finditer(text, start, end))
	if m:
	end = m[-1].end()
	else:
	sp = text.rfind(" ", start, end)
	if sp > start:
	end = sp
	chunk = text[start:end].strip()
	if chunk:
	chunks.append(chunk)
	start = end - overlap if end - overlap > start else end
	return chunks


	def chunk_semantic(text: str, is_arabic: bool = False) -> List[str]:
	"""
	Split text respecting structural boundaries:
	1. Arabic article markers (المادة X) or English 'Article X' — for regulations.
	2. Fall back to overlapping char-based chunking with sentence-end preference.
	"""
	size = CHUNK_SIZE_AR if is_arabic else CHUNK_SIZE_EN
	overlap = OVERLAP_AR if is_arabic else OVERLAP_EN

	markers = list(_ARTICLE_MARKER.finditer(text))
	if len(markers) >= 2:
	segments = []
	for i, m in enumerate(markers):
	seg_end = markers[i + 1].start() if i + 1 < len(markers) else len(text)
	segments.append(text[m.start():seg_end].strip())
	chunks = []
	for seg in segments:
	chunks.extend(_char_chunk(seg, size, overlap))
	return [c for c in chunks if len(c) >= MIN_CHUNK_LEN]

	return [c for c in _char_chunk(text, size, overlap) if len(c) >= MIN_CHUNK_LEN]


	# ══════════════════════════════════════════════════════════════════════════════
	# Record builder
	# ══════════════════════════════════════════════════════════════════════════════

	def _record(text: str, source: str, chunk_id: int,
	doc_type: str, section_title: str) -> Dict:
	return {
	"text": text,
	"source": source,
	"chunk_id": chunk_id,
	"language": detect_language(text),
	"was_translated": False,
	"doc_type": doc_type,
	"section_title": section_title,
	}


	# ══════════════════════════════════════════════════════════════════════════════
	# File processors
	# ══════════════════════════════════════════════════════════════════════════════

	def process_docx(filepath: Path, doc_type: str) -> List[Dict]:
	"""
	Process DOCX with full structural awareness.

	Strategy:
	- Heading blocks mark section boundaries; heading text is prepended to the
	following paragraph buffer so every chunk carries section context.
	- Table rows are emitted as individual atomic records (they are already
	self-contained after header injection).
	- Consecutive paragraphs within the same section are buffered and then
	chunked semantically together.
	"""
	blocks = extract_docx_blocks(filepath)
	if not blocks:
	return []

	records: List[Dict] = []
	idx = 1
	para_buf: List[str] = []
	buf_section = ""

	def flush() -> None:
	nonlocal idx, para_buf
	if not para_buf:
	return
	combined = clean_text("\n".join(para_buf))
	para_buf = []
	if not combined:
	return
	is_ar = _arabic_dominant(combined)
	for chunk in chunk_semantic(combined, is_arabic=is_ar):
	if len(chunk) >= MIN_CHUNK_LEN:
	records.append(_record(chunk, filepath.name, idx, doc_type, buf_section))
	idx += 1

	for block in blocks:
	if block["is_heading"]:
	flush()
	buf_section = block["section_title"]
	para_buf.append(block["text"]) # heading text opens the next chunk for context

	elif block["is_table_row"]:
	# Table rows get their own atomic records (section boundary has no effect)
	flush()
	text = block["text"]
	if len(text) >= MIN_CHUNK_LEN:
	records.append(_record(text, filepath.name, idx, doc_type,
	block.get("section_title", "")))
	idx += 1

	else:
	# Regular paragraph — flush on section change
	if block["section_title"] != buf_section:
	if para_buf:
	flush()
	buf_section = block["section_title"]
	para_buf.append(block["text"])

	flush()
	return records


	def process_pdf(filepath: Path, doc_type: str) -> List[Dict]:
	raw = extract_text_from_pdf(filepath)
	if not raw.strip():
	print(f" [WARN] No text extracted from '{filepath.name}'.")
	return []
	cleaned = clean_text(raw)
	if not cleaned:
	return []
	is_ar = _arabic_dominant(cleaned)
	records = []
	for idx, chunk in enumerate(chunk_semantic(cleaned, is_arabic=is_ar), start=1):
	if len(chunk) >= MIN_CHUNK_LEN:
	records.append(_record(chunk, filepath.name, idx, doc_type, ""))
	return records


	def process_file(filepath: Path) -> List[Dict]:
	suffix = filepath.suffix.lower()
	doc_type = detect_doc_type(filepath.name)
	print(f" → [{doc_type:<22}] '{filepath.name}' ...")

	if suffix == ".pdf":
	records = process_pdf(filepath, doc_type)
	elif suffix in (".docx", ".doc"):
	records = process_docx(filepath, doc_type)
	else:
	print(f" [SKIP] Unsupported format: {suffix}")
	return []

	print(f" ✓ {len(records)} chunks")
	return records


	# ══════════════════════════════════════════════════════════════════════════════
	# Main
	# ══════════════════════════════════════════════════════════════════════════════

	def main() -> None:
	print("=" * 70)
	print(" RAG Preprocessor v4 — Section + Table-aware + Semantic Chunking")
	print(f" English chunks: {CHUNK_SIZE_EN} chars \| Arabic: {CHUNK_SIZE_AR} chars")
	print("=" * 70)

	if not INPUT_DIR.exists():
	INPUT_DIR.mkdir(parents=True)
	print(f"\n[INFO] Created '{INPUT_DIR}/' — add your documents and re-run.\n")
	return

	files = [
	f for f in INPUT_DIR.iterdir()
	if f.is_file() and f.suffix.lower() in {".pdf", ".docx", ".doc"}
	]
	if not files:
	print(f"\n[INFO] No supported files found in '{INPUT_DIR}/'.\n")
	return

	print(f"\nFound {len(files)} file(s):\n")
	all_records: List[Dict] = []
	for f in sorted(files):
	print(f"[FILE] {f.name}")
	all_records.extend(process_file(f))
	print()

	if not all_records:
	print("[WARN] No records produced. Exiting.")
	return

	with open(OUTPUT_FILE, "w", encoding="utf-8") as fh:
	json.dump(all_records, fh, ensure_ascii=False, indent=2)

	ar = sum(1 for r in all_records if r["language"] == "Arabic")
	en = sum(1 for r in all_records if r["language"] == "English")
	mx = sum(1 for r in all_records if r["language"] == "Mixed")
	dtypes = Counter(r.get("doc_type", "general") for r in all_records)

	print("=" * 70)
	print(f" ✅ {len(all_records)} total chunks → '{OUTPUT_FILE}'")
	print(f" Arabic: {ar} \| English: {en} \| Mixed: {mx}")
	print(f"\n Breakdown by document type:")
	for dt, cnt in dtypes.most_common():
	print(f" {dt:<22}: {cnt:>4} chunks")
	print("=" * 70)


	if __name__ == "__main__":
	main()