Spaces:

rishach
/

math-chatbot-v2

Sleeping

math-chatbot-v2 / src /edurag_math_bot /pdf_processing.py

pranshu dhiman

Deploy MathSutra Space

7fab45b 25 days ago

6.77 kB

	from __future__ import annotations

	import hashlib
	import re
	from dataclasses import dataclass
	from io import BytesIO
	from pathlib import Path

	try:
	from pypdf import PdfReader
	except ImportError:
	PdfReader = None

	from .catalog import CHAPTER_CATALOG, clean_title, parse_chapter_number, pdf_metadata


	@dataclass
	class ChunkRecord:
	chunk_id: str
	text: str
	chapter_number: int
	chapter_name: str
	topic: str
	page_number: int
	source_file: str


	SUPPORTED_UPLOAD_EXTENSIONS = {".pdf", ".txt", ".md"}


	def get_pdf_reader(source: str \| Path \| BytesIO) -> object:
	if PdfReader is None:
	raise RuntimeError(
	"PDF support needs the `pypdf` package. Run `python3 -m pip install -r requirements.txt`."
	)
	return PdfReader(source)


	def discover_pdfs(root_dir: Path) -> list[Path]:
	return sorted(path for path in root_dir.glob("*.pdf") if path.is_file())


	def clean_text(text: str) -> str:
	text = text.replace("\x00", " ")
	text = re.sub(r"-\s*\n", "", text)
	text = re.sub(r"\s\n\s", "\n", text)
	text = re.sub(r"[ \t]+", " ", text)
	return text.strip()


	def candidate_topic(text: str, fallback: str) -> str:
	for line in text.splitlines():
	line = re.sub(r"\s+", " ", line).strip(" .:-")
	line = re.sub(r"\d+$", "", line).strip(" .:-")
	if not line:
	continue
	if len(line) > 80:
	continue
	if re.fullmatch(r"[0-9. ]+", line):
	continue
	if line.lower().startswith("mathematics"):
	continue
	if any(char.isalpha() for char in line):
	return line.title()
	return fallback


	def split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
	if len(text) <= chunk_size:
	return [text]

	chunks: list[str] = []
	start = 0
	while start < len(text):
	end = min(start + chunk_size, len(text))
	chunk = text[start:end]
	if end < len(text):
	split_at = chunk.rfind("\n")
	if split_at > chunk_size // 2:
	chunk = chunk[:split_at]
	end = start + split_at
	chunks.append(chunk.strip())
	if end == len(text):
	break
	start = max(end - chunk_overlap, 0)
	return [chunk for chunk in chunks if chunk]


	def build_chunk_records(
	page_texts: list[tuple[int, str]],
	*,
	source_key: str,
	chapter_number: int,
	chapter_name: str,
	source_file: str,
	chunk_size: int,
	chunk_overlap: int,
	) -> list[ChunkRecord]:
	all_chunks: list[ChunkRecord] = []

	for page_number, raw_text in page_texts:
	page_text = clean_text(raw_text)
	if not page_text:
	continue

	page_chunks = split_text(page_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
	for chunk_index, chunk_text in enumerate(page_chunks, start=1):
	topic = candidate_topic(chunk_text, fallback=chapter_name)
	all_chunks.append(
	ChunkRecord(
	chunk_id=f"{source_key}-p{page_number}-c{chunk_index}",
	text=chunk_text,
	chapter_number=chapter_number,
	chapter_name=chapter_name,
	topic=topic,
	page_number=page_number,
	source_file=source_file,
	)
	)

	return all_chunks


	def extract_chunks_from_pdf(
	file_path: Path,
	chunk_size: int,
	chunk_overlap: int,
	) -> list[ChunkRecord]:
	reader = get_pdf_reader(str(file_path))
	meta = pdf_metadata(file_path)
	page_texts = [
	(page_index, page.extract_text() or "")
	for page_index, page in enumerate(reader.pages, start=1)
	]
	return build_chunk_records(
	page_texts,
	source_key=file_path.stem,
	chapter_number=int(meta["chapter_number"]),
	chapter_name=str(meta["chapter_name"]),
	source_file=str(meta["source_file"]),
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)


	def uploaded_file_metadata(file_name: str) -> dict[str, str \| int]:
	path = Path(file_name)
	chapter_number = parse_chapter_number(path)
	if chapter_number is None:
	return {
	"chapter_number": -1,
	"chapter_name": clean_title(path.stem),
	"source_file": path.name,
	}

	return {
	"chapter_number": chapter_number,
	"chapter_name": CHAPTER_CATALOG.get(chapter_number, clean_title(path.stem)),
	"source_file": path.name,
	}


	def extract_chunks_from_pdf_bytes(
	file_name: str,
	file_bytes: bytes,
	chunk_size: int,
	chunk_overlap: int,
	) -> list[ChunkRecord]:
	meta = uploaded_file_metadata(file_name)
	reader = get_pdf_reader(BytesIO(file_bytes))
	source_hash = hashlib.sha1(file_bytes).hexdigest()[:12]
	page_texts = [
	(page_index, page.extract_text() or "")
	for page_index, page in enumerate(reader.pages, start=1)
	]
	return build_chunk_records(
	page_texts,
	source_key=f"upload-{source_hash}",
	chapter_number=int(meta["chapter_number"]),
	chapter_name=str(meta["chapter_name"]),
	source_file=str(meta["source_file"]),
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)


	def extract_chunks_from_text_bytes(
	file_name: str,
	file_bytes: bytes,
	chunk_size: int,
	chunk_overlap: int,
	) -> list[ChunkRecord]:
	meta = uploaded_file_metadata(file_name)
	source_hash = hashlib.sha1(file_bytes).hexdigest()[:12]
	text = file_bytes.decode("utf-8", errors="ignore")
	return build_chunk_records(
	[(1, text)],
	source_key=f"upload-{source_hash}",
	chapter_number=int(meta["chapter_number"]),
	chapter_name=str(meta["chapter_name"]),
	source_file=str(meta["source_file"]),
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)


	def extract_chunks_from_uploaded_file(
	file_name: str,
	file_bytes: bytes,
	chunk_size: int,
	chunk_overlap: int,
	) -> list[ChunkRecord]:
	extension = Path(file_name).suffix.lower()
	if extension == ".pdf":
	return extract_chunks_from_pdf_bytes(
	file_name=file_name,
	file_bytes=file_bytes,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)
	if extension in {".txt", ".md"}:
	return extract_chunks_from_text_bytes(
	file_name=file_name,
	file_bytes=file_bytes,
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	)

	supported = ", ".join(sorted(SUPPORTED_UPLOAD_EXTENSIONS))
	raise ValueError(f"Unsupported file type for {file_name}. Use one of: {supported}.")