math-chatbot-v2 / src /edurag_math_bot /pdf_processing.py
pranshu dhiman
Deploy MathSutra Space
7fab45b
Raw
History Blame Contribute Delete
6.77 kB
from __future__ import annotations
import hashlib
import re
from dataclasses import dataclass
from io import BytesIO
from pathlib import Path
try:
from pypdf import PdfReader
except ImportError:
PdfReader = None
from .catalog import CHAPTER_CATALOG, clean_title, parse_chapter_number, pdf_metadata
@dataclass
class ChunkRecord:
chunk_id: str
text: str
chapter_number: int
chapter_name: str
topic: str
page_number: int
source_file: str
SUPPORTED_UPLOAD_EXTENSIONS = {".pdf", ".txt", ".md"}
def get_pdf_reader(source: str | Path | BytesIO) -> object:
if PdfReader is None:
raise RuntimeError(
"PDF support needs the `pypdf` package. Run `python3 -m pip install -r requirements.txt`."
)
return PdfReader(source)
def discover_pdfs(root_dir: Path) -> list[Path]:
return sorted(path for path in root_dir.glob("*.pdf") if path.is_file())
def clean_text(text: str) -> str:
text = text.replace("\x00", " ")
text = re.sub(r"-\s*\n", "", text)
text = re.sub(r"\s*\n\s*", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def candidate_topic(text: str, fallback: str) -> str:
for line in text.splitlines():
line = re.sub(r"\s+", " ", line).strip(" .:-")
line = re.sub(r"\d+$", "", line).strip(" .:-")
if not line:
continue
if len(line) > 80:
continue
if re.fullmatch(r"[0-9. ]+", line):
continue
if line.lower().startswith("mathematics"):
continue
if any(char.isalpha() for char in line):
return line.title()
return fallback
def split_text(text: str, chunk_size: int, chunk_overlap: int) -> list[str]:
if len(text) <= chunk_size:
return [text]
chunks: list[str] = []
start = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunk = text[start:end]
if end < len(text):
split_at = chunk.rfind("\n")
if split_at > chunk_size // 2:
chunk = chunk[:split_at]
end = start + split_at
chunks.append(chunk.strip())
if end == len(text):
break
start = max(end - chunk_overlap, 0)
return [chunk for chunk in chunks if chunk]
def build_chunk_records(
page_texts: list[tuple[int, str]],
*,
source_key: str,
chapter_number: int,
chapter_name: str,
source_file: str,
chunk_size: int,
chunk_overlap: int,
) -> list[ChunkRecord]:
all_chunks: list[ChunkRecord] = []
for page_number, raw_text in page_texts:
page_text = clean_text(raw_text)
if not page_text:
continue
page_chunks = split_text(page_text, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
for chunk_index, chunk_text in enumerate(page_chunks, start=1):
topic = candidate_topic(chunk_text, fallback=chapter_name)
all_chunks.append(
ChunkRecord(
chunk_id=f"{source_key}-p{page_number}-c{chunk_index}",
text=chunk_text,
chapter_number=chapter_number,
chapter_name=chapter_name,
topic=topic,
page_number=page_number,
source_file=source_file,
)
)
return all_chunks
def extract_chunks_from_pdf(
file_path: Path,
chunk_size: int,
chunk_overlap: int,
) -> list[ChunkRecord]:
reader = get_pdf_reader(str(file_path))
meta = pdf_metadata(file_path)
page_texts = [
(page_index, page.extract_text() or "")
for page_index, page in enumerate(reader.pages, start=1)
]
return build_chunk_records(
page_texts,
source_key=file_path.stem,
chapter_number=int(meta["chapter_number"]),
chapter_name=str(meta["chapter_name"]),
source_file=str(meta["source_file"]),
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
def uploaded_file_metadata(file_name: str) -> dict[str, str | int]:
path = Path(file_name)
chapter_number = parse_chapter_number(path)
if chapter_number is None:
return {
"chapter_number": -1,
"chapter_name": clean_title(path.stem),
"source_file": path.name,
}
return {
"chapter_number": chapter_number,
"chapter_name": CHAPTER_CATALOG.get(chapter_number, clean_title(path.stem)),
"source_file": path.name,
}
def extract_chunks_from_pdf_bytes(
file_name: str,
file_bytes: bytes,
chunk_size: int,
chunk_overlap: int,
) -> list[ChunkRecord]:
meta = uploaded_file_metadata(file_name)
reader = get_pdf_reader(BytesIO(file_bytes))
source_hash = hashlib.sha1(file_bytes).hexdigest()[:12]
page_texts = [
(page_index, page.extract_text() or "")
for page_index, page in enumerate(reader.pages, start=1)
]
return build_chunk_records(
page_texts,
source_key=f"upload-{source_hash}",
chapter_number=int(meta["chapter_number"]),
chapter_name=str(meta["chapter_name"]),
source_file=str(meta["source_file"]),
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
def extract_chunks_from_text_bytes(
file_name: str,
file_bytes: bytes,
chunk_size: int,
chunk_overlap: int,
) -> list[ChunkRecord]:
meta = uploaded_file_metadata(file_name)
source_hash = hashlib.sha1(file_bytes).hexdigest()[:12]
text = file_bytes.decode("utf-8", errors="ignore")
return build_chunk_records(
[(1, text)],
source_key=f"upload-{source_hash}",
chapter_number=int(meta["chapter_number"]),
chapter_name=str(meta["chapter_name"]),
source_file=str(meta["source_file"]),
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
def extract_chunks_from_uploaded_file(
file_name: str,
file_bytes: bytes,
chunk_size: int,
chunk_overlap: int,
) -> list[ChunkRecord]:
extension = Path(file_name).suffix.lower()
if extension == ".pdf":
return extract_chunks_from_pdf_bytes(
file_name=file_name,
file_bytes=file_bytes,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
if extension in {".txt", ".md"}:
return extract_chunks_from_text_bytes(
file_name=file_name,
file_bytes=file_bytes,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
supported = ", ".join(sorted(SUPPORTED_UPLOAD_EXTENSIONS))
raise ValueError(f"Unsupported file type for {file_name}. Use one of: {supported}.")