Add pdf_utils.py - PDF extraction and cleaning

0193035 verified 26 days ago

4.44 kB

	"""
	pdf_utils.py — PDF text extraction and cleaning for Research Draft.

	Handles:
	- Extracting raw text from uploaded PDF files using PyMuPDF (fitz).
	- Cleaning extracted text (removing noise, fixing whitespace).
	- Truncating long documents to fit within LLM context limits.
	"""

	import re
	import fitz # PyMuPDF


	# ---------------------------------------------------------------------------
	# Constants
	# ---------------------------------------------------------------------------

	# Safe token-approximation: ~4 characters per token.
	# For a 4096-token context with system prompt overhead, keep paper text
	# under ~12 000 characters (~3 000 tokens), leaving room for instructions
	# and the generated abstract.
	MAX_TEXT_CHARS = 12_000


	# ---------------------------------------------------------------------------
	# Public API
	# ---------------------------------------------------------------------------

	def extract_text_from_pdf(file_path: str) -> str:
	"""
	Open a PDF file and return the concatenated text of all pages.

	Args:
	file_path: Absolute or relative path to a .pdf file.

	Returns:
	Raw extracted text as a single string.

	Raises:
	FileNotFoundError: If file_path does not exist.
	ValueError: If the file cannot be opened as a PDF.
	"""
	try:
	doc = fitz.open(file_path)
	except Exception as exc:
	raise ValueError(f"Could not open PDF: {exc}") from exc

	pages_text = []
	for page in doc:
	pages_text.append(page.get_text())
	doc.close()

	full_text = "\n".join(pages_text)
	if not full_text.strip():
	raise ValueError("The PDF appears to be empty or contains only images/scans.")
	return full_text


	def clean_text(raw_text: str) -> str:
	"""
	Clean raw PDF-extracted text for LLM consumption.

	Steps:
	1. Replace form-feed and vertical-tab characters.
	2. Normalise line breaks (single newlines inside paragraphs → spaces).
	3. Collapse multiple whitespace characters.
	4. Strip common PDF artefacts (page numbers, headers/footers patterns).
	5. Remove non-ASCII characters that are not standard punctuation.

	Args:
	raw_text: The unprocessed text from extract_text_from_pdf.

	Returns:
	Cleaned text ready for prompt construction.
	"""
	text = raw_text

	# Replace form-feed / vertical-tab
	text = text.replace("\f", "\n").replace("\v", "\n")

	# Remove standalone page-number lines (e.g. "\n12\n", "\nPage 5\n")
	text = re.sub(r"\n\s(?:Page\s)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE)

	# Turn single line-breaks inside paragraphs into spaces, but keep
	# double line-breaks as paragraph separators.
	text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

	# Collapse runs of whitespace (spaces/tabs) into a single space
	text = re.sub(r"[ \t]+", " ", text)

	# Collapse 3+ consecutive newlines into 2
	text = re.sub(r"\n{3,}", "\n\n", text)

	# Remove common artefacts: lines that are only dashes or underscores
	text = re.sub(r"\n[-_=]{3,}\n", "\n", text)

	# Strip leading/trailing whitespace on every line
	text = "\n".join(line.strip() for line in text.split("\n"))

	# Final strip
	text = text.strip()
	return text


	def truncate_text(text: str, max_chars: int = MAX_TEXT_CHARS) -> str:
	"""
	Truncate text to at most max_chars characters, breaking at a
	sentence boundary when possible so the LLM receives coherent input.

	Args:
	text: Cleaned paper text.
	max_chars: Maximum character count (default: 12 000).

	Returns:
	Truncated text. If no truncation was needed the original text is
	returned unchanged.
	"""
	if len(text) <= max_chars:
	return text

	truncated = text[:max_chars]

	# Try to cut at the last sentence-ending punctuation
	last_period = max(truncated.rfind(". "), truncated.rfind(".\n"))
	if last_period > max_chars * 0.5:
	truncated = truncated[: last_period + 1]

	return truncated


	def process_pdf(file_path: str) -> str:
	"""
	End-to-end convenience function: extract → clean → truncate.

	Args:
	file_path: Path to the uploaded PDF.

	Returns:
	Cleaned and (if necessary) truncated paper text ready for the LLM.
	"""
	raw = extract_text_from_pdf(file_path)
	cleaned = clean_text(raw)
	final = truncate_text(cleaned)
	return final