File size: 4,444 Bytes
"""
pdf_utils.py — PDF text extraction and cleaning for Research Draft.

Handles:
  - Extracting raw text from uploaded PDF files using PyMuPDF (fitz).
  - Cleaning extracted text (removing noise, fixing whitespace).
  - Truncating long documents to fit within LLM context limits.
"""

import re
import fitz  # PyMuPDF


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

# Safe token-approximation: ~4 characters per token.
# For a 4096-token context with system prompt overhead, keep paper text
# under ~12 000 characters (~3 000 tokens), leaving room for instructions
# and the generated abstract.
MAX_TEXT_CHARS = 12_000


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def extract_text_from_pdf(file_path: str) -> str:
    """
    Open a PDF file and return the concatenated text of all pages.

    Args:
        file_path: Absolute or relative path to a .pdf file.

    Returns:
        Raw extracted text as a single string.

    Raises:
        FileNotFoundError: If *file_path* does not exist.
        ValueError: If the file cannot be opened as a PDF.
    """
    try:
        doc = fitz.open(file_path)
    except Exception as exc:
        raise ValueError(f"Could not open PDF: {exc}") from exc

    pages_text = []
    for page in doc:
        pages_text.append(page.get_text())
    doc.close()

    full_text = "\n".join(pages_text)
    if not full_text.strip():
        raise ValueError("The PDF appears to be empty or contains only images/scans.")
    return full_text


def clean_text(raw_text: str) -> str:
    """
    Clean raw PDF-extracted text for LLM consumption.

    Steps:
      1. Replace form-feed and vertical-tab characters.
      2. Normalise line breaks (single newlines inside paragraphs → spaces).
      3. Collapse multiple whitespace characters.
      4. Strip common PDF artefacts (page numbers, headers/footers patterns).
      5. Remove non-ASCII characters that are not standard punctuation.

    Args:
        raw_text: The unprocessed text from *extract_text_from_pdf*.

    Returns:
        Cleaned text ready for prompt construction.
    """
    text = raw_text

    # Replace form-feed / vertical-tab
    text = text.replace("\f", "\n").replace("\v", "\n")

    # Remove standalone page-number lines  (e.g. "\n12\n", "\nPage 5\n")
    text = re.sub(r"\n\s*(?:Page\s*)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE)

    # Turn single line-breaks inside paragraphs into spaces, but keep
    # double line-breaks as paragraph separators.
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

    # Collapse runs of whitespace (spaces/tabs) into a single space
    text = re.sub(r"[ \t]+", " ", text)

    # Collapse 3+ consecutive newlines into 2
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Remove common artefacts: lines that are only dashes or underscores
    text = re.sub(r"\n[-_=]{3,}\n", "\n", text)

    # Strip leading/trailing whitespace on every line
    text = "\n".join(line.strip() for line in text.split("\n"))

    # Final strip
    text = text.strip()
    return text


def truncate_text(text: str, max_chars: int = MAX_TEXT_CHARS) -> str:
    """
    Truncate *text* to at most *max_chars* characters, breaking at a
    sentence boundary when possible so the LLM receives coherent input.

    Args:
        text: Cleaned paper text.
        max_chars: Maximum character count (default: 12 000).

    Returns:
        Truncated text. If no truncation was needed the original text is
        returned unchanged.
    """
    if len(text) <= max_chars:
        return text

    truncated = text[:max_chars]

    # Try to cut at the last sentence-ending punctuation
    last_period = max(truncated.rfind(". "), truncated.rfind(".\n"))
    if last_period > max_chars * 0.5:
        truncated = truncated[: last_period + 1]

    return truncated


def process_pdf(file_path: str) -> str:
    """
    End-to-end convenience function: extract → clean → truncate.

    Args:
        file_path: Path to the uploaded PDF.

    Returns:
        Cleaned and (if necessary) truncated paper text ready for the LLM.
    """
    raw = extract_text_from_pdf(file_path)
    cleaned = clean_text(raw)
    final = truncate_text(cleaned)
    return final