""" pdf_utils.py — PDF text extraction and cleaning for Research Draft. Handles: - Extracting raw text from uploaded PDF files using PyMuPDF (fitz). - Cleaning extracted text (removing noise, fixing whitespace). - Truncating long documents to fit within LLM context limits. """ import re import fitz # PyMuPDF # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- # Safe token-approximation: ~4 characters per token. # For a 4096-token context with system prompt overhead, keep paper text # under ~12 000 characters (~3 000 tokens), leaving room for instructions # and the generated abstract. MAX_TEXT_CHARS = 12_000 # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def extract_text_from_pdf(file_path: str) -> str: """ Open a PDF file and return the concatenated text of all pages. Args: file_path: Absolute or relative path to a .pdf file. Returns: Raw extracted text as a single string. Raises: FileNotFoundError: If *file_path* does not exist. ValueError: If the file cannot be opened as a PDF. """ try: doc = fitz.open(file_path) except Exception as exc: raise ValueError(f"Could not open PDF: {exc}") from exc pages_text = [] for page in doc: pages_text.append(page.get_text()) doc.close() full_text = "\n".join(pages_text) if not full_text.strip(): raise ValueError("The PDF appears to be empty or contains only images/scans.") return full_text def clean_text(raw_text: str) -> str: """ Clean raw PDF-extracted text for LLM consumption. Steps: 1. Replace form-feed and vertical-tab characters. 2. Normalise line breaks (single newlines inside paragraphs → spaces). 3. Collapse multiple whitespace characters. 4. Strip common PDF artefacts (page numbers, headers/footers patterns). 5. Remove non-ASCII characters that are not standard punctuation. Args: raw_text: The unprocessed text from *extract_text_from_pdf*. Returns: Cleaned text ready for prompt construction. """ text = raw_text # Replace form-feed / vertical-tab text = text.replace("\f", "\n").replace("\v", "\n") # Remove standalone page-number lines (e.g. "\n12\n", "\nPage 5\n") text = re.sub(r"\n\s*(?:Page\s*)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE) # Turn single line-breaks inside paragraphs into spaces, but keep # double line-breaks as paragraph separators. text = re.sub(r"(? str: """ Truncate *text* to at most *max_chars* characters, breaking at a sentence boundary when possible so the LLM receives coherent input. Args: text: Cleaned paper text. max_chars: Maximum character count (default: 12 000). Returns: Truncated text. If no truncation was needed the original text is returned unchanged. """ if len(text) <= max_chars: return text truncated = text[:max_chars] # Try to cut at the last sentence-ending punctuation last_period = max(truncated.rfind(". "), truncated.rfind(".\n")) if last_period > max_chars * 0.5: truncated = truncated[: last_period + 1] return truncated def process_pdf(file_path: str) -> str: """ End-to-end convenience function: extract → clean → truncate. Args: file_path: Path to the uploaded PDF. Returns: Cleaned and (if necessary) truncated paper text ready for the LLM. """ raw = extract_text_from_pdf(file_path) cleaned = clean_text(raw) final = truncate_text(cleaned) return final