| """ |
| pdf_utils.py — PDF text extraction and cleaning for Research Draft. |
| |
| Handles: |
| - Extracting raw text from uploaded PDF files using PyMuPDF (fitz). |
| - Cleaning extracted text (removing noise, fixing whitespace). |
| - Truncating long documents to fit within LLM context limits. |
| """ |
|
|
| import re |
| import fitz |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| MAX_TEXT_CHARS = 12_000 |
|
|
|
|
| |
| |
| |
|
|
| def extract_text_from_pdf(file_path: str) -> str: |
| """ |
| Open a PDF file and return the concatenated text of all pages. |
| |
| Args: |
| file_path: Absolute or relative path to a .pdf file. |
| |
| Returns: |
| Raw extracted text as a single string. |
| |
| Raises: |
| FileNotFoundError: If *file_path* does not exist. |
| ValueError: If the file cannot be opened as a PDF. |
| """ |
| try: |
| doc = fitz.open(file_path) |
| except Exception as exc: |
| raise ValueError(f"Could not open PDF: {exc}") from exc |
|
|
| pages_text = [] |
| for page in doc: |
| pages_text.append(page.get_text()) |
| doc.close() |
|
|
| full_text = "\n".join(pages_text) |
| if not full_text.strip(): |
| raise ValueError("The PDF appears to be empty or contains only images/scans.") |
| return full_text |
|
|
|
|
| def clean_text(raw_text: str) -> str: |
| """ |
| Clean raw PDF-extracted text for LLM consumption. |
| |
| Steps: |
| 1. Replace form-feed and vertical-tab characters. |
| 2. Normalise line breaks (single newlines inside paragraphs → spaces). |
| 3. Collapse multiple whitespace characters. |
| 4. Strip common PDF artefacts (page numbers, headers/footers patterns). |
| 5. Remove non-ASCII characters that are not standard punctuation. |
| |
| Args: |
| raw_text: The unprocessed text from *extract_text_from_pdf*. |
| |
| Returns: |
| Cleaned text ready for prompt construction. |
| """ |
| text = raw_text |
|
|
| |
| text = text.replace("\f", "\n").replace("\v", "\n") |
|
|
| |
| text = re.sub(r"\n\s*(?:Page\s*)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE) |
|
|
| |
| |
| text = re.sub(r"(?<!\n)\n(?!\n)", " ", text) |
|
|
| |
| text = re.sub(r"[ \t]+", " ", text) |
|
|
| |
| text = re.sub(r"\n{3,}", "\n\n", text) |
|
|
| |
| text = re.sub(r"\n[-_=]{3,}\n", "\n", text) |
|
|
| |
| text = "\n".join(line.strip() for line in text.split("\n")) |
|
|
| |
| text = text.strip() |
| return text |
|
|
|
|
| def truncate_text(text: str, max_chars: int = MAX_TEXT_CHARS) -> str: |
| """ |
| Truncate *text* to at most *max_chars* characters, breaking at a |
| sentence boundary when possible so the LLM receives coherent input. |
| |
| Args: |
| text: Cleaned paper text. |
| max_chars: Maximum character count (default: 12 000). |
| |
| Returns: |
| Truncated text. If no truncation was needed the original text is |
| returned unchanged. |
| """ |
| if len(text) <= max_chars: |
| return text |
|
|
| truncated = text[:max_chars] |
|
|
| |
| last_period = max(truncated.rfind(". "), truncated.rfind(".\n")) |
| if last_period > max_chars * 0.5: |
| truncated = truncated[: last_period + 1] |
|
|
| return truncated |
|
|
|
|
| def process_pdf(file_path: str) -> str: |
| """ |
| End-to-end convenience function: extract → clean → truncate. |
| |
| Args: |
| file_path: Path to the uploaded PDF. |
| |
| Returns: |
| Cleaned and (if necessary) truncated paper text ready for the LLM. |
| """ |
| raw = extract_text_from_pdf(file_path) |
| cleaned = clean_text(raw) |
| final = truncate_text(cleaned) |
| return final |
|
|