File size: 4,444 Bytes
0193035 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | """
pdf_utils.py — PDF text extraction and cleaning for Research Draft.
Handles:
- Extracting raw text from uploaded PDF files using PyMuPDF (fitz).
- Cleaning extracted text (removing noise, fixing whitespace).
- Truncating long documents to fit within LLM context limits.
"""
import re
import fitz # PyMuPDF
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
# Safe token-approximation: ~4 characters per token.
# For a 4096-token context with system prompt overhead, keep paper text
# under ~12 000 characters (~3 000 tokens), leaving room for instructions
# and the generated abstract.
MAX_TEXT_CHARS = 12_000
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def extract_text_from_pdf(file_path: str) -> str:
"""
Open a PDF file and return the concatenated text of all pages.
Args:
file_path: Absolute or relative path to a .pdf file.
Returns:
Raw extracted text as a single string.
Raises:
FileNotFoundError: If *file_path* does not exist.
ValueError: If the file cannot be opened as a PDF.
"""
try:
doc = fitz.open(file_path)
except Exception as exc:
raise ValueError(f"Could not open PDF: {exc}") from exc
pages_text = []
for page in doc:
pages_text.append(page.get_text())
doc.close()
full_text = "\n".join(pages_text)
if not full_text.strip():
raise ValueError("The PDF appears to be empty or contains only images/scans.")
return full_text
def clean_text(raw_text: str) -> str:
"""
Clean raw PDF-extracted text for LLM consumption.
Steps:
1. Replace form-feed and vertical-tab characters.
2. Normalise line breaks (single newlines inside paragraphs → spaces).
3. Collapse multiple whitespace characters.
4. Strip common PDF artefacts (page numbers, headers/footers patterns).
5. Remove non-ASCII characters that are not standard punctuation.
Args:
raw_text: The unprocessed text from *extract_text_from_pdf*.
Returns:
Cleaned text ready for prompt construction.
"""
text = raw_text
# Replace form-feed / vertical-tab
text = text.replace("\f", "\n").replace("\v", "\n")
# Remove standalone page-number lines (e.g. "\n12\n", "\nPage 5\n")
text = re.sub(r"\n\s*(?:Page\s*)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE)
# Turn single line-breaks inside paragraphs into spaces, but keep
# double line-breaks as paragraph separators.
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
# Collapse runs of whitespace (spaces/tabs) into a single space
text = re.sub(r"[ \t]+", " ", text)
# Collapse 3+ consecutive newlines into 2
text = re.sub(r"\n{3,}", "\n\n", text)
# Remove common artefacts: lines that are only dashes or underscores
text = re.sub(r"\n[-_=]{3,}\n", "\n", text)
# Strip leading/trailing whitespace on every line
text = "\n".join(line.strip() for line in text.split("\n"))
# Final strip
text = text.strip()
return text
def truncate_text(text: str, max_chars: int = MAX_TEXT_CHARS) -> str:
"""
Truncate *text* to at most *max_chars* characters, breaking at a
sentence boundary when possible so the LLM receives coherent input.
Args:
text: Cleaned paper text.
max_chars: Maximum character count (default: 12 000).
Returns:
Truncated text. If no truncation was needed the original text is
returned unchanged.
"""
if len(text) <= max_chars:
return text
truncated = text[:max_chars]
# Try to cut at the last sentence-ending punctuation
last_period = max(truncated.rfind(". "), truncated.rfind(".\n"))
if last_period > max_chars * 0.5:
truncated = truncated[: last_period + 1]
return truncated
def process_pdf(file_path: str) -> str:
"""
End-to-end convenience function: extract → clean → truncate.
Args:
file_path: Path to the uploaded PDF.
Returns:
Cleaned and (if necessary) truncated paper text ready for the LLM.
"""
raw = extract_text_from_pdf(file_path)
cleaned = clean_text(raw)
final = truncate_text(cleaned)
return final
|