research-draft / pdf_utils.py
Arunvarma2565's picture
Add pdf_utils.py - PDF extraction and cleaning
0193035 verified
"""
pdf_utils.py — PDF text extraction and cleaning for Research Draft.
Handles:
- Extracting raw text from uploaded PDF files using PyMuPDF (fitz).
- Cleaning extracted text (removing noise, fixing whitespace).
- Truncating long documents to fit within LLM context limits.
"""
import re
import fitz # PyMuPDF
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
# Safe token-approximation: ~4 characters per token.
# For a 4096-token context with system prompt overhead, keep paper text
# under ~12 000 characters (~3 000 tokens), leaving room for instructions
# and the generated abstract.
MAX_TEXT_CHARS = 12_000
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def extract_text_from_pdf(file_path: str) -> str:
"""
Open a PDF file and return the concatenated text of all pages.
Args:
file_path: Absolute or relative path to a .pdf file.
Returns:
Raw extracted text as a single string.
Raises:
FileNotFoundError: If *file_path* does not exist.
ValueError: If the file cannot be opened as a PDF.
"""
try:
doc = fitz.open(file_path)
except Exception as exc:
raise ValueError(f"Could not open PDF: {exc}") from exc
pages_text = []
for page in doc:
pages_text.append(page.get_text())
doc.close()
full_text = "\n".join(pages_text)
if not full_text.strip():
raise ValueError("The PDF appears to be empty or contains only images/scans.")
return full_text
def clean_text(raw_text: str) -> str:
"""
Clean raw PDF-extracted text for LLM consumption.
Steps:
1. Replace form-feed and vertical-tab characters.
2. Normalise line breaks (single newlines inside paragraphs → spaces).
3. Collapse multiple whitespace characters.
4. Strip common PDF artefacts (page numbers, headers/footers patterns).
5. Remove non-ASCII characters that are not standard punctuation.
Args:
raw_text: The unprocessed text from *extract_text_from_pdf*.
Returns:
Cleaned text ready for prompt construction.
"""
text = raw_text
# Replace form-feed / vertical-tab
text = text.replace("\f", "\n").replace("\v", "\n")
# Remove standalone page-number lines (e.g. "\n12\n", "\nPage 5\n")
text = re.sub(r"\n\s*(?:Page\s*)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE)
# Turn single line-breaks inside paragraphs into spaces, but keep
# double line-breaks as paragraph separators.
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
# Collapse runs of whitespace (spaces/tabs) into a single space
text = re.sub(r"[ \t]+", " ", text)
# Collapse 3+ consecutive newlines into 2
text = re.sub(r"\n{3,}", "\n\n", text)
# Remove common artefacts: lines that are only dashes or underscores
text = re.sub(r"\n[-_=]{3,}\n", "\n", text)
# Strip leading/trailing whitespace on every line
text = "\n".join(line.strip() for line in text.split("\n"))
# Final strip
text = text.strip()
return text
def truncate_text(text: str, max_chars: int = MAX_TEXT_CHARS) -> str:
"""
Truncate *text* to at most *max_chars* characters, breaking at a
sentence boundary when possible so the LLM receives coherent input.
Args:
text: Cleaned paper text.
max_chars: Maximum character count (default: 12 000).
Returns:
Truncated text. If no truncation was needed the original text is
returned unchanged.
"""
if len(text) <= max_chars:
return text
truncated = text[:max_chars]
# Try to cut at the last sentence-ending punctuation
last_period = max(truncated.rfind(". "), truncated.rfind(".\n"))
if last_period > max_chars * 0.5:
truncated = truncated[: last_period + 1]
return truncated
def process_pdf(file_path: str) -> str:
"""
End-to-end convenience function: extract → clean → truncate.
Args:
file_path: Path to the uploaded PDF.
Returns:
Cleaned and (if necessary) truncated paper text ready for the LLM.
"""
raw = extract_text_from_pdf(file_path)
cleaned = clean_text(raw)
final = truncate_text(cleaned)
return final