File size: 4,444 Bytes
0193035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
pdf_utils.py — PDF text extraction and cleaning for Research Draft.

Handles:
  - Extracting raw text from uploaded PDF files using PyMuPDF (fitz).
  - Cleaning extracted text (removing noise, fixing whitespace).
  - Truncating long documents to fit within LLM context limits.
"""

import re
import fitz  # PyMuPDF


# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

# Safe token-approximation: ~4 characters per token.
# For a 4096-token context with system prompt overhead, keep paper text
# under ~12 000 characters (~3 000 tokens), leaving room for instructions
# and the generated abstract.
MAX_TEXT_CHARS = 12_000


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def extract_text_from_pdf(file_path: str) -> str:
    """
    Open a PDF file and return the concatenated text of all pages.

    Args:
        file_path: Absolute or relative path to a .pdf file.

    Returns:
        Raw extracted text as a single string.

    Raises:
        FileNotFoundError: If *file_path* does not exist.
        ValueError: If the file cannot be opened as a PDF.
    """
    try:
        doc = fitz.open(file_path)
    except Exception as exc:
        raise ValueError(f"Could not open PDF: {exc}") from exc

    pages_text = []
    for page in doc:
        pages_text.append(page.get_text())
    doc.close()

    full_text = "\n".join(pages_text)
    if not full_text.strip():
        raise ValueError("The PDF appears to be empty or contains only images/scans.")
    return full_text


def clean_text(raw_text: str) -> str:
    """
    Clean raw PDF-extracted text for LLM consumption.

    Steps:
      1. Replace form-feed and vertical-tab characters.
      2. Normalise line breaks (single newlines inside paragraphs → spaces).
      3. Collapse multiple whitespace characters.
      4. Strip common PDF artefacts (page numbers, headers/footers patterns).
      5. Remove non-ASCII characters that are not standard punctuation.

    Args:
        raw_text: The unprocessed text from *extract_text_from_pdf*.

    Returns:
        Cleaned text ready for prompt construction.
    """
    text = raw_text

    # Replace form-feed / vertical-tab
    text = text.replace("\f", "\n").replace("\v", "\n")

    # Remove standalone page-number lines  (e.g. "\n12\n", "\nPage 5\n")
    text = re.sub(r"\n\s*(?:Page\s*)?\d{1,4}\s*\n", "\n", text, flags=re.IGNORECASE)

    # Turn single line-breaks inside paragraphs into spaces, but keep
    # double line-breaks as paragraph separators.
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

    # Collapse runs of whitespace (spaces/tabs) into a single space
    text = re.sub(r"[ \t]+", " ", text)

    # Collapse 3+ consecutive newlines into 2
    text = re.sub(r"\n{3,}", "\n\n", text)

    # Remove common artefacts: lines that are only dashes or underscores
    text = re.sub(r"\n[-_=]{3,}\n", "\n", text)

    # Strip leading/trailing whitespace on every line
    text = "\n".join(line.strip() for line in text.split("\n"))

    # Final strip
    text = text.strip()
    return text


def truncate_text(text: str, max_chars: int = MAX_TEXT_CHARS) -> str:
    """
    Truncate *text* to at most *max_chars* characters, breaking at a
    sentence boundary when possible so the LLM receives coherent input.

    Args:
        text: Cleaned paper text.
        max_chars: Maximum character count (default: 12 000).

    Returns:
        Truncated text. If no truncation was needed the original text is
        returned unchanged.
    """
    if len(text) <= max_chars:
        return text

    truncated = text[:max_chars]

    # Try to cut at the last sentence-ending punctuation
    last_period = max(truncated.rfind(". "), truncated.rfind(".\n"))
    if last_period > max_chars * 0.5:
        truncated = truncated[: last_period + 1]

    return truncated


def process_pdf(file_path: str) -> str:
    """
    End-to-end convenience function: extract → clean → truncate.

    Args:
        file_path: Path to the uploaded PDF.

    Returns:
        Cleaned and (if necessary) truncated paper text ready for the LLM.
    """
    raw = extract_text_from_pdf(file_path)
    cleaned = clean_text(raw)
    final = truncate_text(cleaned)
    return final