File size: 4,070 Bytes
35646e4
275cb5c
 
35646e4
6b0c8b8
35646e4
275cb5c
 
6b0c8b8
 
35646e4
275cb5c
 
 
6b0c8b8
275cb5c
 
6b0c8b8
 
 
 
 
 
dd8eaa7
6b0c8b8
 
 
 
 
 
 
275cb5c
 
 
35646e4
dd8eaa7
35646e4
ad0cd92
275cb5c
dd8eaa7
 
 
275cb5c
 
6b0c8b8
35646e4
dd8eaa7
275cb5c
 
6b0c8b8
35646e4
dd8eaa7
35646e4
 
dd8eaa7
 
 
35646e4
dd8eaa7
35646e4
dd8eaa7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35646e4
dd8eaa7
 
 
 
 
 
 
 
 
 
 
 
 
 
35646e4
 
 
 
 
6b0c8b8
35646e4
 
 
dd8eaa7
 
 
 
275cb5c
dd8eaa7
6b0c8b8
35646e4
dd8eaa7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import re
import fitz  # PyMuPDF

# -----------------------------
# TEXT EXTRACTION (Robust)
# -----------------------------
def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts and cleans text from a PDF using PyMuPDF.
    Handles both textual and scanned PDFs gracefully.

    Args:
        file_path (str): Path to the PDF file.
    Returns:
        str: Combined extracted text.
    """
    text = ""
    try:
        with fitz.open(file_path) as pdf:
            for page in pdf:
                page_text = page.get_text("text").strip()
                if not page_text:
                    # Fallback: extract raw blocks (helps with weird PDFs)
                    blocks = pdf.get_text("blocks")
                    page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
                text += page_text + "\n"
    except Exception as e:
        raise RuntimeError(f"❌ PDF extraction failed: {e}")

    # Clean out any extra whitespace or control characters
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# -----------------------------
# SMART CHUNKING (Step-Aware + Context Aware)
# -----------------------------
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
    """
    Splits text into overlapping, structured chunks.
    Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
    Falls back to sentence-based chunking for normal paragraphs.

    Args:
        text (str): Input text.
        chunk_size (int): Max characters per chunk (default: 800).
        overlap (int): Overlapping characters for continuity (default: 200).

    Returns:
        list[str]: Chunked text segments.
    """
    # Clean and normalize text
    text = re.sub(r'\s+', ' ', text.strip())

    # Try to detect “Step” patterns
    step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
    step_splits = [s.strip() for s in step_splits if s.strip()]

    chunks = []

    # Case 1️⃣: Document has visible “Step” structure
    if len(step_splits) > 1:
        for step in step_splits:
            if len(step) > chunk_size:
                # If a step is too long → split by sentences within that step
                sentences = re.split(r'(?<=[.!?])\s+', step)
                current = ""
                for sent in sentences:
                    if len(current) + len(sent) + 1 <= chunk_size:
                        current += " " + sent
                    else:
                        if current.strip():
                            chunks.append(current.strip())
                        overlap_part = current[-overlap:] if overlap > 0 else ""
                        current = overlap_part + " " + sent
                if current.strip():
                    chunks.append(current.strip())
            else:
                chunks.append(step.strip())

    # Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking
    else:
        sentences = re.split(r'(?<=[.!?])\s+', text)
        current = ""
        for sent in sentences:
            if len(current) + len(sent) + 1 <= chunk_size:
                current += " " + sent
            else:
                if current.strip():
                    chunks.append(current.strip())
                overlap_part = current[-overlap:] if overlap > 0 else ""
                current = overlap_part + " " + sent
        if current.strip():
            chunks.append(current.strip())

    return chunks


# -----------------------------
# DEBUGGING (Manual Run)
# -----------------------------
if __name__ == "__main__":
    sample_text = """
    Step 1: Open the application.
    Step 2: Navigate to the dashboard.
    Step 3: Review the summary and click ‘Export’.
    If the steps are missing, the function should still chunk by sentences.
    """
    chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
    print(f"✅ Chunks created: {len(chunks)}")
    for i, c in enumerate(chunks, 1):
        print(f"\n--- Chunk {i} ---\n{c}")