File size: 4,070 Bytes
35646e4 275cb5c 35646e4 6b0c8b8 35646e4 275cb5c 6b0c8b8 35646e4 275cb5c 6b0c8b8 275cb5c 6b0c8b8 dd8eaa7 6b0c8b8 275cb5c 35646e4 dd8eaa7 35646e4 ad0cd92 275cb5c dd8eaa7 275cb5c 6b0c8b8 35646e4 dd8eaa7 275cb5c 6b0c8b8 35646e4 dd8eaa7 35646e4 dd8eaa7 35646e4 dd8eaa7 35646e4 dd8eaa7 35646e4 dd8eaa7 35646e4 6b0c8b8 35646e4 dd8eaa7 275cb5c dd8eaa7 6b0c8b8 35646e4 dd8eaa7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 |
import re
import fitz # PyMuPDF
# -----------------------------
# TEXT EXTRACTION (Robust)
# -----------------------------
def extract_text_from_pdf(file_path: str) -> str:
"""
Extracts and cleans text from a PDF using PyMuPDF.
Handles both textual and scanned PDFs gracefully.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Combined extracted text.
"""
text = ""
try:
with fitz.open(file_path) as pdf:
for page in pdf:
page_text = page.get_text("text").strip()
if not page_text:
# Fallback: extract raw blocks (helps with weird PDFs)
blocks = pdf.get_text("blocks")
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
text += page_text + "\n"
except Exception as e:
raise RuntimeError(f"❌ PDF extraction failed: {e}")
# Clean out any extra whitespace or control characters
text = re.sub(r'\s+', ' ', text).strip()
return text
# -----------------------------
# SMART CHUNKING (Step-Aware + Context Aware)
# -----------------------------
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
"""
Splits text into overlapping, structured chunks.
Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
Falls back to sentence-based chunking for normal paragraphs.
Args:
text (str): Input text.
chunk_size (int): Max characters per chunk (default: 800).
overlap (int): Overlapping characters for continuity (default: 200).
Returns:
list[str]: Chunked text segments.
"""
# Clean and normalize text
text = re.sub(r'\s+', ' ', text.strip())
# Try to detect “Step” patterns
step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
step_splits = [s.strip() for s in step_splits if s.strip()]
chunks = []
# Case 1️⃣: Document has visible “Step” structure
if len(step_splits) > 1:
for step in step_splits:
if len(step) > chunk_size:
# If a step is too long → split by sentences within that step
sentences = re.split(r'(?<=[.!?])\s+', step)
current = ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
else:
chunks.append(step.strip())
# Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking
else:
sentences = re.split(r'(?<=[.!?])\s+', text)
current = ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
return chunks
# -----------------------------
# DEBUGGING (Manual Run)
# -----------------------------
if __name__ == "__main__":
sample_text = """
Step 1: Open the application.
Step 2: Navigate to the dashboard.
Step 3: Review the summary and click ‘Export’.
If the steps are missing, the function should still chunk by sentences.
"""
chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
print(f"✅ Chunks created: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f"\n--- Chunk {i} ---\n{c}")
|