|
|
import re |
|
|
import fitz |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(file_path: str) -> str: |
|
|
""" |
|
|
Extracts and cleans text from a PDF using PyMuPDF. |
|
|
Handles both textual and scanned PDFs gracefully. |
|
|
|
|
|
Args: |
|
|
file_path (str): Path to the PDF file. |
|
|
Returns: |
|
|
str: Combined extracted text. |
|
|
""" |
|
|
text = "" |
|
|
try: |
|
|
with fitz.open(file_path) as pdf: |
|
|
for page in pdf: |
|
|
page_text = page.get_text("text").strip() |
|
|
if not page_text: |
|
|
|
|
|
blocks = pdf.get_text("blocks") |
|
|
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str)) |
|
|
text += page_text + "\n" |
|
|
except Exception as e: |
|
|
raise RuntimeError(f"❌ PDF extraction failed: {e}") |
|
|
|
|
|
|
|
|
text = re.sub(r'\s+', ' ', text).strip() |
|
|
return text |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list: |
|
|
""" |
|
|
Splits text into overlapping, structured chunks. |
|
|
Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact. |
|
|
Falls back to sentence-based chunking for normal paragraphs. |
|
|
|
|
|
Args: |
|
|
text (str): Input text. |
|
|
chunk_size (int): Max characters per chunk (default: 800). |
|
|
overlap (int): Overlapping characters for continuity (default: 200). |
|
|
|
|
|
Returns: |
|
|
list[str]: Chunked text segments. |
|
|
""" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text.strip()) |
|
|
|
|
|
|
|
|
step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE) |
|
|
step_splits = [s.strip() for s in step_splits if s.strip()] |
|
|
|
|
|
chunks = [] |
|
|
|
|
|
|
|
|
if len(step_splits) > 1: |
|
|
for step in step_splits: |
|
|
if len(step) > chunk_size: |
|
|
|
|
|
sentences = re.split(r'(?<=[.!?])\s+', step) |
|
|
current = "" |
|
|
for sent in sentences: |
|
|
if len(current) + len(sent) + 1 <= chunk_size: |
|
|
current += " " + sent |
|
|
else: |
|
|
if current.strip(): |
|
|
chunks.append(current.strip()) |
|
|
overlap_part = current[-overlap:] if overlap > 0 else "" |
|
|
current = overlap_part + " " + sent |
|
|
if current.strip(): |
|
|
chunks.append(current.strip()) |
|
|
else: |
|
|
chunks.append(step.strip()) |
|
|
|
|
|
|
|
|
else: |
|
|
sentences = re.split(r'(?<=[.!?])\s+', text) |
|
|
current = "" |
|
|
for sent in sentences: |
|
|
if len(current) + len(sent) + 1 <= chunk_size: |
|
|
current += " " + sent |
|
|
else: |
|
|
if current.strip(): |
|
|
chunks.append(current.strip()) |
|
|
overlap_part = current[-overlap:] if overlap > 0 else "" |
|
|
current = overlap_part + " " + sent |
|
|
if current.strip(): |
|
|
chunks.append(current.strip()) |
|
|
|
|
|
return chunks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
sample_text = """ |
|
|
Step 1: Open the application. |
|
|
Step 2: Navigate to the dashboard. |
|
|
Step 3: Review the summary and click ‘Export’. |
|
|
If the steps are missing, the function should still chunk by sentences. |
|
|
""" |
|
|
chunks = chunk_text(sample_text, chunk_size=100, overlap=20) |
|
|
print(f"✅ Chunks created: {len(chunks)}") |
|
|
for i, c in enumerate(chunks, 1): |
|
|
print(f"\n--- Chunk {i} ---\n{c}") |
|
|
|