Shubham170793's picture
Update src/ingestion.py
ad0cd92 verified
raw
history blame
4.07 kB
import re
import fitz # PyMuPDF
# -----------------------------
# TEXT EXTRACTION (Robust)
# -----------------------------
def extract_text_from_pdf(file_path: str) -> str:
"""
Extracts and cleans text from a PDF using PyMuPDF.
Handles both textual and scanned PDFs gracefully.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Combined extracted text.
"""
text = ""
try:
with fitz.open(file_path) as pdf:
for page in pdf:
page_text = page.get_text("text").strip()
if not page_text:
# Fallback: extract raw blocks (helps with weird PDFs)
blocks = pdf.get_text("blocks")
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
text += page_text + "\n"
except Exception as e:
raise RuntimeError(f"❌ PDF extraction failed: {e}")
# Clean out any extra whitespace or control characters
text = re.sub(r'\s+', ' ', text).strip()
return text
# -----------------------------
# SMART CHUNKING (Step-Aware + Context Aware)
# -----------------------------
def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
"""
Splits text into overlapping, structured chunks.
Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
Falls back to sentence-based chunking for normal paragraphs.
Args:
text (str): Input text.
chunk_size (int): Max characters per chunk (default: 800).
overlap (int): Overlapping characters for continuity (default: 200).
Returns:
list[str]: Chunked text segments.
"""
# Clean and normalize text
text = re.sub(r'\s+', ' ', text.strip())
# Try to detect “Step” patterns
step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
step_splits = [s.strip() for s in step_splits if s.strip()]
chunks = []
# Case 1️⃣: Document has visible “Step” structure
if len(step_splits) > 1:
for step in step_splits:
if len(step) > chunk_size:
# If a step is too long → split by sentences within that step
sentences = re.split(r'(?<=[.!?])\s+', step)
current = ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
else:
chunks.append(step.strip())
# Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking
else:
sentences = re.split(r'(?<=[.!?])\s+', text)
current = ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
if current.strip():
chunks.append(current.strip())
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
if current.strip():
chunks.append(current.strip())
return chunks
# -----------------------------
# DEBUGGING (Manual Run)
# -----------------------------
if __name__ == "__main__":
sample_text = """
Step 1: Open the application.
Step 2: Navigate to the dashboard.
Step 3: Review the summary and click ‘Export’.
If the steps are missing, the function should still chunk by sentences.
"""
chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
print(f"✅ Chunks created: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f"\n--- Chunk {i} ---\n{c}")