Shubham170793's picture
Update src/ingestion.py
6b0c8b8 verified
raw
history blame
2.98 kB
import re
import fitz # PyMuPDF
# -----------------------------
# TEXT EXTRACTION (Robust)
# -----------------------------
def extract_text_from_pdf(file_path: str) -> str:
"""
Extracts and cleans text from a PDF using PyMuPDF.
Handles both textual and scanned PDFs gracefully.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Combined extracted text.
"""
text = ""
try:
with fitz.open(file_path) as pdf:
for page in pdf:
page_text = page.get_text("text").strip()
if not page_text:
# Fallback: extract raw blocks (helps with weird PDFs)
blocks = page.get_text("blocks")
page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
text += page_text + "\n"
except Exception as e:
raise RuntimeError(f"❌ PDF extraction failed: {e}")
# Clean out any extra whitespace or control characters
text = re.sub(r'\s+', ' ', text).strip()
return text
# -----------------------------
# SMART CHUNKING (Context Aware)
# -----------------------------
def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
"""
Splits text into overlapping, sentence-based chunks.
Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval.
Args:
text (str): Input text.
chunk_size (int): Max characters per chunk (default: 800).
overlap (int): Overlapping characters for continuity (default: 150).
Returns:
list[str]: Chunked text segments.
"""
# Clean text once
text = re.sub(r'\s+', ' ', text.strip())
# Sentence segmentation (simple rule-based, fast)
sentences = re.split(r'(?<=[.!?])\s+', text)
chunks, current = [], ""
for sent in sentences:
if len(current) + len(sent) + 1 <= chunk_size:
current += " " + sent
else:
# Store full chunk
if current.strip():
chunks.append(current.strip())
# Overlap control
overlap_part = current[-overlap:] if overlap > 0 else ""
current = overlap_part + " " + sent
# Append the last chunk
if current.strip():
chunks.append(current.strip())
return chunks
# -----------------------------
# DEBUGGING (Manual Run)
# -----------------------------
if __name__ == "__main__":
sample_text = """
Artificial Intelligence is transforming industries.
Machine learning is a key subfield, driving automation and predictive analytics.
Neural networks power most modern AI applications today.
This technology is reshaping healthcare, finance, and manufacturing.
"""
chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
print(f"βœ… Chunks created: {len(chunks)}")
for i, c in enumerate(chunks, 1):
print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")