Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Running

App Files Files Community

enterprise-knowledge-assistant / src /ingestion.py

Shubham170793

Update src/ingestion.py

ad0cd92 verified 2 months ago

raw

history blame

4.07 kB

	import re
	import fitz # PyMuPDF

	# -----------------------------
	# TEXT EXTRACTION (Robust)
	# -----------------------------
	def extract_text_from_pdf(file_path: str) -> str:
	"""
	Extracts and cleans text from a PDF using PyMuPDF.
	Handles both textual and scanned PDFs gracefully.

	Args:
	file_path (str): Path to the PDF file.
	Returns:
	str: Combined extracted text.
	"""
	text = ""
	try:
	with fitz.open(file_path) as pdf:
	for page in pdf:
	page_text = page.get_text("text").strip()
	if not page_text:
	# Fallback: extract raw blocks (helps with weird PDFs)
	blocks = pdf.get_text("blocks")
	page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
	text += page_text + "\n"
	except Exception as e:
	raise RuntimeError(f"❌ PDF extraction failed: {e}")

	# Clean out any extra whitespace or control characters
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	# -----------------------------
	# SMART CHUNKING (Step-Aware + Context Aware)
	# -----------------------------
	def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 80) -> list:
	"""
	Splits text into overlapping, structured chunks.
	Detects procedural steps (e.g., 'Step 1:', 'STEP 2.') and keeps them intact.
	Falls back to sentence-based chunking for normal paragraphs.

	Args:
	text (str): Input text.
	chunk_size (int): Max characters per chunk (default: 800).
	overlap (int): Overlapping characters for continuity (default: 200).

	Returns:
	list[str]: Chunked text segments.
	"""
	# Clean and normalize text
	text = re.sub(r'\s+', ' ', text.strip())

	# Try to detect “Step” patterns
	step_splits = re.split(r'(?=(?:Step\s*\d+[:.\s]))', text, flags=re.IGNORECASE)
	step_splits = [s.strip() for s in step_splits if s.strip()]

	chunks = []

	# Case 1️⃣: Document has visible “Step” structure
	if len(step_splits) > 1:
	for step in step_splits:
	if len(step) > chunk_size:
	# If a step is too long → split by sentences within that step
	sentences = re.split(r'(?<=[.!?])\s+', step)
	current = ""
	for sent in sentences:
	if len(current) + len(sent) + 1 <= chunk_size:
	current += " " + sent
	else:
	if current.strip():
	chunks.append(current.strip())
	overlap_part = current[-overlap:] if overlap > 0 else ""
	current = overlap_part + " " + sent
	if current.strip():
	chunks.append(current.strip())
	else:
	chunks.append(step.strip())

	# Case 2️⃣: No “Step” keywords — fall back to sentence-based chunking
	else:
	sentences = re.split(r'(?<=[.!?])\s+', text)
	current = ""
	for sent in sentences:
	if len(current) + len(sent) + 1 <= chunk_size:
	current += " " + sent
	else:
	if current.strip():
	chunks.append(current.strip())
	overlap_part = current[-overlap:] if overlap > 0 else ""
	current = overlap_part + " " + sent
	if current.strip():
	chunks.append(current.strip())

	return chunks


	# -----------------------------
	# DEBUGGING (Manual Run)
	# -----------------------------
	if __name__ == "__main__":
	sample_text = """
	Step 1: Open the application.
	Step 2: Navigate to the dashboard.
	Step 3: Review the summary and click ‘Export’.
	If the steps are missing, the function should still chunk by sentences.
	"""
	chunks = chunk_text(sample_text, chunk_size=100, overlap=20)
	print(f"✅ Chunks created: {len(chunks)}")
	for i, c in enumerate(chunks, 1):
	print(f"\n--- Chunk {i} ---\n{c}")