Spaces:

Shubham170793
/

enterprise-knowledge-assistant

Sleeping

App Files Files Community

enterprise-knowledge-assistant / src /ingestion.py

Shubham170793

Update src/ingestion.py

6b0c8b8 verified 3 months ago

raw

history blame

2.98 kB

	import re
	import fitz # PyMuPDF

	# -----------------------------
	# TEXT EXTRACTION (Robust)
	# -----------------------------
	def extract_text_from_pdf(file_path: str) -> str:
	"""
	Extracts and cleans text from a PDF using PyMuPDF.
	Handles both textual and scanned PDFs gracefully.

	Args:
	file_path (str): Path to the PDF file.
	Returns:
	str: Combined extracted text.
	"""
	text = ""
	try:
	with fitz.open(file_path) as pdf:
	for page in pdf:
	page_text = page.get_text("text").strip()
	if not page_text:
	# Fallback: extract raw blocks (helps with weird PDFs)
	blocks = page.get_text("blocks")
	page_text = " ".join(block[4] for block in blocks if isinstance(block[4], str))
	text += page_text + "\n"
	except Exception as e:
	raise RuntimeError(f"❌ PDF extraction failed: {e}")

	# Clean out any extra whitespace or control characters
	text = re.sub(r'\s+', ' ', text).strip()
	return text


	# -----------------------------
	# SMART CHUNKING (Context Aware)
	# -----------------------------
	def chunk_text(text: str, chunk_size: int = 800, overlap: int = 150) -> list:
	"""
	Splits text into overlapping, sentence-based chunks.
	Optimized for embedding models (E5, MiniLM, etc.) for semantic retrieval.

	Args:
	text (str): Input text.
	chunk_size (int): Max characters per chunk (default: 800).
	overlap (int): Overlapping characters for continuity (default: 150).

	Returns:
	list[str]: Chunked text segments.
	"""
	# Clean text once
	text = re.sub(r'\s+', ' ', text.strip())

	# Sentence segmentation (simple rule-based, fast)
	sentences = re.split(r'(?<=[.!?])\s+', text)

	chunks, current = [], ""

	for sent in sentences:
	if len(current) + len(sent) + 1 <= chunk_size:
	current += " " + sent
	else:
	# Store full chunk
	if current.strip():
	chunks.append(current.strip())

	# Overlap control
	overlap_part = current[-overlap:] if overlap > 0 else ""
	current = overlap_part + " " + sent

	# Append the last chunk
	if current.strip():
	chunks.append(current.strip())

	return chunks


	# -----------------------------
	# DEBUGGING (Manual Run)
	# -----------------------------
	if __name__ == "__main__":
	sample_text = """
	Artificial Intelligence is transforming industries.
	Machine learning is a key subfield, driving automation and predictive analytics.
	Neural networks power most modern AI applications today.
	This technology is reshaping healthcare, finance, and manufacturing.
	"""
	chunks = chunk_text(sample_text, chunk_size=80, overlap=20)
	print(f"✅ Chunks created: {len(chunks)}")
	for i, c in enumerate(chunks, 1):
	print(f"\n--- Chunk {i} ({len(c)} chars) ---\n{c}")