Spaces:

AdithyaVardan
/

GodSpeed

Sleeping

GodSpeed / src /confluence_agent /chunker.py

Ananth Shyam

feat: Implement file and JIRA agents with parsing, processing, and ingestion capabilities

9d8a0cf 23 days ago

6.2 kB

	from __future__ import annotations

	import hashlib
	import logging
	from typing import Optional

	from ingestion.models import DocumentChunk, RawDocument

	logger = logging.getLogger(__name__)

	try:
	from bs4 import BeautifulSoup, Tag

	_BS4_AVAILABLE = True
	except ImportError:
	_BS4_AVAILABLE = False
	logger.warning("confluence_chunker: beautifulsoup4 not installed; falling back to plain strip")

	_HEADING_TAGS = {"h1", "h2", "h3", "h4", "h5", "h6"}
	_MAX_CHUNK_CHARS = 4000


	def _build_breadcrumb(raw_doc: RawDocument) -> str:
	parts: list[str] = []
	space_key = raw_doc.metadata.get("space_key", "")
	if space_key:
	parts.append(space_key)
	for anc in raw_doc.metadata.get("ancestors", []):
	if anc:
	parts.append(anc)
	parts.append(raw_doc.title)
	return " > ".join(parts)


	def _format_table(table_tag) -> str:
	rows = []
	for tr in table_tag.find_all("tr"):
	cells = [td.get_text(" ", strip=True) for td in tr.find_all(["th", "td"])]
	rows.append(" \| ".join(cells))
	return "\n".join(rows)


	def _make_chunk(
	raw_doc: RawDocument,
	text: str,
	heading: str,
	breadcrumb: str,
	chunk_index: int,
	section_idx: int,
	part_idx: int,
	extra_meta: Optional[dict] = None,
	) -> DocumentChunk:
	prefix = f"[{breadcrumb}]\nSection: {heading}\n\n" if heading else f"[{breadcrumb}]\n\n"
	full_text = prefix + text.strip()
	chunk_id = hashlib.sha256(
	f"confluence:{raw_doc.metadata['page_id']}:section_{section_idx}_part_{part_idx}".encode()
	).hexdigest()
	meta = {
	"page_id": raw_doc.metadata.get("page_id"),
	"space_key": raw_doc.metadata.get("space_key"),
	"title": raw_doc.title,
	"section_heading": heading,
	"breadcrumb": breadcrumb,
	}
	if extra_meta:
	meta.update(extra_meta)
	return DocumentChunk(
	chunk_id=chunk_id,
	doc_id=raw_doc.doc_id,
	text=full_text,
	source=raw_doc.source_url,
	source_type="confluence",
	team_id=raw_doc.team_id,
	chunk_index=chunk_index,
	metadata=meta,
	)


	def _split_long_text(text: str, max_chars: int = _MAX_CHUNK_CHARS) -> list[str]:
	"""Split text into parts of at most max_chars, breaking on word boundaries."""
	if len(text) <= max_chars:
	return [text]
	parts = []
	while len(text) > max_chars:
	cut = text.rfind(" ", 0, max_chars)
	if cut == -1:
	cut = max_chars
	parts.append(text[:cut].strip())
	text = text[cut:].strip()
	if text:
	parts.append(text)
	return parts


	def chunk_confluence_page(raw_doc: RawDocument) -> list[DocumentChunk]:
	"""
	Parse Confluence Storage Format HTML using BeautifulSoup.
	Produces heading-split section chunks + one chunk per table.
	Falls back to single-chunk plain strip if BS4 is unavailable.
	"""
	if not _BS4_AVAILABLE:
	return _fallback_single_chunk(raw_doc)

	html = raw_doc.content
	if not html or not html.strip():
	return []

	soup = BeautifulSoup(html, "html.parser")
	breadcrumb = _build_breadcrumb(raw_doc)
	chunks: list[DocumentChunk] = []
	chunk_index = 0
	section_idx = 0

	# Extract tables first and remove from DOM so they don't bleed into text chunks
	tables = soup.find_all("table")
	table_chunks = []
	for t_idx, table in enumerate(tables):
	table_text = _format_table(table)
	table.decompose()
	if table_text.strip():
	tbl_chunk = _make_chunk(
	raw_doc, table_text, "Table", breadcrumb, chunk_index, section_idx, 0,
	extra_meta={"type": "table", "table_index": t_idx},
	)
	table_chunks.append(tbl_chunk)
	chunk_index += 1
	section_idx += 1

	# Walk top-level elements, break on heading tags
	current_heading = ""
	current_text_parts: list[str] = []

	def _flush(heading: str, parts: list[str], s_idx: int) -> int:
	nonlocal chunk_index
	text = " ".join(parts).strip()
	if not text:
	return s_idx
	for p_idx, segment in enumerate(_split_long_text(text)):
	c = _make_chunk(raw_doc, segment, heading, breadcrumb, chunk_index, s_idx, p_idx)
	chunks.append(c)
	chunk_index += 1
	return s_idx + 1

	for element in soup.body.children if soup.body else soup.children:
	if not hasattr(element, "name") or element.name is None:
	# NavigableString
	t = str(element).strip()
	if t:
	current_text_parts.append(t)
	continue
	if element.name in _HEADING_TAGS:
	section_idx = _flush(current_heading, current_text_parts, section_idx)
	current_heading = element.get_text(" ", strip=True)
	current_text_parts = []
	else:
	t = element.get_text(" ", strip=True)
	if t:
	current_text_parts.append(t)

	section_idx = _flush(current_heading, current_text_parts, section_idx)

	# Append table chunks after text chunks
	chunks.extend(table_chunks)

	logger.debug("confluence_chunker: %s -> %d chunks", raw_doc.doc_id, len(chunks))
	return chunks


	def _fallback_single_chunk(raw_doc: RawDocument) -> list[DocumentChunk]:
	import re
	text = re.sub(r"<[^>]+>", " ", raw_doc.content)
	text = re.sub(r"\s{2,}", " ", text).strip()
	if not text:
	return []
	breadcrumb = _build_breadcrumb(raw_doc)
	chunk_id = hashlib.sha256(
	f"confluence:{raw_doc.metadata['page_id']}:section_0_part_0".encode()
	).hexdigest()
	return [
	DocumentChunk(
	chunk_id=chunk_id,
	doc_id=raw_doc.doc_id,
	text=f"[{breadcrumb}]\n\n{text}",
	source=raw_doc.source_url,
	source_type="confluence",
	team_id=raw_doc.team_id,
	chunk_index=0,
	metadata={
	"page_id": raw_doc.metadata.get("page_id"),
	"space_key": raw_doc.metadata.get("space_key"),
	"title": raw_doc.title,
	"breadcrumb": breadcrumb,
	},
	)
	]