Spaces:

MHamdan
/

SPARKNET

Sleeping

App Files Files Community

SPARKNET / src /document_intelligence /agent_adapter.py

MHamdan

Initial commit: SPARKNET framework

d520909 about 1 month ago

raw

history blame

13.7 kB

	"""
	Agent Adapter for Document Intelligence

	Bridges the DocumentAgent with the new document_intelligence subsystem.
	Provides enhanced tools and capabilities.
	"""

	import logging
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Any, Dict, List, Optional, Tuple, Union

	from .chunks.models import (
	DocumentChunk,
	EvidenceRef,
	ParseResult,
	ExtractionResult,
	ClassificationResult,
	DocumentType,
	)
	from .parsing import DocumentParser, ParserConfig
	from .extraction import (
	ExtractionSchema,
	FieldExtractor,
	ExtractionConfig,
	ExtractionValidator,
	)
	from .grounding import EvidenceBuilder, EvidenceTracker, CropManager
	from .tools import get_tool, list_tools, ToolResult

	logger = logging.getLogger(__name__)


	@dataclass
	class AgentConfig:
	"""Configuration for the document agent adapter."""

	# Parser settings
	render_dpi: int = 200
	max_pages: Optional[int] = None
	ocr_languages: List[str] = None

	# Extraction settings
	min_confidence: float = 0.5
	abstain_on_low_confidence: bool = True

	# Grounding settings
	enable_crops: bool = True
	crop_output_dir: Optional[Path] = None

	# Agent settings
	max_iterations: int = 10
	verbose: bool = False

	def __post_init__(self):
	if self.ocr_languages is None:
	self.ocr_languages = ["en"]


	class DocumentIntelligenceAdapter:
	"""
	Adapter connecting DocumentAgent with document_intelligence subsystem.

	Provides:
	- Document loading and parsing
	- Schema-driven extraction
	- Evidence-grounded results
	- Tool execution
	"""

	def __init__(
	self,
	config: Optional[AgentConfig] = None,
	llm_client: Optional[Any] = None,
	):
	self.config = config or AgentConfig()
	self.llm_client = llm_client

	# Initialize components
	self.parser = DocumentParser(
	config=ParserConfig(
	render_dpi=self.config.render_dpi,
	max_pages=self.config.max_pages,
	ocr_languages=self.config.ocr_languages,
	)
	)

	self.extractor = FieldExtractor(
	config=ExtractionConfig(
	min_field_confidence=self.config.min_confidence,
	abstain_on_low_confidence=self.config.abstain_on_low_confidence,
	)
	)

	self.validator = ExtractionValidator(
	min_confidence=self.config.min_confidence,
	)

	self.evidence_builder = EvidenceBuilder()

	if self.config.enable_crops and self.config.crop_output_dir:
	self.crop_manager = CropManager(self.config.crop_output_dir)
	else:
	self.crop_manager = None

	# State
	self._current_parse_result: Optional[ParseResult] = None
	self._page_images: Dict[int, Any] = {}

	logger.info("Initialized DocumentIntelligenceAdapter")

	def load_document(
	self,
	path: Union[str, Path],
	render_pages: bool = True,
	) -> ParseResult:
	"""
	Load and parse a document.

	Args:
	path: Path to document file
	render_pages: Whether to keep rendered page images

	Returns:
	ParseResult with chunks and metadata
	"""
	path = Path(path)
	logger.info(f"Loading document: {path}")

	# Parse document
	self._current_parse_result = self.parser.parse(path)

	# Optionally store page images
	if render_pages:
	from .io import load_document, RenderOptions
	loader, renderer = load_document(path)
	for page_num in range(1, self._current_parse_result.num_pages + 1):
	self._page_images[page_num] = renderer.render_page(
	page_num,
	RenderOptions(dpi=self.config.render_dpi)
	)
	loader.close()

	return self._current_parse_result

	def extract_fields(
	self,
	schema: Union[ExtractionSchema, Dict[str, Any]],
	validate: bool = True,
	) -> ExtractionResult:
	"""
	Extract fields from the loaded document.

	Args:
	schema: Extraction schema
	validate: Whether to validate results

	Returns:
	ExtractionResult with values and evidence
	"""
	if not self._current_parse_result:
	raise RuntimeError("No document loaded. Call load_document() first.")

	# Convert dict schema if needed
	if isinstance(schema, dict):
	schema = ExtractionSchema.from_json_schema(schema)

	# Extract
	result = self.extractor.extract(self._current_parse_result, schema)

	# Validate if requested
	if validate:
	validation = self.validator.validate(result, schema)
	if not validation.is_valid:
	logger.warning(f"Extraction validation failed: {validation.error_count} errors")
	# Add validation issues to result
	result.metadata = result.metadata or {}
	result.metadata["validation_issues"] = [
	{"field": i.field_name, "type": i.issue_type, "message": i.message}
	for i in validation.issues
	]

	return result

	def answer_question(
	self,
	question: str,
	use_llm: bool = True,
	) -> Tuple[str, List[EvidenceRef], float]:
	"""
	Answer a question about the document.

	Args:
	question: Question to answer
	use_llm: Whether to use LLM for generation

	Returns:
	Tuple of (answer, evidence, confidence)
	"""
	if not self._current_parse_result:
	raise RuntimeError("No document loaded")

	tool = get_tool("answer_question", llm_client=self.llm_client)
	result = tool.execute(
	parse_result=self._current_parse_result,
	question=question,
	use_rag=False,
	)

	if not result.success:
	return f"Error: {result.error}", [], 0.0

	data = result.data
	answer = data.get("answer", "")
	confidence = data.get("confidence", 0.5)

	# Convert evidence
	evidence = []
	for ev_dict in result.evidence:
	from .chunks.models import BoundingBox
	evidence.append(EvidenceRef(
	chunk_id=ev_dict["chunk_id"],
	doc_id=self._current_parse_result.doc_id,
	page=ev_dict["page"],
	bbox=BoundingBox(
	x_min=ev_dict["bbox"][0],
	y_min=ev_dict["bbox"][1],
	x_max=ev_dict["bbox"][2],
	y_max=ev_dict["bbox"][3],
	normalized=True,
	),
	source_type="text",
	snippet=ev_dict.get("snippet", ""),
	confidence=confidence,
	))

	return answer, evidence, confidence

	def search_chunks(
	self,
	query: str,
	chunk_types: Optional[List[str]] = None,
	top_k: int = 10,
	) -> List[Dict[str, Any]]:
	"""
	Search for chunks matching a query.

	Args:
	query: Search query
	chunk_types: Optional chunk type filter
	top_k: Maximum results

	Returns:
	List of matching chunks with scores
	"""
	if not self._current_parse_result:
	raise RuntimeError("No document loaded")

	tool = get_tool("search_chunks")
	result = tool.execute(
	parse_result=self._current_parse_result,
	query=query,
	chunk_types=chunk_types,
	top_k=top_k,
	)

	if not result.success:
	return []

	return result.data.get("results", [])

	def get_chunk(self, chunk_id: str) -> Optional[DocumentChunk]:
	"""Get a chunk by ID."""
	if not self._current_parse_result:
	return None

	for chunk in self._current_parse_result.chunks:
	if chunk.chunk_id == chunk_id:
	return chunk
	return None

	def get_page_image(self, page: int) -> Optional[Any]:
	"""Get rendered page image."""
	return self._page_images.get(page)

	def crop_chunk(
	self,
	chunk: DocumentChunk,
	padding_percent: float = 0.02,
	) -> Optional[Any]:
	"""Crop the region of a chunk from its page."""
	page_image = self.get_page_image(chunk.page)
	if page_image is None:
	return None

	from .grounding import crop_region
	return crop_region(page_image, chunk.bbox, padding_percent)

	def get_tools_description(self) -> str:
	"""Get description of available tools for agent prompts."""
	tools = list_tools()
	lines = []
	for tool in tools:
	lines.append(f"- {tool['name']}: {tool['description']}")
	return "\n".join(lines)

	def execute_tool(
	self,
	tool_name: str,
	**kwargs
	) -> ToolResult:
	"""
	Execute a document tool.

	Args:
	tool_name: Name of tool to execute
	**kwargs: Tool arguments

	Returns:
	ToolResult
	"""
	# Add current parse result if not provided
	if "parse_result" not in kwargs and self._current_parse_result:
	kwargs["parse_result"] = self._current_parse_result

	tool = get_tool(tool_name, llm_client=self.llm_client)
	return tool.execute(**kwargs)

	@property
	def parse_result(self) -> Optional[ParseResult]:
	"""Get current parse result."""
	return self._current_parse_result

	@property
	def document_id(self) -> Optional[str]:
	"""Get current document ID."""
	if self._current_parse_result:
	return self._current_parse_result.doc_id
	return None


	def create_enhanced_document_agent(
	llm_client: Any,
	config: Optional[AgentConfig] = None,
	) -> "EnhancedDocumentAgent":
	"""
	Create an enhanced DocumentAgent with document_intelligence integration.

	Args:
	llm_client: LLM client for reasoning
	config: Agent configuration

	Returns:
	EnhancedDocumentAgent instance
	"""
	return EnhancedDocumentAgent(llm_client=llm_client, config=config)


	class EnhancedDocumentAgent:
	"""
	Enhanced DocumentAgent using document_intelligence subsystem.

	Extends the ReAct-style agent with:
	- Better parsing and chunking
	- Schema-driven extraction
	- Visual grounding
	- Evidence tracking
	"""

	def __init__(
	self,
	llm_client: Any,
	config: Optional[AgentConfig] = None,
	):
	self.adapter = DocumentIntelligenceAdapter(
	config=config,
	llm_client=llm_client,
	)
	self.llm_client = llm_client
	self.config = config or AgentConfig()

	async def load_document(self, path: Union[str, Path]) -> ParseResult:
	"""Load a document for processing."""
	return self.adapter.load_document(path, render_pages=True)

	async def extract_fields(
	self,
	schema: Union[ExtractionSchema, Dict],
	) -> ExtractionResult:
	"""Extract fields using schema."""
	return self.adapter.extract_fields(schema, validate=True)

	async def answer_question(
	self,
	question: str,
	) -> Tuple[str, List[EvidenceRef]]:
	"""Answer a question about the document."""
	answer, evidence, confidence = self.adapter.answer_question(question)
	return answer, evidence

	async def classify(self) -> ClassificationResult:
	"""Classify the document type."""
	if not self.adapter.parse_result:
	raise RuntimeError("No document loaded")

	# Get first page content
	first_page_chunks = [
	c for c in self.adapter.parse_result.chunks
	if c.page == 1
	][:5]

	content = " ".join(c.text[:200] for c in first_page_chunks)

	# Simple keyword-based classification
	doc_type = DocumentType.OTHER
	confidence = 0.5

	type_keywords = {
	DocumentType.INVOICE: ["invoice", "bill", "payment due", "amount due"],
	DocumentType.CONTRACT: ["agreement", "contract", "party", "whereas"],
	DocumentType.RECEIPT: ["receipt", "paid", "transaction", "thank you"],
	DocumentType.FORM: ["form", "fill in", "checkbox", "signature line"],
	DocumentType.LETTER: ["dear", "sincerely", "regards"],
	DocumentType.REPORT: ["report", "findings", "conclusion", "summary"],
	DocumentType.PATENT: ["patent", "claims", "invention", "embodiment"],
	}

	content_lower = content.lower()
	for dtype, keywords in type_keywords.items():
	matches = sum(1 for k in keywords if k in content_lower)
	if matches > 0:
	doc_type = dtype
	confidence = min(0.9, 0.5 + matches * 0.15)
	break

	return ClassificationResult(
	doc_id=self.adapter.document_id,
	document_type=doc_type,
	confidence=confidence,
	secondary_types=[],
	)

	def search(
	self,
	query: str,
	top_k: int = 10,
	) -> List[Dict[str, Any]]:
	"""Search document content."""
	return self.adapter.search_chunks(query, top_k=top_k)

	@property
	def current_document(self) -> Optional[ParseResult]:
	"""Get current document."""
	return self.adapter.parse_result