Spaces:

stellar413
/

masterllm

Sleeping

App Files Files Community

masterllm / services /agents /extract_text_agent.py

stellar413

Added fixed agent to agent communication

6df13ef 29 days ago

raw

history blame contribute delete

2.03 kB

	# services/agents/extract_text_agent.py
	"""
	Text Extraction Agent - Wraps utilities/extract_text.py
	"""
	from typing import Dict, Any
	from services.agents.base_agent import BaseUtilityAgent
	from utilities.extract_text import extract_text_remote


	class ExtractTextAgent(BaseUtilityAgent):
	"""
	Autonomous agent for text extraction from documents.

	Wraps the existing extract_text_remote utility while adding
	AI-powered validation and confidence scoring.
	"""

	def __init__(self):
	super().__init__(
	name="extract_text",
	role="Text Extraction Specialist",
	goal="Extract all text content from documents with maximum accuracy and completeness",
	backstory="""You are an expert in optical character recognition (OCR) and PDF text extraction.
	You have processed millions of documents and can identify text quality issues, encoding problems,
	and extraction artifacts. You validate extraction results for completeness and accuracy.""",
	utility_function=extract_text_remote
	)

	def _prepare_task_description(self, input_data: Dict[str, Any]) -> str:
	"""Prepare task description for the agent."""
	filename = input_data.get("filename", "document")
	start_page = input_data.get("start_page", 1)
	end_page = input_data.get("end_page", 1)

	if start_page == end_page:
	page_desc = f"page {start_page}"
	else:
	page_desc = f"pages {start_page}-{end_page}"

	return f"""Validate the text extraction from {filename} ({page_desc}).

	Assess the extraction quality and provide a confidence score (0.0-1.0) based on:
	- Completeness: Is all text likely captured?
	- Accuracy: Are there obvious OCR errors or artifacts?
	- Encoding: Is the text properly decoded?
	- Structure: Is formatting preserved where appropriate?

	Return your assessment with a confidence score."""