masterllm / services /agents /extract_text_agent.py
stellar413's picture
Added fixed agent to agent communication
6df13ef
# services/agents/extract_text_agent.py
"""
Text Extraction Agent - Wraps utilities/extract_text.py
"""
from typing import Dict, Any
from services.agents.base_agent import BaseUtilityAgent
from utilities.extract_text import extract_text_remote
class ExtractTextAgent(BaseUtilityAgent):
"""
Autonomous agent for text extraction from documents.
Wraps the existing extract_text_remote utility while adding
AI-powered validation and confidence scoring.
"""
def __init__(self):
super().__init__(
name="extract_text",
role="Text Extraction Specialist",
goal="Extract all text content from documents with maximum accuracy and completeness",
backstory="""You are an expert in optical character recognition (OCR) and PDF text extraction.
You have processed millions of documents and can identify text quality issues, encoding problems,
and extraction artifacts. You validate extraction results for completeness and accuracy.""",
utility_function=extract_text_remote
)
def _prepare_task_description(self, input_data: Dict[str, Any]) -> str:
"""Prepare task description for the agent."""
filename = input_data.get("filename", "document")
start_page = input_data.get("start_page", 1)
end_page = input_data.get("end_page", 1)
if start_page == end_page:
page_desc = f"page {start_page}"
else:
page_desc = f"pages {start_page}-{end_page}"
return f"""Validate the text extraction from {filename} ({page_desc}).
Assess the extraction quality and provide a confidence score (0.0-1.0) based on:
- Completeness: Is all text likely captured?
- Accuracy: Are there obvious OCR errors or artifacts?
- Encoding: Is the text properly decoded?
- Structure: Is formatting preserved where appropriate?
Return your assessment with a confidence score."""