Spaces:
Sleeping
Sleeping
| # services/agents/extract_text_agent.py | |
| """ | |
| Text Extraction Agent - Wraps utilities/extract_text.py | |
| """ | |
| from typing import Dict, Any | |
| from services.agents.base_agent import BaseUtilityAgent | |
| from utilities.extract_text import extract_text_remote | |
| class ExtractTextAgent(BaseUtilityAgent): | |
| """ | |
| Autonomous agent for text extraction from documents. | |
| Wraps the existing extract_text_remote utility while adding | |
| AI-powered validation and confidence scoring. | |
| """ | |
| def __init__(self): | |
| super().__init__( | |
| name="extract_text", | |
| role="Text Extraction Specialist", | |
| goal="Extract all text content from documents with maximum accuracy and completeness", | |
| backstory="""You are an expert in optical character recognition (OCR) and PDF text extraction. | |
| You have processed millions of documents and can identify text quality issues, encoding problems, | |
| and extraction artifacts. You validate extraction results for completeness and accuracy.""", | |
| utility_function=extract_text_remote | |
| ) | |
| def _prepare_task_description(self, input_data: Dict[str, Any]) -> str: | |
| """Prepare task description for the agent.""" | |
| filename = input_data.get("filename", "document") | |
| start_page = input_data.get("start_page", 1) | |
| end_page = input_data.get("end_page", 1) | |
| if start_page == end_page: | |
| page_desc = f"page {start_page}" | |
| else: | |
| page_desc = f"pages {start_page}-{end_page}" | |
| return f"""Validate the text extraction from {filename} ({page_desc}). | |
| Assess the extraction quality and provide a confidence score (0.0-1.0) based on: | |
| - Completeness: Is all text likely captured? | |
| - Accuracy: Are there obvious OCR errors or artifacts? | |
| - Encoding: Is the text properly decoded? | |
| - Structure: Is formatting preserved where appropriate? | |
| Return your assessment with a confidence score.""" | |