""" HF Agent client with proper environment variable support. """ import httpx import os import time import logging from typing import Dict, Optional, Tuple from pathlib import Path logger = logging.getLogger(__name__) # Load from environment TEXT_EXTRACTOR_URL = os.getenv('TEXT_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/text') TABLE_EXTRACTOR_URL = os.getenv('TABLE_EXTRACTOR_URL', 'https://point9-extract-text-and-table.hf.space/api/tables') NER_URL = os.getenv('NER_URL', 'https://point9-ner.hf.space/api/ner') CLASSIFY_URL = os.getenv('CLASSIFY_URL', 'https://point9-classify.hf.space/api/classify') SUMMARIZER_URL = os.getenv('SUMMARIZER_URL', '') # Optional AGENT_BEARER_TOKEN = os.getenv('AGENT_BEARER_TOKEN', '') AGENT_TIMEOUT_SECONDS = int(os.getenv('AGENT_TIMEOUT_SECONDS', '30')) def get_headers() -> Dict: """Get headers with optional bearer token.""" headers = {} if AGENT_BEARER_TOKEN: headers['Authorization'] = f'Bearer {AGENT_BEARER_TOKEN}' return headers def call_agent_with_retry( url: str, files: Optional[Dict] = None, data: Optional[Dict] = None, json: Optional[Dict] = None, max_retries: int = 1 ) -> Tuple[bool, Optional[Dict], Optional[str]]: """Call agent with retry logic.""" headers = get_headers() for attempt in range(max_retries + 1): try: with httpx.Client(timeout=AGENT_TIMEOUT_SECONDS) as client: if files: response = client.post(url, headers=headers, files=files, data=data) elif json: response = client.post(url, headers=headers, json=json) else: response = client.post(url, headers=headers, data=data) if response.status_code == 200: return True, response.json(), None elif response.status_code == 429: if attempt < max_retries: time.sleep(2) continue return False, None, "Rate limited" else: return False, None, f"HTTP {response.status_code}: {response.text[:200]}" except httpx.TimeoutException: if attempt < max_retries: time.sleep(1) continue return False, None, f"Timeout after {AGENT_TIMEOUT_SECONDS}s" except Exception as e: if attempt < max_retries: time.sleep(1) continue return False, None, str(e) return False, None, "Max retries exceeded" def extract_text_from_file(file_path: Path) -> Tuple[bool, Optional[str], Optional[str]]: """Extract text using HF agent.""" try: with open(file_path, 'rb') as f: files = {'file': (file_path.name, f, 'application/pdf')} data = {'filename': file_path.name} success, response, error = call_agent_with_retry(TEXT_EXTRACTOR_URL, files=files, data=data) if success and response: text = response.get('text', '') if not text or len(text.strip()) < 10: return False, None, "No text extracted" return True, text, None else: return False, None, error or "Text extraction failed" except Exception as e: return False, None, str(e) def extract_tables_from_file(file_path: Path) -> Tuple[bool, Optional[list], Optional[str]]: """Extract tables using HF agent.""" try: with open(file_path, 'rb') as f: files = {'file': (file_path.name, f, 'application/pdf')} data = {'filename': file_path.name} success, response, error = call_agent_with_retry(TABLE_EXTRACTOR_URL, files=files, data=data) if success and response: return True, response.get('tables', []), None else: return False, None, error or "Table extraction failed" except Exception as e: return False, None, str(e) def extract_entities_from_text(text: str) -> Tuple[bool, Optional[list], Optional[str]]: """Extract entities using NER agent.""" try: success, response, error = call_agent_with_retry(NER_URL, json={'text': text}) if success and response: return True, response.get('entities', []), None else: return False, None, error or "NER failed" except Exception as e: return False, None, str(e) def classify_document(text: str) -> Tuple[bool, Optional[Dict], Optional[str]]: """Classify document using classifier agent.""" try: success, response, error = call_agent_with_retry(CLASSIFY_URL, json={'text': text[:2000]}) if success and response: return True, response, None else: return False, None, error or "Classification failed" except Exception as e: return False, None, str(e) def summarize_text(text: str) -> Tuple[bool, Optional[str], Optional[str]]: """Summarize text (optional).""" if not SUMMARIZER_URL: return True, None, None try: success, response, error = call_agent_with_retry(SUMMARIZER_URL, json={'text': text[:5000]}) if success and response: return True, response.get('summary', ''), None else: return False, None, error or "Summarization failed" except Exception as e: return False, None, str(e)