| | """ |
| | Configuration settings for Data Extractor Using Gemini |
| | Optimized for Gemini-only model usage with robust directory management |
| | """ |
| |
|
| | import os |
| | from pathlib import Path |
| | from dotenv import load_dotenv |
| | import logging |
| |
|
| | |
| | load_dotenv() |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class Settings: |
| | """Configuration settings with Gemini-only model support and robust directory management.""" |
| | |
| | |
| | GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") |
| | |
| | |
| | DATA_EXTRACTOR_MODEL = os.getenv("DATA_EXTRACTOR_MODEL", "gemini-2.5-pro") |
| | DATA_ARRANGER_MODEL = os.getenv("DATA_ARRANGER_MODEL", "gemini-2.5-pro") |
| | CODE_GENERATOR_MODEL = os.getenv("CODE_GENERATOR_MODEL", "gemini-2.5-flash") |
| | |
| | |
| | DATA_EXTRACTOR_MODEL_THINKING_BUDGET = int(os.getenv("DATA_EXTRACTOR_THINKING_BUDGET", "4096")) |
| | DATA_ARRANGER_MODEL_THINKING_BUDGET = int(os.getenv("DATA_ARRANGER_THINKING_BUDGET", "4096")) |
| | CODE_GENERATOR_MODEL_THINKING_BUDGET = int(os.getenv("CODE_GENERATOR_THINKING_BUDGET", "4096")) |
| | |
| | |
| | MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "50")) |
| | SUPPORTED_FILE_TYPES = [ |
| | "pdf", "txt", "docx", "xlsx", "csv", "md", "json", "xml", "html", |
| | "png", "jpg", "jpeg", "doc", "xls", "ppt", "pptx" |
| | ] |
| | |
| | |
| | |
| | WORKING_DIR = Path(os.getenv("WORKING_DIR", "/tmp/data_extractor_gemini")) |
| | |
| | |
| | TEMP_DIR = WORKING_DIR / "temp" |
| | INPUT_DIR = WORKING_DIR / "input" |
| | OUTPUT_DIR = WORKING_DIR / "output" |
| | CACHE_DIR = WORKING_DIR / "cache" |
| | LOGS_DIR = WORKING_DIR / "logs" |
| | |
| | |
| | |
| | MAX_RETRIES = int(os.getenv("MAX_RETRIES", "3")) |
| | RETRY_DELAY_SECONDS = int(os.getenv("RETRY_DELAY_SECONDS", "5")) |
| | AGENT_TIMEOUT_SECONDS = int(os.getenv("AGENT_TIMEOUT_SECONDS", "300")) |
| | |
| | |
| | ENABLE_CACHING = os.getenv("ENABLE_CACHING", "true").lower() == "true" |
| | CACHE_TTL_HOURS = int(os.getenv("CACHE_TTL_HOURS", "24")) |
| | |
| | @classmethod |
| | def initialize_directories(cls): |
| | """Initialize all required directories with proper permissions.""" |
| | directories = [ |
| | cls.WORKING_DIR, |
| | cls.TEMP_DIR, |
| | cls.INPUT_DIR, |
| | cls.OUTPUT_DIR, |
| | cls.CACHE_DIR, |
| | cls.LOGS_DIR |
| | ] |
| | |
| | created_dirs = [] |
| | for directory in directories: |
| | try: |
| | directory.mkdir(parents=True, exist_ok=True) |
| | |
| | |
| | test_file = directory / ".write_test" |
| | test_file.write_text("test") |
| | test_file.unlink() |
| | |
| | created_dirs.append(str(directory)) |
| | logger.debug(f"Directory initialized: {directory}") |
| | |
| | except Exception as e: |
| | logger.error(f"Failed to initialize directory {directory}: {e}") |
| | raise RuntimeError(f"Cannot create or write to directory {directory}: {e}") |
| | |
| | logger.info(f"Successfully initialized {len(created_dirs)} directories") |
| | return created_dirs |
| | |
| | @classmethod |
| | def validate_config(cls): |
| | """Comprehensive configuration validation with detailed error reporting.""" |
| | errors = [] |
| | warnings = [] |
| | |
| | |
| | |
| | |
| | if not cls.GOOGLE_API_KEY: |
| | errors.append("GOOGLE_API_KEY is required. Get it from https://aistudio.google.com/app/apikey") |
| | elif len(cls.GOOGLE_API_KEY) < 30: |
| | warnings.append("GOOGLE_API_KEY appears to be too short - verify it's correct") |
| | |
| | |
| | gemini_models = [cls.DATA_EXTRACTOR_MODEL, cls.DATA_ARRANGER_MODEL, cls.CODE_GENERATOR_MODEL] |
| | for i, model in enumerate(gemini_models): |
| | model_names = ["DATA_EXTRACTOR_MODEL", "DATA_ARRANGER_MODEL", "CODE_GENERATOR_MODEL"] |
| | if not model: |
| | errors.append(f"{model_names[i]} cannot be empty") |
| | elif not model.startswith("gemini-"): |
| | errors.append(f"{model_names[i]} must be a Gemini model (starts with 'gemini-'), got: {model}") |
| | |
| | |
| | try: |
| | cls.initialize_directories() |
| | except Exception as e: |
| | errors.append(f"Directory initialization failed: {e}") |
| | |
| | |
| | |
| | |
| | if cls.MAX_FILE_SIZE_MB <= 0: |
| | errors.append("MAX_FILE_SIZE_MB must be positive") |
| | elif cls.MAX_FILE_SIZE_MB > 100: |
| | warnings.append(f"MAX_FILE_SIZE_MB ({cls.MAX_FILE_SIZE_MB}) is very large - may cause memory issues") |
| | |
| | |
| | if not cls.SUPPORTED_FILE_TYPES: |
| | errors.append("SUPPORTED_FILE_TYPES cannot be empty") |
| | |
| | |
| | budgets = [ |
| | (cls.DATA_EXTRACTOR_MODEL_THINKING_BUDGET, "DATA_EXTRACTOR_MODEL_THINKING_BUDGET"), |
| | (cls.DATA_ARRANGER_MODEL_THINKING_BUDGET, "DATA_ARRANGER_MODEL_THINKING_BUDGET"), |
| | (cls.CODE_GENERATOR_MODEL_THINKING_BUDGET, "CODE_GENERATOR_MODEL_THINKING_BUDGET") |
| | ] |
| | |
| | for budget, name in budgets: |
| | if budget < 1024: |
| | warnings.append(f"{name} ({budget}) is quite low - may affect model performance") |
| | elif budget > 8192: |
| | warnings.append(f"{name} ({budget}) is very high - may be unnecessary") |
| | |
| | |
| | if cls.MAX_RETRIES < 1: |
| | warnings.append("MAX_RETRIES should be at least 1") |
| | elif cls.MAX_RETRIES > 10: |
| | warnings.append("MAX_RETRIES is very high - may cause long delays") |
| | |
| | |
| | |
| | if errors: |
| | error_msg = "❌ Configuration validation failed:\n" |
| | error_msg += "\n".join(f" • {error}" for error in errors) |
| | |
| | if warnings: |
| | error_msg += "\n\n⚠️ Warnings:\n" |
| | error_msg += "\n".join(f" • {warning}" for warning in warnings) |
| | |
| | raise ValueError(error_msg) |
| | |
| | if warnings: |
| | logger.warning("Configuration warnings detected:") |
| | for warning in warnings: |
| | logger.warning(f" • {warning}") |
| | |
| | logger.info("✅ Configuration validation successful") |
| | return True |
| | |
| | @classmethod |
| | def get_session_directories(cls, session_id: str): |
| | """Get session-specific directory structure.""" |
| | session_base = cls.WORKING_DIR / session_id |
| | |
| | return { |
| | "base": session_base, |
| | "input": session_base / "input", |
| | "output": session_base / "output", |
| | "temp": session_base / "temp", |
| | "cache": session_base / "cache" |
| | } |
| | |
| | @classmethod |
| | def create_session_directories(cls, session_id: str): |
| | """Create and validate session-specific directories.""" |
| | session_dirs = cls.get_session_directories(session_id) |
| | |
| | created = [] |
| | for name, directory in session_dirs.items(): |
| | try: |
| | directory.mkdir(parents=True, exist_ok=True) |
| | |
| | |
| | test_file = directory / ".write_test" |
| | test_file.write_text("test") |
| | test_file.unlink() |
| | |
| | created.append(str(directory)) |
| | |
| | except Exception as e: |
| | logger.error(f"Failed to create session directory {name}: {e}") |
| | raise RuntimeError(f"Cannot create session directory {directory}: {e}") |
| | |
| | logger.info(f"Created {len(created)} session directories for {session_id}") |
| | return session_dirs |
| | |
| | @classmethod |
| | def cleanup_session(cls, session_id: str, keep_output: bool = True): |
| | """Clean up session directories with option to preserve output.""" |
| | session_dirs = cls.get_session_directories(session_id) |
| | |
| | import shutil |
| | cleaned = [] |
| | |
| | for name, directory in session_dirs.items(): |
| | if keep_output and name == "output": |
| | continue |
| | |
| | if directory.exists(): |
| | try: |
| | shutil.rmtree(directory) |
| | cleaned.append(str(directory)) |
| | except Exception as e: |
| | logger.warning(f"Could not clean {name} directory: {e}") |
| | |
| | logger.info(f"Cleaned {len(cleaned)} session directories for {session_id}") |
| | return cleaned |
| | |
| | @classmethod |
| | def get_debug_info(cls): |
| | """Get comprehensive debug information about current configuration.""" |
| | import platform |
| | import sys |
| | |
| | return { |
| | "python_version": sys.version, |
| | "platform": platform.platform(), |
| | "temp_dir": str(cls.TEMP_DIR), |
| | "temp_dir_exists": cls.TEMP_DIR.exists(), |
| | "models": { |
| | "data_extractor": cls.DATA_EXTRACTOR_MODEL, |
| | "data_arranger": cls.DATA_ARRANGER_MODEL, |
| | "code_generator": cls.CODE_GENERATOR_MODEL, |
| | }, |
| | "api_keys": { |
| | "google_api_key_present": bool(cls.GOOGLE_API_KEY), |
| | "google_api_key_length": len(cls.GOOGLE_API_KEY) if cls.GOOGLE_API_KEY else 0 |
| | } |
| | } |
| |
|
| |
|
| | |
| | settings = Settings() |
| |
|
| | |
| | try: |
| | settings.initialize_directories() |
| | logger.debug("Settings initialized successfully") |
| | except Exception as e: |
| | logger.error(f"Failed to initialize settings: {e}") |
| | |
| |
|
| |
|