Spaces:
Runtime error
Runtime error
| """ | |
| Centralized configuration management for Markit application. | |
| """ | |
| import os | |
| from typing import Optional, Dict, Any | |
| from dataclasses import dataclass | |
| class APIConfig: | |
| """Configuration for external API services.""" | |
| google_api_key: Optional[str] = None | |
| openai_api_key: Optional[str] = None | |
| mistral_api_key: Optional[str] = None | |
| def __post_init__(self): | |
| """Load API keys from environment variables.""" | |
| self.google_api_key = os.getenv("GOOGLE_API_KEY") | |
| self.openai_api_key = os.getenv("OPENAI_API_KEY") | |
| self.mistral_api_key = os.getenv("MISTRAL_API_KEY") | |
| class OCRConfig: | |
| """Configuration for OCR-related settings.""" | |
| tesseract_path: Optional[str] = None | |
| tessdata_path: Optional[str] = None | |
| default_language: str = "eng" | |
| def __post_init__(self): | |
| """Load OCR configuration from environment variables.""" | |
| self.tesseract_path = os.getenv("TESSERACT_PATH") | |
| self.tessdata_path = os.getenv("TESSDATA_PATH", "./tessdata") | |
| class ModelConfig: | |
| """Configuration for AI model settings.""" | |
| gemini_model: str = "gemini-2.5-flash" | |
| mistral_model: str = "mistral-ocr-latest" | |
| got_ocr_model: str = "stepfun-ai/GOT-OCR2_0" | |
| temperature: float = 0.1 | |
| max_tokens: int = 32768 | |
| def __post_init__(self): | |
| """Load model configuration from environment variables.""" | |
| self.gemini_model = os.getenv("GEMINI_MODEL", self.gemini_model) | |
| self.mistral_model = os.getenv("MISTRAL_MODEL", self.mistral_model) | |
| self.got_ocr_model = os.getenv("GOT_OCR_MODEL", self.got_ocr_model) | |
| self.temperature = float(os.getenv("MODEL_TEMPERATURE", self.temperature)) | |
| self.max_tokens = int(os.getenv("MODEL_MAX_TOKENS", self.max_tokens)) | |
| class DoclingConfig: | |
| """Configuration for Docling parser.""" | |
| artifacts_path: Optional[str] = None | |
| enable_remote_services: bool = False | |
| enable_tables: bool = True | |
| enable_code_enrichment: bool = False | |
| enable_formula_enrichment: bool = False | |
| enable_picture_classification: bool = False | |
| generate_picture_images: bool = False | |
| ocr_cpu_threads: int = 4 | |
| def __post_init__(self): | |
| """Load Docling configuration from environment variables.""" | |
| self.artifacts_path = os.getenv("DOCLING_ARTIFACTS_PATH") | |
| self.enable_remote_services = os.getenv("DOCLING_ENABLE_REMOTE_SERVICES", "false").lower() == "true" | |
| self.enable_tables = os.getenv("DOCLING_ENABLE_TABLES", "true").lower() == "true" | |
| self.enable_code_enrichment = os.getenv("DOCLING_ENABLE_CODE_ENRICHMENT", "false").lower() == "true" | |
| self.enable_formula_enrichment = os.getenv("DOCLING_ENABLE_FORMULA_ENRICHMENT", "false").lower() == "true" | |
| self.enable_picture_classification = os.getenv("DOCLING_ENABLE_PICTURE_CLASSIFICATION", "false").lower() == "true" | |
| self.generate_picture_images = os.getenv("DOCLING_GENERATE_PICTURE_IMAGES", "false").lower() == "true" | |
| self.ocr_cpu_threads = int(os.getenv("OMP_NUM_THREADS", self.ocr_cpu_threads)) | |
| class RAGConfig: | |
| """Configuration for RAG (Retrieval-Augmented Generation) functionality.""" | |
| # Vector store settings | |
| vector_store_path: str = "./data/vector_store" | |
| collection_name: str = "markit_documents" | |
| # Chat history settings | |
| chat_history_path: str = "./data/chat_history" | |
| # Embedding settings | |
| embedding_model: str = "models/text-embedding-004" | |
| embedding_chunk_size: int = 1000 | |
| # Chunking settings | |
| chunk_size: int = 1000 | |
| chunk_overlap: int = 200 | |
| # Chat limits | |
| max_messages_per_session: int = 50 | |
| max_messages_per_hour: int = 100 | |
| # Retrieval settings | |
| retrieval_k: int = 4 | |
| retrieval_score_threshold: float = 0.5 | |
| # LLM settings for RAG | |
| rag_model: str = "gemini-2.5-flash" | |
| rag_temperature: float = 0.1 | |
| rag_max_tokens: int = 32768 | |
| def __post_init__(self): | |
| """Load RAG configuration from environment variables.""" | |
| # For HF Spaces, ensure data directories are created | |
| if os.getenv("SPACE_ID"): # HF Spaces environment | |
| base_data_path = "/tmp/data" if not os.access("./data", os.W_OK) else "./data" | |
| self.vector_store_path = os.getenv("VECTOR_STORE_PATH", f"{base_data_path}/vector_store") | |
| self.chat_history_path = os.getenv("CHAT_HISTORY_PATH", f"{base_data_path}/chat_history") | |
| else: | |
| self.vector_store_path = os.getenv("VECTOR_STORE_PATH", self.vector_store_path) | |
| self.chat_history_path = os.getenv("CHAT_HISTORY_PATH", self.chat_history_path) | |
| self.collection_name = os.getenv("VECTOR_STORE_COLLECTION", self.collection_name) | |
| self.embedding_model = os.getenv("EMBEDDING_MODEL", self.embedding_model) | |
| self.embedding_chunk_size = int(os.getenv("EMBEDDING_CHUNK_SIZE", self.embedding_chunk_size)) | |
| self.chunk_size = int(os.getenv("CHUNK_SIZE", self.chunk_size)) | |
| self.chunk_overlap = int(os.getenv("CHUNK_OVERLAP", self.chunk_overlap)) | |
| self.max_messages_per_session = int(os.getenv("MAX_MESSAGES_PER_SESSION", self.max_messages_per_session)) | |
| self.max_messages_per_hour = int(os.getenv("MAX_MESSAGES_PER_HOUR", self.max_messages_per_hour)) | |
| self.retrieval_k = int(os.getenv("RETRIEVAL_K", self.retrieval_k)) | |
| self.retrieval_score_threshold = float(os.getenv("RETRIEVAL_SCORE_THRESHOLD", self.retrieval_score_threshold)) | |
| self.rag_model = os.getenv("RAG_MODEL", self.rag_model) | |
| self.rag_temperature = float(os.getenv("RAG_TEMPERATURE", self.rag_temperature)) | |
| self.rag_max_tokens = int(os.getenv("RAG_MAX_TOKENS", self.rag_max_tokens)) | |
| class AppConfig: | |
| """Main application configuration.""" | |
| debug: bool = False | |
| max_file_size: int = 10 * 1024 * 1024 # 10MB | |
| allowed_extensions: tuple = (".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".webp", ".tex", ".xlsx", ".docx", ".pptx", ".html", ".xhtml", ".md", ".csv") | |
| temp_dir: str = "./temp" | |
| # Multi-document batch processing settings | |
| max_batch_files: int = 5 | |
| max_batch_size: int = 20 * 1024 * 1024 # 20MB combined | |
| batch_processing_types: tuple = ("combined", "individual", "summary", "comparison") | |
| def __post_init__(self): | |
| """Load application configuration from environment variables.""" | |
| self.debug = os.getenv("DEBUG", "false").lower() == "true" | |
| self.max_file_size = int(os.getenv("MAX_FILE_SIZE", self.max_file_size)) | |
| self.temp_dir = os.getenv("TEMP_DIR", self.temp_dir) | |
| # Load batch processing configuration | |
| self.max_batch_files = int(os.getenv("MAX_BATCH_FILES", self.max_batch_files)) | |
| self.max_batch_size = int(os.getenv("MAX_BATCH_SIZE", self.max_batch_size)) | |
| class Config: | |
| """Main configuration container.""" | |
| def __init__(self): | |
| self.api = APIConfig() | |
| self.ocr = OCRConfig() | |
| self.model = ModelConfig() | |
| self.docling = DoclingConfig() | |
| self.app = AppConfig() | |
| self.rag = RAGConfig() | |
| def validate(self) -> Dict[str, Any]: | |
| """Validate configuration and return validation results.""" | |
| validation_results = { | |
| "valid": True, | |
| "warnings": [], | |
| "errors": [] | |
| } | |
| # Check API keys | |
| if not self.api.google_api_key: | |
| validation_results["warnings"].append("Google API key not found - Gemini parser will be unavailable") | |
| if not self.api.mistral_api_key: | |
| validation_results["warnings"].append("Mistral API key not found - Mistral parser will be unavailable") | |
| # Check RAG dependencies | |
| if not self.api.google_api_key: | |
| validation_results["warnings"].append("Google API key not found - RAG embeddings will be unavailable") | |
| if not self.api.google_api_key: | |
| validation_results["warnings"].append("Google API key not found - RAG chat will be unavailable") | |
| # Check tesseract setup | |
| if not self.ocr.tesseract_path and not os.path.exists("/usr/bin/tesseract"): | |
| validation_results["warnings"].append("Tesseract not found in system PATH - OCR functionality may be limited") | |
| # Check temp directory | |
| try: | |
| os.makedirs(self.app.temp_dir, exist_ok=True) | |
| except Exception as e: | |
| validation_results["errors"].append(f"Cannot create temp directory {self.app.temp_dir}: {e}") | |
| validation_results["valid"] = False | |
| # Check RAG directories | |
| try: | |
| os.makedirs(self.rag.vector_store_path, exist_ok=True) | |
| os.makedirs(self.rag.chat_history_path, exist_ok=True) | |
| except Exception as e: | |
| validation_results["errors"].append(f"Cannot create RAG directories: {e}") | |
| validation_results["valid"] = False | |
| return validation_results | |
| def get_available_parsers(self) -> list: | |
| """Get list of available parsers based on current configuration.""" | |
| available = ["markitdown"] # Always available | |
| if self.api.google_api_key: | |
| available.append("gemini_flash") | |
| if self.api.mistral_api_key: | |
| available.append("mistral_ocr") | |
| # GOT-OCR is available if we have GPU or can use ZeroGPU | |
| available.append("got_ocr") | |
| # Docling is available if package is installed | |
| try: | |
| import docling | |
| available.append("docling") | |
| except ImportError: | |
| pass | |
| return available | |
| # Global configuration instance | |
| config = Config() |