import os from pathlib import Path from langchain_huggingface import HuggingFaceEmbeddings from langchain_openai import ChatOpenAI from dotenv import load_dotenv import logging # Initialize environment load_dotenv() # Setup logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # --- File Path Configuration (Cross-platform compatible) --- PROJECT_ROOT = Path(__file__).parent.parent.absolute() DATA_DIR = PROJECT_ROOT / "data" COMPANY_INFO_DIR = DATA_DIR / "raw_company_info" PROCESSED_DIR = DATA_DIR / "processed" CHUNKS_PATH = PROCESSED_DIR / "company_chunks.pkl" VECTOR_STORE_DIR = PROCESSED_DIR / "vector_store" # Ensure directories exist PROCESSED_DIR.mkdir(parents=True, exist_ok=True) COMPANY_INFO_DIR.mkdir(parents=True, exist_ok=True) # --- LLM Configuration with error handling --- def create_llm(): """Create LLM with proper error handling and fallbacks""" openai_key = os.getenv("OPENAI_API_KEY") if not openai_key: logger.error("OPENAI_API_KEY not found in environment variables") raise ValueError("OpenAI API key is required. Please set OPENAI_API_KEY environment variable.") try: return ChatOpenAI( model="gpt-4o", api_key=openai_key, base_url="https://models.inference.ai.azure.com", # Optional custom endpoint temperature=0.0, max_tokens=1024, request_timeout=30, # Increased timeout for stability max_retries=2, streaming=True, ) except Exception as e: logger.error(f"Failed to initialize LLM: {e}") raise LLM = create_llm() # --- Embedding Model Configuration with error handling --- def create_embedding_model(): """Create embedding model with proper error handling""" try: return HuggingFaceEmbeddings( model_name="intfloat/multilingual-e5-small", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) except Exception as e: logger.error(f"Failed to load embedding model: {e}") # Fallback to a simpler model try: return HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': True} ) except Exception as e2: logger.error(f"Fallback embedding model also failed: {e2}") raise EMBEDDING_MODEL = create_embedding_model() # Configuration validation def validate_config(): """Validate all required configurations""" required_env_vars = ["OPENAI_API_KEY"] missing_vars = [var for var in required_env_vars if not os.getenv(var)] if missing_vars: raise ValueError(f"Missing required environment variables: {missing_vars}") # Check if data directories exist if not COMPANY_INFO_DIR.exists(): logger.warning(f"Company info directory not found: {COMPANY_INFO_DIR}") logger.info("Configuration validation completed") # Run validation on import try: validate_config() except Exception as e: logger.error(f"Configuration validation failed: {e}") raise e