File size: 3,259 Bytes
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dfb119
c5e1945
4a48a45
0dfb119
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import os
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import logging

# Initialize environment
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# --- File Path Configuration (Cross-platform compatible) ---
PROJECT_ROOT = Path(__file__).parent.parent.absolute()
DATA_DIR = PROJECT_ROOT / "data"
COMPANY_INFO_DIR = DATA_DIR / "raw_company_info"
PROCESSED_DIR = DATA_DIR / "processed"
CHUNKS_PATH = PROCESSED_DIR / "company_chunks.pkl"
VECTOR_STORE_DIR = PROCESSED_DIR / "vector_store"

# Ensure directories exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
COMPANY_INFO_DIR.mkdir(parents=True, exist_ok=True)

# --- LLM Configuration with error handling ---
def create_llm():
    """Create LLM with proper error handling and fallbacks"""
    openai_key = os.getenv("OPENAI_API_KEY")
    
    if not openai_key:
        logger.error("OPENAI_API_KEY not found in environment variables")
        raise ValueError("OpenAI API key is required. Please set OPENAI_API_KEY environment variable.")
    
    try:
        return ChatOpenAI(
            model="gpt-4o",  
            api_key=openai_key,
            base_url="https://models.inference.ai.azure.com",  # Optional custom endpoint
            temperature=0.0,
            max_tokens=1024,
            request_timeout=30,  # Increased timeout for stability
            max_retries=2,
            streaming=True,
        )
    except Exception as e:
        logger.error(f"Failed to initialize LLM: {e}")
        raise

LLM = create_llm()

# --- Embedding Model Configuration with error handling ---
def create_embedding_model():
    """Create embedding model with proper error handling"""
    try:
        return HuggingFaceEmbeddings(
            model_name="intfloat/multilingual-e5-small",
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
    except Exception as e:
        logger.error(f"Failed to load embedding model: {e}")
        # Fallback to a simpler model
        try:
            return HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2",
                model_kwargs={'device': 'cpu'},
                encode_kwargs={'normalize_embeddings': True}
            )
        except Exception as e2:
            logger.error(f"Fallback embedding model also failed: {e2}")
            raise

EMBEDDING_MODEL = create_embedding_model()

# Configuration validation
def validate_config():
    """Validate all required configurations"""
    required_env_vars = ["OPENAI_API_KEY"]
    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
    
    if missing_vars:
        raise ValueError(f"Missing required environment variables: {missing_vars}")
    
    # Check if data directories exist
    if not COMPANY_INFO_DIR.exists():
        logger.warning(f"Company info directory not found: {COMPANY_INFO_DIR}")
    
    logger.info("Configuration validation completed")

# Run validation on import
try:
    validate_config()
except Exception as e:
    logger.error(f"Configuration validation failed: {e}")
    raise e