File size: 3,700 Bytes
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a48a45
c5e1945
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
from pathlib import Path
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from huggingface_hub import login
from dotenv import load_dotenv
import logging

# Initialize environment
load_dotenv()

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Secure HuggingFace login with error handling
try:
    hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
    if hf_token:
        login(hf_token)
        logger.info("Successfully logged into HuggingFace")
    else:
        logger.warning("No HuggingFace token found - some features may be limited")
except Exception as e:
    logger.error(f"HuggingFace login failed: {e}")

# --- File Path Configuration (Cross-platform compatible) ---
PROJECT_ROOT = Path(__file__).parent.parent.absolute()
DATA_DIR = PROJECT_ROOT / "data"
COMPANY_INFO_DIR = DATA_DIR / "raw_company_info"
PROCESSED_DIR = DATA_DIR / "processed"
CHUNKS_PATH = PROCESSED_DIR / "company_chunks.pkl"
VECTOR_STORE_DIR = PROCESSED_DIR / "vector_store"

# Ensure directories exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
COMPANY_INFO_DIR.mkdir(parents=True, exist_ok=True)

# --- LLM Configuration with error handling ---
def create_llm():
    """Create LLM with proper error handling and fallbacks"""
    openai_key = os.getenv("OPENAI_API_KEY")
    
    if not openai_key:
        logger.error("OPENAI_API_KEY not found in environment variables")
        raise ValueError("OpenAI API key is required. Please set OPENAI_API_KEY environment variable.")
    
    try:
        return ChatOpenAI(
            model="gpt-4o-mini",  # More cost-effective for demos
            api_key=openai_key,
            base_url="https://models.inference.ai.azure.com",  # Optional custom endpoint
            temperature=0.1,
            max_tokens=1024,
            request_timeout=30,  # Increased timeout for stability
            max_retries=2,
            streaming=True,
        )
    except Exception as e:
        logger.error(f"Failed to initialize LLM: {e}")
        raise

LLM = create_llm()

# --- Embedding Model Configuration with error handling ---
def create_embedding_model():
    """Create embedding model with proper error handling"""
    try:
        return HuggingFaceEmbeddings(
            model_name="intfloat/multilingual-e5-small",
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
    except Exception as e:
        logger.error(f"Failed to load embedding model: {e}")
        # Fallback to a simpler model
        try:
            return HuggingFaceEmbeddings(
                model_name="sentence-transformers/all-MiniLM-L6-v2",
                model_kwargs={'device': 'cpu'},
                encode_kwargs={'normalize_embeddings': True}
            )
        except Exception as e2:
            logger.error(f"Fallback embedding model also failed: {e2}")
            raise

EMBEDDING_MODEL = create_embedding_model()

# Configuration validation
def validate_config():
    """Validate all required configurations"""
    required_env_vars = ["OPENAI_API_KEY"]
    missing_vars = [var for var in required_env_vars if not os.getenv(var)]
    
    if missing_vars:
        raise ValueError(f"Missing required environment variables: {missing_vars}")
    
    # Check if data directories exist
    if not COMPANY_INFO_DIR.exists():
        logger.warning(f"Company info directory not found: {COMPANY_INFO_DIR}")
    
    logger.info("Configuration validation completed")

# Run validation on import
try:
    validate_config()
except Exception as e:
    logger.error(f"Configuration validation failed: {e}")
    raise e