File size: 12,726 Bytes
bae99be
 
 
d18c374
bae99be
 
 
 
 
 
 
d18c374
bae99be
 
 
 
d18c374
bae99be
 
 
 
 
 
d18c374
 
bae99be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e120e6b
bae99be
 
 
53c63d4
 
bae99be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a79d041
bae99be
 
a79d041
 
 
294354d
a79d041
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e120e6b
a79d041
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bae99be
 
c4f7836
bae99be
d18c374
c4f7836
bae99be
c4f7836
 
 
 
 
bae99be
c4f7836
 
bae99be
 
 
 
e120e6b
 
bae99be
d18c374
 
bae99be
 
 
 
 
 
 
 
d18c374
bae99be
 
d18c374
bae99be
 
 
 
 
 
e120e6b
 
53c63d4
 
bae99be
 
 
 
 
e120e6b
bae99be
 
c4f7836
bae99be
c4f7836
d18c374
c4f7836
 
 
 
bae99be
 
 
 
 
 
 
53c63d4
 
bae99be
 
 
 
 
 
 
 
 
 
d18c374
bae99be
 
 
d18c374
bae99be
 
e120e6b
bae99be
d18c374
bae99be
 
e120e6b
bae99be
d18c374
bae99be
 
d18c374
bae99be
 
d18c374
bae99be
 
 
 
 
 
e120e6b
 
53c63d4
 
bae99be
e120e6b
bae99be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e120e6b
bae99be
 
 
 
 
 
c4f7836
 
 
 
bae99be
c4f7836
 
bae99be
 
e120e6b
bae99be
c4f7836
e120e6b
bae99be
 
 
 
 
 
53c63d4
 
 
bae99be
 
 
 
 
e120e6b
d18c374
 
 
e120e6b
bae99be
 
 
 
 
 
 
 
 
 
 
e120e6b
bae99be
 
 
e120e6b
bae99be
 
 
 
 
e120e6b
53c63d4
bae99be
 
d18c374
 
bae99be
 
 
e120e6b
bae99be
 
e120e6b
bae99be
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#!/usr/bin/env python3
"""
Setup script for SHL Assessment Recommender System

This script automates the initialization process:
1. Checks dependencies
2. Generates/loads SHL catalog
3. Preprocesses training data  
4. Generates embeddings and builds FAISS index
5. Runs evaluation
"""

import sys
import os
import logging
import pandas as pd

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def check_dependencies():
    """Check if all required packages are installed"""
    required_packages = [
        'pandas',
        'numpy',
        'torch',
        'transformers',
        'sentence_transformers',
        'faiss',
        'sklearn',
        'beautifulsoup4',
        'requests',
        'fastapi',
        'uvicorn',
        'streamlit'
    ]
    
    missing = []
    for package in required_packages:
        try:
            if package == 'sklearn':
                __import__('sklearn')
            elif package == 'beautifulsoup4':
                __import__('bs4')
            elif package == 'sentence_transformers':
                __import__('sentence_transformers')
            else:
                __import__(package)
        except ImportError:
            missing.append(package)
    
    if missing:
        logger.warning(f"Missing packages: {', '.join(missing)}")
        logger.info("Attempting to continue anyway...")
        return True
    
    logger.info("βœ“ All dependencies installed")
    return True


def step1_generate_catalog():
    """Step 1: Generate/Load SHL catalog"""
    logger.info("="*60)
    logger.info("STEP 1: Loading SHL Catalog")
    logger.info("="*60)
    
    try:
        csv_path = 'data/shl_catalog.csv'
        excel_path = 'Data/Gen_AI Dataset.xlsx'
        
        # Priority 1: Use existing CSV (uploaded with repo)
        if os.path.exists(csv_path):
            logger.info(f"βœ“ Found existing catalog: {csv_path}")
            df = pd.read_csv(csv_path)
            logger.info(f"βœ“ Loaded {len(df)} assessments from CSV")
            return True
        
        # Priority 2: Try to generate from Excel, and if anything fails, fall back to scraping
        if os.path.exists(excel_path):
            logger.info(f"βœ“ Generating catalog from Excel: {excel_path}")
            try:
                df = pd.read_excel(excel_path)
                logger.info(f"βœ“ Excel columns found: {list(df.columns)}")
                
                # COMPREHENSIVE column mapping - handles ALL variations
                column_mapping = {}
                for col in df.columns:
                    col_lower = col.lower().replace(' ', '_').replace('-', '_')
                    if 'assessment' in col_lower and 'name' in col_lower:
                        column_mapping[col] = 'Assessment Name'
                    elif col_lower in ['assessment_name', 'name', 'assessment']:
                        column_mapping[col] = 'Assessment Name'
                    elif 'assessment' in col_lower and 'url' in col_lower:
                        column_mapping[col] = 'Assessment URL'
                    elif col_lower in ['assessment_url', 'url', 'link']:
                        column_mapping[col] = 'Assessment URL'
                    elif 'description' in col_lower or col_lower in ['desc', 'details']:
                        column_mapping[col] = 'Description'
                    elif 'category' in col_lower or col_lower in ['cat', 'type', 'group']:
                        column_mapping[col] = 'Category'
                    elif 'test' in col_lower and 'type' in col_lower or col_lower in ['test_type', 'testtype', 'assessment_type']:
                        column_mapping[col] = 'Test Type'
                if column_mapping:
                    df.rename(columns=column_mapping, inplace=True)
                    logger.info(f"βœ“ Mapped columns: {column_mapping}")
                
                required_cols = ['Assessment Name', 'Assessment URL', 'Description', 'Category', 'Test Type']
                available_cols = [col for col in required_cols if col in df.columns]
                missing_cols = [col for col in required_cols if col not in df.columns]
                logger.info(f"βœ“ Available columns: {available_cols}")
                
                if missing_cols:
                    logger.warning(f"⚠ Excel missing columns: {missing_cols} β€” trying positional fallback")
                    if len(df.columns) >= 5:
                        old_cols = list(df.columns)[:5]
                        df = df.iloc[:, :5]
                        df.columns = required_cols
                        logger.info(f"βœ“ Mapped by position: {old_cols} -> {required_cols}")
                    elif len(df.columns) >= 3:
                        old_cols = list(df.columns)[:3]
                        df = df.iloc[:, :3]
                        df.columns = ['Assessment Name', 'Assessment URL', 'Description']
                        df['Category'] = 'General'
                        df['Test Type'] = 'K'
                        logger.info("βœ“ Used first 3 columns with defaults")
                    else:
                        raise ValueError("Insufficient Excel columns after mapping")
                
                if len(df) == 0:
                    raise ValueError("Excel file is empty")
                
                df = df.fillna('')
                os.makedirs('data', exist_ok=True)
                df.to_csv(csv_path, index=False)
                logger.info(f"βœ“ Saved {len(df)} assessments to {csv_path}")
                logger.info(f"βœ“ Sample row: {df.iloc[0].to_dict()}")
                return True
            except Exception as e:
                logger.warning(f"Excel load/mapping failed ({e}); falling back to web scrape...")
        
        # Priority 3: Scrape from web (last resort)
        logger.warning("⚠ No local data found or Excel unusable, scraping SHL website...")
        from src.crawler import SHLCrawler
        
        os.makedirs('data', exist_ok=True)
        crawler = SHLCrawler()
        df = crawler.scrape_catalog()
        try:
            df = df.fillna('')
            df.to_csv(csv_path, index=False)
            logger.info(f"βœ“ Scraped {len(df)} assessments; saved to {csv_path}")
            return True
        except Exception as e:
            logger.error(f"βœ— Scraping failed and no catalog available: {e}")
            return False
            
    except Exception as e:
        logger.error(f"βœ— Failed to load catalog: {e}")
        import traceback
        traceback.print_exc()
        return False


def step2_preprocess_data():
    """Step 2: Preprocess training data"""
    logger.info("\n" + "="*60)
    logger.info("STEP 2: Preprocessing Training Data")
    logger.info("="*60)
    
    try:
        from src.preprocess import DataPreprocessor
        
        preprocessor = DataPreprocessor()
        data = preprocessor.preprocess()
        
        logger.info(f"βœ“ Preprocessed {len(data.get('train_queries', []))} train queries")
        logger.info(f"βœ“ Preprocessed {len(data.get('test_queries', []))} test queries")
        logger.info(f"βœ“ Created {len(data.get('train_mapping', {}))} train mappings")
        return True
    except Exception as e:
        logger.warning(f"⚠ Preprocessing skipped: {e}")
        logger.info("βœ“ Continuing without training data")
        return True


def step3_build_index():
    """Step 3: Generate embeddings and build FAISS index"""
    logger.info("\n" + "="*60)
    logger.info("STEP 3: Building Search Index")
    logger.info("="*60)
    logger.info("Downloading models and creating embeddings...")
    
    try:
        from src.embedder import EmbeddingGenerator
        
        embedder = EmbeddingGenerator()
        
        # Build complete index pipeline (loads catalog, generates embeddings, saves artifacts)
        index, embeddings, mapping = embedder.build_index()
        logger.info(f"βœ“ Built FAISS index with {index.ntotal} vectors")
        logger.info(f"βœ“ Embeddings shape {embeddings.shape}; Mappings {len(mapping)}")
        
        return True
    except Exception as e:
        logger.error(f"βœ— Failed to build index: {e}")
        import traceback
        traceback.print_exc()
        return False


def step4_run_evaluation():
    """Step 4: Run evaluation on training set"""
    logger.info("\n" + "="*60)
    logger.info("STEP 4: Running Evaluation")
    logger.info("="*60)
    
    try:
        from src.evaluator import RecommenderEvaluator
        from src.recommender import AssessmentRecommender
        from src.preprocess import DataPreprocessor
        
        preprocessor = DataPreprocessor()
        data = preprocessor.preprocess()
        train_mapping = data.get('train_mapping', {})
        
        if not train_mapping:
            logger.warning("⚠ No training data available, skipping evaluation")
            logger.info("βœ“ System ready (evaluation skipped)")
            return True
        
        recommender = AssessmentRecommender()
        if not recommender.load_index():
            logger.error("βœ— Failed to load recommender")
            return False
        
        evaluator = RecommenderEvaluator()
        results = evaluator.evaluate(recommender, train_mapping, k=10)
        
        evaluator.print_report()
        evaluator.save_results()
        
        logger.info("βœ“ Evaluation complete")
        logger.info(f"βœ“ Mean Recall@10: {results['mean_recall_at_10']:.2%}")
        
        return True
    except Exception as e:
        logger.warning(f"⚠ Evaluation skipped: {e}")
        logger.info("βœ“ System ready (evaluation skipped)")
        return True


def verify_setup():
    """Verify setup completion"""
    logger.info("\n" + "="*60)
    logger.info("VERIFICATION")
    logger.info("="*60)
    
    required_files = [
        'data/shl_catalog.csv',
        'models/faiss_index.faiss',
        'models/embeddings.npy',
        'models/mapping.pkl'
    ]
    
    missing = []
    for file_path in required_files:
        if os.path.exists(file_path):
            size = os.path.getsize(file_path)
            logger.info(f"βœ“ {file_path} ({size:,} bytes)")
        else:
            logger.error(f"βœ— {file_path} - MISSING!")
            missing.append(file_path)
    
    if missing:
        logger.error(f"Missing files: {missing}")
        return False
    
    try:
        from src.recommender import AssessmentRecommender
        
        recommender = AssessmentRecommender()
        loaded = recommender.load_index()
        if not loaded:
            logger.error("βœ— Recommender failed to load index during verification")
            return False
        
        num_assessments = len(recommender.assessment_mapping)
        num_vectors = recommender.faiss_index.ntotal if recommender.faiss_index is not None else 0
        
        logger.info(f"βœ“ Loaded {num_assessments} assessments")
        logger.info(f"βœ“ Index has {num_vectors} vectors")
        
        if num_assessments < 50:
            logger.warning(f"⚠ Only {num_assessments} assessments (expected 150+)")
        
        return True
        
    except Exception as e:
        logger.error(f"βœ— Verification failed: {e}")
        return False


def main():
    """Main setup process"""
    logger.info("\n" + "="*60)
    logger.info("SHL ASSESSMENT RECOMMENDER - SETUP")
    logger.info("="*60)
    
    check_dependencies()
    
    os.makedirs('data', exist_ok=True)
    os.makedirs('models', exist_ok=True)
    logger.info("βœ“ Directories created")
    
    steps = [
        ("Load Catalog", step1_generate_catalog),
        ("Preprocess Data", step2_preprocess_data),
        ("Build Index", step3_build_index),
        ("Run Evaluation", step4_run_evaluation)
    ]
    
    for step_name, step_func in steps:
        if not step_func():
            if step_name in ["Load Catalog", "Build Index"]:
                logger.error(f"βœ— Critical step failed: {step_name}")
                return 1
    
    if not verify_setup():
        logger.error("βœ— Verification failed")
        return 1
    
    logger.info("\n" + "="*60)
    logger.info("βœ… SETUP COMPLETE!")
    logger.info("="*60)
    logger.info("\nπŸ“Š System Ready for Recommendations")
    
    return 0


if __name__ == "__main__":
    try:
        sys.exit(main())
    except KeyboardInterrupt:
        logger.info("\nSetup interrupted")
        sys.exit(1)
    except Exception as e:
        logger.error(f"\nUnexpected error: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)