Harsh-1132 commited on
Commit
a79d041
Β·
1 Parent(s): f245081
Files changed (2) hide show
  1. requirements.txt +31 -31
  2. setup.py +58 -88
requirements.txt CHANGED
@@ -1,31 +1,31 @@
1
- # fastapi
2
- # uvicorn
3
- # pandas
4
- # numpy
5
- # scikit-learn
6
- # sentence-transformers
7
- # faiss-cpu
8
- # torch
9
- # transformers
10
- # openpyxl
11
- # beautifulsoup4
12
- # requests
13
- # pydantic
14
- # streamlit
15
- # lxml
16
- # python-multipart
17
- streamlit==1.31.0
18
- fastapi==0.109.0
19
- uvicorn==0.27.0
20
- pandas==2.1.4
21
- numpy==1.26.3
22
- scikit-learn==1.4.0
23
- sentence-transformers==2.3.1
24
- faiss-cpu==1.7.4
25
- torch==2.1.2
26
- transformers==4.37.2
27
- openpyxl==3.1.2
28
- beautifulsoup4==4.12.3
29
- requests==2.31.0
30
- pydantic==2.5.3
31
- python-multipart==0.0.6
 
1
+ fastapi
2
+ uvicorn
3
+ pandas
4
+ numpy
5
+ scikit-learn
6
+ sentence-transformers
7
+ faiss-cpu
8
+ torch
9
+ transformers
10
+ openpyxl
11
+ beautifulsoup4
12
+ requests
13
+ pydantic
14
+ streamlit
15
+ lxml
16
+ python-multipart
17
+ # streamlit==1.31.0
18
+ # fastapi==0.109.0
19
+ # uvicorn==0.27.0
20
+ # pandas==2.1.4
21
+ # numpy==1.26.3
22
+ # scikit-learn==1.4.0
23
+ # sentence-transformers==2.3.1
24
+ # faiss-cpu==1.7.4
25
+ # torch==2.1.2
26
+ # transformers==4.37.2
27
+ # openpyxl==3.1.2
28
+ # beautifulsoup4==4.12.3
29
+ # requests==2.31.0
30
+ # pydantic==2.5.3
31
+ # python-multipart==0.0.6
setup.py CHANGED
@@ -80,98 +80,68 @@ def step1_generate_catalog():
80
  logger.info(f"βœ“ Loaded {len(df)} assessments from CSV")
81
  return True
82
 
83
- # Priority 2: Generate from Excel
84
  if os.path.exists(excel_path):
85
  logger.info(f"βœ“ Generating catalog from Excel: {excel_path}")
86
- df = pd.read_excel(excel_path)
87
-
88
- logger.info(f"βœ“ Excel columns found: {list(df.columns)}")
89
-
90
- # COMPREHENSIVE column mapping - handles ALL variations
91
- column_mapping = {}
92
-
93
- # Find Assessment Name column
94
- for col in df.columns:
95
- col_lower = col.lower().replace(' ', '_').replace('-', '_')
96
 
97
- if 'assessment' in col_lower and 'name' in col_lower:
98
- column_mapping[col] = 'Assessment Name'
99
- elif col_lower in ['assessment_name', 'name', 'assessment']:
100
- column_mapping[col] = 'Assessment Name'
101
-
102
- elif 'assessment' in col_lower and 'url' in col_lower:
103
- column_mapping[col] = 'Assessment URL'
104
- elif col_lower in ['assessment_url', 'url', 'link']:
105
- column_mapping[col] = 'Assessment URL'
106
-
107
- elif 'description' in col_lower:
108
- column_mapping[col] = 'Description'
109
- elif col_lower in ['desc', 'description', 'details']:
110
- column_mapping[col] = 'Description'
111
-
112
- elif 'category' in col_lower:
113
- column_mapping[col] = 'Category'
114
- elif col_lower in ['category', 'cat', 'type', 'group']:
115
- column_mapping[col] = 'Category'
116
-
117
- elif 'test' in col_lower and 'type' in col_lower:
118
- column_mapping[col] = 'Test Type'
119
- elif col_lower in ['test_type', 'testtype', 'assessment_type']:
120
- column_mapping[col] = 'Test Type'
121
-
122
- # Apply mapping
123
- if column_mapping:
124
- df.rename(columns=column_mapping, inplace=True)
125
- logger.info(f"βœ“ Mapped columns: {column_mapping}")
126
-
127
- # Check what we have now
128
- required_cols = ['Assessment Name', 'Assessment URL', 'Description', 'Category', 'Test Type']
129
- available_cols = [col for col in required_cols if col in df.columns]
130
- missing_cols = [col for col in required_cols if col not in df.columns]
131
-
132
- logger.info(f"βœ“ Available columns: {available_cols}")
133
-
134
- if missing_cols:
135
- logger.error(f"ERROR - Missing columns: {missing_cols}")
136
- logger.error(f"ERROR - Available columns: {list(df.columns)}")
137
- logger.info("INFO - Attempting to use first 5 columns as fallback...")
138
 
139
- # FALLBACK: Use first 5 columns by position
140
- if len(df.columns) >= 5:
141
- old_cols = list(df.columns)[:5]
142
- df = df.iloc[:, :5]
143
- df.columns = required_cols
144
- logger.info(f"βœ“ Mapped by position: {old_cols} -> {required_cols}")
145
- elif len(df.columns) >= 3:
146
- # At minimum need: Name, URL, Description
147
- old_cols = list(df.columns)[:3]
148
- df = df.iloc[:, :3]
149
- df.columns = ['Assessment Name', 'Assessment URL', 'Description']
150
- df['Category'] = 'General'
151
- df['Test Type'] = 'K'
152
- logger.info(f"βœ“ Used first 3 columns with defaults")
153
- else:
154
- logger.error("ERROR - Not enough columns in Excel file, falling back to web scrape")
155
- # Fall through to scrape step below
156
- raise FileNotFoundError("Insufficient Excel columns; use scrape fallback")
157
-
158
- # Verify we have data
159
- if len(df) == 0:
160
- logger.error("ERROR - Excel file is empty, falling back to web scrape")
161
- raise FileNotFoundError("Empty Excel file; use scrape fallback")
162
-
163
- # Clean data
164
- df = df.fillna('')
165
-
166
- # Save to CSV
167
- os.makedirs('data', exist_ok=True)
168
- df.to_csv(csv_path, index=False)
169
- logger.info(f"βœ“ Saved {len(df)} assessments to {csv_path}")
170
-
171
- # Log sample
172
- logger.info(f"βœ“ Sample row: {df.iloc[0].to_dict()}")
173
-
174
- return True
175
 
176
  # Priority 3: Scrape from web (last resort)
177
  logger.warning("⚠ No local data found or Excel unusable, scraping SHL website...")
 
80
  logger.info(f"βœ“ Loaded {len(df)} assessments from CSV")
81
  return True
82
 
83
+ # Priority 2: Try to generate from Excel, and if anything fails, fall back to scraping
84
  if os.path.exists(excel_path):
85
  logger.info(f"βœ“ Generating catalog from Excel: {excel_path}")
86
+ try:
87
+ df = pd.read_excel(excel_path)
88
+ logger.info(f"βœ“ Excel columns found: {list(df.columns)}")
 
 
 
 
 
 
 
89
 
90
+ # COMPREHENSIVE column mapping - handles ALL variations
91
+ column_mapping = {}
92
+ for col in df.columns:
93
+ col_lower = col.lower().replace(' ', '_').replace('-', '_')
94
+ if 'assessment' in col_lower and 'name' in col_lower:
95
+ column_mapping[col] = 'Assessment Name'
96
+ elif col_lower in ['assessment_name', 'name', 'assessment']:
97
+ column_mapping[col] = 'Assessment Name'
98
+ elif 'assessment' in col_lower and 'url' in col_lower:
99
+ column_mapping[col] = 'Assessment URL'
100
+ elif col_lower in ['assessment_url', 'url', 'link']:
101
+ column_mapping[col] = 'Assessment URL'
102
+ elif 'description' in col_lower or col_lower in ['desc', 'details']:
103
+ column_mapping[col] = 'Description'
104
+ elif 'category' in col_lower or col_lower in ['cat', 'type', 'group']:
105
+ column_mapping[col] = 'Category'
106
+ elif 'test' in col_lower and 'type' in col_lower or col_lower in ['test_type', 'testtype', 'assessment_type']:
107
+ column_mapping[col] = 'Test Type'
108
+ if column_mapping:
109
+ df.rename(columns=column_mapping, inplace=True)
110
+ logger.info(f"βœ“ Mapped columns: {column_mapping}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+ required_cols = ['Assessment Name', 'Assessment URL', 'Description', 'Category', 'Test Type']
113
+ available_cols = [col for col in required_cols if col in df.columns]
114
+ missing_cols = [col for col in required_cols if col not in df.columns]
115
+ logger.info(f"βœ“ Available columns: {available_cols}")
116
+
117
+ if missing_cols:
118
+ logger.warning(f"⚠ Excel missing columns: {missing_cols} β€” trying positional fallback")
119
+ if len(df.columns) >= 5:
120
+ old_cols = list(df.columns)[:5]
121
+ df = df.iloc[:, :5]
122
+ df.columns = required_cols
123
+ logger.info(f"βœ“ Mapped by position: {old_cols} -> {required_cols}")
124
+ elif len(df.columns) >= 3:
125
+ old_cols = list(df.columns)[:3]
126
+ df = df.iloc[:, :3]
127
+ df.columns = ['Assessment Name', 'Assessment URL', 'Description']
128
+ df['Category'] = 'General'
129
+ df['Test Type'] = 'K'
130
+ logger.info("βœ“ Used first 3 columns with defaults")
131
+ else:
132
+ raise ValueError("Insufficient Excel columns after mapping")
133
+
134
+ if len(df) == 0:
135
+ raise ValueError("Excel file is empty")
136
+
137
+ df = df.fillna('')
138
+ os.makedirs('data', exist_ok=True)
139
+ df.to_csv(csv_path, index=False)
140
+ logger.info(f"βœ“ Saved {len(df)} assessments to {csv_path}")
141
+ logger.info(f"βœ“ Sample row: {df.iloc[0].to_dict()}")
142
+ return True
143
+ except Exception as e:
144
+ logger.warning(f"Excel load/mapping failed ({e}); falling back to web scrape...")
 
 
 
145
 
146
  # Priority 3: Scrape from web (last resort)
147
  logger.warning("⚠ No local data found or Excel unusable, scraping SHL website...")