SHL / src /preprocess.py
Harsh-1132's picture
Clean deployment
d18c374
"""
Data Preprocessing Module
This module loads and preprocesses the Gen_AI Dataset.xlsx file,
cleaning queries and creating training mappings.
"""
import pandas as pd
import re
import logging
from typing import Dict, List, Tuple
import os
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class DataPreprocessor:
"""Preprocesses training and test data from Gen_AI Dataset"""
def __init__(self, excel_path: str = 'Data/Gen_AI Dataset.xlsx'):
self.excel_path = excel_path
self.train_df = None
self.test_df = None
self.train_mapping = {}
def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Load train and test data from Excel file"""
try:
logger.info(f"Loading data from {self.excel_path}")
# Read Excel file
xls = pd.ExcelFile(self.excel_path)
logger.info(f"Available sheets: {xls.sheet_names}")
# Load Train-Set
if 'Train-Set' in xls.sheet_names:
self.train_df = pd.read_excel(self.excel_path, sheet_name='Train-Set')
logger.info(f"Loaded Train-Set: {self.train_df.shape}")
else:
# Try alternative sheet names
for sheet in xls.sheet_names:
if 'train' in sheet.lower():
self.train_df = pd.read_excel(self.excel_path, sheet_name=sheet)
logger.info(f"Loaded {sheet}: {self.train_df.shape}")
break
# Load Test-Set
if 'Test-Set' in xls.sheet_names:
self.test_df = pd.read_excel(self.excel_path, sheet_name='Test-Set')
logger.info(f"Loaded Test-Set: {self.test_df.shape}")
else:
# Try alternative sheet names
for sheet in xls.sheet_names:
if 'test' in sheet.lower():
self.test_df = pd.read_excel(self.excel_path, sheet_name=sheet)
logger.info(f"Loaded {sheet}: {self.test_df.shape}")
break
# If no sheets found, try to load all data from first sheet
if self.train_df is None:
logger.warning("No train sheet found, loading from first sheet")
self.train_df = pd.read_excel(self.excel_path, sheet_name=0)
return self.train_df, self.test_df
except Exception as e:
logger.error(f"Error loading data: {e}")
raise
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
if pd.isna(text) or not isinstance(text, str):
return ""
# Convert to lowercase
text = text.lower()
# Remove extra whitespace
text = ' '.join(text.split())
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?-]', '', text)
# Trim
text = text.strip()
return text
def extract_urls_from_text(self, text: str) -> List[str]:
"""Extract URLs from text"""
if pd.isna(text) or not isinstance(text, str):
return []
# Find URLs in text
url_pattern = r'https?://[^\s,]+'
urls = re.findall(url_pattern, text)
return urls
def parse_assessment_urls(self, url_column) -> List[str]:
"""Parse assessment URLs from various formats"""
urls = []
if pd.isna(url_column):
return urls
# If it's a string
if isinstance(url_column, str):
# Split by common separators
parts = re.split(r'[,;\n\|]', url_column)
for part in parts:
part = part.strip()
if 'http' in part or 'shl.com' in part:
urls.append(part)
# Extract URLs from text
extracted = self.extract_urls_from_text(part)
urls.extend(extracted)
# Remove duplicates and clean
urls = list(set([url.strip() for url in urls if url]))
return urls
def create_train_mapping(self) -> Dict[str, List[str]]:
"""
Create mapping from queries to assessment URLs
Fixed to handle all 65 training samples properly
"""
if self.train_df is None:
logger.error("Train data not loaded")
return {}
logger.info("Creating train mapping...")
self.train_mapping = {}
# Identify query and URL columns
query_cols = ['query', 'job_description', 'jd', 'description', 'text', 'job query']
url_cols = ['urls', 'assessment_urls', 'assessment_url', 'relevant_assessments', 'assessments', 'links', 'url']
query_col = None
url_col = None
# Find query column
for col in self.train_df.columns:
col_lower = col.lower()
if any(qc in col_lower for qc in query_cols):
query_col = col
logger.info(f"Found query column: {query_col}")
break
# Find URL column
for col in self.train_df.columns:
col_lower = col.lower()
if any(uc in col_lower for uc in url_cols):
url_col = col
logger.info(f"Found URL column: {url_col}")
break
# If columns not found, use first two columns
if query_col is None and len(self.train_df.columns) > 0:
query_col = self.train_df.columns[0]
logger.warning(f"Query column not identified, using: {query_col}")
if url_col is None and len(self.train_df.columns) > 1:
url_col = self.train_df.columns[1]
logger.warning(f"URL column not identified, using: {url_col}")
# Process ALL rows to create mappings
for idx, row in self.train_df.iterrows():
query = self.clean_text(str(row[query_col]))
url_value = str(row[url_col])
# Skip invalid queries
if not query or query in ['nan', 'none', '']:
continue
# Skip invalid URLs
if not url_value or url_value.lower() in ['nan', 'none', '']:
continue
# Parse URLs (handles multiple URLs separated by commas, semicolons, etc.)
urls = self.parse_assessment_urls(url_value)
# If no URLs parsed, try using the raw value
if not urls and 'http' in url_value:
urls = [url_value.strip()]
# Store mapping (accumulate URLs for same query)
if urls:
if query not in self.train_mapping:
self.train_mapping[query] = []
for url in urls:
if url not in self.train_mapping[query]:
self.train_mapping[query].append(url)
logger.info(f"Created {len(self.train_mapping)} query-URL mappings")
logger.info(f"Total URL entries: {sum(len(v) for v in self.train_mapping.values())}")
return self.train_mapping
def get_all_queries(self) -> Tuple[List[str], List[str]]:
"""Get all queries from train and test sets"""
train_queries = []
test_queries = []
if self.train_df is not None:
# Find query column
query_col = None
for col in self.train_df.columns:
if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']):
query_col = col
break
if query_col is None:
query_col = self.train_df.columns[0]
train_queries = [
self.clean_text(str(q))
for q in self.train_df[query_col]
if not pd.isna(q)
]
if self.test_df is not None:
# Find query column
query_col = None
for col in self.test_df.columns:
if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']):
query_col = col
break
if query_col is None:
query_col = self.test_df.columns[0]
test_queries = [
self.clean_text(str(q))
for q in self.test_df[query_col]
if not pd.isna(q)
]
logger.info(f"Extracted {len(train_queries)} train queries and {len(test_queries)} test queries")
return train_queries, test_queries
def preprocess(self) -> Dict:
"""Main preprocessing pipeline"""
# Load data
self.load_data()
# Create train mapping
self.create_train_mapping()
# Get all queries
train_queries, test_queries = self.get_all_queries()
# Summary
logger.info("Preprocessing complete:")
logger.info(f" Train queries: {len(train_queries)}")
logger.info(f" Test queries: {len(test_queries)}")
logger.info(f" Train mappings: {len(self.train_mapping)}")
return {
'train_queries': train_queries,
'test_queries': test_queries,
'train_mapping': self.train_mapping,
'train_df': self.train_df,
'test_df': self.test_df
}
def main():
"""Main execution function"""
preprocessor = DataPreprocessor()
result = preprocessor.preprocess()
print("\n=== Preprocessing Summary ===")
print(f"Train queries: {len(result['train_queries'])}")
print(f"Test queries: {len(result['test_queries'])}")
print(f"Train mappings: {len(result['train_mapping'])}")
# Show sample
if result['train_queries']:
print(f"\nSample train query: {result['train_queries'][0][:100]}...")
if result['train_mapping']:
sample_key = list(result['train_mapping'].keys())[0]
print(f"\nSample mapping:")
print(f" Query: {sample_key[:80]}...")
print(f" URLs: {result['train_mapping'][sample_key][:2]}")
return result
if __name__ == "__main__":
main()