Spaces:

Harsh-1132
/

SHL

Runtime error

App Files Files Community

SHL / src /preprocess.py

Harsh-1132

Clean deployment

d18c374 3 months ago

raw

history blame contribute delete

10.6 kB

	"""
	Data Preprocessing Module

	This module loads and preprocesses the Gen_AI Dataset.xlsx file,
	cleaning queries and creating training mappings.
	"""

	import pandas as pd
	import re
	import logging
	from typing import Dict, List, Tuple
	import os

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)


	class DataPreprocessor:
	"""Preprocesses training and test data from Gen_AI Dataset"""

	def __init__(self, excel_path: str = 'Data/Gen_AI Dataset.xlsx'):
	self.excel_path = excel_path
	self.train_df = None
	self.test_df = None
	self.train_mapping = {}

	def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
	"""Load train and test data from Excel file"""
	try:
	logger.info(f"Loading data from {self.excel_path}")

	# Read Excel file
	xls = pd.ExcelFile(self.excel_path)
	logger.info(f"Available sheets: {xls.sheet_names}")

	# Load Train-Set
	if 'Train-Set' in xls.sheet_names:
	self.train_df = pd.read_excel(self.excel_path, sheet_name='Train-Set')
	logger.info(f"Loaded Train-Set: {self.train_df.shape}")
	else:
	# Try alternative sheet names
	for sheet in xls.sheet_names:
	if 'train' in sheet.lower():
	self.train_df = pd.read_excel(self.excel_path, sheet_name=sheet)
	logger.info(f"Loaded {sheet}: {self.train_df.shape}")
	break

	# Load Test-Set
	if 'Test-Set' in xls.sheet_names:
	self.test_df = pd.read_excel(self.excel_path, sheet_name='Test-Set')
	logger.info(f"Loaded Test-Set: {self.test_df.shape}")
	else:
	# Try alternative sheet names
	for sheet in xls.sheet_names:
	if 'test' in sheet.lower():
	self.test_df = pd.read_excel(self.excel_path, sheet_name=sheet)
	logger.info(f"Loaded {sheet}: {self.test_df.shape}")
	break

	# If no sheets found, try to load all data from first sheet
	if self.train_df is None:
	logger.warning("No train sheet found, loading from first sheet")
	self.train_df = pd.read_excel(self.excel_path, sheet_name=0)

	return self.train_df, self.test_df

	except Exception as e:
	logger.error(f"Error loading data: {e}")
	raise

	def clean_text(self, text: str) -> str:
	"""Clean and normalize text"""
	if pd.isna(text) or not isinstance(text, str):
	return ""

	# Convert to lowercase
	text = text.lower()

	# Remove extra whitespace
	text = ' '.join(text.split())

	# Remove special characters but keep basic punctuation
	text = re.sub(r'[^\w\s.,!?-]', '', text)

	# Trim
	text = text.strip()

	return text

	def extract_urls_from_text(self, text: str) -> List[str]:
	"""Extract URLs from text"""
	if pd.isna(text) or not isinstance(text, str):
	return []

	# Find URLs in text
	url_pattern = r'https?://[^\s,]+'
	urls = re.findall(url_pattern, text)

	return urls

	def parse_assessment_urls(self, url_column) -> List[str]:
	"""Parse assessment URLs from various formats"""
	urls = []

	if pd.isna(url_column):
	return urls

	# If it's a string
	if isinstance(url_column, str):
	# Split by common separators
	parts = re.split(r'[,;\n\\|]', url_column)
	for part in parts:
	part = part.strip()
	if 'http' in part or 'shl.com' in part:
	urls.append(part)
	# Extract URLs from text
	extracted = self.extract_urls_from_text(part)
	urls.extend(extracted)

	# Remove duplicates and clean
	urls = list(set([url.strip() for url in urls if url]))

	return urls

	def create_train_mapping(self) -> Dict[str, List[str]]:
	"""
	Create mapping from queries to assessment URLs

	Fixed to handle all 65 training samples properly
	"""
	if self.train_df is None:
	logger.error("Train data not loaded")
	return {}

	logger.info("Creating train mapping...")
	self.train_mapping = {}

	# Identify query and URL columns
	query_cols = ['query', 'job_description', 'jd', 'description', 'text', 'job query']
	url_cols = ['urls', 'assessment_urls', 'assessment_url', 'relevant_assessments', 'assessments', 'links', 'url']

	query_col = None
	url_col = None

	# Find query column
	for col in self.train_df.columns:
	col_lower = col.lower()
	if any(qc in col_lower for qc in query_cols):
	query_col = col
	logger.info(f"Found query column: {query_col}")
	break

	# Find URL column
	for col in self.train_df.columns:
	col_lower = col.lower()
	if any(uc in col_lower for uc in url_cols):
	url_col = col
	logger.info(f"Found URL column: {url_col}")
	break

	# If columns not found, use first two columns
	if query_col is None and len(self.train_df.columns) > 0:
	query_col = self.train_df.columns[0]
	logger.warning(f"Query column not identified, using: {query_col}")

	if url_col is None and len(self.train_df.columns) > 1:
	url_col = self.train_df.columns[1]
	logger.warning(f"URL column not identified, using: {url_col}")

	# Process ALL rows to create mappings
	for idx, row in self.train_df.iterrows():
	query = self.clean_text(str(row[query_col]))
	url_value = str(row[url_col])

	# Skip invalid queries
	if not query or query in ['nan', 'none', '']:
	continue

	# Skip invalid URLs
	if not url_value or url_value.lower() in ['nan', 'none', '']:
	continue

	# Parse URLs (handles multiple URLs separated by commas, semicolons, etc.)
	urls = self.parse_assessment_urls(url_value)

	# If no URLs parsed, try using the raw value
	if not urls and 'http' in url_value:
	urls = [url_value.strip()]

	# Store mapping (accumulate URLs for same query)
	if urls:
	if query not in self.train_mapping:
	self.train_mapping[query] = []

	for url in urls:
	if url not in self.train_mapping[query]:
	self.train_mapping[query].append(url)

	logger.info(f"Created {len(self.train_mapping)} query-URL mappings")
	logger.info(f"Total URL entries: {sum(len(v) for v in self.train_mapping.values())}")

	return self.train_mapping

	def get_all_queries(self) -> Tuple[List[str], List[str]]:
	"""Get all queries from train and test sets"""
	train_queries = []
	test_queries = []

	if self.train_df is not None:
	# Find query column
	query_col = None
	for col in self.train_df.columns:
	if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']):
	query_col = col
	break

	if query_col is None:
	query_col = self.train_df.columns[0]

	train_queries = [
	self.clean_text(str(q))
	for q in self.train_df[query_col]
	if not pd.isna(q)
	]

	if self.test_df is not None:
	# Find query column
	query_col = None
	for col in self.test_df.columns:
	if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']):
	query_col = col
	break

	if query_col is None:
	query_col = self.test_df.columns[0]

	test_queries = [
	self.clean_text(str(q))
	for q in self.test_df[query_col]
	if not pd.isna(q)
	]

	logger.info(f"Extracted {len(train_queries)} train queries and {len(test_queries)} test queries")
	return train_queries, test_queries

	def preprocess(self) -> Dict:
	"""Main preprocessing pipeline"""
	# Load data
	self.load_data()

	# Create train mapping
	self.create_train_mapping()

	# Get all queries
	train_queries, test_queries = self.get_all_queries()

	# Summary
	logger.info("Preprocessing complete:")
	logger.info(f" Train queries: {len(train_queries)}")
	logger.info(f" Test queries: {len(test_queries)}")
	logger.info(f" Train mappings: {len(self.train_mapping)}")

	return {
	'train_queries': train_queries,
	'test_queries': test_queries,
	'train_mapping': self.train_mapping,
	'train_df': self.train_df,
	'test_df': self.test_df
	}


	def main():
	"""Main execution function"""
	preprocessor = DataPreprocessor()
	result = preprocessor.preprocess()

	print("\n=== Preprocessing Summary ===")
	print(f"Train queries: {len(result['train_queries'])}")
	print(f"Test queries: {len(result['test_queries'])}")
	print(f"Train mappings: {len(result['train_mapping'])}")

	# Show sample
	if result['train_queries']:
	print(f"\nSample train query: {result['train_queries'][0][:100]}...")

	if result['train_mapping']:
	sample_key = list(result['train_mapping'].keys())[0]
	print(f"\nSample mapping:")
	print(f" Query: {sample_key[:80]}...")
	print(f" URLs: {result['train_mapping'][sample_key][:2]}")

	return result


	if __name__ == "__main__":
	main()