Spaces:
Runtime error
Runtime error
| """ | |
| Data Preprocessing Module | |
| This module loads and preprocesses the Gen_AI Dataset.xlsx file, | |
| cleaning queries and creating training mappings. | |
| """ | |
| import pandas as pd | |
| import re | |
| import logging | |
| from typing import Dict, List, Tuple | |
| import os | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| class DataPreprocessor: | |
| """Preprocesses training and test data from Gen_AI Dataset""" | |
| def __init__(self, excel_path: str = 'Data/Gen_AI Dataset.xlsx'): | |
| self.excel_path = excel_path | |
| self.train_df = None | |
| self.test_df = None | |
| self.train_mapping = {} | |
| def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| """Load train and test data from Excel file""" | |
| try: | |
| logger.info(f"Loading data from {self.excel_path}") | |
| # Read Excel file | |
| xls = pd.ExcelFile(self.excel_path) | |
| logger.info(f"Available sheets: {xls.sheet_names}") | |
| # Load Train-Set | |
| if 'Train-Set' in xls.sheet_names: | |
| self.train_df = pd.read_excel(self.excel_path, sheet_name='Train-Set') | |
| logger.info(f"Loaded Train-Set: {self.train_df.shape}") | |
| else: | |
| # Try alternative sheet names | |
| for sheet in xls.sheet_names: | |
| if 'train' in sheet.lower(): | |
| self.train_df = pd.read_excel(self.excel_path, sheet_name=sheet) | |
| logger.info(f"Loaded {sheet}: {self.train_df.shape}") | |
| break | |
| # Load Test-Set | |
| if 'Test-Set' in xls.sheet_names: | |
| self.test_df = pd.read_excel(self.excel_path, sheet_name='Test-Set') | |
| logger.info(f"Loaded Test-Set: {self.test_df.shape}") | |
| else: | |
| # Try alternative sheet names | |
| for sheet in xls.sheet_names: | |
| if 'test' in sheet.lower(): | |
| self.test_df = pd.read_excel(self.excel_path, sheet_name=sheet) | |
| logger.info(f"Loaded {sheet}: {self.test_df.shape}") | |
| break | |
| # If no sheets found, try to load all data from first sheet | |
| if self.train_df is None: | |
| logger.warning("No train sheet found, loading from first sheet") | |
| self.train_df = pd.read_excel(self.excel_path, sheet_name=0) | |
| return self.train_df, self.test_df | |
| except Exception as e: | |
| logger.error(f"Error loading data: {e}") | |
| raise | |
| def clean_text(self, text: str) -> str: | |
| """Clean and normalize text""" | |
| if pd.isna(text) or not isinstance(text, str): | |
| return "" | |
| # Convert to lowercase | |
| text = text.lower() | |
| # Remove extra whitespace | |
| text = ' '.join(text.split()) | |
| # Remove special characters but keep basic punctuation | |
| text = re.sub(r'[^\w\s.,!?-]', '', text) | |
| # Trim | |
| text = text.strip() | |
| return text | |
| def extract_urls_from_text(self, text: str) -> List[str]: | |
| """Extract URLs from text""" | |
| if pd.isna(text) or not isinstance(text, str): | |
| return [] | |
| # Find URLs in text | |
| url_pattern = r'https?://[^\s,]+' | |
| urls = re.findall(url_pattern, text) | |
| return urls | |
| def parse_assessment_urls(self, url_column) -> List[str]: | |
| """Parse assessment URLs from various formats""" | |
| urls = [] | |
| if pd.isna(url_column): | |
| return urls | |
| # If it's a string | |
| if isinstance(url_column, str): | |
| # Split by common separators | |
| parts = re.split(r'[,;\n\|]', url_column) | |
| for part in parts: | |
| part = part.strip() | |
| if 'http' in part or 'shl.com' in part: | |
| urls.append(part) | |
| # Extract URLs from text | |
| extracted = self.extract_urls_from_text(part) | |
| urls.extend(extracted) | |
| # Remove duplicates and clean | |
| urls = list(set([url.strip() for url in urls if url])) | |
| return urls | |
| def create_train_mapping(self) -> Dict[str, List[str]]: | |
| """ | |
| Create mapping from queries to assessment URLs | |
| Fixed to handle all 65 training samples properly | |
| """ | |
| if self.train_df is None: | |
| logger.error("Train data not loaded") | |
| return {} | |
| logger.info("Creating train mapping...") | |
| self.train_mapping = {} | |
| # Identify query and URL columns | |
| query_cols = ['query', 'job_description', 'jd', 'description', 'text', 'job query'] | |
| url_cols = ['urls', 'assessment_urls', 'assessment_url', 'relevant_assessments', 'assessments', 'links', 'url'] | |
| query_col = None | |
| url_col = None | |
| # Find query column | |
| for col in self.train_df.columns: | |
| col_lower = col.lower() | |
| if any(qc in col_lower for qc in query_cols): | |
| query_col = col | |
| logger.info(f"Found query column: {query_col}") | |
| break | |
| # Find URL column | |
| for col in self.train_df.columns: | |
| col_lower = col.lower() | |
| if any(uc in col_lower for uc in url_cols): | |
| url_col = col | |
| logger.info(f"Found URL column: {url_col}") | |
| break | |
| # If columns not found, use first two columns | |
| if query_col is None and len(self.train_df.columns) > 0: | |
| query_col = self.train_df.columns[0] | |
| logger.warning(f"Query column not identified, using: {query_col}") | |
| if url_col is None and len(self.train_df.columns) > 1: | |
| url_col = self.train_df.columns[1] | |
| logger.warning(f"URL column not identified, using: {url_col}") | |
| # Process ALL rows to create mappings | |
| for idx, row in self.train_df.iterrows(): | |
| query = self.clean_text(str(row[query_col])) | |
| url_value = str(row[url_col]) | |
| # Skip invalid queries | |
| if not query or query in ['nan', 'none', '']: | |
| continue | |
| # Skip invalid URLs | |
| if not url_value or url_value.lower() in ['nan', 'none', '']: | |
| continue | |
| # Parse URLs (handles multiple URLs separated by commas, semicolons, etc.) | |
| urls = self.parse_assessment_urls(url_value) | |
| # If no URLs parsed, try using the raw value | |
| if not urls and 'http' in url_value: | |
| urls = [url_value.strip()] | |
| # Store mapping (accumulate URLs for same query) | |
| if urls: | |
| if query not in self.train_mapping: | |
| self.train_mapping[query] = [] | |
| for url in urls: | |
| if url not in self.train_mapping[query]: | |
| self.train_mapping[query].append(url) | |
| logger.info(f"Created {len(self.train_mapping)} query-URL mappings") | |
| logger.info(f"Total URL entries: {sum(len(v) for v in self.train_mapping.values())}") | |
| return self.train_mapping | |
| def get_all_queries(self) -> Tuple[List[str], List[str]]: | |
| """Get all queries from train and test sets""" | |
| train_queries = [] | |
| test_queries = [] | |
| if self.train_df is not None: | |
| # Find query column | |
| query_col = None | |
| for col in self.train_df.columns: | |
| if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']): | |
| query_col = col | |
| break | |
| if query_col is None: | |
| query_col = self.train_df.columns[0] | |
| train_queries = [ | |
| self.clean_text(str(q)) | |
| for q in self.train_df[query_col] | |
| if not pd.isna(q) | |
| ] | |
| if self.test_df is not None: | |
| # Find query column | |
| query_col = None | |
| for col in self.test_df.columns: | |
| if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']): | |
| query_col = col | |
| break | |
| if query_col is None: | |
| query_col = self.test_df.columns[0] | |
| test_queries = [ | |
| self.clean_text(str(q)) | |
| for q in self.test_df[query_col] | |
| if not pd.isna(q) | |
| ] | |
| logger.info(f"Extracted {len(train_queries)} train queries and {len(test_queries)} test queries") | |
| return train_queries, test_queries | |
| def preprocess(self) -> Dict: | |
| """Main preprocessing pipeline""" | |
| # Load data | |
| self.load_data() | |
| # Create train mapping | |
| self.create_train_mapping() | |
| # Get all queries | |
| train_queries, test_queries = self.get_all_queries() | |
| # Summary | |
| logger.info("Preprocessing complete:") | |
| logger.info(f" Train queries: {len(train_queries)}") | |
| logger.info(f" Test queries: {len(test_queries)}") | |
| logger.info(f" Train mappings: {len(self.train_mapping)}") | |
| return { | |
| 'train_queries': train_queries, | |
| 'test_queries': test_queries, | |
| 'train_mapping': self.train_mapping, | |
| 'train_df': self.train_df, | |
| 'test_df': self.test_df | |
| } | |
| def main(): | |
| """Main execution function""" | |
| preprocessor = DataPreprocessor() | |
| result = preprocessor.preprocess() | |
| print("\n=== Preprocessing Summary ===") | |
| print(f"Train queries: {len(result['train_queries'])}") | |
| print(f"Test queries: {len(result['test_queries'])}") | |
| print(f"Train mappings: {len(result['train_mapping'])}") | |
| # Show sample | |
| if result['train_queries']: | |
| print(f"\nSample train query: {result['train_queries'][0][:100]}...") | |
| if result['train_mapping']: | |
| sample_key = list(result['train_mapping'].keys())[0] | |
| print(f"\nSample mapping:") | |
| print(f" Query: {sample_key[:80]}...") | |
| print(f" URLs: {result['train_mapping'][sample_key][:2]}") | |
| return result | |
| if __name__ == "__main__": | |
| main() | |