Spaces:

Harsh-1132
/

SHL

Runtime error

SHL

File size: 10,628 Bytes

d18c374

"""
Data Preprocessing Module

This module loads and preprocesses the Gen_AI Dataset.xlsx file,
cleaning queries and creating training mappings.
"""

import pandas as pd
import re
import logging
from typing import Dict, List, Tuple
import os

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class DataPreprocessor:
    """Preprocesses training and test data from Gen_AI Dataset"""
    
    def __init__(self, excel_path: str = 'Data/Gen_AI Dataset.xlsx'):
        self.excel_path = excel_path
        self.train_df = None
        self.test_df = None
        self.train_mapping = {}
        
    def load_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Load train and test data from Excel file"""
        try:
            logger.info(f"Loading data from {self.excel_path}")
            
            # Read Excel file
            xls = pd.ExcelFile(self.excel_path)
            logger.info(f"Available sheets: {xls.sheet_names}")
            
            # Load Train-Set
            if 'Train-Set' in xls.sheet_names:
                self.train_df = pd.read_excel(self.excel_path, sheet_name='Train-Set')
                logger.info(f"Loaded Train-Set: {self.train_df.shape}")
            else:
                # Try alternative sheet names
                for sheet in xls.sheet_names:
                    if 'train' in sheet.lower():
                        self.train_df = pd.read_excel(self.excel_path, sheet_name=sheet)
                        logger.info(f"Loaded {sheet}: {self.train_df.shape}")
                        break
            
            # Load Test-Set
            if 'Test-Set' in xls.sheet_names:
                self.test_df = pd.read_excel(self.excel_path, sheet_name='Test-Set')
                logger.info(f"Loaded Test-Set: {self.test_df.shape}")
            else:
                # Try alternative sheet names
                for sheet in xls.sheet_names:
                    if 'test' in sheet.lower():
                        self.test_df = pd.read_excel(self.excel_path, sheet_name=sheet)
                        logger.info(f"Loaded {sheet}: {self.test_df.shape}")
                        break
            
            # If no sheets found, try to load all data from first sheet
            if self.train_df is None:
                logger.warning("No train sheet found, loading from first sheet")
                self.train_df = pd.read_excel(self.excel_path, sheet_name=0)
            
            return self.train_df, self.test_df
            
        except Exception as e:
            logger.error(f"Error loading data: {e}")
            raise
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if pd.isna(text) or not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        
        # Trim
        text = text.strip()
        
        return text
    
    def extract_urls_from_text(self, text: str) -> List[str]:
        """Extract URLs from text"""
        if pd.isna(text) or not isinstance(text, str):
            return []
        
        # Find URLs in text
        url_pattern = r'https?://[^\s,]+'
        urls = re.findall(url_pattern, text)
        
        return urls
    
    def parse_assessment_urls(self, url_column) -> List[str]:
        """Parse assessment URLs from various formats"""
        urls = []
        
        if pd.isna(url_column):
            return urls
        
        # If it's a string
        if isinstance(url_column, str):
            # Split by common separators
            parts = re.split(r'[,;\n\|]', url_column)
            for part in parts:
                part = part.strip()
                if 'http' in part or 'shl.com' in part:
                    urls.append(part)
                # Extract URLs from text
                extracted = self.extract_urls_from_text(part)
                urls.extend(extracted)
        
        # Remove duplicates and clean
        urls = list(set([url.strip() for url in urls if url]))
        
        return urls
    
    def create_train_mapping(self) -> Dict[str, List[str]]:
        """
        Create mapping from queries to assessment URLs
        
        Fixed to handle all 65 training samples properly
        """
        if self.train_df is None:
            logger.error("Train data not loaded")
            return {}
        
        logger.info("Creating train mapping...")
        self.train_mapping = {}
        
        # Identify query and URL columns
        query_cols = ['query', 'job_description', 'jd', 'description', 'text', 'job query']
        url_cols = ['urls', 'assessment_urls', 'assessment_url', 'relevant_assessments', 'assessments', 'links', 'url']
        
        query_col = None
        url_col = None
        
        # Find query column
        for col in self.train_df.columns:
            col_lower = col.lower()
            if any(qc in col_lower for qc in query_cols):
                query_col = col
                logger.info(f"Found query column: {query_col}")
                break
        
        # Find URL column
        for col in self.train_df.columns:
            col_lower = col.lower()
            if any(uc in col_lower for uc in url_cols):
                url_col = col
                logger.info(f"Found URL column: {url_col}")
                break
        
        # If columns not found, use first two columns
        if query_col is None and len(self.train_df.columns) > 0:
            query_col = self.train_df.columns[0]
            logger.warning(f"Query column not identified, using: {query_col}")
        
        if url_col is None and len(self.train_df.columns) > 1:
            url_col = self.train_df.columns[1]
            logger.warning(f"URL column not identified, using: {url_col}")
        
        # Process ALL rows to create mappings
        for idx, row in self.train_df.iterrows():
            query = self.clean_text(str(row[query_col]))
            url_value = str(row[url_col])
            
            # Skip invalid queries
            if not query or query in ['nan', 'none', '']:
                continue
            
            # Skip invalid URLs
            if not url_value or url_value.lower() in ['nan', 'none', '']:
                continue
            
            # Parse URLs (handles multiple URLs separated by commas, semicolons, etc.)
            urls = self.parse_assessment_urls(url_value)
            
            # If no URLs parsed, try using the raw value
            if not urls and 'http' in url_value:
                urls = [url_value.strip()]
            
            # Store mapping (accumulate URLs for same query)
            if urls:
                if query not in self.train_mapping:
                    self.train_mapping[query] = []
                
                for url in urls:
                    if url not in self.train_mapping[query]:
                        self.train_mapping[query].append(url)
        
        logger.info(f"Created {len(self.train_mapping)} query-URL mappings")
        logger.info(f"Total URL entries: {sum(len(v) for v in self.train_mapping.values())}")
        
        return self.train_mapping    
    
    def get_all_queries(self) -> Tuple[List[str], List[str]]:
        """Get all queries from train and test sets"""
        train_queries = []
        test_queries = []
        
        if self.train_df is not None:
            # Find query column
            query_col = None
            for col in self.train_df.columns:
                if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']):
                    query_col = col
                    break
            
            if query_col is None:
                query_col = self.train_df.columns[0]
            
            train_queries = [
                self.clean_text(str(q)) 
                for q in self.train_df[query_col] 
                if not pd.isna(q)
            ]
        
        if self.test_df is not None:
            # Find query column
            query_col = None
            for col in self.test_df.columns:
                if any(qc in col.lower() for qc in ['query', 'job', 'description', 'text']):
                    query_col = col
                    break
            
            if query_col is None:
                query_col = self.test_df.columns[0]
            
            test_queries = [
                self.clean_text(str(q)) 
                for q in self.test_df[query_col] 
                if not pd.isna(q)
            ]
        
        logger.info(f"Extracted {len(train_queries)} train queries and {len(test_queries)} test queries")
        return train_queries, test_queries
    
    def preprocess(self) -> Dict:
        """Main preprocessing pipeline"""
        # Load data
        self.load_data()
        
        # Create train mapping
        self.create_train_mapping()
        
        # Get all queries
        train_queries, test_queries = self.get_all_queries()
        
        # Summary
        logger.info("Preprocessing complete:")
        logger.info(f"  Train queries: {len(train_queries)}")
        logger.info(f"  Test queries: {len(test_queries)}")
        logger.info(f"  Train mappings: {len(self.train_mapping)}")
        
        return {
            'train_queries': train_queries,
            'test_queries': test_queries,
            'train_mapping': self.train_mapping,
            'train_df': self.train_df,
            'test_df': self.test_df
        }


def main():
    """Main execution function"""
    preprocessor = DataPreprocessor()
    result = preprocessor.preprocess()
    
    print("\n=== Preprocessing Summary ===")
    print(f"Train queries: {len(result['train_queries'])}")
    print(f"Test queries: {len(result['test_queries'])}")
    print(f"Train mappings: {len(result['train_mapping'])}")
    
    # Show sample
    if result['train_queries']:
        print(f"\nSample train query: {result['train_queries'][0][:100]}...")
    
    if result['train_mapping']:
        sample_key = list(result['train_mapping'].keys())[0]
        print(f"\nSample mapping:")
        print(f"  Query: {sample_key[:80]}...")
        print(f"  URLs: {result['train_mapping'][sample_key][:2]}")
    
    return result


if __name__ == "__main__":
    main()