Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| import pandas as pd | |
| import os | |
| import joblib | |
| import logging | |
| from config import DATASETS | |
| from utils.text_processing import clean_text | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| def load_all_datasets(): | |
| all_data = pd.DataFrame() | |
| # Load Hugging Face datasets | |
| for ds_name in DATASETS: | |
| if "/" in ds_name: # HF dataset | |
| try: | |
| logger.info(f"Loading Hugging Face dataset: {ds_name}") | |
| dataset = load_dataset(ds_name) | |
| if 'train' in dataset: | |
| df = pd.DataFrame(dataset['train']) | |
| else: | |
| df = pd.DataFrame(dataset) | |
| # Standardize column names | |
| if 'resume' in df.columns and 'job_description' in df.columns: | |
| df = df.rename(columns={'resume': 'text', 'job_description': 'jd_text'}) | |
| df['type'] = 'resume_jd_pair' | |
| elif 'resume' in df.columns: | |
| df = df.rename(columns={'resume': 'text'}) | |
| df['type'] = 'resume' | |
| elif 'job_description' in df.columns: | |
| df = df.rename(columns={'job_description': 'text'}) | |
| df['type'] = 'job_description' | |
| all_data = pd.concat([all_data, df], ignore_index=True) | |
| logger.info(f"Loaded {len(df)} records from {ds_name}") | |
| except Exception as e: | |
| logger.error(f"Error loading {ds_name}: {e}") | |
| # Load Kaggle datasets | |
| kaggle_path = "data/kaggle" | |
| if os.path.exists(kaggle_path): | |
| for file in os.listdir(kaggle_path): | |
| if file.endswith('.csv'): | |
| try: | |
| file_path = os.path.join(kaggle_path, file) | |
| logger.info(f"Loading Kaggle dataset: {file}") | |
| df = pd.read_csv(file_path) | |
| # Standardize columns | |
| if 'Resume' in df.columns: | |
| df = df.rename(columns={'Resume': 'text'}) | |
| df['type'] = 'resume' | |
| elif 'Description' in df.columns: | |
| df = df.rename(columns={'Description': 'text'}) | |
| df['type'] = 'job_description' | |
| elif 'job_desc' in df.columns: | |
| df = df.rename(columns={'job_desc': 'text'}) | |
| df['type'] = 'job_description' | |
| all_data = pd.concat([all_data, df], ignore_index=True) | |
| logger.info(f"Loaded {len(df)} records from {file}") | |
| except Exception as e: | |
| logger.error(f"Error loading Kaggle dataset {file}: {e}") | |
| # Clean text | |
| if not all_data.empty and 'text' in all_data.columns: | |
| all_data['cleaned_text'] = all_data['text'].apply(clean_text) | |
| return all_data |