Spaces:
Sleeping
Sleeping
| """Preprocessing utilities for job_dataset_merged.csv | |
| This script provides a function `process_file` and a CLI entrypoint to: | |
| - read a CSV | |
| - clean the `tech_stack` column by removing semicolons, collapsing whitespace | |
| and prefixing each value with "Skills required for this job: " | |
| - write the result to a new CSV (or inplace) | |
| Usage examples: | |
| python preprocess.py --input backend/Dataset/job_dataset_merged.csv | |
| python preprocess.py --input backend/Dataset/job_dataset_merged.csv --inplace | |
| The script uses pandas. If pandas is not installed, install it with: | |
| pip install pandas | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import re | |
| from pathlib import Path | |
| import sys | |
| import pandas as pd | |
| def clean_tech_stack(value: object) -> object: | |
| """Clean a single tech_stack value. | |
| Steps: | |
| - If value is NaN, return it unchanged. | |
| - Convert to string, replace semicolons with spaces. | |
| - Collapse multiple whitespace into single spaces and strip ends. | |
| - Prefix the resulting string with the required sentence. | |
| """ | |
| if pd.isna(value): | |
| return value | |
| s = str(value) | |
| # replace all semicolons with commas | |
| s = s.replace(';', ',') | |
| # collapse multiple whitespace characters into one | |
| s = re.sub(r"\s+", ' ', s).strip() | |
| prefix = "Skills required for this job: " | |
| return prefix + s if s else prefix | |
| def process_file(input_path: str | Path, output_path: str | Path | None = None, inplace: bool = False) -> str: | |
| """Read CSV at input_path, clean the `tech_stack` column, and write output. | |
| Returns the path written. | |
| Raises KeyError if `tech_stack` column is missing. | |
| """ | |
| input_path = Path(input_path) | |
| if not input_path.exists(): | |
| raise FileNotFoundError(f"Input file not found: {input_path}") | |
| df = pd.read_csv(input_path) | |
| if 'tech_stack' not in df.columns: | |
| raise KeyError("Column 'tech_stack' not found in input CSV") | |
| df['tech_stack'] = df['tech_stack'].apply(clean_tech_stack) | |
| if inplace: | |
| df.to_csv(input_path, index=False) | |
| return str(input_path) | |
| if output_path is None: | |
| output_path = input_path.with_name(input_path.stem + '_preprocessed' + input_path.suffix) | |
| df.to_csv(output_path, index=False) | |
| return str(output_path) | |
| def _parse_args(argv=None): | |
| p = argparse.ArgumentParser(description="Preprocess job dataset 'tech_stack' column") | |
| p.add_argument('--input', '-i', required=True, help='Path to input CSV') | |
| p.add_argument('--output', '-o', required=False, help='Path to output CSV (default: input_preprocessed.csv)') | |
| p.add_argument('--inplace', action='store_true', help='Overwrite the input file') | |
| return p.parse_args(argv) | |
| def main(argv=None): | |
| args = _parse_args(argv) | |
| try: | |
| out = process_file(args.input, args.output, args.inplace) | |
| print(f"Wrote: {out}") | |
| except Exception as e: | |
| print(f"Error: {e}", file=sys.stderr) | |
| raise | |
| if __name__ == '__main__': | |
| main() |