File size: 3,082 Bytes
6518a94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""Preprocessing utilities for job_dataset_merged.csv



This script provides a function `process_file` and a CLI entrypoint to:

- read a CSV

- clean the `tech_stack` column by removing semicolons, collapsing whitespace

  and prefixing each value with "Skills required for this job: "

- write the result to a new CSV (or inplace)



Usage examples:

  python preprocess.py --input backend/Dataset/job_dataset_merged.csv

  python preprocess.py --input backend/Dataset/job_dataset_merged.csv --inplace



The script uses pandas. If pandas is not installed, install it with:

  pip install pandas

"""

from __future__ import annotations
import argparse
import re
from pathlib import Path
import sys
import pandas as pd


def clean_tech_stack(value: object) -> object:
    """Clean a single tech_stack value.



    Steps:

    - If value is NaN, return it unchanged.

    - Convert to string, replace semicolons with spaces.

    - Collapse multiple whitespace into single spaces and strip ends.

    - Prefix the resulting string with the required sentence.

    """
    if pd.isna(value):
        return value
    s = str(value)
    # replace all semicolons with commas
    s = s.replace(';', ',')
    # collapse multiple whitespace characters into one
    s = re.sub(r"\s+", ' ', s).strip()
    prefix = "Skills required for this job: "
    return prefix + s if s else prefix


def process_file(input_path: str | Path, output_path: str | Path | None = None, inplace: bool = False) -> str:
    """Read CSV at input_path, clean the `tech_stack` column, and write output.



    Returns the path written.

    Raises KeyError if `tech_stack` column is missing.

    """
    input_path = Path(input_path)
    if not input_path.exists():
        raise FileNotFoundError(f"Input file not found: {input_path}")

    df = pd.read_csv(input_path)

    if 'tech_stack' not in df.columns:
        raise KeyError("Column 'tech_stack' not found in input CSV")

    df['tech_stack'] = df['tech_stack'].apply(clean_tech_stack)

    if inplace:
        df.to_csv(input_path, index=False)
        return str(input_path)

    if output_path is None:
        output_path = input_path.with_name(input_path.stem + '_preprocessed' + input_path.suffix)

    df.to_csv(output_path, index=False)
    return str(output_path)


def _parse_args(argv=None):
    p = argparse.ArgumentParser(description="Preprocess job dataset 'tech_stack' column")
    p.add_argument('--input', '-i', required=True, help='Path to input CSV')
    p.add_argument('--output', '-o', required=False, help='Path to output CSV (default: input_preprocessed.csv)')
    p.add_argument('--inplace', action='store_true', help='Overwrite the input file')
    return p.parse_args(argv)


def main(argv=None):
    args = _parse_args(argv)
    try:
        out = process_file(args.input, args.output, args.inplace)
        print(f"Wrote: {out}")
    except Exception as e:
        print(f"Error: {e}", file=sys.stderr)
        raise


if __name__ == '__main__':
    main()