Spaces:
Sleeping
Sleeping
File size: 3,082 Bytes
6518a94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
"""Preprocessing utilities for job_dataset_merged.csv
This script provides a function `process_file` and a CLI entrypoint to:
- read a CSV
- clean the `tech_stack` column by removing semicolons, collapsing whitespace
and prefixing each value with "Skills required for this job: "
- write the result to a new CSV (or inplace)
Usage examples:
python preprocess.py --input backend/Dataset/job_dataset_merged.csv
python preprocess.py --input backend/Dataset/job_dataset_merged.csv --inplace
The script uses pandas. If pandas is not installed, install it with:
pip install pandas
"""
from __future__ import annotations
import argparse
import re
from pathlib import Path
import sys
import pandas as pd
def clean_tech_stack(value: object) -> object:
"""Clean a single tech_stack value.
Steps:
- If value is NaN, return it unchanged.
- Convert to string, replace semicolons with spaces.
- Collapse multiple whitespace into single spaces and strip ends.
- Prefix the resulting string with the required sentence.
"""
if pd.isna(value):
return value
s = str(value)
# replace all semicolons with commas
s = s.replace(';', ',')
# collapse multiple whitespace characters into one
s = re.sub(r"\s+", ' ', s).strip()
prefix = "Skills required for this job: "
return prefix + s if s else prefix
def process_file(input_path: str | Path, output_path: str | Path | None = None, inplace: bool = False) -> str:
"""Read CSV at input_path, clean the `tech_stack` column, and write output.
Returns the path written.
Raises KeyError if `tech_stack` column is missing.
"""
input_path = Path(input_path)
if not input_path.exists():
raise FileNotFoundError(f"Input file not found: {input_path}")
df = pd.read_csv(input_path)
if 'tech_stack' not in df.columns:
raise KeyError("Column 'tech_stack' not found in input CSV")
df['tech_stack'] = df['tech_stack'].apply(clean_tech_stack)
if inplace:
df.to_csv(input_path, index=False)
return str(input_path)
if output_path is None:
output_path = input_path.with_name(input_path.stem + '_preprocessed' + input_path.suffix)
df.to_csv(output_path, index=False)
return str(output_path)
def _parse_args(argv=None):
p = argparse.ArgumentParser(description="Preprocess job dataset 'tech_stack' column")
p.add_argument('--input', '-i', required=True, help='Path to input CSV')
p.add_argument('--output', '-o', required=False, help='Path to output CSV (default: input_preprocessed.csv)')
p.add_argument('--inplace', action='store_true', help='Overwrite the input file')
return p.parse_args(argv)
def main(argv=None):
args = _parse_args(argv)
try:
out = process_file(args.input, args.output, args.inplace)
print(f"Wrote: {out}")
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
raise
if __name__ == '__main__':
main() |