| | import argparse
|
| | import yaml
|
| | import subprocess
|
| | import time
|
| | from pathlib import Path
|
| | """
|
| | total.py β ν λ²μ νμ΄νλΌμΈ μ 체 ( ν
μ€νΈ ν΄λ¦¬λ β λ¬Έμ₯ λΆλ¦¬ β κ°μ± λΆλ₯ β ν€μλ λΆλ₯ > μ΅μ’
κ²°κ³Ό μ μ₯) λλ €μ£Όλ μ€ν¬λ¦½νΈ
|
| | β’ μν : ν
μ€νΈ ν΄λ¦¬λ β λ¬Έμ₯ λΆλ¦¬ β κ°μ± λΆλ₯ β ν€μλ λΆλ₯
|
| | β’ μ¬μ©λ²:
|
| | python total.py --config C:/Users/parkm/NLP/config/default.yaml
|
| | β’ configμ: input/output κ²½λ‘, λͺ¨λΈ κ²½λ‘, λ°°μΉ ν¬κΈ°
|
| | β’ μ€κ° κ²°κ³Όλ μ§μ λ intermediate_dirμ step1~step3.csvλ‘ μ μ₯λ¨.
|
| | """
|
| |
|
| |
|
| |
|
| | def run_step(script_path, args):
|
| | """
|
| | βΆ μ£Όμ΄μ§ μ€ν¬λ¦½νΈλ₯Ό μ€ννκ³ , μμ μκ°μ λ°ν
|
| | """
|
| | cmd = ['python', str(script_path)] + args
|
| | print(f"\n--- Running: {' '.join(cmd)}")
|
| | start = time.perf_counter()
|
| | result = subprocess.run(cmd, capture_output=True, text=True)
|
| | elapsed = time.perf_counter() - start
|
| |
|
| | if result.returncode != 0:
|
| | print(f"[Error] {script_path.name} μ€ν μ€ μ€λ₯ λ°μ:\n{result.stderr}")
|
| | exit(1)
|
| | else:
|
| | print(result.stdout)
|
| | print(f"[Info] {script_path.name} μλ£: {elapsed:.2f}s μμ")
|
| | return elapsed
|
| |
|
| | def main(config_path: Path):
|
| | """
|
| | βΆ νμ΄νλΌμΈ λ©μΈ ν¨μ
|
| | 1) config YAML νμΌ λ‘λ
|
| | 2) μ€μ μ λ§μΆ° intermediate ν΄λμ μ€κ°κ²°κ³Ό μ μ₯λ¨
|
| | 3) 4λ¨κ³ μ€ν¬λ¦½νΈ μμ°¨μ μΌλ‘ μ€ν ν
|
| | 4) λ¨κ³λ³ λ° μ΄ μμ μκ° + μλ£ λ©μμ§ μΆλ ₯
|
| | """
|
| |
|
| | cfg = yaml.safe_load(config_path.read_text(encoding='utf-8'))
|
| | data_cfg = cfg['data']
|
| | paths_cfg = cfg['paths']
|
| |
|
| |
|
| | input_csv = Path(data_cfg['input_csv'])
|
| | intermediate = Path(data_cfg['intermediate_dir'])
|
| | output_csv = Path(data_cfg['output_csv'])
|
| | scripts_dir = Path(paths_cfg['scripts_dir'])
|
| | model_dir = Path(paths_cfg['model_dir'])
|
| |
|
| |
|
| | intermediate.mkdir(parents=True, exist_ok=True)
|
| |
|
| |
|
| |
|
| | total_start = time.perf_counter()
|
| | durations = {}
|
| |
|
| |
|
| |
|
| | clean_out = intermediate / 'step1_clean.csv'
|
| | durations['clean'] = run_step(
|
| | scripts_dir / 'text_cleaner.py',
|
| | ['--input', str(input_csv), '--output', str(clean_out)]
|
| | )
|
| |
|
| |
|
| |
|
| | split_out = intermediate / 'step2_split.csv'
|
| | durations['split'] = run_step(
|
| | scripts_dir / 'sentence_splitter.py',
|
| | [
|
| | '--input', str(clean_out),
|
| | '--output', str(split_out),
|
| | '--id-col', data_cfg.get('id_col', 'ID'),
|
| | '--time-col', data_cfg.get('time_col', ''),
|
| | '--text-col', 'cleaned',
|
| | '--output-col', 'divided_comment'
|
| | ]
|
| | )
|
| |
|
| |
|
| | senti_out = intermediate / 'step3_sentiment.csv'
|
| | durations['sentiment'] = run_step(
|
| | scripts_dir / 'sentiment.py',
|
| | [
|
| | '--input', str(split_out),
|
| | '--output', str(senti_out),
|
| | '--model-dir', str(model_dir),
|
| | '--text-col', 'divided_comment',
|
| | '--output-col','sentiment',
|
| | '--max-length', str(cfg['sentiment']['max_length']),
|
| | '--batch-size', str(cfg['sentiment']['batch_size'])
|
| | ]
|
| | )
|
| |
|
| |
|
| | durations['keyword'] = run_step(
|
| | scripts_dir / 'keyword_classifier.py',
|
| | [
|
| | '--input', str(senti_out),
|
| | '--output', str(output_csv),
|
| | '--text-col', 'divided_comment'
|
| | ]
|
| | )
|
| | total_elapsed = time.perf_counter() - total_start
|
| |
|
| | print("\n=== TIME REPORT ===")
|
| | print(f"1) ν
μ€νΈ ν΄λ¦¬λ : {durations['clean']:.2f}s")
|
| | print(f"2) λ¬Έμ₯ λΆλ¦¬ : {durations['split']:.2f}s")
|
| | print(f"3) κ°μ± λΆλ₯ : {durations['sentiment']:.2f}s")
|
| | print(f"4) ν€μλ λΆλ₯ : {durations['keyword']:.2f}s")
|
| | print(f"-----------------------------")
|
| | print(f"μ΄ μμ μκ° : {total_elapsed:.2f}s")
|
| |
|
| | print(f"\n ν
μ€νΈ ν΄λ¦¬λ β λ¬Έμ₯ λΆλ¦¬ β κ°μ± λΆλ₯ β ν€μλ λΆλ₯ λ... Final output : {output_csv}")
|
| |
|
| | if __name__ == '__main__':
|
| | parser = argparse.ArgumentParser(description='ν λ²μ μ 체 νμ΄νλΌμΈ μ€ννλ μ€ν¬λ¦½νΈ')
|
| | parser.add_argument(
|
| | '--config', '-c',
|
| | default='C:/Users/parkm/NLP/config/default.yaml',
|
| | help='config YAML νμΌ κ²½λ‘ (κΈ°λ³Έ: C:/Users/parkm/NLP/config/default.yaml)'
|
| | )
|
| | args = parser.parse_args()
|
| | main(Path(args.config))
|
| |
|