alsxxxz's picture
Upload 4 files
af6ff2d verified
import argparse
import yaml
import subprocess
import time
from pathlib import Path
"""
total.py β€” ν•œ λ²ˆμ— νŒŒμ΄ν”„λΌμΈ 전체 ( ν…μŠ€νŠΈ 클리닝 β†’ λ¬Έμž₯ 뢄리 β†’ 감성 λΆ„λ₯˜ β†’ ν‚€μ›Œλ“œ λΆ„λ₯˜ > μ΅œμ’… κ²°κ³Ό μ €μž₯) λŒλ €μ£ΌλŠ” 슀크립트
β€’ μ—­ν• : ν…μŠ€νŠΈ 클리닝 β†’ λ¬Έμž₯ 뢄리 β†’ 감성 λΆ„λ₯˜ β†’ ν‚€μ›Œλ“œ λΆ„λ₯˜
β€’ μ‚¬μš©λ²•:
python total.py --config C:/Users/parkm/NLP/config/default.yaml
β€’ config에: input/output 경둜, λͺ¨λΈ 경둜, 배치 크기
β€’ 쀑간 κ²°κ³ΌλŠ” μ§€μ •λœ intermediate_dir에 step1~step3.csv둜 μ €μž₯됨.
"""
def run_step(script_path, args):
"""
β–Ά μ£Όμ–΄μ§„ 슀크립트λ₯Ό μ‹€ν–‰ν•˜κ³ , μ†Œμš” μ‹œκ°„μ„ λ°˜ν™˜
"""
cmd = ['python', str(script_path)] + args
print(f"\n--- Running: {' '.join(cmd)}")
start = time.perf_counter()
result = subprocess.run(cmd, capture_output=True, text=True)
elapsed = time.perf_counter() - start
if result.returncode != 0:
print(f"[Error] {script_path.name} μ‹€ν–‰ 쀑 였λ₯˜ λ°œμƒ:\n{result.stderr}")
exit(1)
else:
print(result.stdout)
print(f"[Info] {script_path.name} μ™„λ£Œ: {elapsed:.2f}s μ†Œμš”")
return elapsed
def main(config_path: Path):
"""
β–Ά νŒŒμ΄ν”„λΌμΈ 메인 ν•¨μˆ˜
1) config YAML 파일 λ‘œλ“œ
2) 섀정에 맞좰 intermediate 폴더에 쀑간결과 μ €μž₯됨
3) 4단계 슀크립트 순차적으둜 μ‹€ν–‰ ν›„
4) 단계별 및 총 μ†Œμš” μ‹œκ°„ + μ™„λ£Œ λ©”μ‹œμ§€ 좜λ ₯
"""
#1) μ„€μ • λ‘œλ“œ
cfg = yaml.safe_load(config_path.read_text(encoding='utf-8'))
data_cfg = cfg['data']
paths_cfg = cfg['paths']
# 2) configμ—μ„œ 경둜 κΊΌλ‚΄κΈ°
input_csv = Path(data_cfg['input_csv']) # 졜초 원본 CSV
intermediate = Path(data_cfg['intermediate_dir']) # 쀑간 κ²°κ³Ό 폴더
output_csv = Path(data_cfg['output_csv']) # μ΅œμ’… μ €μž₯ CSV
scripts_dir = Path(paths_cfg['scripts_dir']) # λͺ¨λ“ˆ(.py) μœ„μΉ˜
model_dir = Path(paths_cfg['model_dir']) # KcELECTRA λͺ¨λΈ 폴더 config.json 이런거 5개
# 쀑간 κ²°κ³Ό μ €μž₯ν•  ν΄λ”μ—†μœΌλ©΄ μžλ™μƒμ„±
intermediate.mkdir(parents=True, exist_ok=True)
total_start = time.perf_counter()
durations = {}
# ── 1) ν…μŠ€νŠΈ 클리닝 ─────────────────────────────────────────────
clean_out = intermediate / 'step1_clean.csv'
durations['clean'] = run_step(
scripts_dir / 'text_cleaner.py',
['--input', str(input_csv), '--output', str(clean_out)]
)
# ── 2) λ¬Έμž₯ 뢄리 ────────────────────────────────────────────────
# ID, (μž‘μ„±μ‹œκ°„), cleaned μΉΌλŸΌμ„ λ°›μ•„ divided_comment둜 뢄리
split_out = intermediate / 'step2_split.csv'
durations['split'] = run_step(
scripts_dir / 'sentence_splitter.py',
[
'--input', str(clean_out),
'--output', str(split_out),
'--id-col', data_cfg.get('id_col', 'ID'),
'--time-col', data_cfg.get('time_col', ''),
'--text-col', 'cleaned',
'--output-col', 'divided_comment'
]
)
# ── 3) 감성 λΆ„λ₯˜ ────────────────────────────────────────────────
senti_out = intermediate / 'step3_sentiment.csv'
durations['sentiment'] = run_step(
scripts_dir / 'sentiment.py',
[
'--input', str(split_out),
'--output', str(senti_out),
'--model-dir', str(model_dir),
'--text-col', 'divided_comment',
'--output-col','sentiment',
'--max-length', str(cfg['sentiment']['max_length']),
'--batch-size', str(cfg['sentiment']['batch_size'])
]
)
# ── 4) ν‚€μ›Œλ“œ λΆ„λ₯˜ ────────────────────────────────────────────────
durations['keyword'] = run_step(
scripts_dir / 'keyword_classifier.py',
[
'--input', str(senti_out),
'--output', str(output_csv),
'--text-col', 'divided_comment'
]
)
total_elapsed = time.perf_counter() - total_start
print("\n=== TIME REPORT ===")
print(f"1) ν…μŠ€νŠΈ 클리닝 : {durations['clean']:.2f}s")
print(f"2) λ¬Έμž₯ 뢄리 : {durations['split']:.2f}s")
print(f"3) 감성 λΆ„λ₯˜ : {durations['sentiment']:.2f}s")
print(f"4) ν‚€μ›Œλ“œ λΆ„λ₯˜ : {durations['keyword']:.2f}s")
print(f"-----------------------------")
print(f"총 μ†Œμš” μ‹œκ°„ : {total_elapsed:.2f}s")
print(f"\n ν…μŠ€νŠΈ 클리닝 β†’ λ¬Έμž₯ 뢄리 β†’ 감성 λΆ„λ₯˜ β†’ ν‚€μ›Œλ“œ λΆ„λ₯˜ 끝... Final output : {output_csv}")
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ν•œ λ²ˆμ— 전체 νŒŒμ΄ν”„λΌμΈ μ‹€ν–‰ν•˜λŠ” 슀크립트')
parser.add_argument(
'--config', '-c',
default='C:/Users/parkm/NLP/config/default.yaml',
help='config YAML 파일 경둜 (기본: C:/Users/parkm/NLP/config/default.yaml)'
)
args = parser.parse_args()
main(Path(args.config))