kcelectra-base-DC / source /text_cleaner.py
alsxxxz's picture
Upload 4 files
af6ff2d verified
"""
text_cleaner.py
๋ชจ๋“ˆํ™”๋œ ํ…์ŠคํŠธ ์ •์ œ๊ธฐ
ํ•จ์ˆ˜:
- clean_text: ๋‹จ์ผ ๋ฌธ์ž์—ด ์ •์ œ -> str ๋˜๋Š” None ๋ฐ˜ํ™˜
- clean_dataframe: DataFrame์— clean_text ์ ์šฉ, ์ •์ œ๋œ DataFrame ๋ฐ˜ํ™˜ ๋ฐ ์‹คํ–‰ ์‹œ๊ฐ„/์‚ญ์ œ ํ–‰ ์ˆ˜ ์ถœ๋ ฅ
"""
import re
import emoji
import time
import pandas as pd
from typing import Optional
from soynlp.normalizer import repeat_normalize
# ํ—ˆ์šฉ ๋ฌธ์ž ํŒจํ„ด
_EMOJI_PATTERN = emoji.get_emoji_regexp()
_CLEAN_PATTERN = re.compile(
rf"[^ .,?!/@\$%~๏ผ…ยทโˆผ()\x00-\x7Fใ„ฑ-ใ…ฃ๊ฐ€-ํžฃ{_EMOJI_PATTERN}]+"
)
_URL_PATTERN = re.compile(r"https?://\S+")
def clean_text(
text: str,
min_length: int = 3,
num_repeats: int = 2
) -> Optional[str]:
"""
ํ…์ŠคํŠธ ์ •์ œ:
- '๋ ˆ์‹œํ”ผ', '๋งŒ๋“ค๊ธฐ' ํ‚ค์›Œ๋“œ, URL, ํ•œ๊ธ€ ์—†๋Š” ๋Œ“๊ธ€ None ๋ฐ˜ํ™˜
- ํ—ˆ์šฉ ๋ฌธ์ž๋งŒ ๋‚จ๊น€
- ์ด๋ชจ์ง€ ์ œ๊ฑฐ
- ๊ณต๋ฐฑ strip
- ๋ฐ˜๋ณต๋ฌธ์ž ์ •๊ทœํ™”
- min_length ์ดํ•˜์ด๋ฉด None ๋ฐ˜ํ™˜
์ด ๋ชจ๋“  ํ•„ํ„ฐ ํ†ต๊ณผ์‹œ ์ •์ œ๋œ ๋ฌธ์ž์—ด ๋ฐ˜ํ™˜ํ•œ๋‹ค.
"""
# ์‚ญ์ œ ์กฐ๊ฑด
if any(kw in text for kw in ("๋ ˆ์‹œํ”ผ", "๋งŒ๋“ค๊ธฐ")):
return None
if _URL_PATTERN.search(text):
return None
if not re.search(r"[๊ฐ€-ํžฃ]", text):
return None
# ์ •์ œ ๋กœ์ง
cleaned = _CLEAN_PATTERN.sub(" ", text)
cleaned = _EMOJI_PATTERN.sub("", cleaned)
cleaned = cleaned.strip()
cleaned = repeat_normalize(cleaned, num_repeats=num_repeats)
if len(cleaned) < min_length:
return None
return cleaned
def clean_dataframe(
df: pd.DataFrame,
text_col: str = "comment",
min_length: int = 3,
num_repeats: int = 2
) -> pd.DataFrame:
"""
DataFrame์— clean_text ์ ์šฉ:
- original ํ–‰ ์ˆ˜, ์‚ญ์ œ๋œ ํ–‰ ์ˆ˜, ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ์ถœ๋ ฅ
- 'cleaned' ์ปฌ๋Ÿผ์— ์ •์ œ๋œ ํ…์ŠคํŠธ ์ €์žฅ
- None์ธ ํ–‰์€ ์‚ญ์ œ
"""
start = time.perf_counter()
total = len(df)
df['cleaned'] = df[text_col].apply(
lambda x: clean_text(x, min_length, num_repeats)
)
df_clean = df.dropna(subset=['cleaned']).reset_index(drop=True)
dropped = total - len(df_clean)
elapsed = time.perf_counter() - start
print(f"[CLEAN] ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {elapsed:.2f}s | ์ด ํ–‰: {total} | ์‚ญ์ œ๋œ ํ–‰: {dropped}")
return df_clean
#CLI ์ง€์›
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="ํ…์ŠคํŠธ ์ •์ œ ๋ชจ๋“ˆ"
)
parser.add_argument(
"--input", "-i",
required=True,
help="์ž…๋ ฅ CSV ํŒŒ์ผ ๊ฒฝ๋กœ (comment ์ปฌ๋Ÿผ ํฌํ•จ)"
)
parser.add_argument(
"--output", "-o",
required=True,
help="์ถœ๋ ฅ CSV ํŒŒ์ผ ๊ฒฝ๋กœ"
)
parser.add_argument(
"--min_length", type=int, default=3,
help="์ตœ์†Œ ๊ธธ์ด (default=3)"
)
parser.add_argument(
"--num_repeats", type=int, default=2,
help="๋ฐ˜๋ณต๋ฌธ์ž ์ •๊ทœํ™” ํ—ˆ์šฉ ํšŸ์ˆ˜ (default=2)"
)
args = parser.parse_args()
df = pd.read_csv(args.input, encoding="utf-8-sig")
df_clean = clean_dataframe(
df,
text_col="comment",
min_length=args.min_length,
num_repeats=args.num_repeats
)
df_clean.to_csv(args.output, index=False, encoding="utf-8-sig")
print(f"์ •์ œ๋œ ๋ฐ์ดํ„ฐ ์ €์žฅ: {args.output}")