| | """
|
| | text_cleaner.py
|
| |
|
| | ๋ชจ๋ํ๋ ํ
์คํธ ์ ์ ๊ธฐ
|
| |
|
| | ํจ์:
|
| | - clean_text: ๋จ์ผ ๋ฌธ์์ด ์ ์ -> str ๋๋ None ๋ฐํ
|
| | - clean_dataframe: DataFrame์ clean_text ์ ์ฉ, ์ ์ ๋ DataFrame ๋ฐํ ๋ฐ ์คํ ์๊ฐ/์ญ์ ํ ์ ์ถ๋ ฅ
|
| | """
|
| |
|
| | import re
|
| | import emoji
|
| | import time
|
| | import pandas as pd
|
| | from typing import Optional
|
| | from soynlp.normalizer import repeat_normalize
|
| |
|
| |
|
| | _EMOJI_PATTERN = emoji.get_emoji_regexp()
|
| | _CLEAN_PATTERN = re.compile(
|
| | rf"[^ .,?!/@\$%~๏ผ
ยทโผ()\x00-\x7Fใฑ-ใ
ฃ๊ฐ-ํฃ{_EMOJI_PATTERN}]+"
|
| | )
|
| | _URL_PATTERN = re.compile(r"https?://\S+")
|
| |
|
| |
|
| | def clean_text(
|
| | text: str,
|
| | min_length: int = 3,
|
| | num_repeats: int = 2
|
| | ) -> Optional[str]:
|
| | """
|
| | ํ
์คํธ ์ ์ :
|
| | - '๋ ์ํผ', '๋ง๋ค๊ธฐ' ํค์๋, URL, ํ๊ธ ์๋ ๋๊ธ None ๋ฐํ
|
| | - ํ์ฉ ๋ฌธ์๋ง ๋จ๊น
|
| | - ์ด๋ชจ์ง ์ ๊ฑฐ
|
| | - ๊ณต๋ฐฑ strip
|
| | - ๋ฐ๋ณต๋ฌธ์ ์ ๊ทํ
|
| | - min_length ์ดํ์ด๋ฉด None ๋ฐํ
|
| | ์ด ๋ชจ๋ ํํฐ ํต๊ณผ์ ์ ์ ๋ ๋ฌธ์์ด ๋ฐํํ๋ค.
|
| | """
|
| |
|
| | if any(kw in text for kw in ("๋ ์ํผ", "๋ง๋ค๊ธฐ")):
|
| | return None
|
| | if _URL_PATTERN.search(text):
|
| | return None
|
| | if not re.search(r"[๊ฐ-ํฃ]", text):
|
| | return None
|
| |
|
| |
|
| | cleaned = _CLEAN_PATTERN.sub(" ", text)
|
| | cleaned = _EMOJI_PATTERN.sub("", cleaned)
|
| | cleaned = cleaned.strip()
|
| | cleaned = repeat_normalize(cleaned, num_repeats=num_repeats)
|
| | if len(cleaned) < min_length:
|
| | return None
|
| | return cleaned
|
| |
|
| |
|
| | def clean_dataframe(
|
| | df: pd.DataFrame,
|
| | text_col: str = "comment",
|
| | min_length: int = 3,
|
| | num_repeats: int = 2
|
| | ) -> pd.DataFrame:
|
| | """
|
| | DataFrame์ clean_text ์ ์ฉ:
|
| | - original ํ ์, ์ญ์ ๋ ํ ์, ์ฒ๋ฆฌ ์๊ฐ ์ถ๋ ฅ
|
| | - 'cleaned' ์ปฌ๋ผ์ ์ ์ ๋ ํ
์คํธ ์ ์ฅ
|
| | - None์ธ ํ์ ์ญ์
|
| | """
|
| | start = time.perf_counter()
|
| | total = len(df)
|
| | df['cleaned'] = df[text_col].apply(
|
| | lambda x: clean_text(x, min_length, num_repeats)
|
| | )
|
| | df_clean = df.dropna(subset=['cleaned']).reset_index(drop=True)
|
| | dropped = total - len(df_clean)
|
| | elapsed = time.perf_counter() - start
|
| | print(f"[CLEAN] ์ฒ๋ฆฌ ์๊ฐ: {elapsed:.2f}s | ์ด ํ: {total} | ์ญ์ ๋ ํ: {dropped}")
|
| | return df_clean
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | import argparse
|
| |
|
| | parser = argparse.ArgumentParser(
|
| | description="ํ
์คํธ ์ ์ ๋ชจ๋"
|
| | )
|
| | parser.add_argument(
|
| | "--input", "-i",
|
| | required=True,
|
| | help="์
๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก (comment ์ปฌ๋ผ ํฌํจ)"
|
| | )
|
| | parser.add_argument(
|
| | "--output", "-o",
|
| | required=True,
|
| | help="์ถ๋ ฅ CSV ํ์ผ ๊ฒฝ๋ก"
|
| | )
|
| | parser.add_argument(
|
| | "--min_length", type=int, default=3,
|
| | help="์ต์ ๊ธธ์ด (default=3)"
|
| | )
|
| | parser.add_argument(
|
| | "--num_repeats", type=int, default=2,
|
| | help="๋ฐ๋ณต๋ฌธ์ ์ ๊ทํ ํ์ฉ ํ์ (default=2)"
|
| | )
|
| | args = parser.parse_args()
|
| |
|
| | df = pd.read_csv(args.input, encoding="utf-8-sig")
|
| | df_clean = clean_dataframe(
|
| | df,
|
| | text_col="comment",
|
| | min_length=args.min_length,
|
| | num_repeats=args.num_repeats
|
| | )
|
| | df_clean.to_csv(args.output, index=False, encoding="utf-8-sig")
|
| | print(f"์ ์ ๋ ๋ฐ์ดํฐ ์ ์ฅ: {args.output}")
|
| |
|