672gjsds / text_utils.py
ssboost's picture
Upload 11 files
99f1333 verified
"""
ํ…์ŠคํŠธ ์ฒ˜๋ฆฌ ๊ด€๋ จ ์œ ํ‹ธ๋ฆฌํ‹ฐ ํ•จ์ˆ˜ ๋ชจ์Œ
- ํ…์ŠคํŠธ ๋ถ„๋ฆฌ ๋ฐ ์ •์ œ
- ํ‚ค์›Œ๋“œ ์ถ”์ถœ
- Gemini API ํ‚ค ํ†ตํ•ฉ ๊ด€๋ฆฌ ์ ์šฉ
"""
import re
import google.generativeai as genai
import os
import logging
import api_utils # API ํ‚ค ํ†ตํ•ฉ ๊ด€๋ฆฌ๋ฅผ ์œ„ํ•œ ์ž„ํฌํŠธ
# ๋กœ๊น… ์„ค์ •
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)
# ===== Gemini ๋ชจ๋ธ ๊ด€๋ฆฌ ํ•จ์ˆ˜๋“ค =====
def get_gemini_model():
"""api_utils์—์„œ Gemini ๋ชจ๋ธ ๊ฐ€์ ธ์˜ค๊ธฐ (ํ†ตํ•ฉ ๊ด€๋ฆฌ)"""
try:
model = api_utils.get_gemini_model()
if model:
logger.info("Gemini ๋ชจ๋ธ ๋กœ๋“œ ์„ฑ๊ณต (api_utils ํ†ตํ•ฉ ๊ด€๋ฆฌ)")
return model
else:
logger.warning("์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ Gemini API ํ‚ค๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
return None
except Exception as e:
logger.error(f"Gemini ๋ชจ๋ธ ๋กœ๋“œ ์‹คํŒจ: {e}")
return None
# ํ…์ŠคํŠธ ๋ถ„๋ฆฌ ๋ฐ ์ •์ œ ํ•จ์ˆ˜
def clean_and_split(text, only_korean=False):
"""ํ…์ŠคํŠธ๋ฅผ ๋ถ„๋ฆฌํ•˜๊ณ  ์ •์ œํ•˜๋Š” ํ•จ์ˆ˜"""
text = re.sub(r"[()\[\]-]", " ", text)
text = text.replace("/", " ")
if only_korean:
# ํ•œ๊ธ€๋งŒ ์ถ”์ถœ ์˜ต์…˜์ด ์ผœ์ง„ ๊ฒฝ์šฐ
# ๊ณต๋ฐฑ์ด๋‚˜ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„ํ•œ ๋’ค ํ•œ๊ธ€๋งŒ ์ถ”์ถœ
words = re.split(r"[ ,]", text)
cleaned = []
for word in words:
word = word.strip()
# ํ•œ๊ธ€๋งŒ ๋‚จ๊ธฐ๊ณ  ๋‹ค๋ฅธ ๋ฌธ์ž๋Š” ์ œ๊ฑฐ
word = re.sub(r"[^๊ฐ€-ํžฃ]", "", word)
if word and len(word) >= 1: # ๋นˆ ๋ฌธ์ž์—ด์ด ์•„๋‹ˆ๊ณ  1๊ธ€์ž ์ด์ƒ์ธ ๊ฒฝ์šฐ๋งŒ ์ถ”๊ฐ€
cleaned.append(word)
else:
# ํ•œ๊ธ€๋งŒ ์ถ”์ถœ ์˜ต์…˜์ด ๊บผ์ง„ ๊ฒฝ์šฐ - ๋‹จ์–ด ํ†ต์งธ๋กœ ์ฒ˜๋ฆฌ
# ๊ณต๋ฐฑ๊ณผ ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„ํ•˜์—ฌ ๋‹จ์–ด ์ „์ฒด๋ฅผ ์œ ์ง€
words = re.split(r"[,\s]+", text)
cleaned = []
for word in words:
word = word.strip()
if word and len(word) >= 1: # ๋นˆ ๋ฌธ์ž์—ด์ด ์•„๋‹ˆ๊ณ  1๊ธ€์ž ์ด์ƒ์ธ ๊ฒฝ์šฐ๋งŒ ์ถ”๊ฐ€
cleaned.append(word)
return cleaned
def filter_keywords_with_gemini(pairs, gemini_model=None):
"""Gemini AI๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ํ‚ค์›Œ๋“œ ์กฐํ•ฉ ํ•„ํ„ฐ๋ง (๊ฐœ์„ ๋ฒ„์ „) - API ํ‚ค ํ†ตํ•ฉ ๊ด€๋ฆฌ"""
if gemini_model is None:
# api_utils์—์„œ Gemini ๋ชจ๋ธ ๊ฐ€์ ธ์˜ค๊ธฐ
gemini_model = get_gemini_model()
if gemini_model is None:
logger.error("Gemini ๋ชจ๋ธ์„ ๊ฐ€์ ธ์˜ฌ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€ํ•ฉ๋‹ˆ๋‹ค.")
# ์•ˆ์ „ํ•˜๊ฒŒ ์ฒ˜๋ฆฌ: ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€
all_keywords = set()
for pair in pairs:
for keyword in pair:
all_keywords.add(keyword)
return list(all_keywords)
# ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ๋ชฉ๋ก์œผ๋กœ ์ถ”์ถœ (์ œ๊ฑฐ๋œ ํ‚ค์›Œ๋“œ ํ™•์ธ์šฉ)
all_keywords = set()
for pair in pairs:
for keyword in pair:
all_keywords.add(keyword)
# ๋„ˆ๋ฌด ๋งŽ์€ ์Œ์ด ์žˆ์œผ๋ฉด ์ œํ•œ
max_pairs = 50 # ์ตœ๋Œ€ 50๊ฐœ ์Œ๋งŒ ์ฒ˜๋ฆฌ
pairs_to_process = list(pairs)[:max_pairs] if len(pairs) > max_pairs else pairs
logger.info(f"ํ•„ํ„ฐ๋งํ•  ํ‚ค์›Œ๋“œ ์Œ: ์ด {len(pairs)}๊ฐœ ์ค‘ {len(pairs_to_process)}๊ฐœ ์ฒ˜๋ฆฌ")
# ๋ณด์ˆ˜์ ์ธ ํ”„๋กฌํ”„ํŠธ ์‚ฌ์šฉ - ํ‚ค์›Œ๋“œ ์ œ๊ฑฐ ์ตœ์†Œํ™”
prompt = (
"๋‹ค์Œ์€ ์†Œ๋น„์ž๊ฐ€ ๊ฒ€์ƒ‰ํ•  ๊ฐ€๋Šฅ์„ฑ์ด ์žˆ๋Š” ํ‚ค์›Œ๋“œ ์Œ ๋ชฉ๋ก์ž…๋‹ˆ๋‹ค.\n"
"๊ฐ ์Œ์€ ๊ฐ™์€ ๋‹จ์–ด ์กฐํ•ฉ์ด์ง€๋งŒ ์ˆœ์„œ๋งŒ ๋‹ค๋ฅธ ๊ฒฝ์šฐ์ž…๋‹ˆ๋‹ค (์˜ˆ: ์†์งˆ์˜ค์ง•์–ด vs ์˜ค์ง•์–ด์†์งˆ).\n\n"
"์•„๋ž˜์˜ ๊ธฐ์ค€์— ๋”ฐ๋ผ ๊ฐ ์Œ์—์„œ ๋” ์ž์—ฐ์Šค๋Ÿฌ์šด ํ‚ค์›Œ๋“œ๋ฅผ ์„ ํƒํ•ด์ฃผ์„ธ์š”:\n"
"1. ์†Œ๋น„์ž๊ฐ€ ์ผ์ƒ์ ์œผ๋กœ ์‚ฌ์šฉํ•˜๋Š” ์ž์—ฐ์Šค๋Ÿฌ์šด ํ‘œํ˜„์„ ์šฐ์„  ์„ ํƒํ•˜์„ธ์š”.\n"
"2. ๋‘ ํ‚ค์›Œ๋“œ๊ฐ€ ๋ชจ๋‘ ์ž์—ฐ์Šค๋Ÿฝ๊ฑฐ๋‚˜ ์˜๋ฏธ๊ฐ€ ์•ฝ๊ฐ„ ๋‹ค๋ฅด๋‹ค๋ฉด, ๋ฐ˜๋“œ์‹œ ๋‘˜ ๋‹ค ์œ ์ง€ํ•˜์„ธ์š”.\n"
"3. ํ™•์‹คํžˆ ๋น„์ž์—ฐ์Šค๋Ÿฝ๊ฑฐ๋‚˜ ์–ด์ƒ‰ํ•œ ๊ฒฝ์šฐ์—๋งŒ ์ œ๊ฑฐํ•˜์„ธ์š”.\n"
"4. ๋ถˆํ™•์‹คํ•œ ๊ฒฝ์šฐ์—๋Š” ๋ฐ˜๋“œ์‹œ ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€ํ•˜์„ธ์š”.\n"
"5. ์ˆซ์ž๋‚˜ ์˜์–ด๊ฐ€ ํฌํ•จ๋œ ํ‚ค์›Œ๋“œ๋Š” ํ•œ๊ธ€ ๋ฉ”์ธ ํ‚ค์›Œ๋“œ๊ฐ€ ์•ž์ชฝ์— ์˜ค๋Š” ํ˜•ํƒœ๋ฅผ ์„ ํƒํ•˜์„ธ์š”. (์˜ˆ: '10kg ์˜ค์ง•์–ด' ๋ณด๋‹ค '์˜ค์ง•์–ด 10kg' ์„ ํƒ)\n"
"6. ๊ฒ€์ƒ‰๋Ÿ‰์ด 0์ธ ํ‚ค์›Œ๋“œ๋ผ๋„ ์ผ์ƒ์ ์ธ ํ‘œํ˜„์ด๋ผ๋ฉด ๊ฐ€๋Šฅํ•œ ์œ ์ง€ํ•˜์„ธ์š”. ๋ช…๋ฐฑํ•˜๊ฒŒ ๋น„์ •์ƒ์ ์ธ ํ‘œํ˜„๋งŒ ์ œ๊ฑฐํ•˜์„ธ์š”.\n\n"
"์ฃผ์˜: ๊ธฐ๋ณธ์ ์œผ๋กœ ๋Œ€๋ถ€๋ถ„์˜ ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€ํ•˜๊ณ , ๋งค์šฐ ๋ช…ํ™•ํ•˜๊ฒŒ ๋น„์ž์—ฐ์Šค๋Ÿฌ์šด ๊ฒƒ๋งŒ ์ œ๊ฑฐํ•˜์„ธ์š”.\n\n"
"๊ฒฐ๊ณผ๋Š” ๋‹ค์Œ ํ˜•์‹์œผ๋กœ ์ œ๊ณตํ•ด์ฃผ์„ธ์š”:\n"
"- ์„ ํƒ๋œ ํ‚ค์›Œ๋“œ (์ด์œ : ์ž์—ฐ์Šค๋Ÿฌ์šด ํ‘œํ˜„์ด๊ธฐ ๋•Œ๋ฌธ)\n"
"- ์„ ํƒ๋œ ํ‚ค์›Œ๋“œ1, ์„ ํƒ๋œ ํ‚ค์›Œ๋“œ2 (์ด์œ : ๋‘˜ ๋‹ค ์ž์—ฐ์Šค๋Ÿฝ๊ณ  ์˜๋ฏธ๊ฐ€ ์กฐ๊ธˆ ๋‹ค๋ฆ„)\n\n"
)
# ํ‚ค์›Œ๋“œ ์Œ ๋ชฉ๋ก
formatted = "\n".join([f"- {a}, {b}" for a, b in pairs_to_process])
full_prompt = prompt + formatted
try:
# ํƒ€์ž„์•„์›ƒ ์ถ”๊ฐ€
logger.info(f"Gemini API ํ˜ธ์ถœ ์‹œ์ž‘ - {len(pairs_to_process)}๊ฐœ ํ‚ค์›Œ๋“œ ์Œ ์ฒ˜๋ฆฌ ์ค‘...")
# ์‘๋‹ต ๋ฐ›๊ธฐ (ํƒ€์ž„์•„์›ƒ ๊ธฐ๋Šฅ์ด ์žˆ์œผ๋ฉด ์ถ”๊ฐ€)
response = gemini_model.generate_content(full_prompt)
logger.info("Gemini API ์‘๋‹ต ์„ฑ๊ณต")
lines = response.text.strip().split("\n")
# ์„ ํƒ๋œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋œ ๊ฒฝ์šฐ ๋ชจ๋‘ ํฌํ•จ)
final_keywords = []
for line in lines:
if line.startswith("-"):
# ์ด์œ  ๋ถ€๋ถ„ ์ œ๊ฑฐ
keywords_part = line.strip("- ").split("(์ด์œ :")[0].strip()
# ์‰ผํ‘œ๋กœ ๊ตฌ๋ถ„๋œ ํ‚ค์›Œ๋“œ ๋ชจ๋‘ ์ถ”๊ฐ€
for kw in keywords_part.split(","):
kw = kw.strip()
if kw:
final_keywords.append(kw)
# ์ฒ˜๋ฆฌ๋˜์ง€ ์•Š์€ ์Œ์˜ ์ฒซ ๋ฒˆ์งธ ํ‚ค์›Œ๋“œ๋„ ์ถ”๊ฐ€ (LLM์ด ์ฒ˜๋ฆฌํ•˜์ง€ ์•Š์€ ํ‚ค์›Œ๋“œ)
if len(pairs) > max_pairs:
logger.info(f"์ถ”๊ฐ€ ํ‚ค์›Œ๋“œ ์ฒ˜๋ฆฌ: ๋‚จ์€ {len(pairs) - max_pairs}๊ฐœ ์Œ์˜ ์ฒซ ๋ฒˆ์งธ ํ‚ค์›Œ๋“œ ์ถ”๊ฐ€")
for pair in list(pairs)[max_pairs:]:
# ๊ฐ ์Œ์˜ ์ฒซ ๋ฒˆ์งธ ํ‚ค์›Œ๋“œ๋งŒ ์‚ฌ์šฉ
final_keywords.append(pair[0])
# ์„ ํƒ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ๊ธฐ์กด ํ‚ค์›Œ๋“œ ๋ชจ๋‘ ๋ฐ˜ํ™˜
if not final_keywords:
logger.warning("๊ฒฝ๊ณ : ์„ ํƒ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์–ด ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€ํ•ฉ๋‹ˆ๋‹ค.")
final_keywords = list(all_keywords)
# ์ˆœ์„œ ๊ฐ•์ œ ์ˆ˜์ •
corrected_keywords = []
# ๋‹จ์œ„์™€ ์ˆซ์ž ๊ด€๋ จ ์ •๊ทœ์‹ ํŒจํ„ด
unit_pattern = re.compile(r'(?i)(kg|g|mm|cm|ml|l|๋ฆฌํ„ฐ|๊ฐœ|ํŒฉ|๋ฐ•์Šค|์„ธํŠธ|2l|l2)')
number_pattern = re.compile(r'\d+')
for kw in final_keywords:
# ๊ณต๋ฐฑ์œผ๋กœ ๋ถ„๋ฆฌ
if ' ' in kw:
parts = kw.split()
first_part = parts[0]
# ์ฒซ ๋ถ€๋ถ„์ด ๋‹จ์œ„๋‚˜ ์ˆซ์ž๋ฅผ ํฌํ•จํ•˜๋Š”์ง€ ํ™•์ธ
if (unit_pattern.search(first_part) or number_pattern.search(first_part)) and len(parts) > 1:
# ์ˆœ์„œ ๋ฐ”๊พธ๊ธฐ: ๋‹จ์œ„/์ˆซ์ž ๋ถ€๋ถ„์„ ๋’ค๋กœ ์ด๋™
corrected_kw = " ".join(parts[1:] + [first_part])
logger.info(f"ํ‚ค์›Œ๋“œ ์ˆœ์„œ ๊ฐ•์ œ ์ˆ˜์ •: '{kw}' -> '{corrected_kw}'")
corrected_keywords.append(corrected_kw)
else:
corrected_keywords.append(kw)
else:
corrected_keywords.append(kw)
# ํŠน๋ณ„ ์ฒ˜๋ฆฌ: "L ์˜ค์ง•์–ด", "2L ์˜ค์ง•์–ด" ๊ฐ™์€ ๊ฒฝ์šฐ๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ํ™•์ธํ•˜๊ณ  ์ˆ˜์ •
specific_fixes = []
for kw in corrected_keywords:
# ํŠน์ • ํŒจํ„ด ์ฒดํฌ
l_pattern = re.compile(r'^([0-9]*L) (.+)$', re.IGNORECASE)
match = l_pattern.match(kw)
if match:
# L ๋‹จ์œ„๋ฅผ ๋’ค๋กœ ์ด๋™
l_part = match.group(1)
main_part = match.group(2)
fixed_kw = f"{main_part} {l_part}"
logger.info(f"ํŠน์ˆ˜ ํŒจํ„ด ์ˆ˜์ •: '{kw}' -> '{fixed_kw}'")
specific_fixes.append(fixed_kw)
else:
specific_fixes.append(kw)
# ์ œ๊ฑฐ๋œ ํ‚ค์›Œ๋“œ ๋ชฉ๋ก ํ™•์ธ
selected_set = set(specific_fixes)
removed_keywords = all_keywords - selected_set
# ์ œ๊ฑฐ๋œ ํ‚ค์›Œ๋“œ ์ถœ๋ ฅ
logger.info("\n=== LLM์— ์˜ํ•ด ์ œ๊ฑฐ๋œ ํ‚ค์›Œ๋“œ ๋ชฉ๋ก ===")
for kw in removed_keywords:
logger.info(f" - {kw}")
logger.info(f"์ด {len(all_keywords)}๊ฐœ ์ค‘ {len(removed_keywords)}๊ฐœ ์ œ๊ฑฐ๋จ ({len(selected_set)}๊ฐœ ์œ ์ง€)\n")
return specific_fixes
except Exception as e:
logger.error(f"Gemini ์˜ค๋ฅ˜: {e}")
logger.error("์˜ค๋ฅ˜ ๋ฐœ์ƒ์œผ๋กœ ์ธํ•ด ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€ํ•ฉ๋‹ˆ๋‹ค.")
logger.error(f"์˜ค๋ฅ˜ ์œ ํ˜•: {type(e).__name__}")
import traceback
traceback.print_exc()
# ์•ˆ์ „ํ•˜๊ฒŒ ์ฒ˜๋ฆฌ: ๋ชจ๋“  ํ‚ค์›Œ๋“œ๋ฅผ ์œ ์ง€
logger.info(f"์•ˆ์ „ ๋ชจ๋“œ: {len(all_keywords)}๊ฐœ ํ‚ค์›Œ๋“œ ๋ชจ๋‘ ์œ ์ง€")
return list(all_keywords)
def get_search_volume_range(total_volume):
"""์ด ๊ฒ€์ƒ‰๋Ÿ‰์„ ๊ธฐ๋ฐ˜์œผ๋กœ ๊ฒ€์ƒ‰๋Ÿ‰ ๊ตฌ๊ฐ„์„ ๋ฐ˜ํ™˜"""
if total_volume == 0:
return "100๋ฏธ๋งŒ"
elif total_volume <= 100:
return "100๋ฏธ๋งŒ"
elif total_volume <= 1000:
return "1000๋ฏธ๋งŒ"
elif total_volume <= 2000:
return "2000๋ฏธ๋งŒ"
elif total_volume <= 5000:
return "5000๋ฏธ๋งŒ"
elif total_volume <= 10000:
return "10000๋ฏธ๋งŒ"
else:
return "10000์ด์ƒ"