|
|
""" |
|
|
ํ
์คํธ ์ฒ๋ฆฌ ๊ด๋ จ ์ ํธ๋ฆฌํฐ ํจ์ ๋ชจ์ |
|
|
- ํ
์คํธ ๋ถ๋ฆฌ ๋ฐ ์ ์ |
|
|
- ํค์๋ ์ถ์ถ |
|
|
- Gemini API ํค ํตํฉ ๊ด๋ฆฌ ์ ์ฉ |
|
|
""" |
|
|
|
|
|
import re |
|
|
import google.generativeai as genai |
|
|
import os |
|
|
import logging |
|
|
import api_utils |
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
logger.setLevel(logging.INFO) |
|
|
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') |
|
|
handler = logging.StreamHandler() |
|
|
handler.setFormatter(formatter) |
|
|
logger.addHandler(handler) |
|
|
|
|
|
|
|
|
def get_gemini_model(): |
|
|
"""api_utils์์ Gemini ๋ชจ๋ธ ๊ฐ์ ธ์ค๊ธฐ (ํตํฉ ๊ด๋ฆฌ)""" |
|
|
try: |
|
|
model = api_utils.get_gemini_model() |
|
|
if model: |
|
|
logger.info("Gemini ๋ชจ๋ธ ๋ก๋ ์ฑ๊ณต (api_utils ํตํฉ ๊ด๋ฆฌ)") |
|
|
return model |
|
|
else: |
|
|
logger.warning("์ฌ์ฉ ๊ฐ๋ฅํ Gemini API ํค๊ฐ ์์ต๋๋ค.") |
|
|
return None |
|
|
except Exception as e: |
|
|
logger.error(f"Gemini ๋ชจ๋ธ ๋ก๋ ์คํจ: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
def clean_and_split(text, only_korean=False): |
|
|
"""ํ
์คํธ๋ฅผ ๋ถ๋ฆฌํ๊ณ ์ ์ ํ๋ ํจ์""" |
|
|
text = re.sub(r"[()\[\]-]", " ", text) |
|
|
text = text.replace("/", " ") |
|
|
|
|
|
if only_korean: |
|
|
|
|
|
|
|
|
words = re.split(r"[ ,]", text) |
|
|
cleaned = [] |
|
|
for word in words: |
|
|
word = word.strip() |
|
|
|
|
|
word = re.sub(r"[^๊ฐ-ํฃ]", "", word) |
|
|
if word and len(word) >= 1: |
|
|
cleaned.append(word) |
|
|
else: |
|
|
|
|
|
|
|
|
words = re.split(r"[,\s]+", text) |
|
|
cleaned = [] |
|
|
for word in words: |
|
|
word = word.strip() |
|
|
if word and len(word) >= 1: |
|
|
cleaned.append(word) |
|
|
|
|
|
return cleaned |
|
|
|
|
|
def filter_keywords_with_gemini(pairs, gemini_model=None): |
|
|
"""Gemini AI๋ฅผ ์ฌ์ฉํ์ฌ ํค์๋ ์กฐํฉ ํํฐ๋ง (๊ฐ์ ๋ฒ์ ) - API ํค ํตํฉ ๊ด๋ฆฌ""" |
|
|
if gemini_model is None: |
|
|
|
|
|
gemini_model = get_gemini_model() |
|
|
|
|
|
if gemini_model is None: |
|
|
logger.error("Gemini ๋ชจ๋ธ์ ๊ฐ์ ธ์ฌ ์ ์์ต๋๋ค. ๋ชจ๋ ํค์๋๋ฅผ ์ ์งํฉ๋๋ค.") |
|
|
|
|
|
all_keywords = set() |
|
|
for pair in pairs: |
|
|
for keyword in pair: |
|
|
all_keywords.add(keyword) |
|
|
return list(all_keywords) |
|
|
|
|
|
|
|
|
all_keywords = set() |
|
|
for pair in pairs: |
|
|
for keyword in pair: |
|
|
all_keywords.add(keyword) |
|
|
|
|
|
|
|
|
max_pairs = 50 |
|
|
pairs_to_process = list(pairs)[:max_pairs] if len(pairs) > max_pairs else pairs |
|
|
|
|
|
logger.info(f"ํํฐ๋งํ ํค์๋ ์: ์ด {len(pairs)}๊ฐ ์ค {len(pairs_to_process)}๊ฐ ์ฒ๋ฆฌ") |
|
|
|
|
|
|
|
|
prompt = ( |
|
|
"๋ค์์ ์๋น์๊ฐ ๊ฒ์ํ ๊ฐ๋ฅ์ฑ์ด ์๋ ํค์๋ ์ ๋ชฉ๋ก์
๋๋ค.\n" |
|
|
"๊ฐ ์์ ๊ฐ์ ๋จ์ด ์กฐํฉ์ด์ง๋ง ์์๋ง ๋ค๋ฅธ ๊ฒฝ์ฐ์
๋๋ค (์: ์์ง์ค์ง์ด vs ์ค์ง์ด์์ง).\n\n" |
|
|
"์๋์ ๊ธฐ์ค์ ๋ฐ๋ผ ๊ฐ ์์์ ๋ ์์ฐ์ค๋ฌ์ด ํค์๋๋ฅผ ์ ํํด์ฃผ์ธ์:\n" |
|
|
"1. ์๋น์๊ฐ ์ผ์์ ์ผ๋ก ์ฌ์ฉํ๋ ์์ฐ์ค๋ฌ์ด ํํ์ ์ฐ์ ์ ํํ์ธ์.\n" |
|
|
"2. ๋ ํค์๋๊ฐ ๋ชจ๋ ์์ฐ์ค๋ฝ๊ฑฐ๋ ์๋ฏธ๊ฐ ์ฝ๊ฐ ๋ค๋ฅด๋ค๋ฉด, ๋ฐ๋์ ๋ ๋ค ์ ์งํ์ธ์.\n" |
|
|
"3. ํ์คํ ๋น์์ฐ์ค๋ฝ๊ฑฐ๋ ์ด์ํ ๊ฒฝ์ฐ์๋ง ์ ๊ฑฐํ์ธ์.\n" |
|
|
"4. ๋ถํ์คํ ๊ฒฝ์ฐ์๋ ๋ฐ๋์ ํค์๋๋ฅผ ์ ์งํ์ธ์.\n" |
|
|
"5. ์ซ์๋ ์์ด๊ฐ ํฌํจ๋ ํค์๋๋ ํ๊ธ ๋ฉ์ธ ํค์๋๊ฐ ์์ชฝ์ ์ค๋ ํํ๋ฅผ ์ ํํ์ธ์. (์: '10kg ์ค์ง์ด' ๋ณด๋ค '์ค์ง์ด 10kg' ์ ํ)\n" |
|
|
"6. ๊ฒ์๋์ด 0์ธ ํค์๋๋ผ๋ ์ผ์์ ์ธ ํํ์ด๋ผ๋ฉด ๊ฐ๋ฅํ ์ ์งํ์ธ์. ๋ช
๋ฐฑํ๊ฒ ๋น์ ์์ ์ธ ํํ๋ง ์ ๊ฑฐํ์ธ์.\n\n" |
|
|
"์ฃผ์: ๊ธฐ๋ณธ์ ์ผ๋ก ๋๋ถ๋ถ์ ํค์๋๋ฅผ ์ ์งํ๊ณ , ๋งค์ฐ ๋ช
ํํ๊ฒ ๋น์์ฐ์ค๋ฌ์ด ๊ฒ๋ง ์ ๊ฑฐํ์ธ์.\n\n" |
|
|
"๊ฒฐ๊ณผ๋ ๋ค์ ํ์์ผ๋ก ์ ๊ณตํด์ฃผ์ธ์:\n" |
|
|
"- ์ ํ๋ ํค์๋ (์ด์ : ์์ฐ์ค๋ฌ์ด ํํ์ด๊ธฐ ๋๋ฌธ)\n" |
|
|
"- ์ ํ๋ ํค์๋1, ์ ํ๋ ํค์๋2 (์ด์ : ๋ ๋ค ์์ฐ์ค๋ฝ๊ณ ์๋ฏธ๊ฐ ์กฐ๊ธ ๋ค๋ฆ)\n\n" |
|
|
) |
|
|
|
|
|
|
|
|
formatted = "\n".join([f"- {a}, {b}" for a, b in pairs_to_process]) |
|
|
full_prompt = prompt + formatted |
|
|
|
|
|
try: |
|
|
|
|
|
logger.info(f"Gemini API ํธ์ถ ์์ - {len(pairs_to_process)}๊ฐ ํค์๋ ์ ์ฒ๋ฆฌ ์ค...") |
|
|
|
|
|
|
|
|
response = gemini_model.generate_content(full_prompt) |
|
|
|
|
|
logger.info("Gemini API ์๋ต ์ฑ๊ณต") |
|
|
lines = response.text.strip().split("\n") |
|
|
|
|
|
|
|
|
final_keywords = [] |
|
|
for line in lines: |
|
|
if line.startswith("-"): |
|
|
|
|
|
keywords_part = line.strip("- ").split("(์ด์ :")[0].strip() |
|
|
|
|
|
for kw in keywords_part.split(","): |
|
|
kw = kw.strip() |
|
|
if kw: |
|
|
final_keywords.append(kw) |
|
|
|
|
|
|
|
|
if len(pairs) > max_pairs: |
|
|
logger.info(f"์ถ๊ฐ ํค์๋ ์ฒ๋ฆฌ: ๋จ์ {len(pairs) - max_pairs}๊ฐ ์์ ์ฒซ ๋ฒ์งธ ํค์๋ ์ถ๊ฐ") |
|
|
for pair in list(pairs)[max_pairs:]: |
|
|
|
|
|
final_keywords.append(pair[0]) |
|
|
|
|
|
|
|
|
if not final_keywords: |
|
|
logger.warning("๊ฒฝ๊ณ : ์ ํ๋ ํค์๋๊ฐ ์์ด ๋ชจ๋ ํค์๋๋ฅผ ์ ์งํฉ๋๋ค.") |
|
|
final_keywords = list(all_keywords) |
|
|
|
|
|
|
|
|
corrected_keywords = [] |
|
|
|
|
|
|
|
|
unit_pattern = re.compile(r'(?i)(kg|g|mm|cm|ml|l|๋ฆฌํฐ|๊ฐ|ํฉ|๋ฐ์ค|์ธํธ|2l|l2)') |
|
|
number_pattern = re.compile(r'\d+') |
|
|
|
|
|
for kw in final_keywords: |
|
|
|
|
|
if ' ' in kw: |
|
|
parts = kw.split() |
|
|
first_part = parts[0] |
|
|
|
|
|
|
|
|
if (unit_pattern.search(first_part) or number_pattern.search(first_part)) and len(parts) > 1: |
|
|
|
|
|
corrected_kw = " ".join(parts[1:] + [first_part]) |
|
|
logger.info(f"ํค์๋ ์์ ๊ฐ์ ์์ : '{kw}' -> '{corrected_kw}'") |
|
|
corrected_keywords.append(corrected_kw) |
|
|
else: |
|
|
corrected_keywords.append(kw) |
|
|
else: |
|
|
corrected_keywords.append(kw) |
|
|
|
|
|
|
|
|
specific_fixes = [] |
|
|
for kw in corrected_keywords: |
|
|
|
|
|
l_pattern = re.compile(r'^([0-9]*L) (.+)$', re.IGNORECASE) |
|
|
match = l_pattern.match(kw) |
|
|
|
|
|
if match: |
|
|
|
|
|
l_part = match.group(1) |
|
|
main_part = match.group(2) |
|
|
fixed_kw = f"{main_part} {l_part}" |
|
|
logger.info(f"ํน์ ํจํด ์์ : '{kw}' -> '{fixed_kw}'") |
|
|
specific_fixes.append(fixed_kw) |
|
|
else: |
|
|
specific_fixes.append(kw) |
|
|
|
|
|
|
|
|
selected_set = set(specific_fixes) |
|
|
removed_keywords = all_keywords - selected_set |
|
|
|
|
|
|
|
|
logger.info("\n=== LLM์ ์ํด ์ ๊ฑฐ๋ ํค์๋ ๋ชฉ๋ก ===") |
|
|
for kw in removed_keywords: |
|
|
logger.info(f" - {kw}") |
|
|
logger.info(f"์ด {len(all_keywords)}๊ฐ ์ค {len(removed_keywords)}๊ฐ ์ ๊ฑฐ๋จ ({len(selected_set)}๊ฐ ์ ์ง)\n") |
|
|
|
|
|
return specific_fixes |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Gemini ์ค๋ฅ: {e}") |
|
|
logger.error("์ค๋ฅ ๋ฐ์์ผ๋ก ์ธํด ๋ชจ๋ ํค์๋๋ฅผ ์ ์งํฉ๋๋ค.") |
|
|
logger.error(f"์ค๋ฅ ์ ํ: {type(e).__name__}") |
|
|
import traceback |
|
|
traceback.print_exc() |
|
|
|
|
|
|
|
|
logger.info(f"์์ ๋ชจ๋: {len(all_keywords)}๊ฐ ํค์๋ ๋ชจ๋ ์ ์ง") |
|
|
return list(all_keywords) |
|
|
|
|
|
def get_search_volume_range(total_volume): |
|
|
"""์ด ๊ฒ์๋์ ๊ธฐ๋ฐ์ผ๋ก ๊ฒ์๋ ๊ตฌ๊ฐ์ ๋ฐํ""" |
|
|
if total_volume == 0: |
|
|
return "100๋ฏธ๋ง" |
|
|
elif total_volume <= 100: |
|
|
return "100๋ฏธ๋ง" |
|
|
elif total_volume <= 1000: |
|
|
return "1000๋ฏธ๋ง" |
|
|
elif total_volume <= 2000: |
|
|
return "2000๋ฏธ๋ง" |
|
|
elif total_volume <= 5000: |
|
|
return "5000๋ฏธ๋ง" |
|
|
elif total_volume <= 10000: |
|
|
return "10000๋ฏธ๋ง" |
|
|
else: |
|
|
return "10000์ด์" |