Spaces:

ssboost
/

3gghdf5

Running

File size: 17,692 Bytes

106555b

"""
키워드 처리 관련 기능 - 앞뒤 조합 중 높은 검색량만 선택, 카테고리 항목 제거
- 키워드 추출 및 조합
- 검색 결과 처리
"""

import pandas as pd
import re
from collections import defaultdict, Counter
import text_utils
import keyword_search
import product_search
import logging

# 로깅 설정
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler = logging.StreamHandler()
handler.setFormatter(formatter)
logger.addHandler(handler)

def process_search_results(search_results, current_keyword="", exclude_zero_volume=True):
    """
    검색 결과에서 키워드와 카테고리 정보 추출 및 처리 - 앞뒤 조합 중 높은 검색량만 선택
    
    Args:
        search_results (dict): 검색 결과 정보
        current_keyword (str): 현재 검색 중인 키워드
        exclude_zero_volume (bool): 검색량이 0인 키워드 제외 여부
        
    Returns:
        dict: 처리된 결과
    """
    logger.info("\n===== 검색 결과 처리 시작 =====")
    logger.info(f"현재 키워드: '{current_keyword}'")
    logger.info(f"검색량 0 키워드 제외: {exclude_zero_volume}")
    
    if not search_results or not search_results.get("product_list"):
        logger.warning("검색 결과가 없습니다.")
        return {
            "products_df": None,
            "keywords_df": None,
            "categories": ["전체 보기"],
            "message": "검색 결과가 없습니다."
        }
    
    product_list = search_results["product_list"]
    combo_candidates = search_results["combo_candidates"]
    category_counter = search_results["category_counter"]
    keyword_indices = search_results["keyword_indices"]
    keyword_pairs = search_results.get("keyword_pairs", {})  # 앞뒤 조합 정보
    
    logger.info(f"검색 결과 - 상품 수: {len(product_list)}개")
    logger.info(f"검색 결과 - 조합 후보 수: {len(combo_candidates)}개")
    logger.info(f"검색 결과 - 카테고리 수: {len(category_counter)}개")
    
    # 상품 정보 데이터프레임 생성
    df_products = pd.DataFrame(product_list)
    
    # API 키워드를 UI 키워드로 변환하는 매핑 생성
    api_to_ui_keywords = {}
    
    for api_keyword in combo_candidates.keys():
        # API 키워드에서 UI 키워드로 변환
        if current_keyword and current_keyword in api_keyword:
            # 메인 키워드 자체인 경우
            if api_keyword == current_keyword:
                api_to_ui_keywords[api_keyword] = current_keyword
                continue
                
            # 메인 키워드가 이미 포함된 경우 (예: 갑오징어, 귀오징어)
            # 공백 있는 형태로 변환
            ui_keyword = api_keyword
            # 공백이 없는 형태라면 적절한 위치에 공백 추가
            if " " not in api_keyword:
                # 메인 키워드 기준으로 분리
                if api_keyword.startswith(current_keyword):
                    # 오징어갑 => 오징어 갑
                    prefix = current_keyword
                    suffix = api_keyword[len(current_keyword):]
                    if suffix:
                        ui_keyword = f"{prefix} {suffix}"
                elif api_keyword.endswith(current_keyword):
                    # 갑오징어 => 갑 오징어
                    prefix = api_keyword[:-len(current_keyword)]
                    suffix = current_keyword
                    if prefix:
                        ui_keyword = f"{prefix} {suffix}"
                else:
                    # 메인 키워드가 중간에 있는 경우
                    idx = api_keyword.find(current_keyword)
                    if idx > 0:
                        prefix = api_keyword[:idx]
                        middle = current_keyword
                        suffix = api_keyword[idx+len(current_keyword):]
                        ui_keyword = f"{prefix} {middle}"
                        if suffix:
                            ui_keyword += f" {suffix}"
            
            api_to_ui_keywords[api_keyword] = ui_keyword
        else:
            # 메인 키워드가 없는 경우 - 그대로 사용
            api_to_ui_keywords[api_keyword] = api_keyword
    
    # === 수정된 부분: 검색량 조회 후 앞뒤 조합 중 높은 것만 선택 ===
    logger.info(f"\n검색량 조회 대상 키워드 수: {len(combo_candidates)}개")
    search_volumes = keyword_search.fetch_all_search_volumes(list(combo_candidates.keys()))
    logger.info(f"검색량 조회 완료: {len(search_volumes)}개 결과")
    
    # 앞뒤 조합 중 높은 검색량만 선택
    if keyword_pairs and current_keyword:
        logger.info("\n=== 앞뒤 조합 중 높은 검색량 선택 ===")
        filtered_candidates = {}
        
        # 메인 키워드는 항상 포함
        main_api = current_keyword.replace(" ", "")
        if main_api in combo_candidates:
            filtered_candidates[main_api] = combo_candidates[main_api]
            logger.info(f"메인 키워드 유지: '{current_keyword}'")
        
        # 메인 키워드가 포함된 복합어도 유지
        for api_kw, categories in combo_candidates.items():
            ui_kw = api_to_ui_keywords[api_kw]
            if current_keyword in ui_kw and api_kw != main_api and api_kw not in [pair_info["front"].replace(" ", "") for pair_info in keyword_pairs.values()] and api_kw not in [pair_info["back"].replace(" ", "") for pair_info in keyword_pairs.values()]:
                filtered_candidates[api_kw] = categories
                logger.info(f"메인 키워드 포함 복합어 유지: '{ui_kw}'")
        
        # 앞뒤 조합 비교
        for base_word, pair_info in keyword_pairs.items():
            front_kw = pair_info["front"]  # "키워드 메인키워드"
            back_kw = pair_info["back"]    # "메인키워드 키워드"
            
            front_api = front_kw.replace(" ", "")
            back_api = back_kw.replace(" ", "")
            
            front_vol = search_volumes.get(front_api, {}).get("총검색량", 0)
            back_vol = search_volumes.get(back_api, {}).get("총검색량", 0)
            
            # 높은 검색량 선택
            if front_vol > back_vol:
                selected_api = front_api
                selected_kw = front_kw
                selected_vol = front_vol
                removed_kw = back_kw
                removed_vol = back_vol
            elif back_vol > front_vol:
                selected_api = back_api
                selected_kw = back_kw
                selected_vol = back_vol
                removed_kw = front_kw
                removed_vol = front_vol
            elif front_vol == back_vol and front_vol > 0:
                # 같은 검색량이면 더 자연스러운 순서 선택 (메인키워드가 뒤에 오는 것)
                selected_api = back_api
                selected_kw = back_kw
                selected_vol = back_vol
                removed_kw = front_kw
                removed_vol = front_vol
            else:
                # 둘 다 0이면 제외
                logger.info(f"  '{base_word}' 조합: 둘 다 검색량 0으로 제외")
                continue
            
            # 선택된 키워드만 추가
            if selected_vol > 0 or not exclude_zero_volume:
                filtered_candidates[selected_api] = combo_candidates[selected_api]
                logger.info(f"  '{base_word}' 조합 선택: '{selected_kw}' ({selected_vol:,}) > '{removed_kw}' ({removed_vol:,})")
            else:
                logger.info(f"  '{base_word}' 조합: 검색량 0으로 제외")
        
        # 필터링된 조합으로 교체
        combo_candidates = filtered_candidates
        logger.info(f"앞뒤 조합 필터링 완료: {len(combo_candidates)}개 키워드 선택")
    
    # 검색량 0 키워드 통계
    zero_volume_count = sum(1 for vol in search_volumes.values() if vol.get("총검색량", 0) == 0)
    logger.info(f"검색량 0인 키워드 수: {zero_volume_count}개 ({zero_volume_count/max(1, len(search_volumes))*100:.1f}%)")
    
    # 중복 키워드 제거를 위한 정규화된 키워드 집합
    normalized_keywords = {}
    
    for api_keyword in combo_candidates.keys():
        ui_keyword = api_to_ui_keywords[api_keyword]
        
        # 검색량 정보 가져오기
        pc_count = 0
        mobile_count = 0
        total_count = 0
        if api_keyword in search_volumes:
            pc_count = search_volumes[api_keyword]["PC검색량"]
            mobile_count = search_volumes[api_keyword]["모바일검색량"]
            total_count = search_volumes[api_keyword]["총검색량"]
        
        # 검색량 0인 키워드 제외 옵션 적용
        if exclude_zero_volume and total_count == 0:
            logger.debug(f"  - '{ui_keyword}' (API: '{api_keyword}') - 검색량 0으로 제외됨")
            continue
        
        # 1. 공백을 기준으로 단어 분리 후 정렬해 정규화 키 생성
        words = ui_keyword.split()
        normalized = "".join(sorted(words))
        
        # 2. 이미 정규화된 키워드가 있으면 검색량이 더 높은 것을 선택
        if normalized in normalized_keywords:
            existing_api_keyword, existing_ui_keyword, existing_total = normalized_keywords[normalized]
            if total_count > existing_total:
                logger.debug(f"  - 중복 키워드 대체: '{existing_ui_keyword}' ({existing_total}) -> '{ui_keyword}' ({total_count})")
                normalized_keywords[normalized] = (api_keyword, ui_keyword, total_count)
            else:
                logger.debug(f"  - 중복 키워드 제외: '{ui_keyword}' ({total_count}) < '{existing_ui_keyword}' ({existing_total})")
        else:
            normalized_keywords[normalized] = (api_keyword, ui_keyword, total_count)
            logger.debug(f"  - 키워드 추가: '{ui_keyword}' (검색량: {total_count})")
    
    logger.info(f"\n중복 제거 후 키워드 수: {len(normalized_keywords)}개")
    
    # 중복이 제거된 키워드만 처리
    final_combos = []
    for normalized, (api_keyword, ui_keyword, total_count) in normalized_keywords.items():
        
        # 키워드 가독성 개선 - fix_keyword_order 함수 적용
        readable = fix_keyword_order(ui_keyword, current_keyword)
        
        # 검색량 정보 가져오기
        pc_count = 0
        mobile_count = 0
        if api_keyword in search_volumes:
            pc_count = search_volumes[api_keyword]["PC검색량"]
            mobile_count = search_volumes[api_keyword]["모바일검색량"]
            total_count = search_volumes[api_keyword]["총검색량"]
        
        # 검색량 구간 계산
        search_volume_range = text_utils.get_search_volume_range(total_count)
        
        # 등장 순위 및 횟수 계산
        base_word = readable.replace(current_keyword, "").strip() if current_keyword else readable
        ranks = []
        if base_word in keyword_indices:
            ranks = [idx + 1 for idx in keyword_indices[base_word]]
        elif api_keyword in keyword_indices:  # 메인 키워드가 포함된 단어인 경우
            ranks = [idx + 1 for idx in keyword_indices.get(api_keyword, [])]
        
        ranks_str = ", ".join(map(str, ranks)) if ranks else "-"
        usage_count = len(ranks)
        
        # === 수정된 부분: "상품 등록 카테고리(상위100위)" 항목 제거 ===
        # 카테고리 정보는 내부적으로만 사용하고 테이블에는 표시하지 않음
        
        final_combos.append({
            "조합 키워드": readable.strip(),
            "PC검색량": pc_count,
            "모바일검색량": mobile_count,
            "총검색량": total_count,
            "검색량구간": search_volume_range,
            "키워드 사용자순위": ranks_str,
            "키워드 사용횟수": usage_count
            # "상품 등록 카테고리(상위100위)" 항목 제거됨
        })
    
    # 키워드 정보 데이터프레임 생성
    df_keywords = pd.DataFrame(final_combos)
    
    # 검색량 기준으로 내림차순 정렬
    if not df_keywords.empty:
        df_keywords = df_keywords.sort_values(by="총검색량", ascending=False)
        # 순번을 위해 인덱스 리셋 (순차적 순번 보장)
        df_keywords = df_keywords.reset_index(drop=True)
    
    # 데이터프레임 생성 후 로깅
    logger.info(f"\n생성된 키워드 데이터프레임 행 수: {len(df_keywords)}")
    if not df_keywords.empty:
        logger.debug(f"데이터프레임 열: {df_keywords.columns.tolist()}")
        logger.info(f"총 {len(df_keywords)}개 키워드 생성 완료")
    
    # 카테고리 정보 가공
    category_with_counts = [f"{cat} ({category_counter[cat]})" for cat in sorted(category_counter.keys())]
    category_with_counts.insert(0, "전체 보기")
    
    logger.info(f"카테고리 수: {len(category_counter)}개")
    logger.info("===== 검색 결과 처리 완료 =====\n")

    return {
        "products_df": df_products,
        "keywords_df": df_keywords,
        "categories": category_with_counts,
        "message": "✅ 검색이 완료되었습니다. 아래에서 키워드를 확인하세요."
    }

def filter_and_sort_table(df, selected_cat, keyword_sort, total_volume_sort, usage_count_sort, selected_volume_range, exclude_zero_volume=False):
    """테이블 필터링 및 정렬 함수 (검색량 0 제외 기능 추가)"""
    if df is None or df.empty:
        return ""
        
    # 필터링 적용
    filtered = df.copy()
    
    # 카테고리 필터 적용 (카테고리 열이 제거되었으므로 주석 처리)
    # if selected_cat and selected_cat != "전체 보기":
    #     cat_name = selected_cat.rsplit(" (", 1)[0]
    #     filtered = filtered[filtered["관련 카테고리"].str.contains(cat_name)]
    
    # 검색량 구간 필터 적용
    if selected_volume_range and selected_volume_range != "전체":
        filtered = filtered[filtered["검색량구간"] == selected_volume_range]
    
    # 검색량 0 제외 필터 적용
    if exclude_zero_volume:
        filtered = filtered[filtered["총검색량"] > 0]
        logger.info(f"검색량 0 제외 필터 적용 - 남은 키워드 수: {len(filtered)}")
    
    # 정렬 적용
    if keyword_sort != "정렬 없음":
        is_ascending = keyword_sort == "오름차순"
        filtered = filtered.sort_values(by="조합 키워드", ascending=is_ascending)
    
    if total_volume_sort != "정렬 없음":
        is_ascending = total_volume_sort == "오름차순"
        filtered = filtered.sort_values(by="총검색량", ascending=is_ascending)
        
    # 키워드 사용횟수 정렬 적용
    if usage_count_sort != "정렬 없음":
        is_ascending = usage_count_sort == "오름차순"
        filtered = filtered.sort_values(by="키워드 사용횟수", ascending=is_ascending)
    
    # 데이터프레임 내용 로깅
    logger.info(f"필터 적용 후 - 필터링된 DataFrame 행 수: {len(filtered)}")
    
    # 순번을 1부터 순차적으로 유지하기 위해 행 인덱스 재설정
    filtered = filtered.reset_index(drop=True)
    
    from export_utils import create_table_without_checkboxes
    
    # 순번을 포함한 HTML 테이블 생성
    html = create_table_without_checkboxes(filtered)
    
    return html

def fix_keyword_order(keyword, main_keyword):
    """
    키워드 순서를 수정하는 함수 - 한글이 앞에 오고 영어/숫자가 뒤에 오도록 함
    
    Args:
        keyword (str): 수정할 키워드
        main_keyword (str): 메인 키워드
        
    Returns:
        str: 순서가 수정된 키워드
    """
    # 공백 없이 숫자+영어와 한글이 붙어있는 패턴 처리
    # 예: "300g오징어" → "오징어 300g"
    pattern_combined = re.compile(r'^([0-9]+[a-zA-Z]*)([가-힣]+.*)$')
    match = pattern_combined.match(keyword)
    if match:
        number_part = match.group(1)  # 숫자+영어 부분
        korean_part = match.group(2)  # 한글 부분
        fixed_keyword = f"{korean_part} {number_part}"
        logger.debug(f"붙어있는 패턴 수정: '{keyword}' -> '{fixed_keyword}'")
        return fixed_keyword
    
    # 공백으로 분리된 경우 처리
    if ' ' in keyword:
        parts = keyword.split()
        
        # 한글 포함 여부와 영어/숫자 포함 여부를 각 부분별로 확인
        korean_parts = []
        non_korean_parts = []
        
        for part in parts:
            if re.search(r'[가-힣]', part):
                korean_parts.append(part)  # 한글이 포함된 부분
            else:
                non_korean_parts.append(part)  # 한글이 없는 부분 (영어, 숫자, 기호 등)
        
        # 한글 부분이 하나도 없거나 비한글 부분이 하나도 없으면 그대로 반환
        if not korean_parts or not non_korean_parts:
            return keyword
        
        # 한글 부분을 앞으로, 비한글 부분을 뒤로 배치
        fixed_keyword = " ".join(korean_parts + non_korean_parts)
        
        # 원래 키워드와 다른 경우에만 로그 출력
        if fixed_keyword != keyword:
            logger.debug(f"키워드 순서 수정: '{keyword}' -> '{fixed_keyword}'")
        
        return fixed_keyword
    
    return keyword