Spaces:

seoulalpha
/

seoulalpha_space

Running

App Files Files Community

Syngyeon commited on Sep 26, 2025

Commit

369686e

verified ·

1 Parent(s): ce4e0cd

Upload 12 files

Browse files

Files changed (12) hide show

README.md +17 -16
app.py +127 -70
cluster_predictor.py +188 -0
models/feature_importance_ranking.csv +57 -0
models/imputation_base_data.csv +0 -0
models/kmeans.joblib +3 -0
models/pca.joblib +3 -0
models/preprocessor.joblib +3 -0
models/variable_weights.json +18 -0
rag_retriever.py +146 -0
region_extractor.py +97 -0
requirements.txt +0 -0

README.md CHANGED Viewed

@@ -1,16 +1,17 @@
----
-title: Seoulalpha Space
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
-pinned: false
-hf_oauth: true
-hf_oauth_scopes:
-- inference-api
-license: mit
----
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

+---
+title: Seoulalpha
+emoji: 💬
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.42.0
+app_file: app.py
+pinned: false
+hf_oauth: true
+hf_oauth_scopes:
+- inference-api
+license: apache-2.0
+short_description: Travel Spots in Korea chatbot recommender
+---
+An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

app.py CHANGED Viewed

@@ -1,70 +1,127 @@
-import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-    """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
-if __name__ == "__main__":
-    demo.launch()

+# app.py
+import gradio as gr
+from langdetect import detect
+from deep_translator import GoogleTranslator
+# 모듈 import
+from cluster_predictor import get_user_cluster
+from region_extractor import extract_region_from_query
+from rag_retriever import get_rag_recommendation
+# 언어 코드 매핑 (deep_translator 호환)
+LANG_CODE_MAP = {
+    "zh-cn": "zh-CN",
+    "zh-tw": "zh-TW",
+    "iw": "he",
+}
+def normalize_lang_code(code: str) -> str:
+    return LANG_CODE_MAP.get(code.lower(), code)
+# --- Gradio용 대화 함수 ---
+def chatbot_interface(user_input, history, state):
+    if user_input.lower() in ["종료", "exit", "quit"]:
+        return history + [[user_input, "프로그램을 종료합니다."]], state
+    conversation_context = state.get("conversation_context", {})
+    full_conversation = state.get("full_conversation", [])
+    # --- Step1: 입력 언어 감지 & 한국어 번역 ---
+    try:
+        detected = detect(user_input)     # 'en', 'ja', 'fr', 'zh-cn' ...
+        input_lang = normalize_lang_code(detected)
+    except Exception as e:
+        return history + [[user_input, f"❌ 언어 감지 오류: {e}"]], state
+    if input_lang != "ko":
+        try:
+            current_query = GoogleTranslator(source=input_lang, target="ko").translate(user_input)
+        except Exception as e:
+            return history + [[user_input, f"❌ 번역 오류: {e}"]], state
+    else:
+        current_query = user_input
+    cluster_info = None
+    max_turns = 3
+    # 클러스터 확정 루프
+    for turn in range(max_turns):
+        full_conversation.append(current_query)
+        status, data = get_user_cluster(current_query, conversation_context)
+        if status == "SUCCESS":
+            cluster_info = data
+            break
+        elif status == "RETRY_WITH_QUESTION":
+            question_to_user, updated_context = data
+            conversation_context = updated_context
+            # 질문도 입력 언어로 번역해서 사용자에게 보여줌
+            if input_lang != "ko":
+                try:
+                    question_to_user = GoogleTranslator(source="ko", target=input_lang).translate(question_to_user)
+                except:
+                    pass
+            # state 업데이트
+            state["conversation_context"] = conversation_context
+            state["full_conversation"] = full_conversation
+            return history + [[user_input, question_to_user]], state
+        elif status == "FAIL":
+            fail_msg = "최종 클러스터 분석에 실패했습니다."
+            if input_lang != "ko":
+                try:
+                    fail_msg = GoogleTranslator(source="ko", target=input_lang).translate(fail_msg)
+                except:
+                    pass
+            return history + [[user_input, fail_msg]], state
+    # RAG 실행
+    if cluster_info:
+        cluster_id, cluster_profile = cluster_info
+        final_query_for_rag = " ".join(full_conversation)
+        region_keywords = extract_region_from_query(final_query_for_rag)
+        rag_query = f"{cluster_profile} 특징을 가진 여행객이 '{final_query_for_rag}'와 같은 여행을 할 때 가기 좋은 곳"
+        final_answer_ko = get_rag_recommendation(rag_query, region_keywords)
+        # 최종 답변도 입력 언어로 다시 번역
+        final_answer = final_answer_ko
+        if input_lang != "ko":
+            try:
+                final_answer = GoogleTranslator(source="ko", target=input_lang).translate(final_answer_ko)
+            except:
+                final_answer = f"❌ 결과 번역 오류: {final_answer_ko}"
+        # state 업데이트
+        state["conversation_context"] = conversation_context
+        state["full_conversation"] = full_conversation
+        return history + [[user_input, final_answer]], state
+    else:
+        fail_msg = "추천을 생성할 수 없습니다."
+        if input_lang != "ko":
+            try:
+                fail_msg = GoogleTranslator(source="ko", target=input_lang).translate(fail_msg)
+            except:
+                pass
+        return history + [[user_input, fail_msg]], state
+# --- Gradio UI 정의 ---
+with gr.Blocks() as demo:
+    gr.Markdown("## ✈️ 여행 추천 챗봇")
+    chatbot = gr.Chatbot(height=500)
+    msg = gr.Textbox(label="사용자 입력")
+    state = gr.State({"conversation_context": {}, "full_conversation": []})
+    def respond(message, chat_history, state):
+        response, new_state = chatbot_interface(message, chat_history, state)
+        return "", response, new_state
+    msg.submit(respond, [msg, chatbot, state], [msg, chatbot, state])
+if __name__ == "__main__":
+    demo.launch(show_api=False, debug=True)

cluster_predictor.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# cluster_predictor.py
+import joblib
+import pandas as pd
+from openai import OpenAI
+import os
+import json
+from huggingface_hub import hf_hub_download
+# Hugging Face dataset repo에서 prompt 파일 로드
+PROMPT_PATH = hf_hub_download(
+    repo_id="Syngyeon/seoulalpha-data",
+    repo_type="dataset",   # ✅ 반드시 dataset으로 지정
+    filename="data/prompt/custom_prompt_eng.txt"
+)
+FEWSHOT_PATH = hf_hub_download(
+    repo_id="Syngyeon/seoulalpha-data",
+    repo_type="dataset",   # ✅ 반드시 dataset으로 지정
+    filename="data/prompt/custom_few_shot_learning_multi_language.txt"
+)
+# --- 초기 설정 ---
+client = OpenAI(api_key=os.getenv("API_KEY"))
+CLUSTER_PROFILES = {
+    0: "문화, 역사, 자연 탐방을 주목적으로 가을에 한국을 재방문하는 여행객. 긴 체류 기간 동안 서울과 여러 지방(경기, 강원, 경상)을 함께 방문하며, 매우 알뜰하게 소비하는 경향이 있음.",
+    1: "한국을 처음 방문한 여행객. 짧은 기간 동안 서울에만 머무르며 음식과 미식 탐방에 가장 큰 관심을 두고 여행함. 숙박비에 비교적 높은 예산을 사용함.",
+    2: "한국을 처음 방문한 여행객. 짧은 기간 서울에 머물며 음식, 쇼핑 등 모든 분야에서 압도적인 소비력을 보여주는 럭셔리 여행을 즐김.",
+    3: "쇼핑과 맛집 탐방을 목적으로 서울을 자주 재방문하는 여행객. 매우 짧은 기간 머물며 여행 목적을 집중적으로 달성하고, 식비에 지출 비중이 매우 높음. 문화나 자연보다 쇼핑과 미식에 관심이 집중됨.",
+    4: "한국 여행 경험이 풍부한 재방문객. 서울뿐만 아니라 전국을 여행하며, 특히 다양한 지역의 음식을 즐기는 미식 활동에 관심이 매우 높음.",
+    5: "한국을 처음 방문하는 여행객. 긴 기간 동안 머무르며 서울을 넘어 지방, 특히 경상도 지역의 자연 경관과 문화 유산을 깊이 있게 탐험하는 것에 관심이 압도적으로 높음. 예산은 비교적 적게 사용함.",
+    6: "한국을 처음 방문하는 여행객. 긴 기간 동안 지방, 특히 경상도를 여행하며 한국의 자연 경관과 문화 유산에 매우 높은 만족도와 깊은 감명을 느낌. 재방문 의향도 높은 이상적인 탐방형 여행객."
+}
+# --- 모델 및 데이터 로드 ---
+try:
+    preprocessor = joblib.load('./models/preprocessor.joblib')
+    pca = joblib.load('./models/pca.joblib')
+    kmeans = joblib.load('./models/kmeans.joblib')
+    imputation_base_data = pd.read_csv('./models/imputation_base_data.csv', encoding='utf-8-sig')
+    with open('./models/variable_weights.json', 'r', encoding='utf-8') as f:
+        VARIABLE_WEIGHTS = json.load(f)
+except FileNotFoundError:
+    print("모델 파일이 없습니다. train_model.py를 먼저 실행해주세요.")
+    preprocessor, pca, kmeans, imputation_base_data = None, None, None, None
+# 변수 정의
+categorical_cols = ['country', 'gender', 'age', 'revisit_indicator', 'visit_local_indicator', 'planned_activity']
+numerical_cols = ['stay_duration', 'accommodation_percent', 'food_percent', 'shopping_percent', 'food', 'landscape', 'heritage', 'language', 'safety', 'budget', 'accommodation', 'transport', 'navigation']
+used_variables = categorical_cols + numerical_cols
+def query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True):
+    prompt_parts = []
+    if use_prompt:
+        with open(PROMPT_PATH, "r", encoding="utf-8") as f:
+            custom_prompt = f.read()
+            prompt_parts.append(custom_prompt)
+    if use_fewshot:
+        with open(FEWSHOT_PATH, "r", encoding="utf-8") as f:
+            few_shot_examples = f.read()
+            prompt_parts.append(few_shot_examples)
+    full_prompt = "\n\n".join(prompt_parts)
+    messages = [
+        {"role": "system", "content": full_prompt},
+        {"role": "user", "content": user_query}
+    ]
+    try:
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            response_format={"type": "json_object"} # tsy 추가: JSON 응답 형식을 강제
+        )
+        content = response.choices[0].message.content.strip()
+        return json.loads(content)
+    except Exception as e:
+        print("[파싱 실패]", e)
+        return {}
+def impute_with_user_subgroup(user_input_dict, df_base=imputation_base_data):
+    known_info = {k: v for k, v in user_input_dict.items() if v is not None}
+    filtered_df = df_base.copy()
+    for key, val in known_info.items():
+        if key in filtered_df.columns:
+            filtered_df = filtered_df[filtered_df[key].astype(str) == str(val)]
+    imputed = {}
+    for var in used_variables:
+        if user_input_dict.get(var) is not None:
+            imputed[var] = user_input_dict[var]
+        else:
+            if not filtered_df.empty:
+                if var in numerical_cols: imputed[var] = filtered_df[var].mean()
+                elif var in categorical_cols: imputed[var] = filtered_df[var].mode().iloc[0]
+            else:
+                if var in numerical_cols: imputed[var] = df_base[var].mean()
+                elif var in categorical_cols: imputed[var] = df_base[var].mode().iloc[0]
+    return imputed
+def predict_cluster_from_query(variable_dict: dict):
+    # 이 함수는 더 이상 LLM을 호출하지 않고, 주어진 정보로 예측만 수행
+    if not variable_dict: return None
+    completed_input = impute_with_user_subgroup(variable_dict)
+    df = pd.DataFrame([completed_input])
+    for col in categorical_cols:
+        if col in df.columns: df[col] = df[col].astype(str)
+    for col in numerical_cols:
+        if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce')
+    try:
+        X_processed = preprocessor.transform(df)
+        X_pca = pca.transform(X_processed)
+        return kmeans.predict(X_pca)[0]
+    except Exception as e:
+        print(f"[클러스터 예측 실패] {e}")
+        return None
+# ==================== 신규 추가: 헬퍼 함수 ====================
+def _calculate_info_score(extracted_vars):
+    """추출된 변수들의 가중치 합으로 정보 충분도 점수를 계산합니다."""
+    if not VARIABLE_WEIGHTS: return 0.0
+    current_score = sum(VARIABLE_WEIGHTS.get(var, 0) for var, value in extracted_vars.items() if value is not None)
+    print(f"정보 충분도 점수: {current_score:.4f}")
+    return current_score
+def _generate_clarifying_question(user_query, context):
+    variable_map = {
+        'revisit_indicator': '이번이 한국 첫 방문인지, 혹은 이전에 한국을 방문한 적이 있는지',
+        'visit_local_indicator': '수도권(서울/경기/인천) 외 다른 지역을 방문할 계획이 있는지',
+        'stay_duration': '한국 여행 기간',
+        'planned_activity': '한국 여행을 하기위해 계획한 활동'
+    }
+    missing_vars = []
+    if VARIABLE_WEIGHTS:
+        sorted_vars = sorted(VARIABLE_WEIGHTS.keys(), key=lambda k: VARIABLE_WEIGHTS[k], reverse=True)
+        for var in sorted_vars:
+            if context.get(var) is None and var in variable_map:
+                missing_vars.append(variable_map[var])
+    if not missing_vars:
+        return "여행에 대해 조금만 더 자세히 말씀해주시겠어요?"
+    question_prompt = f"""당신은 친절한 여행 플래너입니다.
+        사용자가 아래와 같이 질문했습니다.
+        사용자 질문: "{user_query}"
+        사용자 맞춤 추천을 위해 '{', '.join(missing_vars[:2])}' 정보가 필요합니다.
+        사용자의 질문 맥락에 맞춰 자연스럽게 질문을 한 문장으로 만들어주세요."""
+    try:
+        response = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "system", "content": question_prompt}])
+        return response.choices[0].message.content
+    except Exception:
+        return f"혹시 계획 중인 {missing_vars[0]}에 대해 조금 더 알려주실 수 있나요?"
+# --- 대표 실행 함수 (재설계) ---
+def get_user_cluster(user_query: str, previous_context: dict = None):
+    if preprocessor is None or pca is None or kmeans is None or imputation_base_data.empty:
+        return None, None
+    #if not all([preprocessor, pca, kmeans, imputation_base_data, VARIABLE_WEIGHTS]):
+    #    return "FAIL", "필수 모델/데이터 파일이 로드되지 않았습니다."
+    newly_extracted_vars = query_llm_for_variables(user_query)
+    current_context = previous_context.copy() if previous_context else {}
+    current_context.update({k: v for k, v in newly_extracted_vars.items() if v is not None})
+    score = _calculate_info_score(current_context)
+    if score > 0.50:
+        #print("✅ 정보가 충분하여 클러스터링을 진행합니다.")
+        cluster_label = predict_cluster_from_query(current_context)
+        if cluster_label is not None:
+            profile = CLUSTER_PROFILES.get(cluster_label, "정의되지 않은 클러스터입니다.")
+            return "SUCCESS", (cluster_label, profile)
+        else:
+            return "FAIL", "클러스터 예측에 실패했습니다."
+    else:
+        #print("⚠️ 정보가 불충분하여 사용자에게 재질의합니다.")
+        question = _generate_clarifying_question(user_query, current_context)
+        return "RETRY_WITH_QUESTION", (question, current_context)

models/feature_importance_ranking.csv ADDED Viewed

	@@ -0,0 +1,57 @@

+feature,importance
+cat__gender_2,0.16612719377732216
+cat__revisit_indicator_0,0.16455068671975487
+cat__gender_1,0.1482217051794164
+cat__revisit_indicator_1,0.13597727601444679
+cat__visit_local_indicator_0,0.13319687700894334
+cat__visit_local_indicator_1,0.10697746487224133
+cat__visit_local_indicator_2,0.020241612346529863
+num__stay_duration,0.014849368057557598
+num__food_percent,0.014630130868015519
+num__shopping_percent,0.009280205750283798
+num__accommodation_percent,0.009180690732971732
+cat__planned_activity_2.0,0.008696580924116888
+cat__country_1,0.0070618551302561675
+cat__planned_activity_3.0,0.005779261578544953
+cat__age_2,0.004328397535810549
+cat__planned_activity_4.0,0.003544786166398055
+cat__planned_activity_99.0,0.003167551351550833
+cat__country_2,0.0029464161975619727
+num__language,0.002927291909464459
+num__budget,0.0027502516728713754
+num__safety,0.0025716066643480325
+num__navigation,0.0025153003716470498
+num__transport,0.002371096987708634
+num__food,0.00230589888998026
+num__accommodation,0.0022564692709160917
+num__landscape,0.0021469948117270295
+num__heritage,0.001992909406583649
+cat__country_5,0.0018994328238773261
+cat__age_3,0.0014320177855646557
+cat__country_10,0.0014213292787885526
+cat__age_5,0.0012658681149728401
+cat__age_4,0.0011676351945606124
+cat__country_8,0.0009806737230181408
+cat__planned_activity_6.0,0.0009291243639821009
+cat__planned_activity_1.0,0.0009189380963146095
+cat__country_4,0.0009131123524501497
+cat__country_3,0.0008415104761365984
+cat__country_9,0.0006391833106764304
+cat__country_6,0.0005992862929722725
+cat__country_15,0.0005961254268885925
+cat__planned_activity_7.0,0.0005203510135881252
+cat__age_1,0.0004786493334730666
+cat__country_7,0.00045682833180356916
+cat__age_6,0.0004286939505569954
+cat__country_11,0.0004238714395302936
+cat__country_13,0.0004145802913805203
+cat__country_14,0.0003922145285860367
+cat__country_99,0.0003900235538848013
+cat__country_19,0.00036882244028804445
+cat__country_12,0.00036315525951813906
+cat__country_18,0.00036298882952069363
+cat__planned_activity_8.0,0.0003433572001072147
+cat__country_16,0.0003025359192961062
+cat__planned_activity_5.0,0.00025347994754935764
+cat__country_17,0.0002398183809611425
+cat__country_20,6.0512142783746834e-05

models/imputation_base_data.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

models/kmeans.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71afae6ee2376a18af9bda0f6b7a3e8263458cf7e8747ed1a49b89f5e3834ff9
+size 78363

models/pca.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:78eae182e98318f55b029c52086e88bfce0dad6fa28a6cd426d8b140925a4a39
+size 2815

models/preprocessor.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29424baa251ab960783524bdd697cbf145a5db0b1a6eb3313944a5acf99fd766
+size 7330

models/variable_weights.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "revisit_indicator":0.4589085007,
+    "visit_local_indicator":0.3976571565,
+    "planned_activity":0.0368824736,
+    "stay_duration":0.0226750987,
+    "food_percent":0.0223403219,
+    "shopping_percent":0.0141709453,
+    "accommodation_percent":0.0140189851,
+    "language":0.0044699972,
+    "budget":0.0041996554,
+    "safety":0.0039268631,
+    "navigation":0.0038408829,
+    "transport":0.0036206833,
+    "food":0.0035211253,
+    "accommodation":0.0034456459,
+    "landscape":0.0032784775,
+    "heritage":0.0030431879
+}

rag_retriever.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# rag_retriever.py
+import json
+import os
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from openai import OpenAI
+from huggingface_hub import hf_hub_download
+# --- 설정 ---
+MODEL_NAME = 'jhgan/ko-sbert-nli'
+LLM_MODEL_NAME = 'gpt-3.5-turbo'
+DATA_REPO = "Syngyeon/seoulalpha-data"
+TOP_K = 10
+# OpenAI 클라이언트 초기화
+client = OpenAI(api_key=os.getenv("API_KEY"))
+# --- 리소스 로딩 ---
+def _load_resources():
+    """모듈 로딩 시 검색에 필요한 리소스를 미리 불러옵니다."""
+    try:
+        print("1. Hugging Face Hub에서 RAG 리소스를 다운로드합니다...")
+        # HF repo에서 파일 다운로드
+        index_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="data/faiss/faiss_merged_output/merged.index")
+        metadata_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="data/faiss/faiss_merged_output/merged_metadata.jsonl")
+        # 임베딩 모델 로드
+        model = SentenceTransformer(MODEL_NAME)
+        # FAISS index 로드
+        index = faiss.read_index(index_path)
+        # 메타데이터 로드
+        metadata_map = {}
+        with open(metadata_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                meta = json.loads(line)
+                metadata_map[meta['vector_id']] = meta
+        print("RAG 리소스 로딩 완료!")
+        return model, index, metadata_map
+    except Exception as e:
+        print(f"RAG 리소스 로딩에 실패했습니다: {e}")
+        return None, None, None
+# 모듈이 임포트될 때 리소스를 한 번만 로드합니다.
+embedding_model, faiss_index, meta_map = _load_resources()
+def _retrieve_places(query, k):
+    """내부 함수: 쿼리를 기반으로 유사한 장소를 검색합니다."""
+    query_vector = embedding_model.encode([query])
+    distances, ids = faiss_index.search(query_vector.astype('float32'), k)
+    results = []
+    for vector_id in ids[0]:
+        if vector_id in meta_map:
+            results.append(meta_map[vector_id])
+    return results
+def _generate_answer_with_llm(query, retrieved_places):
+    """내부 함수: 검색된 정보를 바탕으로 LLM 답변을 생성합니다."""
+    context = ""
+    for i, place in enumerate(retrieved_places[:5]):  # 상위 5개 정보만 사용
+        context += f"--- 장소 정보 {i+1} ---\n"
+        context += f"이름: {place.get('name', '정보 없음')}\n"
+        context += f"주소: {place.get('address', '정보 없음')}\n"
+        context += f"AI 요약: {place.get('ai_summary', '정보 없음')}\n"
+        processed_sentences = place.get('processed_sentences', [])
+        context += "주요 특징 및 후기:\n"
+        for sentence in processed_sentences:
+            context += f"- {sentence}\n"
+        context += "\n"
+    system_prompt = "당신은 사용자의 질문에 가장 적합한 장소를 추천해주는 유용한 어시스턴트입니다."
+    user_prompt = f"""
+    아래 '장소 정보'만을 바탕으로 사용자의 질문에 대한 답변을 생성해 주세요.
+    [지시사항]
+    1. 검색된 장소 중에서 질문과 가장 관련성이 높은 2~3곳을 추천해 주세요.
+    2. 각 장소를 추천할 때, 반드시 '이름'과 '주소'를 명확하게 함께 표시해주세요.
+    3. 각 장소를 추천하는 이유를 'AI 요약'과 '주요 특징 및 후기'를 근거로 구체적으로 설명해 주세요.
+    4. 'processed_sentences'에 있는 실제 후기를 인용하여 답변하면 신뢰도를 높일 수 있습니다.
+    5. 친절하고 자연스러운 말투로 답변해 주세요.
+    --- 장소 정보 ---
+    {context}
+    --- 사용자의 질문 ---
+    {query}
+    """
+    try:
+        response = client.chat.completions.create(
+            model=LLM_MODEL_NAME,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ],
+            temperature=0.7,
+        )
+        return response.choices[0].message.content
+    except Exception as e:
+        return f"LLM 답변 생성 중 오류가 발생했습니다: {e}"
+# --- 대표 실행 함수 ---
+def get_rag_recommendation(search_query, region_keywords):
+    """
+    검색 쿼리와 지역 키워드를 받아 RAG 시스템을 통해 최종 추천 답변을 반환합니다.
+    """
+    if not all([embedding_model, faiss_index, meta_map]):
+        return "RAG 시스템이 준비되지 않아 추천을 생성할 수 없습니다."
+    # 1. 장소 검색
+    print("\n[RAG] 의미적으로 유사한 장소를 검색합니다...")
+    top_places = _retrieve_places(search_query, k=100)
+    if not top_places:
+        return "관련된 장소를 찾지 못했습니다."
+    # 2. 지역 필터링
+    if region_keywords:
+        print(f"[RAG] 주소 필터링 (키워드: {region_keywords})...")
+        filtered_places = []
+        for place in top_places:
+            address = place.get('address', '')
+            if any(keyword in address for keyword in region_keywords):
+                filtered_places.append(place)
+            if len(filtered_places) >= 10:
+                break
+        print(f"[RAG] 필터링 후 남은 장소: {[p.get('name') for p in filtered_places]}")
+    else:
+        print("[RAG] 지역 키워드가 없어 필터링을 건너뜁니다.")
+        filtered_places = top_places
+    if not filtered_places:
+        return "요청하신 지역에 맞는 장소를 찾지 못했습니다."
+    # 3. LLM으로 답변 생성
+    print("[RAG] 필터링된 정보를 바탕으로 최종 답변을 생성합니다...")
+    final_answer = _generate_answer_with_llm(search_query, filtered_places)
+    return final_answer

region_extractor.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import json
+import faiss
+import numpy as np
+from sentence_transformers import SentenceTransformer
+from huggingface_hub import hf_hub_download
+from openai import OpenAI   # 🔹 추가
+DATA_REPO = "Syngyeon/seoulalpha-data"
+MODEL_NAME = "jhgan/ko-sbert-nli"
+# OpenAI 클라이언트 초기화
+client = OpenAI(api_key=os.getenv("API_KEY"))   # 🔹 추가
+# 로드
+def _load_region_index():
+    try:
+        index_path = hf_hub_download(
+            repo_id=DATA_REPO, repo_type="dataset",
+            filename="data/faiss/region_db/faiss_region_semantic.index"
+        )
+        metadata_path = hf_hub_download(
+            repo_id=DATA_REPO, repo_type="dataset",
+            filename="data/faiss/region_db/metadata_region_semantic.jsonl"
+        )
+        index = faiss.read_index(index_path)
+        model = SentenceTransformer(MODEL_NAME)
+        metadata_map = {}
+        with open(metadata_path, "r", encoding="utf-8") as f:
+            for line in f:
+                meta = json.loads(line)
+                metadata_map[meta["vector_id"]] = meta
+        print("[RegionDB] 로딩 완료")
+        return model, index, metadata_map
+    except Exception as e:
+        print("[RegionDB] 로딩 실패:", e)
+        return None, None, None
+region_model, region_index, region_meta = _load_region_index()
+def extract_region_semantic(user_query, top_k=5):
+    """FAISS 기반 지역 후보 추출"""
+    if not all([region_model, region_index, region_meta]):
+        return []
+    query_vec = region_model.encode([user_query]).astype("float32")
+    distances, ids = region_index.search(query_vec, top_k)
+    results = []
+    for i, vid in enumerate(ids[0]):
+        if vid in region_meta:
+            results.append(region_meta[vid]["region_name"])
+    return results
+def extract_region_from_query(user_query):
+    """
+    사용자 질문에서 LLM을 사용해 지역명 키워드 리스트를 추출합니다.
+    """
+    print("[LLM] 사용자 쿼리에서 지역명 키워드를 추출합니다...")
+    system_prompt = """
+    당신은 사용자의 여행 관련 질문에서 '대한민국 행정구역' 키워드를 추출하는 AI 어시스턴트입니다.
+    사용자의 질문을 분석하여, 주소 필터링에 사용할 수 있는 키워드 목록을 JSON 형식으로 반환해 주세요.
+    결과는 반드시 {"regions": ["키워드1", "키워드2", ...]} 형태여야 합니다.
+    - "전라도"는 "전북", "전남", "광주"로 해석합니다.
+    - "경상도"는 "경북", "경남", "부산", "대구", "울산"으로 해석합니다.
+    - "충청도"는 "충북", "충남", "대전", "세종"으로 해석합니다.
+    - "서울 근교"는 "경기", "인천"으로 해석합니다.
+    - 언급된 지역이 없으면 빈 리스트 []를 반환합니다.
+    """
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_query}
+    ]
+    try:
+        response = client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            response_format={"type": "json_object"}
+        )
+        result = json.loads(response.choices[0].message.content)
+        if 'regions' in result and isinstance(result['regions'], list):
+            return result['regions']
+        else:
+            return []
+    except Exception as e:
+        print(f"[LLM] 지역명 추출 중 오류 발생: {e}")
+        return []

requirements.txt ADDED Viewed

Binary file (322 Bytes). View file