Spaces:
Sleeping
Sleeping
SyngyeonTak commited on
Commit ยท
1a554ac
1
Parent(s): 500a117
changes to adapt to hugging face
Browse files- app.py +1 -6
- cluster_predictor.py +7 -5
- llm_cluster_predictor.py +0 -198
- rag_retriever.py +27 -35
- region_extractor.py +39 -58
- requirements.txt +0 -0
app.py
CHANGED
|
@@ -1,7 +1,4 @@
|
|
| 1 |
# app.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from dotenv import load_dotenv
|
| 5 |
import gradio as gr
|
| 6 |
from langdetect import detect
|
| 7 |
from deep_translator import GoogleTranslator
|
|
@@ -11,8 +8,6 @@ from cluster_predictor import get_user_cluster
|
|
| 11 |
from region_extractor import extract_region_from_query
|
| 12 |
from rag_retriever import get_rag_recommendation
|
| 13 |
|
| 14 |
-
# --- ์ด๊ธฐ ์ค์ ---
|
| 15 |
-
load_dotenv()
|
| 16 |
|
| 17 |
# ์ธ์ด ์ฝ๋ ๋งคํ (deep_translator ํธํ)
|
| 18 |
LANG_CODE_MAP = {
|
|
@@ -129,4 +124,4 @@ with gr.Blocks() as demo:
|
|
| 129 |
msg.submit(respond, [msg, chatbot, state], [msg, chatbot, state])
|
| 130 |
|
| 131 |
if __name__ == "__main__":
|
| 132 |
-
demo.launch()
|
|
|
|
| 1 |
# app.py
|
|
|
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from langdetect import detect
|
| 4 |
from deep_translator import GoogleTranslator
|
|
|
|
| 8 |
from region_extractor import extract_region_from_query
|
| 9 |
from rag_retriever import get_rag_recommendation
|
| 10 |
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# ์ธ์ด ์ฝ๋ ๋งคํ (deep_translator ํธํ)
|
| 13 |
LANG_CODE_MAP = {
|
|
|
|
| 124 |
msg.submit(respond, [msg, chatbot, state], [msg, chatbot, state])
|
| 125 |
|
| 126 |
if __name__ == "__main__":
|
| 127 |
+
demo.launch(show_api=False, debug=True)
|
cluster_predictor.py
CHANGED
|
@@ -5,11 +5,13 @@ import pandas as pd
|
|
| 5 |
from openai import OpenAI
|
| 6 |
import os
|
| 7 |
import json
|
| 8 |
-
from
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# --- ์ด๊ธฐ ์ค์ ---
|
| 12 |
-
load_dotenv()
|
| 13 |
client = OpenAI(api_key=os.getenv("API_KEY"))
|
| 14 |
|
| 15 |
CLUSTER_PROFILES = {
|
|
@@ -43,12 +45,12 @@ def query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True):
|
|
| 43 |
prompt_parts = []
|
| 44 |
|
| 45 |
if use_prompt:
|
| 46 |
-
with open("
|
| 47 |
custom_prompt = f.read()
|
| 48 |
prompt_parts.append(custom_prompt)
|
| 49 |
|
| 50 |
if use_fewshot:
|
| 51 |
-
with open("
|
| 52 |
few_shot_examples = f.read()
|
| 53 |
prompt_parts.append(few_shot_examples)
|
| 54 |
|
|
|
|
| 5 |
from openai import OpenAI
|
| 6 |
import os
|
| 7 |
import json
|
| 8 |
+
from huggingface_hub import hf_hub_download
|
| 9 |
+
|
| 10 |
+
# Hugging Face dataset repo์์ prompt ํ์ผ ๋ก๋
|
| 11 |
+
PROMPT_PATH = hf_hub_download("Syngyeon/seoulalpha-data", "prompt/custom_prompt_eng.txt")
|
| 12 |
+
FEWSHOT_PATH = hf_hub_download("Syngyeon/seoulalpha-data", "prompt/custom_few_shot_learning_multi_language.txt")
|
| 13 |
|
| 14 |
# --- ์ด๊ธฐ ์ค์ ---
|
|
|
|
| 15 |
client = OpenAI(api_key=os.getenv("API_KEY"))
|
| 16 |
|
| 17 |
CLUSTER_PROFILES = {
|
|
|
|
| 45 |
prompt_parts = []
|
| 46 |
|
| 47 |
if use_prompt:
|
| 48 |
+
with open("PROMPT_PATH", "r", encoding="utf-8") as f:
|
| 49 |
custom_prompt = f.read()
|
| 50 |
prompt_parts.append(custom_prompt)
|
| 51 |
|
| 52 |
if use_fewshot:
|
| 53 |
+
with open("FEWSHOT_PATH", "r", encoding="utf-8") as f:
|
| 54 |
few_shot_examples = f.read()
|
| 55 |
prompt_parts.append(few_shot_examples)
|
| 56 |
|
llm_cluster_predictor.py
DELETED
|
@@ -1,198 +0,0 @@
|
|
| 1 |
-
import subprocess
|
| 2 |
-
import sys
|
| 3 |
-
import os
|
| 4 |
-
import json
|
| 5 |
-
from dotenv import load_dotenv
|
| 6 |
-
import os
|
| 7 |
-
|
| 8 |
-
load_dotenv()
|
| 9 |
-
|
| 10 |
-
# ํ๊ฒฝ ๋ณ์ ์ฌ์ฉ
|
| 11 |
-
USER_KEY = os.getenv("API_KEY")
|
| 12 |
-
|
| 13 |
-
# ์ค์น๊ฐ ํ์ํ ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์๋ ๊ฒฝ์ฐ ์๋ ์ค์น
|
| 14 |
-
def install_required_packages(package, import_name=None):
|
| 15 |
-
try:
|
| 16 |
-
if import_name:
|
| 17 |
-
__import__(import_name)
|
| 18 |
-
else:
|
| 19 |
-
__import__(package)
|
| 20 |
-
except ImportError:
|
| 21 |
-
print(f"๐ฆ {package} ์ค์น ์ค...")
|
| 22 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
| 23 |
-
print(f"โ
{package} ์ค์น ์๋ฃ!")
|
| 24 |
-
|
| 25 |
-
required_packages = [
|
| 26 |
-
("scikit-learn", "sklearn"),
|
| 27 |
-
("tqdm", "tqdm"),
|
| 28 |
-
("openai", "openai"),
|
| 29 |
-
("pandas", "pandas"),
|
| 30 |
-
("numpy", "numpy")
|
| 31 |
-
]
|
| 32 |
-
|
| 33 |
-
for pkg, imp in required_packages:
|
| 34 |
-
install_required_packages(pkg, imp)
|
| 35 |
-
|
| 36 |
-
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
|
| 37 |
-
from sklearn.compose import ColumnTransformer
|
| 38 |
-
from sklearn.pipeline import Pipeline
|
| 39 |
-
from sklearn.cluster import KMeans
|
| 40 |
-
from sklearn.metrics import silhouette_score
|
| 41 |
-
from sklearn.decomposition import PCA
|
| 42 |
-
from sklearn.impute import SimpleImputer
|
| 43 |
-
from tqdm import tqdm
|
| 44 |
-
from openai import OpenAI
|
| 45 |
-
import pandas as pd
|
| 46 |
-
import numpy as np
|
| 47 |
-
|
| 48 |
-
# OpenAI API ํค ์ค์
|
| 49 |
-
#USER_KEY = ""
|
| 50 |
-
client = OpenAI(api_key=USER_KEY)
|
| 51 |
-
|
| 52 |
-
# ๋ฐ์ดํฐ ์ค๋น
|
| 53 |
-
preprocessed_data = pd.read_csv('./๊ด๊ด๋ฐ์ดํฐ.csv', encoding='cp949')
|
| 54 |
-
|
| 55 |
-
# ๋ณ์ ๊ตฌ๋ถ
|
| 56 |
-
categorical_cols = ['country', 'gender', 'age',
|
| 57 |
-
'revisit_indicator', 'visit_local_indicator', 'planned_activity']
|
| 58 |
-
|
| 59 |
-
numerical_cols = [
|
| 60 |
-
'stay_duration', 'accommodation_percent', 'food_percent', 'shopping_percent', 'food',
|
| 61 |
-
'landscape', 'heritage', 'language', 'safety', 'budget',
|
| 62 |
-
'accommodation', 'transport', 'navigation'
|
| 63 |
-
]
|
| 64 |
-
used_variables = categorical_cols + numerical_cols
|
| 65 |
-
|
| 66 |
-
for col in categorical_cols:
|
| 67 |
-
preprocessed_data[col] = preprocessed_data[col].astype(str)
|
| 68 |
-
preprocessed_data_clean = preprocessed_data.dropna(subset=used_variables).copy()
|
| 69 |
-
|
| 70 |
-
# ์ ์ฒ๋ฆฌ ํ์ดํ๋ผ์ธ ์ ์
|
| 71 |
-
# ์์นํ ํ์ดํ๋ผ์ธ: ํ๊ท ๋์ฒด + ์ ๊ทํ
|
| 72 |
-
numeric_pipeline = Pipeline([
|
| 73 |
-
('imputer', SimpleImputer(strategy='mean')),
|
| 74 |
-
('scaler', MinMaxScaler())
|
| 75 |
-
])
|
| 76 |
-
|
| 77 |
-
# ๋ฒ์ฃผํ ํ์ดํ๋ผ์ธ: ์ต๋น๊ฐ ๋์ฒด + ์ํซ์ธ์ฝ๋ฉ
|
| 78 |
-
categorical_pipeline = Pipeline([
|
| 79 |
-
('imputer', SimpleImputer(strategy='most_frequent')),
|
| 80 |
-
('encoder', OneHotEncoder(handle_unknown='ignore'))
|
| 81 |
-
])
|
| 82 |
-
|
| 83 |
-
preprocessor = ColumnTransformer(transformers=[
|
| 84 |
-
('cat', categorical_pipeline, categorical_cols),
|
| 85 |
-
('num', numeric_pipeline, numerical_cols)
|
| 86 |
-
])
|
| 87 |
-
|
| 88 |
-
# ํ์ต: ์ ์ฒ๋ฆฌ + PCA + ํด๋ฌ์คํฐ๋ง
|
| 89 |
-
X_preprocessed = preprocessor.fit_transform(preprocessed_data_clean)
|
| 90 |
-
pca = PCA(n_components=3)
|
| 91 |
-
X_reduced = pca.fit_transform(X_preprocessed)
|
| 92 |
-
|
| 93 |
-
kmeans = KMeans(n_clusters=7, random_state=42)
|
| 94 |
-
preprocessed_data_clean['cluster'] = kmeans.fit_predict(X_reduced)
|
| 95 |
-
|
| 96 |
-
print("explained_variance_ratio:", pca.explained_variance_ratio_.sum())
|
| 97 |
-
print(f"Silhouette Score: {silhouette_score(X_reduced, preprocessed_data_clean['cluster']):.4f}")
|
| 98 |
-
|
| 99 |
-
# LLM ์ง์ = ๋ณ์ ๋งคํ ํจ์
|
| 100 |
-
def load_text_file(filepath):
|
| 101 |
-
try:
|
| 102 |
-
with open(filepath, "r", encoding="utf-8") as f:
|
| 103 |
-
return f.read()
|
| 104 |
-
except Exception as e:
|
| 105 |
-
print(f"[ํ์ผ ๋ก๋ฉ ์คํจ] {filepath} - {e}")
|
| 106 |
-
return ""
|
| 107 |
-
|
| 108 |
-
def query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True):
|
| 109 |
-
prompt_parts = []
|
| 110 |
-
|
| 111 |
-
if use_prompt:
|
| 112 |
-
with open("custom_prompt.txt", "r", encoding="utf-8") as f:
|
| 113 |
-
custom_prompt = f.read()
|
| 114 |
-
prompt_parts.append(custom_prompt)
|
| 115 |
-
|
| 116 |
-
if use_fewshot:
|
| 117 |
-
with open("custom_few_shot_learning.txt", "r", encoding="utf-8") as f:
|
| 118 |
-
few_shot_examples = f.read()
|
| 119 |
-
prompt_parts.append(few_shot_examples)
|
| 120 |
-
|
| 121 |
-
full_prompt = "\n\n".join(prompt_parts)
|
| 122 |
-
|
| 123 |
-
messages = [
|
| 124 |
-
{"role": "system", "content": full_prompt},
|
| 125 |
-
{"role": "user", "content": user_query}
|
| 126 |
-
]
|
| 127 |
-
|
| 128 |
-
try:
|
| 129 |
-
response = client.chat.completions.create(
|
| 130 |
-
model="gpt-3.5-turbo",
|
| 131 |
-
messages=messages
|
| 132 |
-
)
|
| 133 |
-
content = response.choices[0].message.content.strip()
|
| 134 |
-
return json.loads(content)
|
| 135 |
-
except Exception as e:
|
| 136 |
-
print("[ํ์ฑ ์คํจ]", e)
|
| 137 |
-
return {}
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
def impute_with_user_subgroup(user_input_dict, df_base):
|
| 141 |
-
known_info = {k: v for k, v in user_input_dict.items() if v is not None}
|
| 142 |
-
filtered_df = df_base.copy()
|
| 143 |
-
for key, val in known_info.items():
|
| 144 |
-
if key in filtered_df.columns:
|
| 145 |
-
filtered_df = filtered_df[filtered_df[key].astype(str) == str(val)]
|
| 146 |
-
imputed = {}
|
| 147 |
-
for var in used_variables:
|
| 148 |
-
if user_input_dict.get(var) is not None:
|
| 149 |
-
imputed[var] = user_input_dict[var]
|
| 150 |
-
else:
|
| 151 |
-
if var in numerical_cols:
|
| 152 |
-
imputed[var] = filtered_df[var].mean() if not filtered_df.empty else df_base[var].mean()
|
| 153 |
-
elif var in categorical_cols:
|
| 154 |
-
mode_series = filtered_df[var].mode() if not filtered_df.empty else df_base[var].mode()
|
| 155 |
-
imputed[var] = mode_series.iloc[0] if not mode_series.empty else None
|
| 156 |
-
return imputed
|
| 157 |
-
|
| 158 |
-
# ์ง์ = ์์ธก ํจ์
|
| 159 |
-
def predict_cluster_from_query(user_query):
|
| 160 |
-
variable_dict = query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True)
|
| 161 |
-
|
| 162 |
-
# null์ด ์๋ ๊ฐ๋ง ํํฐ๋งํ์ฌ ์ถ๋ ฅ
|
| 163 |
-
filtered_dict = {k: v for k, v in variable_dict.items() if v is not None}
|
| 164 |
-
print("โฎ LLM ์ถ์ถ ๊ฒฐ๊ณผ:", filtered_dict)
|
| 165 |
-
|
| 166 |
-
# ๊ฒฐ์ธก ๋ณด์
|
| 167 |
-
completed_input = impute_with_user_subgroup(variable_dict, preprocessed_data_clean)
|
| 168 |
-
df = pd.DataFrame([completed_input])
|
| 169 |
-
|
| 170 |
-
for col in categorical_cols:
|
| 171 |
-
df[col] = df[col].astype(str)
|
| 172 |
-
for col in numerical_cols:
|
| 173 |
-
df[col] = pd.to_numeric(df[col], errors='coerce')
|
| 174 |
-
|
| 175 |
-
try:
|
| 176 |
-
X_processed = preprocessor.transform(df)
|
| 177 |
-
X_pca = pca.transform(X_processed)
|
| 178 |
-
cluster_label = kmeans.predict(X_pca)[0]
|
| 179 |
-
return cluster_label
|
| 180 |
-
except Exception as e:
|
| 181 |
-
print("[์์ธก ์คํจ]", e)
|
| 182 |
-
return None
|
| 183 |
-
|
| 184 |
-
# main block
|
| 185 |
-
if __name__ == "__main__":
|
| 186 |
-
test_inputs = [
|
| 187 |
-
"๋๋ 50๋ ๋จ์ฑ์ด๊ณ , ์์ฐ ํ๊ฒฝ์ ์ข์ํด์ ์ ์ฃผ๋์ 4์ผ ์ฌํํ์ด์",
|
| 188 |
-
"์ ๋ 20๋ ์ฌ์ฑ์ด๋ฉฐ ์ผํ์ ์ข์ํด์. ์์ธ์์ 3์ผ๊ฐ ๋จธ๋ฌผ๋ ์ด์",
|
| 189 |
-
"๋๋ 30๋ ๋จ์๊ณ ํ๊ตญ ์ ํต๋ฌธํ ์ฒดํ์ด ์ข์์ ์ ์ฃผ์ ๊ฐ์ด์. ์ด 5์ผ ์์์ด์",
|
| 190 |
-
"์ ๋ ๋ฏธ๊ตญ์์ ์๊ณ , ์ฒ์ ๋ฐฉ๋ฌธํ์ด์. ํ๊ตญ ์์์ ๊ด์ฌ์ด ๋ง์ 6์ผ๊ฐ ๋จธ๋ฌผ๋ ์ด์",
|
| 191 |
-
"์ ๋ ์ผ๋ณธ ์ฌ์ฑ์ด๊ณ , ๋ ๋ฒ์งธ ๋ฐฉ๋ฌธ์
๋๋ค. ์์ฐ ํ๊ฒฝ๊ณผ ์ ์ ์ง๋ฅผ ๋ณด๊ธฐ ์ํด ๊ฐ์๋์ 7์ผ ๋จธ๋ฌผ๋ ์ด์"
|
| 192 |
-
]
|
| 193 |
-
|
| 194 |
-
for i, user_input in enumerate(test_inputs, 1):
|
| 195 |
-
cluster = predict_cluster_from_query(user_input)
|
| 196 |
-
print(f"# ์คํ ์์ {i}")
|
| 197 |
-
print(f"์
๋ ฅ ๋ฌธ์ฅ: {user_input}")
|
| 198 |
-
print(f"์์ธก๋ ํด๋ฌ์คํฐ: {cluster}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rag_retriever.py
CHANGED
|
@@ -6,22 +6,13 @@ import faiss
|
|
| 6 |
import numpy as np
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from openai import OpenAI
|
| 9 |
-
from
|
| 10 |
|
| 11 |
# --- ์ค์ ---
|
| 12 |
-
# .env ํ์ผ ๋ก๋
|
| 13 |
-
load_dotenv()
|
| 14 |
-
|
| 15 |
-
# os.environ['HF_HOME'] = 'D:/huggingface_cache'
|
| 16 |
-
|
| 17 |
-
# ๋ชจ๋ธ ๋ฐ ํ์ผ ๊ฒฝ๋ก ์ ์
|
| 18 |
MODEL_NAME = 'jhgan/ko-sbert-nli'
|
| 19 |
-
#LLM_MODEL_NAME = 'gpt-4o-mini'
|
| 20 |
LLM_MODEL_NAME = 'gpt-3.5-turbo'
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
METADATA_FILE = f'{OUTPUT_DIR}/merged_metadata.jsonl'
|
| 24 |
-
TOP_K = 10 # ๊ฒ์ํ ๊ฒฐ๊ณผ์ ์
|
| 25 |
|
| 26 |
# OpenAI ํด๋ผ์ด์ธํธ ์ด๊ธฐํ
|
| 27 |
client = OpenAI(api_key=os.getenv("API_KEY"))
|
|
@@ -30,14 +21,24 @@ client = OpenAI(api_key=os.getenv("API_KEY"))
|
|
| 30 |
def _load_resources():
|
| 31 |
"""๋ชจ๋ ๋ก๋ฉ ์ ๊ฒ์์ ํ์ํ ๋ฆฌ์์ค๋ฅผ ๋ฏธ๋ฆฌ ๋ถ๋ฌ์ต๋๋ค."""
|
| 32 |
try:
|
| 33 |
-
print("1. RAG ๋ฆฌ์์ค๋ฅผ ๋ก
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
model = SentenceTransformer(MODEL_NAME)
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
metadata_map = {}
|
| 37 |
-
with open(
|
| 38 |
for line in f:
|
| 39 |
meta = json.loads(line)
|
| 40 |
metadata_map[meta['vector_id']] = meta
|
|
|
|
| 41 |
print("RAG ๋ฆฌ์์ค ๋ก๋ฉ ์๋ฃ!")
|
| 42 |
return model, index, metadata_map
|
| 43 |
except Exception as e:
|
|
@@ -52,7 +53,7 @@ def _retrieve_places(query, k):
|
|
| 52 |
"""๋ด๋ถ ํจ์: ์ฟผ๋ฆฌ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ ์ฌํ ์ฅ์๋ฅผ ๊ฒ์ํฉ๋๋ค."""
|
| 53 |
query_vector = embedding_model.encode([query])
|
| 54 |
distances, ids = faiss_index.search(query_vector.astype('float32'), k)
|
| 55 |
-
|
| 56 |
results = []
|
| 57 |
for vector_id in ids[0]:
|
| 58 |
if vector_id in meta_map:
|
|
@@ -60,14 +61,13 @@ def _retrieve_places(query, k):
|
|
| 60 |
return results
|
| 61 |
|
| 62 |
|
| 63 |
-
|
| 64 |
def _generate_answer_with_llm(query, retrieved_places):
|
| 65 |
"""๋ด๋ถ ํจ์: ๊ฒ์๋ ์ ๋ณด๋ฅผ ๋ฐํ์ผ๋ก LLM ๋ต๋ณ์ ์์ฑํฉ๋๋ค."""
|
| 66 |
context = ""
|
| 67 |
-
for i, place in enumerate(retrieved_places[:5]):
|
| 68 |
context += f"--- ์ฅ์ ์ ๋ณด {i+1} ---\n"
|
| 69 |
context += f"์ด๋ฆ: {place.get('name', '์ ๋ณด ์์')}\n"
|
| 70 |
-
context += f"์ฃผ์: {place.get('address', '์ ๋ณด ์์')}\n"
|
| 71 |
context += f"AI ์์ฝ: {place.get('ai_summary', '์ ๋ณด ์์')}\n"
|
| 72 |
processed_sentences = place.get('processed_sentences', [])
|
| 73 |
context += "์ฃผ์ ํน์ง ๋ฐ ํ๊ธฐ:\n"
|
|
@@ -76,7 +76,6 @@ def _generate_answer_with_llm(query, retrieved_places):
|
|
| 76 |
context += "\n"
|
| 77 |
|
| 78 |
system_prompt = "๋น์ ์ ์ฌ์ฉ์์ ์ง๋ฌธ์ ๊ฐ์ฅ ์ ํฉํ ์ฅ์๋ฅผ ์ถ์ฒํด์ฃผ๋ ์ ์ฉํ ์ด์์คํดํธ์
๋๋ค."
|
| 79 |
-
# <--- 2. ์ง์์ฌํญ ์์
|
| 80 |
user_prompt = f"""
|
| 81 |
์๋ '์ฅ์ ์ ๋ณด'๋ง์ ๋ฐํ์ผ๋ก ์ฌ์ฉ์์ ์ง๋ฌธ์ ๋ํ ๋ต๋ณ์ ์์ฑํด ์ฃผ์ธ์.
|
| 82 |
|
|
@@ -106,27 +105,22 @@ def _generate_answer_with_llm(query, retrieved_places):
|
|
| 106 |
return f"LLM ๋ต๋ณ ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {e}"
|
| 107 |
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
# --- ๋ํ ์คํ ํจ์ ---
|
| 113 |
-
# search_query ์ธ์ region_keywords๋ฅผ ์ธ์๋ก ๋ฐ๋๋ก ๋ณ๊ฒฝ
|
| 114 |
def get_rag_recommendation(search_query, region_keywords):
|
| 115 |
"""
|
| 116 |
๊ฒ์ ์ฟผ๋ฆฌ์ ์ง์ญ ํค์๋๋ฅผ ๋ฐ์ RAG ์์คํ
์ ํตํด ์ต์ข
์ถ์ฒ ๋ต๋ณ์ ๋ฐํํฉ๋๋ค.
|
| 117 |
"""
|
| 118 |
if not all([embedding_model, faiss_index, meta_map]):
|
| 119 |
return "RAG ์์คํ
์ด ์ค๋น๋์ง ์์ ์ถ์ฒ์ ์์ฑํ ์ ์์ต๋๋ค."
|
| 120 |
-
|
| 121 |
# 1. ์ฅ์ ๊ฒ์
|
| 122 |
print("\n[RAG] ์๋ฏธ์ ์ผ๋ก ์ ์ฌํ ์ฅ์๋ฅผ ๊ฒ์ํฉ๋๋ค...")
|
| 123 |
top_places = _retrieve_places(search_query, k=100)
|
| 124 |
-
|
| 125 |
if not top_places:
|
| 126 |
return "๊ด๋ จ๋ ์ฅ์๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค."
|
| 127 |
|
| 128 |
-
# 2. ์ง์ญ ํํฐ๋ง
|
| 129 |
-
# region_keywords ๋ฆฌ์คํธ๊ฐ ๋น์ด์์ง ์์ ๊ฒฝ์ฐ์๋ง ํํฐ๋ง ์ํ
|
| 130 |
if region_keywords:
|
| 131 |
print(f"[RAG] ์ฃผ์ ํํฐ๋ง (ํค์๋: {region_keywords})...")
|
| 132 |
filtered_places = []
|
|
@@ -134,20 +128,18 @@ def get_rag_recommendation(search_query, region_keywords):
|
|
| 134 |
address = place.get('address', '')
|
| 135 |
if any(keyword in address for keyword in region_keywords):
|
| 136 |
filtered_places.append(place)
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
print(f"[RAG] ํํฐ๋ง ํ ๋จ์ ์ฅ์: {[p.get('name') for p in filtered_places]}")
|
| 141 |
else:
|
| 142 |
-
# ์ง์ญ ํค์๋๊ฐ ์์ผ๋ฉด ํํฐ๋ง ์์ด ๊ทธ๋๋ก ์ฌ์ฉ
|
| 143 |
print("[RAG] ์ง์ญ ํค์๋๊ฐ ์์ด ํํฐ๋ง์ ๊ฑด๋๋๋๋ค.")
|
| 144 |
filtered_places = top_places
|
| 145 |
-
|
| 146 |
if not filtered_places:
|
| 147 |
return "์์ฒญํ์ ์ง์ญ์ ๋ง๋ ์ฅ์๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค."
|
| 148 |
|
| 149 |
# 3. LLM์ผ๋ก ๋ต๋ณ ์์ฑ
|
| 150 |
print("[RAG] ํํฐ๋ง๋ ์ ๋ณด๋ฅผ ๋ฐํ์ผ๋ก ์ต์ข
๋ต๋ณ์ ์์ฑํฉ๋๋ค...")
|
| 151 |
final_answer = _generate_answer_with_llm(search_query, filtered_places)
|
| 152 |
-
|
| 153 |
-
return final_answer
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
from sentence_transformers import SentenceTransformer
|
| 8 |
from openai import OpenAI
|
| 9 |
+
from huggingface_hub import hf_hub_download
|
| 10 |
|
| 11 |
# --- ์ค์ ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
MODEL_NAME = 'jhgan/ko-sbert-nli'
|
|
|
|
| 13 |
LLM_MODEL_NAME = 'gpt-3.5-turbo'
|
| 14 |
+
DATA_REPO = "Syngyeon/seoulalpha-data"
|
| 15 |
+
TOP_K = 10
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# OpenAI ํด๋ผ์ด์ธํธ ์ด๊ธฐํ
|
| 18 |
client = OpenAI(api_key=os.getenv("API_KEY"))
|
|
|
|
| 21 |
def _load_resources():
|
| 22 |
"""๋ชจ๋ ๋ก๋ฉ ์ ๊ฒ์์ ํ์ํ ๋ฆฌ์์ค๋ฅผ ๋ฏธ๋ฆฌ ๋ถ๋ฌ์ต๋๋ค."""
|
| 23 |
try:
|
| 24 |
+
print("1. Hugging Face Hub์์ RAG ๋ฆฌ์์ค๋ฅผ ๋ค์ด๋ก๋ํฉ๋๋ค...")
|
| 25 |
+
|
| 26 |
+
# HF repo์์ ํ์ผ ๋ค์ด๋ก๋
|
| 27 |
+
index_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/faiss_merged_output/merged.index")
|
| 28 |
+
metadata_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/faiss_merged_output/merged_metadata.jsonl")
|
| 29 |
+
# ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋
|
| 30 |
model = SentenceTransformer(MODEL_NAME)
|
| 31 |
+
|
| 32 |
+
# FAISS index ๋ก๋
|
| 33 |
+
index = faiss.read_index(index_path)
|
| 34 |
+
|
| 35 |
+
# ๋ฉํ๋ฐ์ดํฐ ๋ก๋
|
| 36 |
metadata_map = {}
|
| 37 |
+
with open(metadata_path, 'r', encoding='utf-8') as f:
|
| 38 |
for line in f:
|
| 39 |
meta = json.loads(line)
|
| 40 |
metadata_map[meta['vector_id']] = meta
|
| 41 |
+
|
| 42 |
print("RAG ๋ฆฌ์์ค ๋ก๋ฉ ์๋ฃ!")
|
| 43 |
return model, index, metadata_map
|
| 44 |
except Exception as e:
|
|
|
|
| 53 |
"""๋ด๋ถ ํจ์: ์ฟผ๋ฆฌ๋ฅผ ๊ธฐ๋ฐ์ผ๋ก ์ ์ฌํ ์ฅ์๋ฅผ ๊ฒ์ํฉ๋๋ค."""
|
| 54 |
query_vector = embedding_model.encode([query])
|
| 55 |
distances, ids = faiss_index.search(query_vector.astype('float32'), k)
|
| 56 |
+
|
| 57 |
results = []
|
| 58 |
for vector_id in ids[0]:
|
| 59 |
if vector_id in meta_map:
|
|
|
|
| 61 |
return results
|
| 62 |
|
| 63 |
|
|
|
|
| 64 |
def _generate_answer_with_llm(query, retrieved_places):
|
| 65 |
"""๋ด๋ถ ํจ์: ๊ฒ์๋ ์ ๋ณด๋ฅผ ๋ฐํ์ผ๋ก LLM ๋ต๋ณ์ ์์ฑํฉ๋๋ค."""
|
| 66 |
context = ""
|
| 67 |
+
for i, place in enumerate(retrieved_places[:5]): # ์์ 5๊ฐ ์ ๋ณด๋ง ์ฌ์ฉ
|
| 68 |
context += f"--- ์ฅ์ ์ ๋ณด {i+1} ---\n"
|
| 69 |
context += f"์ด๋ฆ: {place.get('name', '์ ๋ณด ์์')}\n"
|
| 70 |
+
context += f"์ฃผ์: {place.get('address', '์ ๋ณด ์์')}\n"
|
| 71 |
context += f"AI ์์ฝ: {place.get('ai_summary', '์ ๋ณด ์์')}\n"
|
| 72 |
processed_sentences = place.get('processed_sentences', [])
|
| 73 |
context += "์ฃผ์ ํน์ง ๋ฐ ํ๊ธฐ:\n"
|
|
|
|
| 76 |
context += "\n"
|
| 77 |
|
| 78 |
system_prompt = "๋น์ ์ ์ฌ์ฉ์์ ์ง๋ฌธ์ ๊ฐ์ฅ ์ ํฉํ ์ฅ์๋ฅผ ์ถ์ฒํด์ฃผ๋ ์ ์ฉํ ์ด์์คํดํธ์
๋๋ค."
|
|
|
|
| 79 |
user_prompt = f"""
|
| 80 |
์๋ '์ฅ์ ์ ๋ณด'๋ง์ ๋ฐํ์ผ๋ก ์ฌ์ฉ์์ ์ง๋ฌธ์ ๋ํ ๋ต๋ณ์ ์์ฑํด ์ฃผ์ธ์.
|
| 81 |
|
|
|
|
| 105 |
return f"LLM ๋ต๋ณ ์์ฑ ์ค ์ค๋ฅ๊ฐ ๋ฐ์ํ์ต๋๋ค: {e}"
|
| 106 |
|
| 107 |
|
|
|
|
|
|
|
|
|
|
| 108 |
# --- ๋ํ ์คํ ํจ์ ---
|
|
|
|
| 109 |
def get_rag_recommendation(search_query, region_keywords):
|
| 110 |
"""
|
| 111 |
๊ฒ์ ์ฟผ๋ฆฌ์ ์ง์ญ ํค์๋๋ฅผ ๋ฐ์ RAG ์์คํ
์ ํตํด ์ต์ข
์ถ์ฒ ๋ต๋ณ์ ๋ฐํํฉ๋๋ค.
|
| 112 |
"""
|
| 113 |
if not all([embedding_model, faiss_index, meta_map]):
|
| 114 |
return "RAG ์์คํ
์ด ์ค๋น๋์ง ์์ ์ถ์ฒ์ ์์ฑํ ์ ์์ต๋๋ค."
|
| 115 |
+
|
| 116 |
# 1. ์ฅ์ ๊ฒ์
|
| 117 |
print("\n[RAG] ์๋ฏธ์ ์ผ๋ก ์ ์ฌํ ์ฅ์๋ฅผ ๊ฒ์ํฉ๋๋ค...")
|
| 118 |
top_places = _retrieve_places(search_query, k=100)
|
| 119 |
+
|
| 120 |
if not top_places:
|
| 121 |
return "๊ด๋ จ๋ ์ฅ์๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค."
|
| 122 |
|
| 123 |
+
# 2. ์ง์ญ ํํฐ๋ง
|
|
|
|
| 124 |
if region_keywords:
|
| 125 |
print(f"[RAG] ์ฃผ์ ํํฐ๋ง (ํค์๋: {region_keywords})...")
|
| 126 |
filtered_places = []
|
|
|
|
| 128 |
address = place.get('address', '')
|
| 129 |
if any(keyword in address for keyword in region_keywords):
|
| 130 |
filtered_places.append(place)
|
| 131 |
+
if len(filtered_places) >= 10:
|
| 132 |
+
break
|
|
|
|
| 133 |
print(f"[RAG] ํํฐ๋ง ํ ๋จ์ ์ฅ์: {[p.get('name') for p in filtered_places]}")
|
| 134 |
else:
|
|
|
|
| 135 |
print("[RAG] ์ง์ญ ํค์๋๊ฐ ์์ด ํํฐ๋ง์ ๊ฑด๋๋๋๋ค.")
|
| 136 |
filtered_places = top_places
|
| 137 |
+
|
| 138 |
if not filtered_places:
|
| 139 |
return "์์ฒญํ์ ์ง์ญ์ ๋ง๋ ์ฅ์๋ฅผ ์ฐพ์ง ๋ชปํ์ต๋๋ค."
|
| 140 |
|
| 141 |
# 3. LLM์ผ๋ก ๋ต๋ณ ์์ฑ
|
| 142 |
print("[RAG] ํํฐ๋ง๋ ์ ๋ณด๋ฅผ ๋ฐํ์ผ๋ก ์ต์ข
๋ต๋ณ์ ์์ฑํฉ๋๋ค...")
|
| 143 |
final_answer = _generate_answer_with_llm(search_query, filtered_places)
|
| 144 |
+
|
| 145 |
+
return final_answer
|
region_extractor.py
CHANGED
|
@@ -1,66 +1,47 @@
|
|
| 1 |
-
from dotenv import load_dotenv
|
| 2 |
-
import random
|
| 3 |
-
from openai import OpenAI
|
| 4 |
import os
|
| 5 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
client = OpenAI(api_key=os.getenv("API_KEY"))
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
์ฌ์ฉ์ ์ง๋ฌธ์์ LLM์ ์ฌ์ฉํด ์ง์ญ๋ช
ํค์๋ ๋ฆฌ์คํธ๋ฅผ ์ถ์ถํฉ๋๋ค.
|
| 15 |
-
"""
|
| 16 |
-
print("[LLM] ์ฌ์ฉ์ ์ฟผ๋ฆฌ์์ ์ง์ญ๋ช
ํค์๋๋ฅผ ์ถ์ถํฉ๋๋ค...")
|
| 17 |
-
|
| 18 |
-
# LLM์๊ฒ ์ญํ ์ ๋ถ์ฌํ๊ณ , ์์(Few-shot)๋ฅผ ํตํด ์ํ๋ ๊ฒฐ๊ณผ ํ์์ ๋ช
ํํ ์๋ ค์ค๋๋ค.
|
| 19 |
-
system_prompt = """
|
| 20 |
-
๋น์ ์ ์ฌ์ฉ์์ ์ฌํ ๊ด๋ จ ์ง๋ฌธ์์ '๋ํ๋ฏผ๊ตญ ํ์ ๊ตฌ์ญ' ํค์๋๋ฅผ ์ถ์ถํ๋ AI ์ด์์คํดํธ์
๋๋ค.
|
| 21 |
-
์ฌ์ฉ์์ ์ง๋ฌธ์ ๋ถ์ํ์ฌ, ์ฃผ์ ํํฐ๋ง์ ์ฌ์ฉํ ์ ์๋ ํค์๋ ๋ชฉ๋ก์ JSON ํ์์ผ๋ก ๋ฐํํด ์ฃผ์ธ์.
|
| 22 |
-
๊ฒฐ๊ณผ๋ ๋ฐ๋์ {"regions": ["ํค์๋1", "ํค์๋2", ...]} ํํ์ฌ์ผ ํฉ๋๋ค.
|
| 23 |
-
|
| 24 |
-
- "์ ๋ผ๋"๋ "์ ๋ถ", "์ ๋จ", "๊ด์ฃผ"๋ก ํด์ํฉ๋๋ค.
|
| 25 |
-
- "๊ฒฝ์๋"๋ "๊ฒฝ๋ถ", "๊ฒฝ๋จ", "๋ถ์ฐ", "๋๊ตฌ", "์ธ์ฐ"์ผ๋ก ํด์ํฉ๋๋ค.
|
| 26 |
-
- "์ถฉ์ฒญ๋"๋ "์ถฉ๋ถ", "์ถฉ๋จ", "๋์ ", "์ธ์ข
"์ผ๋ก ํด์ํฉ๋๋ค.
|
| 27 |
-
- "์์ธ ๊ทผ๊ต"๋ "๊ฒฝ๊ธฐ", "์ธ์ฒ"์ผ๋ก ํด์ํฉ๋๋ค.
|
| 28 |
-
- ์ธ๊ธ๋ ์ง์ญ์ด ์์ผ๋ฉด ๋น ๋ฆฌ์คํธ []๋ฅผ ๋ฐํํฉ๋๋ค.
|
| 29 |
-
|
| 30 |
-
# ์์1
|
| 31 |
-
์ฌ์ฉ์: "์ ๋ผ๋ ์ชฝ์ผ๋ก ๋ง์ง ํฌ์ด ๊ฐ๊ณ ์ถ์ด"
|
| 32 |
-
AI: {"regions": ["์ ๋ถ", "์ ๋จ", "๊ด์ฃผ"]}
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
messages = [
|
| 44 |
-
{"role": "system", "content": system_prompt},
|
| 45 |
-
{"role": "user", "content": user_query}
|
| 46 |
-
]
|
| 47 |
-
|
| 48 |
-
try:
|
| 49 |
-
response = client.chat.completions.create(
|
| 50 |
-
model="gpt-3.5-turbo",
|
| 51 |
-
messages=messages,
|
| 52 |
-
response_format={"type": "json_object"}
|
| 53 |
-
)
|
| 54 |
-
result = json.loads(response.choices[0].message.content)
|
| 55 |
-
|
| 56 |
-
# 'regions' ํค๊ฐ ์๊ณ , ๊ทธ ๊ฐ์ด ๋ฆฌ์คํธ์ธ์ง ํ์ธ
|
| 57 |
-
if 'regions' in result and isinstance(result['regions'], list):
|
| 58 |
-
#print(f"[LLM] ์ถ์ถ๋ ์ง์ญ ํค์๋: {result['regions']}")
|
| 59 |
-
return result['regions']
|
| 60 |
-
else:
|
| 61 |
-
#print("[LLM] 'regions' ํค๋ฅผ ์ฐพ์ง ๋ชปํ๊ฑฐ๋ ํ์์ด ๋ฆฌ์คํธ๊ฐ ์๋๋๋ค.")
|
| 62 |
-
return []
|
| 63 |
-
|
| 64 |
except Exception as e:
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
+
import faiss
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
|
| 8 |
+
DATA_REPO = "Syngyeon/seoulalpha-data"
|
| 9 |
+
MODEL_NAME = "jhgan/ko-sbert-nli"
|
|
|
|
| 10 |
|
| 11 |
+
# ๋ก๋
|
| 12 |
+
def _load_region_index():
|
| 13 |
+
try:
|
| 14 |
+
index_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/region_db/faiss_region_semantic.index")
|
| 15 |
+
metadata_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/region_db/metadata_region_semantic.jsonl")
|
| 16 |
|
| 17 |
+
index = faiss.read_index(index_path)
|
| 18 |
+
model = SentenceTransformer(MODEL_NAME)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
+
metadata_map = {}
|
| 21 |
+
with open(metadata_path, "r", encoding="utf-8") as f:
|
| 22 |
+
for line in f:
|
| 23 |
+
meta = json.loads(line)
|
| 24 |
+
metadata_map[meta["vector_id"]] = meta
|
| 25 |
+
|
| 26 |
+
print("[RegionDB] ๋ก๋ฉ ์๋ฃ")
|
| 27 |
+
return model, index, metadata_map
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
except Exception as e:
|
| 29 |
+
print("[RegionDB] ๋ก๋ฉ ์คํจ:", e)
|
| 30 |
+
return None, None, None
|
| 31 |
+
|
| 32 |
+
region_model, region_index, region_meta = _load_region_index()
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def extract_region_semantic(user_query, top_k=5):
|
| 36 |
+
"""FAISS ๊ธฐ๋ฐ ์ง์ญ ํ๋ณด ์ถ์ถ"""
|
| 37 |
+
if not all([region_model, region_index, region_meta]):
|
| 38 |
return []
|
| 39 |
+
|
| 40 |
+
query_vec = region_model.encode([user_query]).astype("float32")
|
| 41 |
+
distances, ids = region_index.search(query_vec, top_k)
|
| 42 |
+
|
| 43 |
+
results = []
|
| 44 |
+
for i, vid in enumerate(ids[0]):
|
| 45 |
+
if vid in region_meta:
|
| 46 |
+
results.append(region_meta[vid]["region_name"])
|
| 47 |
+
return results
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|