SyngyeonTak commited on
Commit
1a554ac
ยท
1 Parent(s): 500a117

changes to adapt to hugging face

Browse files
app.py CHANGED
@@ -1,7 +1,4 @@
1
  # app.py
2
-
3
- import os
4
- from dotenv import load_dotenv
5
  import gradio as gr
6
  from langdetect import detect
7
  from deep_translator import GoogleTranslator
@@ -11,8 +8,6 @@ from cluster_predictor import get_user_cluster
11
  from region_extractor import extract_region_from_query
12
  from rag_retriever import get_rag_recommendation
13
 
14
- # --- ์ดˆ๊ธฐ ์„ค์ • ---
15
- load_dotenv()
16
 
17
  # ์–ธ์–ด ์ฝ”๋“œ ๋งคํ•‘ (deep_translator ํ˜ธํ™˜)
18
  LANG_CODE_MAP = {
@@ -129,4 +124,4 @@ with gr.Blocks() as demo:
129
  msg.submit(respond, [msg, chatbot, state], [msg, chatbot, state])
130
 
131
  if __name__ == "__main__":
132
- demo.launch()
 
1
  # app.py
 
 
 
2
  import gradio as gr
3
  from langdetect import detect
4
  from deep_translator import GoogleTranslator
 
8
  from region_extractor import extract_region_from_query
9
  from rag_retriever import get_rag_recommendation
10
 
 
 
11
 
12
  # ์–ธ์–ด ์ฝ”๋“œ ๋งคํ•‘ (deep_translator ํ˜ธํ™˜)
13
  LANG_CODE_MAP = {
 
124
  msg.submit(respond, [msg, chatbot, state], [msg, chatbot, state])
125
 
126
  if __name__ == "__main__":
127
+ demo.launch(show_api=False, debug=True)
cluster_predictor.py CHANGED
@@ -5,11 +5,13 @@ import pandas as pd
5
  from openai import OpenAI
6
  import os
7
  import json
8
- from dotenv import load_dotenv
9
- import random
 
 
 
10
 
11
  # --- ์ดˆ๊ธฐ ์„ค์ • ---
12
- load_dotenv()
13
  client = OpenAI(api_key=os.getenv("API_KEY"))
14
 
15
  CLUSTER_PROFILES = {
@@ -43,12 +45,12 @@ def query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True):
43
  prompt_parts = []
44
 
45
  if use_prompt:
46
- with open("data/prompt/custom_prompt_eng.txt", "r", encoding="utf-8") as f:
47
  custom_prompt = f.read()
48
  prompt_parts.append(custom_prompt)
49
 
50
  if use_fewshot:
51
- with open("data/prompt/custom_few_shot_learning_multi_language.txt", "r", encoding="utf-8") as f:
52
  few_shot_examples = f.read()
53
  prompt_parts.append(few_shot_examples)
54
 
 
5
  from openai import OpenAI
6
  import os
7
  import json
8
+ from huggingface_hub import hf_hub_download
9
+
10
+ # Hugging Face dataset repo์—์„œ prompt ํŒŒ์ผ ๋กœ๋“œ
11
+ PROMPT_PATH = hf_hub_download("Syngyeon/seoulalpha-data", "prompt/custom_prompt_eng.txt")
12
+ FEWSHOT_PATH = hf_hub_download("Syngyeon/seoulalpha-data", "prompt/custom_few_shot_learning_multi_language.txt")
13
 
14
  # --- ์ดˆ๊ธฐ ์„ค์ • ---
 
15
  client = OpenAI(api_key=os.getenv("API_KEY"))
16
 
17
  CLUSTER_PROFILES = {
 
45
  prompt_parts = []
46
 
47
  if use_prompt:
48
+ with open("PROMPT_PATH", "r", encoding="utf-8") as f:
49
  custom_prompt = f.read()
50
  prompt_parts.append(custom_prompt)
51
 
52
  if use_fewshot:
53
+ with open("FEWSHOT_PATH", "r", encoding="utf-8") as f:
54
  few_shot_examples = f.read()
55
  prompt_parts.append(few_shot_examples)
56
 
llm_cluster_predictor.py DELETED
@@ -1,198 +0,0 @@
1
- import subprocess
2
- import sys
3
- import os
4
- import json
5
- from dotenv import load_dotenv
6
- import os
7
-
8
- load_dotenv()
9
-
10
- # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์‚ฌ์šฉ
11
- USER_KEY = os.getenv("API_KEY")
12
-
13
- # ์„ค์น˜๊ฐ€ ํ•„์š”ํ•œ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ ์—†๋Š” ๊ฒฝ์šฐ ์ž๋™ ์„ค์น˜
14
- def install_required_packages(package, import_name=None):
15
- try:
16
- if import_name:
17
- __import__(import_name)
18
- else:
19
- __import__(package)
20
- except ImportError:
21
- print(f"๐Ÿ“ฆ {package} ์„ค์น˜ ์ค‘...")
22
- subprocess.check_call([sys.executable, "-m", "pip", "install", package])
23
- print(f"โœ… {package} ์„ค์น˜ ์™„๋ฃŒ!")
24
-
25
- required_packages = [
26
- ("scikit-learn", "sklearn"),
27
- ("tqdm", "tqdm"),
28
- ("openai", "openai"),
29
- ("pandas", "pandas"),
30
- ("numpy", "numpy")
31
- ]
32
-
33
- for pkg, imp in required_packages:
34
- install_required_packages(pkg, imp)
35
-
36
- from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
37
- from sklearn.compose import ColumnTransformer
38
- from sklearn.pipeline import Pipeline
39
- from sklearn.cluster import KMeans
40
- from sklearn.metrics import silhouette_score
41
- from sklearn.decomposition import PCA
42
- from sklearn.impute import SimpleImputer
43
- from tqdm import tqdm
44
- from openai import OpenAI
45
- import pandas as pd
46
- import numpy as np
47
-
48
- # OpenAI API ํ‚ค ์„ค์ •
49
- #USER_KEY = ""
50
- client = OpenAI(api_key=USER_KEY)
51
-
52
- # ๋ฐ์ดํ„ฐ ์ค€๋น„
53
- preprocessed_data = pd.read_csv('./๊ด€๊ด‘๋ฐ์ดํ„ฐ.csv', encoding='cp949')
54
-
55
- # ๋ณ€์ˆ˜ ๊ตฌ๋ถ„
56
- categorical_cols = ['country', 'gender', 'age',
57
- 'revisit_indicator', 'visit_local_indicator', 'planned_activity']
58
-
59
- numerical_cols = [
60
- 'stay_duration', 'accommodation_percent', 'food_percent', 'shopping_percent', 'food',
61
- 'landscape', 'heritage', 'language', 'safety', 'budget',
62
- 'accommodation', 'transport', 'navigation'
63
- ]
64
- used_variables = categorical_cols + numerical_cols
65
-
66
- for col in categorical_cols:
67
- preprocessed_data[col] = preprocessed_data[col].astype(str)
68
- preprocessed_data_clean = preprocessed_data.dropna(subset=used_variables).copy()
69
-
70
- # ์ „์ฒ˜๋ฆฌ ํŒŒ์ดํ”„๋ผ์ธ ์ •์˜
71
- # ์ˆ˜์น˜ํ˜• ํŒŒ์ดํ”„๋ผ์ธ: ํ‰๊ท  ๋Œ€์ฒด + ์ •๊ทœํ™”
72
- numeric_pipeline = Pipeline([
73
- ('imputer', SimpleImputer(strategy='mean')),
74
- ('scaler', MinMaxScaler())
75
- ])
76
-
77
- # ๋ฒ”์ฃผํ˜• ํŒŒ์ดํ”„๋ผ์ธ: ์ตœ๋นˆ๊ฐ’ ๋Œ€์ฒด + ์›ํ•ซ์ธ์ฝ”๋”ฉ
78
- categorical_pipeline = Pipeline([
79
- ('imputer', SimpleImputer(strategy='most_frequent')),
80
- ('encoder', OneHotEncoder(handle_unknown='ignore'))
81
- ])
82
-
83
- preprocessor = ColumnTransformer(transformers=[
84
- ('cat', categorical_pipeline, categorical_cols),
85
- ('num', numeric_pipeline, numerical_cols)
86
- ])
87
-
88
- # ํ•™์Šต: ์ „์ฒ˜๋ฆฌ + PCA + ํด๋Ÿฌ์Šคํ„ฐ๋ง
89
- X_preprocessed = preprocessor.fit_transform(preprocessed_data_clean)
90
- pca = PCA(n_components=3)
91
- X_reduced = pca.fit_transform(X_preprocessed)
92
-
93
- kmeans = KMeans(n_clusters=7, random_state=42)
94
- preprocessed_data_clean['cluster'] = kmeans.fit_predict(X_reduced)
95
-
96
- print("explained_variance_ratio:", pca.explained_variance_ratio_.sum())
97
- print(f"Silhouette Score: {silhouette_score(X_reduced, preprocessed_data_clean['cluster']):.4f}")
98
-
99
- # LLM ์งˆ์˜ = ๋ณ€์ˆ˜ ๋งคํ•‘ ํ•จ์ˆ˜
100
- def load_text_file(filepath):
101
- try:
102
- with open(filepath, "r", encoding="utf-8") as f:
103
- return f.read()
104
- except Exception as e:
105
- print(f"[ํŒŒ์ผ ๋กœ๋”ฉ ์‹คํŒจ] {filepath} - {e}")
106
- return ""
107
-
108
- def query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True):
109
- prompt_parts = []
110
-
111
- if use_prompt:
112
- with open("custom_prompt.txt", "r", encoding="utf-8") as f:
113
- custom_prompt = f.read()
114
- prompt_parts.append(custom_prompt)
115
-
116
- if use_fewshot:
117
- with open("custom_few_shot_learning.txt", "r", encoding="utf-8") as f:
118
- few_shot_examples = f.read()
119
- prompt_parts.append(few_shot_examples)
120
-
121
- full_prompt = "\n\n".join(prompt_parts)
122
-
123
- messages = [
124
- {"role": "system", "content": full_prompt},
125
- {"role": "user", "content": user_query}
126
- ]
127
-
128
- try:
129
- response = client.chat.completions.create(
130
- model="gpt-3.5-turbo",
131
- messages=messages
132
- )
133
- content = response.choices[0].message.content.strip()
134
- return json.loads(content)
135
- except Exception as e:
136
- print("[ํŒŒ์‹ฑ ์‹คํŒจ]", e)
137
- return {}
138
-
139
-
140
- def impute_with_user_subgroup(user_input_dict, df_base):
141
- known_info = {k: v for k, v in user_input_dict.items() if v is not None}
142
- filtered_df = df_base.copy()
143
- for key, val in known_info.items():
144
- if key in filtered_df.columns:
145
- filtered_df = filtered_df[filtered_df[key].astype(str) == str(val)]
146
- imputed = {}
147
- for var in used_variables:
148
- if user_input_dict.get(var) is not None:
149
- imputed[var] = user_input_dict[var]
150
- else:
151
- if var in numerical_cols:
152
- imputed[var] = filtered_df[var].mean() if not filtered_df.empty else df_base[var].mean()
153
- elif var in categorical_cols:
154
- mode_series = filtered_df[var].mode() if not filtered_df.empty else df_base[var].mode()
155
- imputed[var] = mode_series.iloc[0] if not mode_series.empty else None
156
- return imputed
157
-
158
- # ์งˆ์˜ = ์˜ˆ์ธก ํ•จ์ˆ˜
159
- def predict_cluster_from_query(user_query):
160
- variable_dict = query_llm_for_variables(user_query, use_prompt=True, use_fewshot=True)
161
-
162
- # null์ด ์•„๋‹Œ ๊ฐ’๋งŒ ํ•„ํ„ฐ๋งํ•˜์—ฌ ์ถœ๋ ฅ
163
- filtered_dict = {k: v for k, v in variable_dict.items() if v is not None}
164
- print("โฎ• LLM ์ถ”์ถœ ๊ฒฐ๊ณผ:", filtered_dict)
165
-
166
- # ๊ฒฐ์ธก ๋ณด์™„
167
- completed_input = impute_with_user_subgroup(variable_dict, preprocessed_data_clean)
168
- df = pd.DataFrame([completed_input])
169
-
170
- for col in categorical_cols:
171
- df[col] = df[col].astype(str)
172
- for col in numerical_cols:
173
- df[col] = pd.to_numeric(df[col], errors='coerce')
174
-
175
- try:
176
- X_processed = preprocessor.transform(df)
177
- X_pca = pca.transform(X_processed)
178
- cluster_label = kmeans.predict(X_pca)[0]
179
- return cluster_label
180
- except Exception as e:
181
- print("[์˜ˆ์ธก ์‹คํŒจ]", e)
182
- return None
183
-
184
- # main block
185
- if __name__ == "__main__":
186
- test_inputs = [
187
- "๋‚˜๋Š” 50๋Œ€ ๋‚จ์„ฑ์ด๊ณ , ์ž์—ฐ ํ’๊ฒฝ์„ ์ข‹์•„ํ•ด์„œ ์ œ์ฃผ๋„์— 4์ผ ์—ฌํ–‰ํ–ˆ์–ด์š”",
188
- "์ €๋Š” 20๋Œ€ ์—ฌ์„ฑ์ด๋ฉฐ ์‡ผํ•‘์„ ์ข‹์•„ํ•ด์š”. ์„œ์šธ์—์„œ 3์ผ๊ฐ„ ๋จธ๋ฌผ๋ €์–ด์š”",
189
- "๋‚˜๋Š” 30๋Œ€ ๋‚จ์ž๊ณ  ํ•œ๊ตญ ์ „ํ†ต๋ฌธํ™” ์ฒดํ—˜์ด ์ข‹์•„์„œ ์ „์ฃผ์— ๊ฐ”์–ด์š”. ์ด 5์ผ ์žˆ์—ˆ์–ด์š”",
190
- "์ €๋Š” ๋ฏธ๊ตญ์—์„œ ์™”๊ณ , ์ฒ˜์Œ ๋ฐฉ๋ฌธํ–ˆ์–ด์š”. ํ•œ๊ตญ ์Œ์‹์— ๊ด€์‹ฌ์ด ๋งŽ์•„ 6์ผ๊ฐ„ ๋จธ๋ฌผ๋ €์–ด์š”",
191
- "์ €๋Š” ์ผ๋ณธ ์—ฌ์„ฑ์ด๊ณ , ๋‘ ๋ฒˆ์งธ ๋ฐฉ๋ฌธ์ž…๋‹ˆ๋‹ค. ์ž์—ฐ ํ’๊ฒฝ๊ณผ ์œ ์ ์ง€๋ฅผ ๋ณด๊ธฐ ์œ„ํ•ด ๊ฐ•์›๋„์— 7์ผ ๋จธ๋ฌผ๋ €์–ด์š”"
192
- ]
193
-
194
- for i, user_input in enumerate(test_inputs, 1):
195
- cluster = predict_cluster_from_query(user_input)
196
- print(f"# ์‹คํ–‰ ์˜ˆ์‹œ {i}")
197
- print(f"์ž…๋ ฅ ๋ฌธ์žฅ: {user_input}")
198
- print(f"์˜ˆ์ธก๋œ ํด๋Ÿฌ์Šคํ„ฐ: {cluster}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
rag_retriever.py CHANGED
@@ -6,22 +6,13 @@ import faiss
6
  import numpy as np
7
  from sentence_transformers import SentenceTransformer
8
  from openai import OpenAI
9
- from dotenv import load_dotenv
10
 
11
  # --- ์„ค์ • ---
12
- # .env ํŒŒ์ผ ๋กœ๋“œ
13
- load_dotenv()
14
-
15
- # os.environ['HF_HOME'] = 'D:/huggingface_cache'
16
-
17
- # ๋ชจ๋ธ ๋ฐ ํŒŒ์ผ ๊ฒฝ๋กœ ์ •์˜
18
  MODEL_NAME = 'jhgan/ko-sbert-nli'
19
- #LLM_MODEL_NAME = 'gpt-4o-mini'
20
  LLM_MODEL_NAME = 'gpt-3.5-turbo'
21
- OUTPUT_DIR = 'data/faiss/faiss_merged_output'
22
- INDEX_FILE = f'{OUTPUT_DIR}/merged.index'
23
- METADATA_FILE = f'{OUTPUT_DIR}/merged_metadata.jsonl'
24
- TOP_K = 10 # ๊ฒ€์ƒ‰ํ•  ๊ฒฐ๊ณผ์˜ ์ˆ˜
25
 
26
  # OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
27
  client = OpenAI(api_key=os.getenv("API_KEY"))
@@ -30,14 +21,24 @@ client = OpenAI(api_key=os.getenv("API_KEY"))
30
  def _load_resources():
31
  """๋ชจ๋“ˆ ๋กœ๋”ฉ ์‹œ ๊ฒ€์ƒ‰์— ํ•„์š”ํ•œ ๋ฆฌ์†Œ์Šค๋ฅผ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค."""
32
  try:
33
- print("1. RAG ๋ฆฌ์†Œ์Šค๋ฅผ ๋กœ๋”ฉํ•ฉ๋‹ˆ๋‹ค...")
 
 
 
 
 
34
  model = SentenceTransformer(MODEL_NAME)
35
- index = faiss.read_index(INDEX_FILE)
 
 
 
 
36
  metadata_map = {}
37
- with open(METADATA_FILE, 'r', encoding='utf-8') as f:
38
  for line in f:
39
  meta = json.loads(line)
40
  metadata_map[meta['vector_id']] = meta
 
41
  print("RAG ๋ฆฌ์†Œ์Šค ๋กœ๋”ฉ ์™„๋ฃŒ!")
42
  return model, index, metadata_map
43
  except Exception as e:
@@ -52,7 +53,7 @@ def _retrieve_places(query, k):
52
  """๋‚ด๋ถ€ ํ•จ์ˆ˜: ์ฟผ๋ฆฌ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์œ ์‚ฌํ•œ ์žฅ์†Œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค."""
53
  query_vector = embedding_model.encode([query])
54
  distances, ids = faiss_index.search(query_vector.astype('float32'), k)
55
-
56
  results = []
57
  for vector_id in ids[0]:
58
  if vector_id in meta_map:
@@ -60,14 +61,13 @@ def _retrieve_places(query, k):
60
  return results
61
 
62
 
63
-
64
  def _generate_answer_with_llm(query, retrieved_places):
65
  """๋‚ด๋ถ€ ํ•จ์ˆ˜: ๊ฒ€์ƒ‰๋œ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ LLM ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."""
66
  context = ""
67
- for i, place in enumerate(retrieved_places[:5]): # ์ƒ์œ„ 5๊ฐœ ์ •๋ณด๋งŒ ์‚ฌ์šฉ
68
  context += f"--- ์žฅ์†Œ ์ •๋ณด {i+1} ---\n"
69
  context += f"์ด๋ฆ„: {place.get('name', '์ •๋ณด ์—†์Œ')}\n"
70
- context += f"์ฃผ์†Œ: {place.get('address', '์ •๋ณด ์—†์Œ')}\n" # <--- 1. '์ฃผ์†Œ' ์ •๋ณด ์ถ”๊ฐ€
71
  context += f"AI ์š”์•ฝ: {place.get('ai_summary', '์ •๋ณด ์—†์Œ')}\n"
72
  processed_sentences = place.get('processed_sentences', [])
73
  context += "์ฃผ์š” ํŠน์ง• ๋ฐ ํ›„๊ธฐ:\n"
@@ -76,7 +76,6 @@ def _generate_answer_with_llm(query, retrieved_places):
76
  context += "\n"
77
 
78
  system_prompt = "๋‹น์‹ ์€ ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์— ๊ฐ€์žฅ ์ ํ•ฉํ•œ ์žฅ์†Œ๋ฅผ ์ถ”์ฒœํ•ด์ฃผ๋Š” ์œ ์šฉํ•œ ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."
79
- # <--- 2. ์ง€์‹œ์‚ฌํ•ญ ์ˆ˜์ •
80
  user_prompt = f"""
81
  ์•„๋ž˜ '์žฅ์†Œ ์ •๋ณด'๋งŒ์„ ๋ฐ”ํƒ•์œผ๋กœ ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ด ์ฃผ์„ธ์š”.
82
 
@@ -106,27 +105,22 @@ def _generate_answer_with_llm(query, retrieved_places):
106
  return f"LLM ๋‹ต๋ณ€ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {e}"
107
 
108
 
109
-
110
-
111
-
112
  # --- ๋Œ€ํ‘œ ์‹คํ–‰ ํ•จ์ˆ˜ ---
113
- # search_query ์™ธ์— region_keywords๋ฅผ ์ธ์ž๋กœ ๋ฐ›๋„๋ก ๋ณ€๊ฒฝ
114
  def get_rag_recommendation(search_query, region_keywords):
115
  """
116
  ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ์™€ ์ง€์—ญ ํ‚ค์›Œ๋“œ๋ฅผ ๋ฐ›์•„ RAG ์‹œ์Šคํ…œ์„ ํ†ตํ•ด ์ตœ์ข… ์ถ”์ฒœ ๋‹ต๋ณ€์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
117
  """
118
  if not all([embedding_model, faiss_index, meta_map]):
119
  return "RAG ์‹œ์Šคํ…œ์ด ์ค€๋น„๋˜์ง€ ์•Š์•„ ์ถ”์ฒœ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
120
-
121
  # 1. ์žฅ์†Œ ๊ฒ€์ƒ‰
122
  print("\n[RAG] ์˜๋ฏธ์ ์œผ๋กœ ์œ ์‚ฌํ•œ ์žฅ์†Œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค...")
123
  top_places = _retrieve_places(search_query, k=100)
124
-
125
  if not top_places:
126
  return "๊ด€๋ จ๋œ ์žฅ์†Œ๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
127
 
128
- # 2. ์ง€์—ญ ํ•„ํ„ฐ๋ง (์ „๋‹ฌ๋ฐ›์€ ํ‚ค์›Œ๋“œ ์‚ฌ์šฉ)
129
- # region_keywords ๋ฆฌ์ŠคํŠธ๊ฐ€ ๋น„์–ด์žˆ์ง€ ์•Š์€ ๊ฒฝ์šฐ์—๋งŒ ํ•„ํ„ฐ๋ง ์ˆ˜ํ–‰
130
  if region_keywords:
131
  print(f"[RAG] ์ฃผ์†Œ ํ•„ํ„ฐ๋ง (ํ‚ค์›Œ๋“œ: {region_keywords})...")
132
  filtered_places = []
@@ -134,20 +128,18 @@ def get_rag_recommendation(search_query, region_keywords):
134
  address = place.get('address', '')
135
  if any(keyword in address for keyword in region_keywords):
136
  filtered_places.append(place)
137
-
138
- if len(filtered_places) >= 10: break
139
-
140
  print(f"[RAG] ํ•„ํ„ฐ๋ง ํ›„ ๋‚จ์€ ์žฅ์†Œ: {[p.get('name') for p in filtered_places]}")
141
  else:
142
- # ์ง€์—ญ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ํ•„ํ„ฐ๋ง ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
143
  print("[RAG] ์ง€์—ญ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์–ด ํ•„ํ„ฐ๋ง์„ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
144
  filtered_places = top_places
145
-
146
  if not filtered_places:
147
  return "์š”์ฒญํ•˜์‹  ์ง€์—ญ์— ๋งž๋Š” ์žฅ์†Œ๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
148
 
149
  # 3. LLM์œผ๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ
150
  print("[RAG] ํ•„ํ„ฐ๋ง๋œ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค...")
151
  final_answer = _generate_answer_with_llm(search_query, filtered_places)
152
-
153
- return final_answer
 
6
  import numpy as np
7
  from sentence_transformers import SentenceTransformer
8
  from openai import OpenAI
9
+ from huggingface_hub import hf_hub_download
10
 
11
  # --- ์„ค์ • ---
 
 
 
 
 
 
12
  MODEL_NAME = 'jhgan/ko-sbert-nli'
 
13
  LLM_MODEL_NAME = 'gpt-3.5-turbo'
14
+ DATA_REPO = "Syngyeon/seoulalpha-data"
15
+ TOP_K = 10
 
 
16
 
17
  # OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
18
  client = OpenAI(api_key=os.getenv("API_KEY"))
 
21
  def _load_resources():
22
  """๋ชจ๋“ˆ ๋กœ๋”ฉ ์‹œ ๊ฒ€์ƒ‰์— ํ•„์š”ํ•œ ๋ฆฌ์†Œ์Šค๋ฅผ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ต๋‹ˆ๋‹ค."""
23
  try:
24
+ print("1. Hugging Face Hub์—์„œ RAG ๋ฆฌ์†Œ์Šค๋ฅผ ๋‹ค์šด๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค...")
25
+
26
+ # HF repo์—์„œ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ
27
+ index_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/faiss_merged_output/merged.index")
28
+ metadata_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/faiss_merged_output/merged_metadata.jsonl")
29
+ # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
30
  model = SentenceTransformer(MODEL_NAME)
31
+
32
+ # FAISS index ๋กœ๋“œ
33
+ index = faiss.read_index(index_path)
34
+
35
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ
36
  metadata_map = {}
37
+ with open(metadata_path, 'r', encoding='utf-8') as f:
38
  for line in f:
39
  meta = json.loads(line)
40
  metadata_map[meta['vector_id']] = meta
41
+
42
  print("RAG ๋ฆฌ์†Œ์Šค ๋กœ๋”ฉ ์™„๋ฃŒ!")
43
  return model, index, metadata_map
44
  except Exception as e:
 
53
  """๋‚ด๋ถ€ ํ•จ์ˆ˜: ์ฟผ๋ฆฌ๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ์œ ์‚ฌํ•œ ์žฅ์†Œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค."""
54
  query_vector = embedding_model.encode([query])
55
  distances, ids = faiss_index.search(query_vector.astype('float32'), k)
56
+
57
  results = []
58
  for vector_id in ids[0]:
59
  if vector_id in meta_map:
 
61
  return results
62
 
63
 
 
64
  def _generate_answer_with_llm(query, retrieved_places):
65
  """๋‚ด๋ถ€ ํ•จ์ˆ˜: ๊ฒ€์ƒ‰๋œ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ LLM ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค."""
66
  context = ""
67
+ for i, place in enumerate(retrieved_places[:5]): # ์ƒ์œ„ 5๊ฐœ ์ •๋ณด๋งŒ ์‚ฌ์šฉ
68
  context += f"--- ์žฅ์†Œ ์ •๋ณด {i+1} ---\n"
69
  context += f"์ด๋ฆ„: {place.get('name', '์ •๋ณด ์—†์Œ')}\n"
70
+ context += f"์ฃผ์†Œ: {place.get('address', '์ •๋ณด ์—†์Œ')}\n"
71
  context += f"AI ์š”์•ฝ: {place.get('ai_summary', '์ •๋ณด ์—†์Œ')}\n"
72
  processed_sentences = place.get('processed_sentences', [])
73
  context += "์ฃผ์š” ํŠน์ง• ๋ฐ ํ›„๊ธฐ:\n"
 
76
  context += "\n"
77
 
78
  system_prompt = "๋‹น์‹ ์€ ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์— ๊ฐ€์žฅ ์ ํ•ฉํ•œ ์žฅ์†Œ๋ฅผ ์ถ”์ฒœํ•ด์ฃผ๋Š” ์œ ์šฉํ•œ ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค."
 
79
  user_prompt = f"""
80
  ์•„๋ž˜ '์žฅ์†Œ ์ •๋ณด'๋งŒ์„ ๋ฐ”ํƒ•์œผ๋กœ ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์— ๋Œ€ํ•œ ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ด ์ฃผ์„ธ์š”.
81
 
 
105
  return f"LLM ๋‹ต๋ณ€ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {e}"
106
 
107
 
 
 
 
108
  # --- ๋Œ€ํ‘œ ์‹คํ–‰ ํ•จ์ˆ˜ ---
 
109
  def get_rag_recommendation(search_query, region_keywords):
110
  """
111
  ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ์™€ ์ง€์—ญ ํ‚ค์›Œ๋“œ๋ฅผ ๋ฐ›์•„ RAG ์‹œ์Šคํ…œ์„ ํ†ตํ•ด ์ตœ์ข… ์ถ”์ฒœ ๋‹ต๋ณ€์„ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
112
  """
113
  if not all([embedding_model, faiss_index, meta_map]):
114
  return "RAG ์‹œ์Šคํ…œ์ด ์ค€๋น„๋˜์ง€ ์•Š์•„ ์ถ”์ฒœ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
115
+
116
  # 1. ์žฅ์†Œ ๊ฒ€์ƒ‰
117
  print("\n[RAG] ์˜๋ฏธ์ ์œผ๋กœ ์œ ์‚ฌํ•œ ์žฅ์†Œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค...")
118
  top_places = _retrieve_places(search_query, k=100)
119
+
120
  if not top_places:
121
  return "๊ด€๋ จ๋œ ์žฅ์†Œ๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
122
 
123
+ # 2. ์ง€์—ญ ํ•„ํ„ฐ๋ง
 
124
  if region_keywords:
125
  print(f"[RAG] ์ฃผ์†Œ ํ•„ํ„ฐ๋ง (ํ‚ค์›Œ๋“œ: {region_keywords})...")
126
  filtered_places = []
 
128
  address = place.get('address', '')
129
  if any(keyword in address for keyword in region_keywords):
130
  filtered_places.append(place)
131
+ if len(filtered_places) >= 10:
132
+ break
 
133
  print(f"[RAG] ํ•„ํ„ฐ๋ง ํ›„ ๋‚จ์€ ์žฅ์†Œ: {[p.get('name') for p in filtered_places]}")
134
  else:
 
135
  print("[RAG] ์ง€์—ญ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์–ด ํ•„ํ„ฐ๋ง์„ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")
136
  filtered_places = top_places
137
+
138
  if not filtered_places:
139
  return "์š”์ฒญํ•˜์‹  ์ง€์—ญ์— ๋งž๋Š” ์žฅ์†Œ๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
140
 
141
  # 3. LLM์œผ๋กœ ๋‹ต๋ณ€ ์ƒ์„ฑ
142
  print("[RAG] ํ•„ํ„ฐ๋ง๋œ ์ •๋ณด๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค...")
143
  final_answer = _generate_answer_with_llm(search_query, filtered_places)
144
+
145
+ return final_answer
region_extractor.py CHANGED
@@ -1,66 +1,47 @@
1
- from dotenv import load_dotenv
2
- import random
3
- from openai import OpenAI
4
  import os
5
  import json
 
 
 
 
6
 
7
- # --- ์ดˆ๊ธฐ ์„ค์ • ---
8
- load_dotenv()
9
- client = OpenAI(api_key=os.getenv("API_KEY"))
10
 
 
 
 
 
 
11
 
12
- def extract_region_from_query(user_query):
13
- """
14
- ์‚ฌ์šฉ์ž ์งˆ๋ฌธ์—์„œ LLM์„ ์‚ฌ์šฉํ•ด ์ง€์—ญ๋ช… ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
15
- """
16
- print("[LLM] ์‚ฌ์šฉ์ž ์ฟผ๋ฆฌ์—์„œ ์ง€์—ญ๋ช… ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค...")
17
-
18
- # LLM์—๊ฒŒ ์—ญํ• ์„ ๋ถ€์—ฌํ•˜๊ณ , ์˜ˆ์‹œ(Few-shot)๋ฅผ ํ†ตํ•ด ์›ํ•˜๋Š” ๊ฒฐ๊ณผ ํ˜•์‹์„ ๋ช…ํ™•ํžˆ ์•Œ๋ ค์ค๋‹ˆ๋‹ค.
19
- system_prompt = """
20
- ๋‹น์‹ ์€ ์‚ฌ์šฉ์ž์˜ ์—ฌํ–‰ ๊ด€๋ จ ์งˆ๋ฌธ์—์„œ '๋Œ€ํ•œ๋ฏผ๊ตญ ํ–‰์ •๊ตฌ์—ญ' ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•˜๋Š” AI ์–ด์‹œ์Šคํ„ดํŠธ์ž…๋‹ˆ๋‹ค.
21
- ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์„ ๋ถ„์„ํ•˜์—ฌ, ์ฃผ์†Œ ํ•„ํ„ฐ๋ง์— ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ํ‚ค์›Œ๋“œ ๋ชฉ๋ก์„ JSON ํ˜•์‹์œผ๋กœ ๋ฐ˜ํ™˜ํ•ด ์ฃผ์„ธ์š”.
22
- ๊ฒฐ๊ณผ๋Š” ๋ฐ˜๋“œ์‹œ {"regions": ["ํ‚ค์›Œ๋“œ1", "ํ‚ค์›Œ๋“œ2", ...]} ํ˜•ํƒœ์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค.
23
-
24
- - "์ „๋ผ๋„"๋Š” "์ „๋ถ", "์ „๋‚จ", "๊ด‘์ฃผ"๋กœ ํ•ด์„ํ•ฉ๋‹ˆ๋‹ค.
25
- - "๊ฒฝ์ƒ๋„"๋Š” "๊ฒฝ๋ถ", "๊ฒฝ๋‚จ", "๋ถ€์‚ฐ", "๋Œ€๊ตฌ", "์šธ์‚ฐ"์œผ๋กœ ํ•ด์„ํ•ฉ๋‹ˆ๋‹ค.
26
- - "์ถฉ์ฒญ๋„"๋Š” "์ถฉ๋ถ", "์ถฉ๋‚จ", "๋Œ€์ „", "์„ธ์ข…"์œผ๋กœ ํ•ด์„ํ•ฉ๋‹ˆ๋‹ค.
27
- - "์„œ์šธ ๊ทผ๊ต"๋Š” "๊ฒฝ๊ธฐ", "์ธ์ฒœ"์œผ๋กœ ํ•ด์„ํ•ฉ๋‹ˆ๋‹ค.
28
- - ์–ธ๊ธ‰๋œ ์ง€์—ญ์ด ์—†์œผ๋ฉด ๋นˆ ๋ฆฌ์ŠคํŠธ []๋ฅผ ๋ฐ˜ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
29
-
30
- # ์˜ˆ์‹œ1
31
- ์‚ฌ์šฉ์ž: "์ „๋ผ๋„ ์ชฝ์œผ๋กœ ๋ง›์ง‘ ํˆฌ์–ด ๊ฐ€๊ณ  ์‹ถ์–ด"
32
- AI: {"regions": ["์ „๋ถ", "์ „๋‚จ", "๊ด‘์ฃผ"]}
33
 
34
- # ์˜ˆ์‹œ2
35
- ์‚ฌ์šฉ์ž: "๊ฐ•์›๋„๋‚˜ ๊ฒฝ์ฃผ ์ชฝ ๋ฐ”๋‹ค ๋ณด๊ณ  ์‹ถ๋‹ค"
36
- AI: {"regions": ["๊ฐ•์›", "๊ฒฝ์ฃผ"]}
37
-
38
- # ์˜ˆ์‹œ3
39
- ์‚ฌ์šฉ์ž: "๊ทธ๋ƒฅ ์กฐ์šฉํ•œ ๊ณณ์ด๋ฉด ๋ผ"
40
- AI: {"regions": []}
41
- """
42
-
43
- messages = [
44
- {"role": "system", "content": system_prompt},
45
- {"role": "user", "content": user_query}
46
- ]
47
-
48
- try:
49
- response = client.chat.completions.create(
50
- model="gpt-3.5-turbo",
51
- messages=messages,
52
- response_format={"type": "json_object"}
53
- )
54
- result = json.loads(response.choices[0].message.content)
55
-
56
- # 'regions' ํ‚ค๊ฐ€ ์žˆ๊ณ , ๊ทธ ๊ฐ’์ด ๋ฆฌ์ŠคํŠธ์ธ์ง€ ํ™•์ธ
57
- if 'regions' in result and isinstance(result['regions'], list):
58
- #print(f"[LLM] ์ถ”์ถœ๋œ ์ง€์—ญ ํ‚ค์›Œ๋“œ: {result['regions']}")
59
- return result['regions']
60
- else:
61
- #print("[LLM] 'regions' ํ‚ค๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ๊ฑฐ๋‚˜ ํ˜•์‹์ด ๋ฆฌ์ŠคํŠธ๊ฐ€ ์•„๋‹™๋‹ˆ๋‹ค.")
62
- return []
63
-
64
  except Exception as e:
65
- #print(f"[LLM] ์ง€์—ญ๋ช… ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
 
 
 
 
 
 
 
 
66
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from huggingface_hub import hf_hub_download
7
 
8
+ DATA_REPO = "Syngyeon/seoulalpha-data"
9
+ MODEL_NAME = "jhgan/ko-sbert-nli"
 
10
 
11
+ # ๋กœ๋“œ
12
+ def _load_region_index():
13
+ try:
14
+ index_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/region_db/faiss_region_semantic.index")
15
+ metadata_path = hf_hub_download(repo_id=DATA_REPO, filename="data/faiss/region_db/metadata_region_semantic.jsonl")
16
 
17
+ index = faiss.read_index(index_path)
18
+ model = SentenceTransformer(MODEL_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ metadata_map = {}
21
+ with open(metadata_path, "r", encoding="utf-8") as f:
22
+ for line in f:
23
+ meta = json.loads(line)
24
+ metadata_map[meta["vector_id"]] = meta
25
+
26
+ print("[RegionDB] ๋กœ๋”ฉ ์™„๋ฃŒ")
27
+ return model, index, metadata_map
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  except Exception as e:
29
+ print("[RegionDB] ๋กœ๋”ฉ ์‹คํŒจ:", e)
30
+ return None, None, None
31
+
32
+ region_model, region_index, region_meta = _load_region_index()
33
+
34
+
35
+ def extract_region_semantic(user_query, top_k=5):
36
+ """FAISS ๊ธฐ๋ฐ˜ ์ง€์—ญ ํ›„๋ณด ์ถ”์ถœ"""
37
+ if not all([region_model, region_index, region_meta]):
38
  return []
39
+
40
+ query_vec = region_model.encode([user_query]).astype("float32")
41
+ distances, ids = region_index.search(query_vec, top_k)
42
+
43
+ results = []
44
+ for i, vid in enumerate(ids[0]):
45
+ if vid in region_meta:
46
+ results.append(region_meta[vid]["region_name"])
47
+ return results
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ