Jake-seong commited on
Commit
9578ec7
ยท
verified ยท
1 Parent(s): 75d2a36
Files changed (1) hide show
  1. app.py +198 -247
app.py CHANGED
@@ -3,16 +3,13 @@ import psycopg2
3
  from openai import OpenAI
4
  import json
5
  import os
6
- from typing import List, Dict
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
-
10
- # ๊ฐ€์ค‘์น˜ ๋ฐ ์ž„๊ณ„๊ฐ’ ์„ค์ •
11
- DEFAULT_FULL_WEIGHT = 0.2
12
- DEFAULT_TOPIC_WEIGHT = 0.5
13
- DEFAULT_CUSTOMER_WEIGHT = 0.2
14
- DEFAULT_AGENT_WEIGHT = 0.1
15
- DEFAULT_SIMILARITY_THRESHOLD = 0.5
16
 
17
  # DB ์—ฐ๊ฒฐ ์„ค์ •
18
  def get_db_conn():
@@ -24,294 +21,248 @@ def get_db_conn():
24
  password=os.environ["VECTOR_SECRET"]
25
  )
26
 
27
- # OpenAI ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
28
  client = OpenAI()
29
 
30
  def get_embedding(text: str) -> List[float]:
 
 
 
 
 
 
 
 
31
  """
32
- ํ…์ŠคํŠธ๋ฅผ OpenAI์˜ text-embedding-ada-002 ๋ชจ๋ธ์„ ์‚ฌ์šฉํ•˜์—ฌ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
33
- Java์˜ float[](float32)์™€ ํ˜ธํ™˜๋˜๋„๋ก ๋ช…์‹œ์ ์œผ๋กœ float32๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
34
-
35
- Args:
36
- text (str): ์ž„๋ฒ ๋”ฉํ•  ํ…์ŠคํŠธ
37
-
38
- Returns:
39
- List[float]: ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ (float32)
40
  """
 
41
  try:
42
- response = client.embeddings.create(
43
- input=text,
44
- model="text-embedding-ada-002",
45
- encoding_format="float"
 
 
 
 
46
  )
47
- # ๋ช…์‹œ์ ์œผ๋กœ float32๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ Java์˜ float[]์™€ ํ˜ธํ™˜๋˜๊ฒŒ ํ•จ
48
- return np.array(response.data[0].embedding, dtype=np.float32).tolist()
49
- except Exception as e:
50
- print(f"์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
51
- raise
52
 
53
- def format_vector_for_pg(vector: List[float]) -> str:
54
  """
55
- ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋ฅผ PostgreSQL ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค.
56
- ์ž…๋ ฅ๋œ ๋ฒกํ„ฐ๊ฐ€ float32 ํƒ€์ž…์ธ์ง€ ํ™•์ธํ•ฉ๋‹ˆ๋‹ค.
57
  """
58
- # ๋ฒกํ„ฐ๊ฐ€ float32 ํƒ€์ž…์ธ์ง€ ํ™•์ธํ•˜๊ณ , ์•„๋‹ˆ๋ฉด ๋ณ€ํ™˜
59
- # NumPy ๋ฐฐ์—ด์ด ์•„๋‹Œ ๊ฒฝ์šฐ์—๋„ ์ฒ˜๋ฆฌ
60
- if not isinstance(vector, np.ndarray):
61
- vector = np.array(vector, dtype=np.float32)
62
- elif vector.dtype != np.float32:
63
- vector = vector.astype(np.float32)
64
- vector_str = ','.join([f"{x}" for x in vector])
65
- return f"[{vector_str}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- def get_text_value(node: Dict, field_name: str) -> str:
68
  """
69
- ๋”•์…”๋„ˆ๋ฆฌ์—์„œ ํ…์ŠคํŠธ ๊ฐ’์„ ์•ˆ์ „ํ•˜๊ฒŒ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
70
- ์ž๋ฐ”์˜ getTextValue() ๋ฉ”์†Œ๋“œ์™€ ๋™์ผํ•œ ๊ธฐ๋Šฅ์ž…๋‹ˆ๋‹ค.
71
  """
72
- if node and field_name in node and node[field_name] is not None:
73
- return node[field_name]
74
- return None
75
 
76
- def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
77
  """
78
- ์ฑ„ํŒ… ๋ฐ์ดํ„ฐ์—์„œ ์œ ์‚ฌํ•œ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
79
-
80
  Args:
81
  query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
82
- max_results (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
83
-
84
  Returns:
85
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
86
  """
87
- limit = max_results if max_results is not None else 100
 
 
 
 
 
88
 
89
- # ์ž๋ฐ”์™€ ๋™์ผํ•œ ๊ฐ€์ค‘์น˜ ์„ค์ •
90
- full_w = DEFAULT_FULL_WEIGHT
91
- topic_w = DEFAULT_TOPIC_WEIGHT
92
- customer_w = DEFAULT_CUSTOMER_WEIGHT
93
- agent_w = DEFAULT_AGENT_WEIGHT
94
- threshold = DEFAULT_SIMILARITY_THRESHOLD
95
 
96
  try:
97
- # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
98
- query_embedding = get_embedding(query)
99
-
100
- # PostgreSQL ํฌ๋งท์œผ๋กœ ๋ฒกํ„ฐ ๋ณ€ํ™˜
101
- query_vector = format_vector_for_pg(query_embedding)
102
-
103
- # DB ์—ฐ๊ฒฐ
104
- conn = get_db_conn()
105
- register_vector(conn)
106
-
107
- # ์ž๋ฐ” ์ฝ”๋“œ์™€ ๋™์ผํ•œ SQL ์ฟผ๋ฆฌ ๊ตฌํ˜„
108
- sql = """
109
- WITH embeddings AS (
110
- SELECT
111
- id,
112
- metadata,
113
- content,
114
- CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '%s'::vector) ELSE 0 END * %f as full_sim,
115
- CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '%s'::vector) ELSE 0 END * %f as topic_sim,
116
- CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '%s'::vector) ELSE 0 END * %f as customer_sim,
117
- CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '%s'::vector) ELSE 0 END * %f as agent_sim
118
- FROM vector_store_multi_embeddings
119
- WHERE full_embedding IS NOT NULL
120
- OR topic_embedding IS NOT NULL
121
- OR customer_embedding IS NOT NULL
122
- OR agent_embedding IS NOT NULL
123
- )
124
- SELECT
125
- id,
126
- metadata,
127
- content,
128
- (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
129
- FROM embeddings
130
- ORDER BY combined_similarity DESC
131
- LIMIT %s
132
- """ % (query_vector, full_w, query_vector, topic_w, query_vector, customer_w, query_vector, agent_w, limit)
133
-
134
  with conn.cursor() as cur:
135
- cur.execute(sql)
136
- rows = cur.fetchall()
 
 
 
 
 
 
137
 
138
- results = []
139
- for row in rows:
140
- id_val = row[0]
141
- metadata_json = row[1]
142
- content = row[2]
143
- similarity_score = float(row[3])
144
-
145
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
146
- try:
147
- metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
148
-
149
- result = {
150
- "id": id_val,
151
- "similarityScore": similarity_score,
152
- "content": content,
153
- "chatId": get_text_value(metadata, "chatId"),
154
- "topic": get_text_value(metadata, "topic")
155
- }
156
-
157
- # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
158
- if "startTime" in metadata and metadata["startTime"] is not None:
159
- result["startTime"] = metadata["startTime"]
160
-
161
- if "endTime" in metadata and metadata["endTime"] is not None:
162
- result["endTime"] = metadata["endTime"]
163
-
164
- results.append(result)
165
- except Exception as e:
166
- print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
167
- continue
168
-
169
- # ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ๋ง
170
- filtered_results = [r for r in results if r["similarityScore"] >= threshold]
171
 
172
- return filtered_results
173
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  except Exception as e:
175
- print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
176
- return []
177
-
178
  finally:
179
- if 'conn' in locals():
180
- conn.close()
181
 
182
- def search_similar_chat_by_date(
183
- query: str,
184
- start_date: str = None,
185
- end_date: str = None,
186
- max_results: int = 100
187
  ) -> List[Dict]:
188
  """
189
- ์ง€์ •๋œ ๋‚ ์งœ ๋ฒ”์œ„ ๋‚ด์˜ ์ฑ„ํŒ… ๋ฐ์ดํ„ฐ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
190
 
191
  Args:
192
- query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
193
- start_date (str): ๊ฒ€์ƒ‰ ์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD ํ˜•์‹)
194
- end_date (str): ๊ฒ€์ƒ‰ ์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD ํ˜•์‹)
195
- max_results (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
196
-
197
  Returns:
198
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
199
  """
200
- limit = max_results if max_results is not None else 100
201
-
202
- # ์ž๋ฐ”์™€ ๋™์ผํ•œ ๊ฐ€์ค‘์น˜ ์„ค์ •
203
- full_w = DEFAULT_FULL_WEIGHT
204
- topic_w = DEFAULT_TOPIC_WEIGHT
205
- customer_w = DEFAULT_CUSTOMER_WEIGHT
206
- agent_w = DEFAULT_AGENT_WEIGHT
207
- threshold = DEFAULT_SIMILARITY_THRESHOLD
208
-
209
  try:
210
- # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
211
- query_embedding = get_embedding(query)
212
-
213
- # PostgreSQL ํฌ๋งท์œผ๋กœ ๋ฒกํ„ฐ ๋ณ€ํ™˜
214
- query_vector = format_vector_for_pg(query_embedding)
215
-
216
- # DB ์—ฐ๊ฒฐ
217
- conn = get_db_conn()
218
- register_vector(conn)
219
-
220
- # ์ž๋ฐ” ์ฝ”๋“œ์™€ ๋™์ผํ•œ SQL ์ฟผ๋ฆฌ ์‹œ์ž‘
221
- sql = """
222
- WITH embeddings AS (
223
- SELECT
224
- id,
225
- metadata,
226
- content,
227
- CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '%s'::vector) ELSE 0 END * %f as full_sim,
228
- CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '%s'::vector) ELSE 0 END * %f as topic_sim,
229
- CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '%s'::vector) ELSE 0 END * %f as customer_sim,
230
- CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '%s'::vector) ELSE 0 END * %f as agent_sim
231
- FROM vector_store_multi_embeddings
232
- WHERE full_embedding IS NOT NULL
233
- OR topic_embedding IS NOT NULL
234
- OR customer_embedding IS NOT NULL
235
- OR agent_embedding IS NOT NULL
236
- """ % (query_vector, full_w, query_vector, topic_w, query_vector, customer_w, query_vector, agent_w)
237
-
238
- # ๋‚ ์งœ ํ•„ํ„ฐ ์ถ”๊ฐ€
239
- if start_date and start_date.strip():
240
- # ์‹œ์ž‘ ์‹œ๊ฐ„ ์ถ”๊ฐ€ํ•˜์—ฌ ISO ํ˜•์‹์œผ๋กœ ๋น„๊ต
241
- iso_start_date = start_date + "T00:00:00"
242
- sql += f" AND metadata->>'startTime' >= '{iso_start_date}'"
243
-
244
- if end_date and end_date.strip():
245
- # ์ข…๋ฃŒ ์‹œ๊ฐ„ ์ถ”๊ฐ€ํ•˜์—ฌ ISO ํ˜•์‹์œผ๋กœ ๋น„๊ต
246
- iso_end_date = end_date + "T23:59:59"
247
- sql += f" AND metadata->>'startTime' <= '{iso_end_date}'"
248
-
249
- sql += """
250
- )
251
- SELECT
252
- id,
253
- metadata,
254
- content,
255
- (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
256
- FROM embeddings
257
- ORDER BY combined_similarity DESC
258
- LIMIT %s
259
- """
260
 
 
 
 
 
 
261
  with conn.cursor() as cur:
262
- # ์—ฌ๊ธฐ์„œ๋Š” limit๋ฅผ ํŒŒ๋ผ๋ฏธํ„ฐ๋กœ ์ „๋‹ฌ
263
- cur.execute(sql, (limit,))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  rows = cur.fetchall()
265
 
266
- results = []
267
- for row in rows:
268
- id_val = row[0]
269
- metadata_json = row[1]
270
- content = row[2]
271
- similarity_score = float(row[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
274
- try:
275
- metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
276
-
277
- result = {
278
- "id": id_val,
279
- "similarityScore": similarity_score,
280
- "content": content,
281
- "chatId": get_text_value(metadata, "chatId"),
282
- "topic": get_text_value(metadata, "topic")
283
- }
284
-
285
- # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ (์ด๋ฏธ KST๋กœ ์ €์žฅ๋˜์–ด ์žˆ์Œ)
286
- if "startTime" in metadata and metadata["startTime"] is not None:
287
- result["startTime"] = metadata["startTime"]
288
-
289
- if "endTime" in metadata and metadata["endTime"] is not None:
290
- result["endTime"] = metadata["endTime"]
291
-
292
- results.append(result)
293
- except Exception as e:
294
- print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
295
- continue
296
 
297
- # ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ๋ง (์ž๋ฐ” ์ฝ”๋“œ์™€ ๋™์ผํ•˜๊ฒŒ ๊ตฌํ˜„)
298
- filtered_results = [r for r in results if r["similarityScore"] >= threshold]
299
 
300
- return filtered_results
301
-
302
  except Exception as e:
303
- print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๋‚ ์งœ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
304
- return []
305
-
306
  finally:
307
- if 'conn' in locals():
308
- conn.close()
309
 
310
- # Gradio ์›น ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
311
  with gr.Blocks() as demo:
312
  gr.Markdown("# Chat Analysis Search")
313
- gr.Interface(fn=search_similar_chat, inputs=["text", "number"], outputs="json", api_name="search_similar_chat")
314
- gr.Interface(fn=search_similar_chat_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chat_by_date")
315
 
316
  if __name__ == "__main__":
317
  demo.launch(mcp_server=True)
 
3
  from openai import OpenAI
4
  import json
5
  import os
6
+ from typing import List, Dict, Tuple, Any
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
+ from datetime import datetime
10
+ import re
11
+ from sklearn.feature_extraction.text import TfidfVectorizer
12
+ from sklearn.metrics.pairwise import cosine_similarity
 
 
 
13
 
14
  # DB ์—ฐ๊ฒฐ ์„ค์ •
15
  def get_db_conn():
 
21
  password=os.environ["VECTOR_SECRET"]
22
  )
23
 
 
24
  client = OpenAI()
25
 
26
  def get_embedding(text: str) -> List[float]:
27
+ """ํ…์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
28
+ response = client.embeddings.create(
29
+ input=text,
30
+ model="text-embedding-3-small"
31
+ )
32
+ return response.data[0].embedding
33
+
34
+ def expand_query(query: str) -> str:
35
  """
36
+ ์‚ฌ์šฉ์ž ์ฟผ๋ฆฌ๋ฅผ ํ™•์žฅํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
 
37
  """
38
+ # GPT๋ฅผ ํ™œ์šฉํ•œ ์ฟผ๋ฆฌ ํ™•์žฅ
39
  try:
40
+ response = client.chat.completions.create(
41
+ model="gpt-3.5-turbo",
42
+ messages=[
43
+ {"role": "system", "content": "๋‹น์‹ ์€ ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ํ™•์žฅ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์‚ฌ์šฉ์ž์˜ ์ฟผ๋ฆฌ๋ฅผ ๋ถ„์„ํ•˜๊ณ , ์ด์™€ ๊ด€๋ จ๋œ ํ‚ค์›Œ๋“œ์™€ ์งˆ๋ฌธ ํ˜•ํƒœ๋กœ ํ™•์žฅํ•˜์„ธ์š”."},
44
+ {"role": "user", "content": f"๋‹ค์Œ ๊ฒ€์ƒ‰์–ด๋ฅผ ํ™•์žฅํ•ด์ฃผ์„ธ์š”: '{query}'"}
45
+ ],
46
+ temperature=0.3,
47
+ max_tokens=150
48
  )
49
+ expanded = query + " " + response.choices[0].message.content
50
+ return expanded
51
+ except:
52
+ # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์›๋ณธ ์ฟผ๋ฆฌ ๋ฐ˜ํ™˜
53
+ return query
54
 
55
+ def extract_keywords(text: str) -> List[str]:
56
  """
57
+ ํ…์ŠคํŠธ์—์„œ ์ค‘์š” ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
 
58
  """
59
+ # ๋‹จ์ˆœํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๊ณ ๊ธ‰ NLP ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ ๋Œ€์ฒด ๊ฐ€๋Šฅ)
60
+ # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ ๋ฐ ์ •๊ทœํ‘œํ˜„์‹์œผ๋กœ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
61
+ stop_words = {'์žˆ๋Š”', 'ํ•˜๋Š”', '๊ทธ๋ฆฌ๊ณ ', '์ž…๋‹ˆ๋‹ค', '๊ทธ๊ฒƒ์€', '์žˆ์Šต๋‹ˆ๋‹ค', 'ํ•ฉ๋‹ˆ๋‹ค', '๊ทธ๋Ÿฐ', '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋ƒฅ'}
62
+ words = re.findall(r'\w+', text.lower())
63
+ keywords = [w for w in words if len(w) > 1 and w not in stop_words]
64
+ return list(set(keywords))
65
+
66
+ def perform_hybrid_search(
67
+ query: str,
68
+ vector_results: List[Dict],
69
+ keyword_weight: float = 0.3,
70
+ similarity_threshold: float = 0.4
71
+ ) -> List[Dict]:
72
+ """
73
+ ๋ฒกํ„ฐ ๊ฒ€์ƒ‰๊ณผ ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์„ ๊ฒฐํ•ฉํ•œ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
74
+ """
75
+ # ์ž„๊ณ„๊ฐ’ ๋ฏธ๋งŒ์˜ ๊ฒฐ๊ณผ ํ•„ํ„ฐ๋ง
76
+ filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold]
77
+
78
+ if not filtered_results:
79
+ # ๊ฒฐ๊ณผ๊ฐ€ ์—†์œผ๋ฉด ์ž„๊ณ„๊ฐ’์„ ๋‚ฎ์ถฐ์„œ ์žฌ์‹œ๋„
80
+ filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold * 0.7]
81
+
82
+ if not filtered_results:
83
+ return vector_results[:5] # ์—ฌ์ „ํžˆ ์—†์œผ๋ฉด ์ƒ์œ„ 5๊ฐœ ๋ฐ˜ํ™˜
84
+
85
+ # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ๊ฐ€์ค‘์น˜ ์ ์šฉ
86
+ keywords = extract_keywords(query)
87
+
88
+ for result in filtered_results:
89
+ content = result.get("content", "")
90
+ keyword_matches = sum(1 for kw in keywords if kw.lower() in content.lower())
91
+ keyword_score = keyword_matches / max(len(keywords), 1)
92
+
93
+ # ์ตœ์ข… ์ ์ˆ˜ ๊ณ„์‚ฐ (๋ฒกํ„ฐ ์œ ์‚ฌ๋„ + ํ‚ค์›Œ๋“œ ๊ฐ€์ค‘์น˜)
94
+ result["original_similarity"] = result["similarity"]
95
+ result["keyword_score"] = keyword_score
96
+ result["similarity"] = (1 - keyword_weight) * result["similarity"] + keyword_weight * keyword_score
97
+
98
+ # ์ตœ์ข… ์ ์ˆ˜๋กœ ์žฌ์ •๋ ฌ
99
+ return sorted(filtered_results, key=lambda x: x["similarity"], reverse=True)
100
 
101
+ def preprocess_query(query: str) -> str:
102
  """
103
+ ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
 
104
  """
105
+ # ๊ฒ€์ƒ‰์— ๋งž๊ฒŒ ํ”„๋กฌํ”„ํŠธ ์žฌ๊ตฌ์„ฑ
106
+ return f"๋‹ค์Œ ์งˆ๋ฌธ์ด๋‚˜ ์ฃผ์ œ์™€ ๊ด€๋ จ๋œ ๋Œ€ํ™”๋ฅผ ์ฐพ์•„์ฃผ์„ธ์š”: {query}"
 
107
 
108
+ def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
109
  """
110
+ ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
 
111
  Args:
112
  query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
113
+ maxResults (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
 
114
  Returns:
115
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
116
  """
117
+ # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
118
+ processed_query = preprocess_query(query)
119
+ try:
120
+ expanded_query = expand_query(processed_query)
121
+ except:
122
+ expanded_query = processed_query
123
 
124
+ embedding = np.array(get_embedding(expanded_query))
125
+ conn = get_db_conn()
126
+ register_vector(conn)
 
 
 
127
 
128
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  with conn.cursor() as cur:
130
+ # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
131
+ cur.execute("""
132
+ SELECT id, metadata, content,
133
+ 1 - (embedding <=> %s) AS similarity
134
+ FROM vector_store
135
+ ORDER BY similarity DESC
136
+ LIMIT %s
137
+ """, (embedding, maxResults))
138
 
139
+ rows = cur.fetchall()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ results = [{
142
+ "id": row[0],
143
+ "metadata": row[1],
144
+ "content": row[2],
145
+ "similarity": float(row[3])
146
+ } for row in rows]
147
+
148
+ # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
149
+ results = perform_hybrid_search(
150
+ query,
151
+ results,
152
+ keyword_weight=0.3,
153
+ similarity_threshold=0.4
154
+ )
155
+
156
+ return results
157
  except Exception as e:
158
+ raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
 
 
159
  finally:
160
+ conn.close()
 
161
 
162
+ def search_similar_chats_by_date(
163
+ query: str,
164
+ startDate: str = None,
165
+ endDate: str = None,
166
+ maxResults: int = 200
167
  ) -> List[Dict]:
168
  """
169
+ ์ง€์ •๋œ ๋‚ ์งœ ๋ฒ”์œ„์— ํ•ด๋‹นํ•˜๋Š” ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
170
 
171
  Args:
172
+ query (str): ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ
173
+ startDate (str): ๊ฒ€์ƒ‰ ์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD)
174
+ endDate (str): ๊ฒ€์ƒ‰ ์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD)
175
+ maxResults (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
 
176
  Returns:
177
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
178
  """
 
 
 
 
 
 
 
 
 
179
  try:
180
+ start_dt = datetime.strptime(startDate, "%Y-%m-%d") if startDate else None
181
+ end_dt = datetime.strptime(endDate, "%Y-%m-%d") if endDate else None
182
+ except ValueError as e:
183
+ raise ValueError(f"๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {e}")
184
+
185
+ # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
186
+ processed_query = preprocess_query(query)
187
+ try:
188
+ expanded_query = expand_query(processed_query)
189
+ except:
190
+ expanded_query = processed_query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
+ embedding = np.array(get_embedding(expanded_query))
193
+ conn = get_db_conn()
194
+ register_vector(conn)
195
+
196
+ try:
197
  with conn.cursor() as cur:
198
+ base_query = """
199
+ SELECT id, metadata, content,
200
+ 1 - (embedding <=> %s) AS similarity
201
+ FROM vector_store
202
+ WHERE 1=1
203
+ """
204
+ params = [embedding]
205
+
206
+ # ๋™์  ์ฟผ๋ฆฌ ๊ตฌ์„ฑ
207
+ if startDate:
208
+ base_query += " AND (metadata->>'startTime')::date >= %s"
209
+ params.append(startDate)
210
+ if endDate:
211
+ base_query += " AND (metadata->>'startTime')::date <= %s"
212
+ params.append(endDate)
213
+
214
+ base_query += " ORDER BY similarity DESC LIMIT %s"
215
+ params.append(maxResults)
216
+
217
+ cur.execute(base_query, tuple(params))
218
  rows = cur.fetchall()
219
 
220
+ results = [{
221
+ "id": row[0],
222
+ "metadata": row[1],
223
+ "content": row[2],
224
+ "similarity": float(row[3])
225
+ } for row in rows]
226
+
227
+ # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
228
+ results = perform_hybrid_search(
229
+ query,
230
+ results,
231
+ keyword_weight=0.3,
232
+ similarity_threshold=0.4
233
+ )
234
+
235
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
236
+ keywords = extract_keywords(query)
237
+ for result in results:
238
+ metadata = result.get("metadata", {})
239
+ if not metadata or isinstance(metadata, str):
240
+ continue
241
 
242
+ # ์ฃผ์ œ(topic) ํ•„๋“œ์— ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
243
+ topic = metadata.get("topic", "")
244
+ topic_matches = sum(1 for kw in keywords if kw.lower() in topic.lower())
245
+
246
+ # ์ฃผ์ œ ์ผ์น˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
247
+ if topic_matches > 0:
248
+ topic_boost = 0.1 * min(topic_matches, 3) # ์ตœ๋Œ€ 0.3 ๊ฐ€์ค‘์น˜
249
+ result["similarity"] += topic_boost
250
+ result["topic_boost"] = topic_boost
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
+ # ๊ฒฐ๊ณผ ์žฌ์ •๋ ฌ
253
+ results = sorted(results, key=lambda x: x["similarity"], reverse=True)
254
 
255
+ return results
 
256
  except Exception as e:
257
+ raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
 
 
258
  finally:
259
+ conn.close()
 
260
 
261
+ # Gradio Blocks์— ํ•จ์ˆ˜ ๋“ฑ๋ก
262
  with gr.Blocks() as demo:
263
  gr.Markdown("# Chat Analysis Search")
264
+ gr.Interface(fn=search_similar_chats, inputs=["text", "number"], outputs="json", api_name="search_similar_chats")
265
+ gr.Interface(fn=search_similar_chats_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chats_by_date")
266
 
267
  if __name__ == "__main__":
268
  demo.launch(mcp_server=True)