Jake-seong commited on
Commit
facba8d
Β·
verified Β·
1 Parent(s): 6e0fcb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -80
app.py CHANGED
@@ -6,7 +6,13 @@ import os
6
  from typing import List, Dict
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
- from datetime import datetime
 
 
 
 
 
 
10
 
11
  # DB μ—°κ²° μ„€μ •
12
  def get_db_conn():
@@ -18,119 +24,294 @@ def get_db_conn():
18
  password=os.environ["VECTOR_SECRET"]
19
  )
20
 
 
21
  client = OpenAI()
22
 
23
  def get_embedding(text: str) -> List[float]:
24
- """ν…μŠ€νŠΈλ₯Ό μž„λ² λ”© λ²‘ν„°λ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€."""
25
- response = client.embeddings.create(
26
- input=text,
27
- model="text-embedding-3-small"
28
- )
29
- return response.data[0].embedding
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
32
  """
33
- μœ μ‚¬ν•œ μ±„νŒ… λ¬Έμ„œλ₯Ό κ²€μƒ‰ν•©λ‹ˆλ‹€.
 
 
 
 
 
 
 
 
 
 
34
  Args:
35
  query (str): 검색할 쿼리 ν…μŠ€νŠΈ
36
- maxResults (int): λ°˜ν™˜ν•  μ΅œλŒ€ κ²°κ³Ό 수
 
37
  Returns:
38
  List[Dict]: 검색 κ²°κ³Ό λͺ©λ‘
39
  """
40
- embedding = np.array(get_embedding(query))
41
- conn = get_db_conn()
42
- register_vector(conn)
 
 
 
 
 
43
 
44
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  with conn.cursor() as cur:
46
- # 코사인 μœ μ‚¬λ„ μ—°μ‚°μž λ³€κ²½ (<=> μ‚¬μš©)
47
- cur.execute("""
48
- SELECT id, metadata, content,
49
- 1 - (embedding <=> %s) AS similarity
50
- FROM vector_store
51
- ORDER BY similarity DESC
52
- LIMIT %s
53
- """, (embedding, maxResults))
54
-
55
  rows = cur.fetchall()
56
- return [{
57
- "id": row[0],
58
- "metadata": row[1],
59
- "content": row[2],
60
- "similarity": float(row[3])
61
- } for row in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  except Exception as e:
63
- raise RuntimeError(f"DB 검색 였λ₯˜: {str(e)}")
 
 
64
  finally:
65
- conn.close()
 
66
 
67
- def search_similar_chats_by_date(
68
- query: str,
69
- startDate: str = None,
70
- endDate: str = None,
71
- maxResults: int = 200
72
  ) -> List[Dict]:
73
  """
74
- μ§€μ •λœ λ‚ μ§œ λ²”μœ„μ— ν•΄λ‹Ήν•˜λŠ” μœ μ‚¬ν•œ μ±„νŒ… λ¬Έμ„œλ₯Ό κ²€μƒ‰ν•©λ‹ˆλ‹€.
75
 
76
  Args:
77
- query (str): 검색 쿼리
78
- startDate (str): 검색 μ‹œμž‘ λ‚ μ§œ (YYYY-MM-DD)
79
- endDate (str): 검색 μ’…λ£Œ λ‚ μ§œ (YYYY-MM-DD)
80
- maxResults (int): λ°˜ν™˜ν•  μ΅œλŒ€ κ²°κ³Ό 수
 
81
  Returns:
82
  List[Dict]: 검색 κ²°κ³Ό λͺ©λ‘
83
  """
84
- try:
85
- start_dt = datetime.strptime(startDate, "%Y-%m-%d") if startDate else None
86
- end_dt = datetime.strptime(endDate, "%Y-%m-%d") if endDate else None
87
- except ValueError as e:
88
- raise ValueError(f"λ‚ μ§œ ν˜•μ‹ 였λ₯˜: {e}")
89
-
90
- embedding = np.array(get_embedding(query))
91
- conn = get_db_conn()
92
- register_vector(conn)
93
 
94
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  with conn.cursor() as cur:
96
- base_query = """
97
- SELECT id, metadata, content,
98
- 1 - (embedding <=> %s) AS similarity
99
- FROM vector_store
100
- WHERE 1=1
101
- """
102
- params = [embedding]
103
-
104
- # 동적 쿼리 ꡬ성
105
- if startDate:
106
- base_query += " AND (metadata->>'startTime')::date >= %s"
107
- params.append(startDate)
108
- if endDate:
109
- base_query += " AND (metadata->>'startTime')::date <= %s"
110
- params.append(endDate)
111
-
112
- base_query += " ORDER BY similarity DESC LIMIT %s"
113
- params.append(maxResults)
114
-
115
- cur.execute(base_query, tuple(params))
116
  rows = cur.fetchall()
117
 
118
- return [{
119
- "id": row[0],
120
- "metadata": row[1],
121
- "content": row[2],
122
- "similarity": float(row[3])
123
- } for row in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
- raise RuntimeError(f"DB 검색 였λ₯˜: {str(e)}")
 
 
126
  finally:
127
- conn.close()
 
128
 
129
- # Gradio Blocks에 ν•¨μˆ˜ 등둝
130
  with gr.Blocks() as demo:
131
  gr.Markdown("# Chat Analysis Search")
132
- gr.Interface(fn=search_similar_chats, inputs=["text", "number"], outputs="json", api_name="search_similar_chats")
133
- gr.Interface(fn=search_similar_chats_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chats_by_date")
134
 
135
  if __name__ == "__main__":
136
  demo.launch(mcp_server=True)
 
6
  from typing import List, Dict
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
+
10
+ # κ°€μ€‘μΉ˜ 및 μž„κ³„κ°’ μ„€μ •
11
+ DEFAULT_FULL_WEIGHT = 0.2
12
+ DEFAULT_TOPIC_WEIGHT = 0.5
13
+ DEFAULT_CUSTOMER_WEIGHT = 0.2
14
+ DEFAULT_AGENT_WEIGHT = 0.1
15
+ DEFAULT_SIMILARITY_THRESHOLD = 0.5
16
 
17
  # DB μ—°κ²° μ„€μ •
18
  def get_db_conn():
 
24
  password=os.environ["VECTOR_SECRET"]
25
  )
26
 
27
+ # OpenAI ν΄λΌμ΄μ–ΈνŠΈ μ΄ˆκΈ°ν™”
28
  client = OpenAI()
29
 
30
  def get_embedding(text: str) -> List[float]:
31
+ """
32
+ ν…μŠ€νŠΈλ₯Ό OpenAI의 text-embedding-ada-002 λͺ¨λΈμ„ μ‚¬μš©ν•˜μ—¬ μž„λ² λ”© λ²‘ν„°λ‘œ λ³€ν™˜ν•©λ‹ˆλ‹€.
33
+ Java의 float[](float32)와 ν˜Έν™˜λ˜λ„λ‘ λͺ…μ‹œμ μœΌλ‘œ float32둜 λ³€ν™˜ν•©λ‹ˆλ‹€.
34
+
35
+ Args:
36
+ text (str): μž„λ² λ”©ν•  ν…μŠ€νŠΈ
37
+
38
+ Returns:
39
+ List[float]: μž„λ² λ”© 벑터 (float32)
40
+ """
41
+ try:
42
+ response = client.embeddings.create(
43
+ input=text,
44
+ model="text-embedding-ada-002",
45
+ encoding_format="float"
46
+ )
47
+ # λͺ…μ‹œμ μœΌλ‘œ float32둜 λ³€ν™˜ν•˜μ—¬ Java의 float[]와 ν˜Έν™˜λ˜κ²Œ 함
48
+ return np.array(response.data[0].embedding, dtype=np.float32).tolist()
49
+ except Exception as e:
50
+ print(f"μž„λ² λ”© 생성 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
51
+ raise
52
+
53
+ def format_vector_for_pg(vector: List[float]) -> str:
54
+ """
55
+ μž„λ² λ”© 벑터λ₯Ό PostgreSQL 포맷으둜 λ³€ν™˜ν•©λ‹ˆλ‹€.
56
+ μž…λ ₯된 벑터가 float32 νƒ€μž…μΈμ§€ ν™•μΈν•©λ‹ˆλ‹€.
57
+ """
58
+ # 벑터가 float32 νƒ€μž…μΈμ§€ ν™•μΈν•˜κ³ , μ•„λ‹ˆλ©΄ λ³€ν™˜
59
+ # NumPy 배열이 μ•„λ‹Œ κ²½μš°μ—λ„ 처리
60
+ if not isinstance(vector, np.ndarray):
61
+ vector = np.array(vector, dtype=np.float32)
62
+ elif vector.dtype != np.float32:
63
+ vector = vector.astype(np.float32)
64
+ vector_str = ','.join([f"{x}" for x in vector])
65
+ return f"[{vector_str}]"
66
 
67
+ def get_text_value(node: Dict, field_name: str) -> str:
68
  """
69
+ λ”•μ…”λ„ˆλ¦¬μ—μ„œ ν…μŠ€νŠΈ 값을 μ•ˆμ „ν•˜κ²Œ μΆ”μΆœν•©λ‹ˆλ‹€.
70
+ μžλ°”μ˜ getTextValue() λ©”μ†Œλ“œμ™€ λ™μΌν•œ κΈ°λŠ₯μž…λ‹ˆλ‹€.
71
+ """
72
+ if node and field_name in node and node[field_name] is not None:
73
+ return node[field_name]
74
+ return None
75
+
76
+ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
77
+ """
78
+ μ±„νŒ… λ°μ΄ν„°μ—μ„œ μœ μ‚¬ν•œ μ½˜ν…μΈ λ₯Ό κ²€μƒ‰ν•©λ‹ˆλ‹€.
79
+
80
  Args:
81
  query (str): 검색할 쿼리 ν…μŠ€νŠΈ
82
+ max_results (int): λ°˜ν™˜ν•  μ΅œλŒ€ κ²°κ³Ό 수
83
+
84
  Returns:
85
  List[Dict]: 검색 κ²°κ³Ό λͺ©λ‘
86
  """
87
+ limit = max_results if max_results is not None else 100
88
+
89
+ # μžλ°”μ™€ λ™μΌν•œ κ°€μ€‘μΉ˜ μ„€μ •
90
+ full_w = DEFAULT_FULL_WEIGHT
91
+ topic_w = DEFAULT_TOPIC_WEIGHT
92
+ customer_w = DEFAULT_CUSTOMER_WEIGHT
93
+ agent_w = DEFAULT_AGENT_WEIGHT
94
+ threshold = DEFAULT_SIMILARITY_THRESHOLD
95
 
96
  try:
97
+ # 쿼리 μž„λ² λ”© 생성
98
+ query_embedding = get_embedding(query)
99
+
100
+ # PostgreSQL 포맷으둜 벑터 λ³€ν™˜
101
+ query_vector = format_vector_for_pg(query_embedding)
102
+
103
+ # DB μ—°κ²°
104
+ conn = get_db_conn()
105
+ register_vector(conn)
106
+
107
+ # μžλ°” μ½”λ“œμ™€ λ™μΌν•œ SQL 쿼리 κ΅¬ν˜„
108
+ sql = """
109
+ WITH embeddings AS (
110
+ SELECT
111
+ id,
112
+ metadata,
113
+ content,
114
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '%s'::vector) ELSE 0 END * %f as full_sim,
115
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '%s'::vector) ELSE 0 END * %f as topic_sim,
116
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '%s'::vector) ELSE 0 END * %f as customer_sim,
117
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '%s'::vector) ELSE 0 END * %f as agent_sim
118
+ FROM vector_store_multi_embeddings
119
+ WHERE full_embedding IS NOT NULL
120
+ OR topic_embedding IS NOT NULL
121
+ OR customer_embedding IS NOT NULL
122
+ OR agent_embedding IS NOT NULL
123
+ )
124
+ SELECT
125
+ id,
126
+ metadata,
127
+ content,
128
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
129
+ FROM embeddings
130
+ ORDER BY combined_similarity DESC
131
+ LIMIT %s
132
+ """ % (query_vector, full_w, query_vector, topic_w, query_vector, customer_w, query_vector, agent_w, limit)
133
+
134
  with conn.cursor() as cur:
135
+ cur.execute(sql)
 
 
 
 
 
 
 
 
136
  rows = cur.fetchall()
137
+
138
+ results = []
139
+ for row in rows:
140
+ id_val = row[0]
141
+ metadata_json = row[1]
142
+ content = row[2]
143
+ similarity_score = float(row[3])
144
+
145
+ # 메타데이터 νŒŒμ‹±
146
+ try:
147
+ metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
148
+
149
+ result = {
150
+ "id": id_val,
151
+ "similarityScore": similarity_score,
152
+ "content": content,
153
+ "chatId": get_text_value(metadata, "chatId"),
154
+ "topic": get_text_value(metadata, "topic")
155
+ }
156
+
157
+ # μ‹œκ°„ ν•„λ“œ λ³€ν™˜ 없이 κ·ΈλŒ€λ‘œ μ‚¬μš©
158
+ if "startTime" in metadata and metadata["startTime"] is not None:
159
+ result["startTime"] = metadata["startTime"]
160
+
161
+ if "endTime" in metadata and metadata["endTime"] is not None:
162
+ result["endTime"] = metadata["endTime"]
163
+
164
+ results.append(result)
165
+ except Exception as e:
166
+ print(f"메타데이터 νŒŒμ‹± 였λ₯˜: {e}")
167
+ continue
168
+
169
+ # μž„κ³„κ°’ 필터링
170
+ filtered_results = [r for r in results if r["similarityScore"] >= threshold]
171
+
172
+ return filtered_results
173
+
174
  except Exception as e:
175
+ print(f"닀쀑 μž„λ² λ”© 검색 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
176
+ return []
177
+
178
  finally:
179
+ if 'conn' in locals():
180
+ conn.close()
181
 
182
+ def search_similar_chat_by_date(
183
+ query: str,
184
+ start_date: str = None,
185
+ end_date: str = None,
186
+ max_results: int = 100
187
  ) -> List[Dict]:
188
  """
189
+ μ§€μ •λœ λ‚ μ§œ λ²”μœ„ λ‚΄μ˜ μ±„νŒ… 데이터λ₯Ό κ²€μƒ‰ν•©λ‹ˆλ‹€.
190
 
191
  Args:
192
+ query (str): 검색할 쿼리 ν…μŠ€νŠΈ
193
+ start_date (str): 검색 μ‹œμž‘ λ‚ μ§œ (YYYY-MM-DD ν˜•μ‹)
194
+ end_date (str): 검색 μ’…λ£Œ λ‚ μ§œ (YYYY-MM-DD ν˜•μ‹)
195
+ max_results (int): λ°˜ν™˜ν•  μ΅œλŒ€ κ²°κ³Ό 수
196
+
197
  Returns:
198
  List[Dict]: 검색 κ²°κ³Ό λͺ©λ‘
199
  """
200
+ limit = max_results if max_results is not None else 100
201
+
202
+ # μžλ°”μ™€ λ™μΌν•œ κ°€μ€‘μΉ˜ μ„€μ •
203
+ full_w = DEFAULT_FULL_WEIGHT
204
+ topic_w = DEFAULT_TOPIC_WEIGHT
205
+ customer_w = DEFAULT_CUSTOMER_WEIGHT
206
+ agent_w = DEFAULT_AGENT_WEIGHT
207
+ threshold = DEFAULT_SIMILARITY_THRESHOLD
 
208
 
209
  try:
210
+ # 쿼리 μž„λ² λ”© 생성
211
+ query_embedding = get_embedding(query)
212
+
213
+ # PostgreSQL 포맷으둜 벑터 λ³€ν™˜
214
+ query_vector = format_vector_for_pg(query_embedding)
215
+
216
+ # DB μ—°κ²°
217
+ conn = get_db_conn()
218
+ register_vector(conn)
219
+
220
+ # μžλ°” μ½”λ“œμ™€ λ™μΌν•œ SQL 쿼리 μ‹œμž‘
221
+ sql = """
222
+ WITH embeddings AS (
223
+ SELECT
224
+ id,
225
+ metadata,
226
+ content,
227
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '%s'::vector) ELSE 0 END * %f as full_sim,
228
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '%s'::vector) ELSE 0 END * %f as topic_sim,
229
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '%s'::vector) ELSE 0 END * %f as customer_sim,
230
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '%s'::vector) ELSE 0 END * %f as agent_sim
231
+ FROM vector_store_multi_embeddings
232
+ WHERE full_embedding IS NOT NULL
233
+ OR topic_embedding IS NOT NULL
234
+ OR customer_embedding IS NOT NULL
235
+ OR agent_embedding IS NOT NULL
236
+ """ % (query_vector, full_w, query_vector, topic_w, query_vector, customer_w, query_vector, agent_w)
237
+
238
+ # λ‚ μ§œ ν•„ν„° μΆ”κ°€
239
+ if start_date and start_date.strip():
240
+ # μ‹œμž‘ μ‹œκ°„ μΆ”κ°€ν•˜μ—¬ ISO ν˜•μ‹μœΌλ‘œ 비ꡐ
241
+ iso_start_date = start_date + "T00:00:00"
242
+ sql += f" AND metadata->>'startTime' >= '{iso_start_date}'"
243
+
244
+ if end_date and end_date.strip():
245
+ # μ’…λ£Œ μ‹œκ°„ μΆ”κ°€ν•˜μ—¬ ISO ν˜•μ‹μœΌλ‘œ 비ꡐ
246
+ iso_end_date = end_date + "T23:59:59"
247
+ sql += f" AND metadata->>'startTime' <= '{iso_end_date}'"
248
+
249
+ sql += """
250
+ )
251
+ SELECT
252
+ id,
253
+ metadata,
254
+ content,
255
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
256
+ FROM embeddings
257
+ ORDER BY combined_similarity DESC
258
+ LIMIT %s
259
+ """
260
+
261
  with conn.cursor() as cur:
262
+ # μ—¬κΈ°μ„œλŠ” limitλ₯Ό νŒŒλΌλ―Έν„°λ‘œ 전달
263
+ cur.execute(sql, (limit,))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  rows = cur.fetchall()
265
 
266
+ results = []
267
+ for row in rows:
268
+ id_val = row[0]
269
+ metadata_json = row[1]
270
+ content = row[2]
271
+ similarity_score = float(row[3])
272
+
273
+ # 메타데이터 νŒŒμ‹±
274
+ try:
275
+ metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
276
+
277
+ result = {
278
+ "id": id_val,
279
+ "similarityScore": similarity_score,
280
+ "content": content,
281
+ "chatId": get_text_value(metadata, "chatId"),
282
+ "topic": get_text_value(metadata, "topic")
283
+ }
284
+
285
+ # μ‹œκ°„ ν•„λ“œ λ³€ν™˜ 없이 κ·ΈλŒ€λ‘œ μ‚¬μš© (이미 KST둜 μ €μž₯λ˜μ–΄ 있음)
286
+ if "startTime" in metadata and metadata["startTime"] is not None:
287
+ result["startTime"] = metadata["startTime"]
288
+
289
+ if "endTime" in metadata and metadata["endTime"] is not None:
290
+ result["endTime"] = metadata["endTime"]
291
+
292
+ results.append(result)
293
+ except Exception as e:
294
+ print(f"메타데이터 νŒŒμ‹± 였λ₯˜: {e}")
295
+ continue
296
+
297
+ # μž„κ³„κ°’ 필터링 (μžλ°” μ½”λ“œμ™€ λ™μΌν•˜κ²Œ κ΅¬ν˜„)
298
+ filtered_results = [r for r in results if r["similarityScore"] >= threshold]
299
+
300
+ return filtered_results
301
+
302
  except Exception as e:
303
+ print(f"닀쀑 μž„λ² λ”© λ‚ μ§œ 검색 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
304
+ return []
305
+
306
  finally:
307
+ if 'conn' in locals():
308
+ conn.close()
309
 
310
+ # Gradio μ›Ή μΈν„°νŽ˜μ΄μŠ€ μ„€μ •
311
  with gr.Blocks() as demo:
312
  gr.Markdown("# Chat Analysis Search")
313
+ gr.Interface(fn=search_similar_chat, inputs=["text", "number"], outputs="json", api_name="search_similar_chat")
314
+ gr.Interface(fn=search_similar_chat_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chat_by_date")
315
 
316
  if __name__ == "__main__":
317
  demo.launch(mcp_server=True)