Jake-seong commited on
Commit
e63d055
ยท
verified ยท
1 Parent(s): dc0abc3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -195
app.py CHANGED
@@ -7,9 +7,14 @@ from typing import List, Dict, Tuple, Any
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
  from datetime import datetime
10
- import re
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
13
 
14
  # DB ์—ฐ๊ฒฐ ์„ค์ •
15
  def get_db_conn():
@@ -31,238 +36,312 @@ def get_embedding(text: str) -> List[float]:
31
  )
32
  return response.data[0].embedding
33
 
34
- def expand_query(query: str) -> str:
35
- """
36
- ์‚ฌ์šฉ์ž ์ฟผ๋ฆฌ๋ฅผ ํ™•์žฅํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
37
- """
38
- # GPT๋ฅผ ํ™œ์šฉํ•œ ์ฟผ๋ฆฌ ํ™•์žฅ
39
- try:
40
- response = client.chat.completions.create(
41
- model="gpt-3.5-turbo",
42
- messages=[
43
- {"role": "system", "content": "๋‹น์‹ ์€ ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ํ™•์žฅ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์‚ฌ์šฉ์ž์˜ ์ฟผ๋ฆฌ๋ฅผ ๋ถ„์„ํ•˜๊ณ , ์ด์™€ ๊ด€๋ จ๋œ ํ‚ค์›Œ๋“œ์™€ ์งˆ๋ฌธ ํ˜•ํƒœ๋กœ ํ™•์žฅํ•˜์„ธ์š”."},
44
- {"role": "user", "content": f"๋‹ค์Œ ๊ฒ€์ƒ‰์–ด๋ฅผ ํ™•์žฅํ•ด์ฃผ์„ธ์š”: '{query}'"}
45
- ],
46
- temperature=0.3,
47
- max_tokens=150
48
- )
49
- expanded = query + " " + response.choices[0].message.content
50
- return expanded
51
- except:
52
- # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์›๋ณธ ์ฟผ๋ฆฌ ๋ฐ˜ํ™˜
53
- return query
54
 
55
- def extract_keywords(text: str) -> List[str]:
56
  """
57
- ํ…์ŠคํŠธ์—์„œ ์ค‘์š” ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
58
- """
59
- # ๋‹จ์ˆœํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๊ณ ๊ธ‰ NLP ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ ๋Œ€์ฒด ๊ฐ€๋Šฅ)
60
- # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ ๋ฐ ์ •๊ทœํ‘œํ˜„์‹์œผ๋กœ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
61
- stop_words = {'์žˆ๋Š”', 'ํ•˜๋Š”', '๊ทธ๋ฆฌ๊ณ ', '์ž…๋‹ˆ๋‹ค', '๊ทธ๊ฒƒ์€', '์žˆ์Šต๋‹ˆ๋‹ค', 'ํ•ฉ๋‹ˆ๋‹ค', '๊ทธ๋Ÿฐ', '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋ƒฅ'}
62
- words = re.findall(r'\w+', text.lower())
63
- keywords = [w for w in words if len(w) > 1 and w not in stop_words]
64
- return list(set(keywords))
65
-
66
- def perform_hybrid_search(
67
- query: str,
68
- vector_results: List[Dict],
69
- keyword_weight: float = 0.3,
70
- similarity_threshold: float = 0.4
71
- ) -> List[Dict]:
72
- """
73
- ๋ฒกํ„ฐ ๊ฒ€์ƒ‰๊ณผ ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์„ ๊ฒฐํ•ฉํ•œ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
74
- """
75
- # ์ž„๊ณ„๊ฐ’ ๋ฏธ๋งŒ์˜ ๊ฒฐ๊ณผ ํ•„ํ„ฐ๋ง
76
- filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold]
77
-
78
- if not filtered_results:
79
- # ๊ฒฐ๊ณผ๊ฐ€ ์—†์œผ๋ฉด ์ž„๊ณ„๊ฐ’์„ ๋‚ฎ์ถฐ์„œ ์žฌ์‹œ๋„
80
- filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold * 0.7]
81
-
82
- if not filtered_results:
83
- return vector_results[:5] # ์—ฌ์ „ํžˆ ์—†์œผ๋ฉด ์ƒ์œ„ 5๊ฐœ ๋ฐ˜ํ™˜
84
-
85
- # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ๊ฐ€์ค‘์น˜ ์ ์šฉ
86
- keywords = extract_keywords(query)
87
-
88
- for result in filtered_results:
89
- content = result.get("content", "")
90
- keyword_matches = sum(1 for kw in keywords if kw.lower() in content.lower())
91
- keyword_score = keyword_matches / max(len(keywords), 1)
92
-
93
- # ์ตœ์ข… ์ ์ˆ˜ ๊ณ„์‚ฐ (๋ฒกํ„ฐ ์œ ์‚ฌ๋„ + ํ‚ค์›Œ๋“œ ๊ฐ€์ค‘์น˜)
94
- result["original_similarity"] = result["similarity"]
95
- result["keyword_score"] = keyword_score
96
- result["similarity"] = (1 - keyword_weight) * result["similarity"] + keyword_weight * keyword_score
97
 
98
- # ์ตœ์ข… ์ ์ˆ˜๋กœ ์žฌ์ •๋ ฌ
99
- return sorted(filtered_results, key=lambda x: x["similarity"], reverse=True)
100
-
101
- def preprocess_query(query: str) -> str:
102
- """
103
- ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
104
- """
105
- # ๊ฒ€์ƒ‰์— ๋งž๊ฒŒ ํ”„๋กฌํ”„ํŠธ ์žฌ๊ตฌ์„ฑ
106
- return f"๋‹ค์Œ ์งˆ๋ฌธ์ด๋‚˜ ์ฃผ์ œ์™€ ๊ด€๋ จ๋œ ๋Œ€ํ™”๋ฅผ ์ฐพ์•„์ฃผ์„ธ์š”: {query}"
107
-
108
- def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
109
- """
110
- ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
111
  Args:
112
  query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
113
- maxResults (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
 
114
  Returns:
115
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
116
  """
117
- # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
118
- processed_query = preprocess_query(query)
119
- try:
120
- expanded_query = expand_query(processed_query)
121
- except:
122
- expanded_query = processed_query
 
 
123
 
124
- embedding = np.array(get_embedding(expanded_query))
125
- conn = get_db_conn()
126
- register_vector(conn)
127
 
128
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  with conn.cursor() as cur:
130
- # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
131
- cur.execute("""
132
- SELECT id, metadata, content,
133
- 1 - (embedding <=> %s) AS similarity
134
- FROM vector_store
135
- ORDER BY similarity DESC
136
- LIMIT %s
137
- """, (embedding, maxResults))
138
-
 
139
  rows = cur.fetchall()
140
 
141
- results = [{
142
- "id": row[0],
143
- "metadata": row[1],
144
- "content": row[2],
145
- "similarity": float(row[3])
146
- } for row in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
149
- results = perform_hybrid_search(
150
- query,
151
- results,
152
- keyword_weight=0.3,
153
- similarity_threshold=0.4
154
- )
155
 
156
- return results
 
157
  except Exception as e:
158
- raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
 
 
159
  finally:
160
- conn.close()
 
161
 
162
- def search_similar_chats_by_date(
163
- query: str,
164
- startDate: str = None,
165
- endDate: str = None,
166
- maxResults: int = 200
167
  ) -> List[Dict]:
168
  """
169
- ์ง€์ •๋œ ๋‚ ์งœ ๋ฒ”์œ„์— ํ•ด๋‹นํ•˜๋Š” ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
170
 
171
  Args:
172
- query (str): ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ
173
- startDate (str): ๊ฒ€์ƒ‰ ์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD)
174
- endDate (str): ๊ฒ€์ƒ‰ ์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD)
175
- maxResults (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
 
176
  Returns:
177
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
178
  """
179
- try:
180
- start_dt = datetime.strptime(startDate, "%Y-%m-%d") if startDate else None
181
- end_dt = datetime.strptime(endDate, "%Y-%m-%d") if endDate else None
182
- except ValueError as e:
183
- raise ValueError(f"๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {e}")
184
-
185
- # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
186
- processed_query = preprocess_query(query)
187
- try:
188
- expanded_query = expand_query(processed_query)
189
- except:
190
- expanded_query = processed_query
191
-
192
- embedding = np.array(get_embedding(expanded_query))
193
- conn = get_db_conn()
194
- register_vector(conn)
195
 
196
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  with conn.cursor() as cur:
198
- base_query = """
199
- SELECT id, metadata, content,
200
- 1 - (embedding <=> %s) AS similarity
201
- FROM vector_store
202
- WHERE 1=1
203
- """
204
- params = [embedding]
205
-
206
- # ๋™์  ์ฟผ๋ฆฌ ๊ตฌ์„ฑ
207
- if startDate:
208
- base_query += " AND (metadata->>'startTime')::date >= %s"
209
- params.append(startDate)
210
- if endDate:
211
- base_query += " AND (metadata->>'startTime')::date <= %s"
212
- params.append(endDate)
213
-
214
- base_query += " ORDER BY similarity DESC LIMIT %s"
215
- params.append(maxResults)
216
-
217
- cur.execute(base_query, tuple(params))
218
  rows = cur.fetchall()
219
 
220
- results = [{
221
- "id": row[0],
222
- "metadata": row[1],
223
- "content": row[2],
224
- "similarity": float(row[3])
225
- } for row in rows]
226
-
227
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
228
- results = perform_hybrid_search(
229
- query,
230
- results,
231
- keyword_weight=0.3,
232
- similarity_threshold=0.4
233
- )
234
 
235
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
236
- keywords = extract_keywords(query)
237
- for result in results:
238
- metadata = result.get("metadata", {})
239
- if not metadata or isinstance(metadata, str):
240
- continue
241
 
242
- # ์ฃผ์ œ(topic) ํ•„๋“œ์— ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
243
- topic = metadata.get("topic", "")
244
- topic_matches = sum(1 for kw in keywords if kw.lower() in topic.lower())
245
-
246
- # ์ฃผ์ œ ์ผ์น˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
247
- if topic_matches > 0:
248
- topic_boost = 0.1 * min(topic_matches, 3) # ์ตœ๋Œ€ 0.3 ๊ฐ€์ค‘์น˜
249
- result["similarity"] += topic_boost
250
- result["topic_boost"] = topic_boost
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- # ๊ฒฐ๊ณผ ์žฌ์ •๋ ฌ
253
- results = sorted(results, key=lambda x: x["similarity"], reverse=True)
 
254
 
255
- return results
 
 
 
 
 
256
  except Exception as e:
257
- raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
 
 
258
  finally:
259
- conn.close()
 
260
 
261
  # Gradio Blocks์— ํ•จ์ˆ˜ ๋“ฑ๋ก
262
  with gr.Blocks() as demo:
263
  gr.Markdown("# Chat Analysis Search")
264
- gr.Interface(fn=search_similar_chats, inputs=["text", "number"], outputs="json", api_name="search_similar_chats")
265
- gr.Interface(fn=search_similar_chats_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chats_by_date")
266
 
267
  if __name__ == "__main__":
268
- demo.launch(mcp_server=True)
 
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
  from datetime import datetime
10
+ from sklearn.preprocessing import normalize
11
+
12
+ # ๊ฐ€์ค‘์น˜ ๋ฐ ์ž„๊ณ„๊ฐ’ ์„ค์ •
13
+ DEFAULT_FULL_WEIGHT = 0.2
14
+ DEFAULT_TOPIC_WEIGHT = 0.5
15
+ DEFAULT_CUSTOMER_WEIGHT = 0.2
16
+ DEFAULT_AGENT_WEIGHT = 0.1
17
+ DEFAULT_SIMILARITY_THRESHOLD = 0.7
18
 
19
  # DB ์—ฐ๊ฒฐ ์„ค์ •
20
  def get_db_conn():
 
36
  )
37
  return response.data[0].embedding
38
 
39
+ def get_text_value(node, field_name):
40
+ """JSON ๋…ธ๋“œ์—์„œ ํ…์ŠคํŠธ ๊ฐ’์„ ์•ˆ์ „ํ•˜๊ฒŒ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
41
+ if node and field_name in node and node[field_name] is not None:
42
+ return node[field_name]
43
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
46
  """
47
+ ๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ๋œ ์ฑ„ํŒ… ๋ฐ์ดํ„ฐ์—์„œ ์œ ์‚ฌํ•œ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  Args:
50
  query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
51
+ max_results (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
52
+
53
  Returns:
54
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
55
  """
56
+ limit = max_results if max_results is not None else 100
57
+
58
+ # ๊ฐ€์ค‘์น˜ ์„ค์ •
59
+ full_w = DEFAULT_FULL_WEIGHT
60
+ topic_w = DEFAULT_TOPIC_WEIGHT
61
+ customer_w = DEFAULT_CUSTOMER_WEIGHT
62
+ agent_w = DEFAULT_AGENT_WEIGHT
63
+ threshold = DEFAULT_SIMILARITY_THRESHOLD
64
 
65
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์‹œ์ž‘: ์ฟผ๋ฆฌ='{query}', ๊ฐ€์ค‘์น˜=(full={full_w}, topic={topic_w}, customer={customer_w}, agent={agent_w}), ์ตœ๋Œ€ ๊ฒฐ๊ณผ={limit}")
 
 
66
 
67
  try:
68
+ # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
69
+ raw_embedding = np.array(get_embedding(query))
70
+ # L2 ์ •๊ทœํ™” ์ ์šฉ
71
+ query_embedding = normalize(raw_embedding.reshape(1, -1), norm='l2')[0]
72
+ print(f"์ž„๋ฒ ๋”ฉ ์ •๊ทœํ™” ์ „/ํ›„ ์ฒซ 5๊ฐœ ์š”์†Œ: {raw_embedding[:5]} -> {query_embedding[:5]}")
73
+
74
+ # DB ์—ฐ๊ฒฐ
75
+ conn = get_db_conn()
76
+ register_vector(conn)
77
+
78
+ # ์—ฌ๋Ÿฌ ํ•„๋“œ๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์กฐํ•ฉํ•œ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ SQL - ๋งค๊ฐœ๋ณ€์ˆ˜ํ™”๋œ ์ฟผ๋ฆฌ ์‚ฌ์šฉ
79
+ sql = """
80
+ WITH embeddings AS (
81
+ SELECT
82
+ id,
83
+ metadata,
84
+ content,
85
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> %s::vector) ELSE 0 END * %s as full_sim,
86
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> %s::vector) ELSE 0 END * %s as topic_sim,
87
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> %s::vector) ELSE 0 END * %s as customer_sim,
88
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> %s::vector) ELSE 0 END * %s as agent_sim
89
+ FROM vector_store_multi_embeddings
90
+ WHERE full_embedding IS NOT NULL
91
+ OR topic_embedding IS NOT NULL
92
+ OR customer_embedding IS NOT NULL
93
+ OR agent_embedding IS NOT NULL
94
+ )
95
+ SELECT
96
+ id,
97
+ metadata,
98
+ content,
99
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
100
+ FROM embeddings
101
+ ORDER BY combined_similarity DESC
102
+ LIMIT %s
103
+ """
104
+
105
  with conn.cursor() as cur:
106
+ # ๋งค๊ฐœ๋ณ€์ˆ˜ํ™”๋œ ์ฟผ๋ฆฌ ์‹คํ–‰
107
+ params = (
108
+ query_embedding, full_w,
109
+ query_embedding, topic_w,
110
+ query_embedding, customer_w,
111
+ query_embedding, agent_w,
112
+ limit
113
+ )
114
+ print(f"์ฟผ๋ฆฌ ์‹คํ–‰ - ํŒŒ๋ผ๋ฏธํ„ฐ: ๊ฐ€์ค‘์น˜ ์„ค์ •={full_w}, {topic_w}, {customer_w}, {agent_w}, ๊ฒฐ๊ณผ ์ œํ•œ={limit}")
115
+ cur.execute(sql, params)
116
  rows = cur.fetchall()
117
 
118
+ print(f"๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ: ์ด {len(rows)}๊ฐœ ๋ฐ์ดํ„ฐ ์กฐํšŒ๋จ")
119
+ if len(rows) > 0:
120
+ print(f"์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ ID: {rows[0][0]}, ์œ ์‚ฌ๋„: {float(rows[0][3])}")
121
+
122
+ results = []
123
+ for row in rows:
124
+ id_val = row[0]
125
+ metadata_json = row[1]
126
+ content = row[2]
127
+ similarity_score = float(row[3])
128
+
129
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
130
+ try:
131
+ metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
132
+
133
+ result = {
134
+ "id": id_val,
135
+ "similarityScore": similarity_score,
136
+ "content": content,
137
+ "chatId": get_text_value(metadata, "chatId"),
138
+ "topic": get_text_value(metadata, "topic")
139
+ }
140
+
141
+ # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
142
+ if "startTime" in metadata and metadata["startTime"] is not None:
143
+ result["startTime"] = metadata["startTime"]
144
+
145
+ if "endTime" in metadata and metadata["endTime"] is not None:
146
+ result["endTime"] = metadata["endTime"]
147
+
148
+ results.append(result)
149
+ except Exception as e:
150
+ print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
151
+ print(f"๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ: {metadata_json[:200]}...")
152
+ continue
153
+
154
+ # ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ๋ง
155
+ filtered_results = [r for r in results if r["similarityScore"] >= threshold]
156
+ print(f"์ž„๊ณ„๊ฐ’({threshold}) ์ด์ƒ ๊ฒฐ๊ณผ: {len(filtered_results)}๊ฐœ / ์ „์ฒด {len(results)}๊ฐœ")
157
 
158
+ if len(filtered_results) > 0:
159
+ print(f"๊ฐ€์žฅ ๋†’์€ ์œ ์‚ฌ๋„ ์ ์ˆ˜: {filtered_results[0]['similarityScore']}")
160
+ print(f"์ƒ์œ„ ๊ฒฐ๊ณผ ์ฑ—ID: {filtered_results[0].get('chatId')}, ์ฃผ์ œ: {filtered_results[0].get('topic', '')[:50]}...")
 
 
 
 
161
 
162
+ return filtered_results
163
+
164
  except Exception as e:
165
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
166
+ return []
167
+
168
  finally:
169
+ if 'conn' in locals():
170
+ conn.close()
171
 
172
+ def search_similar_chat_by_date(
173
+ query: str,
174
+ start_date: str = None,
175
+ end_date: str = None,
176
+ max_results: int = 100
177
  ) -> List[Dict]:
178
  """
179
+ ์ง€์ •๋œ ๋‚ ์งœ ๋ฒ”์œ„ ๋‚ด์˜ ๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ์ฑ„ํŒ… ๋ฐ์ดํ„ฐ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
180
 
181
  Args:
182
+ query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
183
+ start_date (str): ๊ฒ€์ƒ‰ ์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD ํ˜•์‹)
184
+ end_date (str): ๊ฒ€์ƒ‰ ์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD ํ˜•์‹)
185
+ max_results (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
186
+
187
  Returns:
188
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
189
  """
190
+ limit = max_results if max_results is not None else 100
191
+
192
+ # ๊ฐ€์ค‘์น˜ ์„ค์ •
193
+ full_w = DEFAULT_FULL_WEIGHT
194
+ topic_w = DEFAULT_TOPIC_WEIGHT
195
+ customer_w = DEFAULT_CUSTOMER_WEIGHT
196
+ agent_w = DEFAULT_AGENT_WEIGHT
197
+ threshold = DEFAULT_SIMILARITY_THRESHOLD
198
+
199
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๋‚ ์งœ ๊ฒ€์ƒ‰ ์‹œ์ž‘: ์ฟผ๋ฆฌ='{query}', ์‹œ์ž‘์ผ={start_date}, ์ข…๋ฃŒ์ผ={end_date}, ์ตœ๋Œ€ ๊ฒฐ๊ณผ={limit}")
 
 
 
 
 
 
200
 
201
  try:
202
+ # ๋‚ ์งœ ํ•„ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์ƒ์„ฑ
203
+ start_timestamp = None
204
+ end_timestamp = None
205
+
206
+ if start_date and start_date.strip():
207
+ try:
208
+ start_datetime = datetime.strptime(start_date, '%Y-%m-%d')
209
+ start_timestamp = int(start_datetime.timestamp() * 1000) # ๋ฐ€๋ฆฌ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜
210
+ except ValueError as e:
211
+ print(f"์‹œ์ž‘ ๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {str(e)}")
212
+ return []
213
+
214
+ if end_date and end_date.strip():
215
+ try:
216
+ # ์ข…๋ฃŒ์ผ์˜ 23:59:59๋กœ ์„ค์ •
217
+ end_datetime = datetime.strptime(end_date + ' 23:59:59', '%Y-%m-%d %H:%M:%S')
218
+ end_timestamp = int(end_datetime.timestamp() * 1000) # ๋ฐ€๋ฆฌ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜
219
+ except ValueError as e:
220
+ print(f"์ข…๋ฃŒ ๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {str(e)}")
221
+ return []
222
+
223
+ # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
224
+ raw_embedding = np.array(get_embedding(query))
225
+ # L2 ์ •๊ทœํ™” ์ ์šฉ
226
+ query_embedding = normalize(raw_embedding.reshape(1, -1), norm='l2')[0]
227
+ print(f"๋‚ ์งœ ๊ฒ€์ƒ‰ - ์ž„๋ฒ ๋”ฉ ์ •๊ทœํ™” ์ „/ํ›„ ์ฒซ 5๊ฐœ ์š”์†Œ: {raw_embedding[:5]} -> {query_embedding[:5]}")
228
+
229
+ # DB ์—ฐ๊ฒฐ
230
+ conn = get_db_conn()
231
+ register_vector(conn)
232
+
233
+ # ์—ฌ๋Ÿฌ ํ•„๋“œ๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์กฐํ•ฉํ•œ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ SQL - ๋งค๊ฐœ๋ณ€์ˆ˜ํ™”
234
+ sql = """
235
+ WITH embeddings AS (
236
+ SELECT
237
+ id,
238
+ metadata,
239
+ content,
240
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> %s::vector) ELSE 0 END * %s as full_sim,
241
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> %s::vector) ELSE 0 END * %s as topic_sim,
242
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> %s::vector) ELSE 0 END * %s as customer_sim,
243
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> %s::vector) ELSE 0 END * %s as agent_sim
244
+ FROM vector_store_multi_embeddings
245
+ WHERE full_embedding IS NOT NULL
246
+ OR topic_embedding IS NOT NULL
247
+ OR customer_embedding IS NOT NULL
248
+ OR agent_embedding IS NOT NULL
249
+ """
250
+
251
+ params = [
252
+ query_embedding, full_w,
253
+ query_embedding, topic_w,
254
+ query_embedding, customer_w,
255
+ query_embedding, agent_w
256
+ ]
257
+
258
+ # ๋‚ ์งœ ํ•„ํ„ฐ ์ถ”๊ฐ€
259
+ if start_timestamp is not None:
260
+ sql += " AND (metadata->>'startTime')::bigint >= %s"
261
+ params.append(start_timestamp)
262
+
263
+ if end_timestamp is not None:
264
+ sql += " AND (metadata->>'startTime')::bigint <= %s"
265
+ params.append(end_timestamp)
266
+
267
+ sql += """
268
+ )
269
+ SELECT
270
+ id,
271
+ metadata,
272
+ content,
273
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
274
+ FROM embeddings
275
+ ORDER BY combined_similarity DESC
276
+ LIMIT %s
277
+ """
278
+
279
+ params.append(limit)
280
+
281
  with conn.cursor() as cur:
282
+ print(f"๋‚ ์งœ ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ์‹คํ–‰: ์‹œ์ž‘์ผ={start_date}({start_timestamp}), ์ข…๋ฃŒ์ผ={end_date}({end_timestamp})")
283
+ cur.execute(sql, tuple(params))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  rows = cur.fetchall()
285
 
286
+ print(f"๋‚ ์งœ ํ•„ํ„ฐ๋ง ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ: ์ด {len(rows)}๊ฐœ ๋ฐ์ดํ„ฐ ์กฐํšŒ๋จ")
287
+ if len(rows) > 0:
288
+ print(f"์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ ID: {rows[0][0]}, ์œ ์‚ฌ๋„: {float(rows[0][3])}")
 
 
 
 
 
 
 
 
 
 
 
289
 
290
+ results = []
291
+ for row in rows:
292
+ id_val = row[0]
293
+ metadata_json = row[1]
294
+ content = row[2]
295
+ similarity_score = float(row[3])
296
 
297
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
298
+ try:
299
+ metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
300
+
301
+ result = {
302
+ "id": id_val,
303
+ "similarityScore": similarity_score,
304
+ "content": content,
305
+ "chatId": get_text_value(metadata, "chatId"),
306
+ "topic": get_text_value(metadata, "topic")
307
+ }
308
+
309
+ # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
310
+ if "startTime" in metadata and metadata["startTime"] is not None:
311
+ result["startTime"] = metadata["startTime"]
312
+
313
+ if "endTime" in metadata and metadata["endTime"] is not None:
314
+ result["endTime"] = metadata["endTime"]
315
+
316
+ results.append(result)
317
+ except Exception as e:
318
+ print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
319
+ print(f"๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ: {metadata_json[:200]}...")
320
+ continue
321
 
322
+ # ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ๋ง
323
+ filtered_results = [r for r in results if r["similarityScore"] >= threshold]
324
+ print(f"๋‚ ์งœ ๊ฒ€์ƒ‰ - ์ž„๊ณ„๊ฐ’({threshold}) ์ด์ƒ ๊ฒฐ๊ณผ: {len(filtered_results)}๊ฐœ / ์ „์ฒด {len(results)}๊ฐœ")
325
 
326
+ if len(filtered_results) > 0:
327
+ print(f"๋‚ ์งœ ๊ฒ€์ƒ‰ - ๊ฐ€์žฅ ๋†’์€ ์œ ์‚ฌ๋„ ์ ์ˆ˜: {filtered_results[0]['similarityScore']}")
328
+ print(f"๋‚ ์งœ ๊ฒ€์ƒ‰ - ์ƒ์œ„ ๊ฒฐ๊ณผ ์ฑ—ID: {filtered_results[0].get('chatId')}, ์‹œ์ž‘์‹œ๊ฐ„: {filtered_results[0].get('startTime')}")
329
+
330
+ return filtered_results
331
+
332
  except Exception as e:
333
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๋‚ ์งœ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
334
+ return []
335
+
336
  finally:
337
+ if 'conn' in locals():
338
+ conn.close()
339
 
340
  # Gradio Blocks์— ํ•จ์ˆ˜ ๋“ฑ๋ก
341
  with gr.Blocks() as demo:
342
  gr.Markdown("# Chat Analysis Search")
343
+ gr.Interface(fn=search_similar_chat, inputs=["text", "number"], outputs="json", api_name="search_similar_chat")
344
+ gr.Interface(fn=search_similar_chat_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chat_by_date")
345
 
346
  if __name__ == "__main__":
347
+ demo.launch(mcp_server=True)