Jake-seong commited on
Commit
004f156
ยท
verified ยท
1 Parent(s): 8866252

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +238 -195
app.py CHANGED
@@ -7,9 +7,13 @@ from typing import List, Dict, Tuple, Any
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
  from datetime import datetime
10
- import re
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
13
 
14
  # DB ์—ฐ๊ฒฐ ์„ค์ •
15
  def get_db_conn():
@@ -31,238 +35,277 @@ def get_embedding(text: str) -> List[float]:
31
  )
32
  return response.data[0].embedding
33
 
34
- def expand_query(query: str) -> str:
35
- """
36
- ์‚ฌ์šฉ์ž ์ฟผ๋ฆฌ๋ฅผ ํ™•์žฅํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
37
- """
38
- # GPT๋ฅผ ํ™œ์šฉํ•œ ์ฟผ๋ฆฌ ํ™•์žฅ
39
- try:
40
- response = client.chat.completions.create(
41
- model="gpt-3.5-turbo",
42
- messages=[
43
- {"role": "system", "content": "๋‹น์‹ ์€ ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ํ™•์žฅ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์‚ฌ์šฉ์ž์˜ ์ฟผ๋ฆฌ๋ฅผ ๋ถ„์„ํ•˜๊ณ , ์ด์™€ ๊ด€๋ จ๋œ ํ‚ค์›Œ๋“œ์™€ ์งˆ๋ฌธ ํ˜•ํƒœ๋กœ ํ™•์žฅํ•˜์„ธ์š”."},
44
- {"role": "user", "content": f"๋‹ค์Œ ๊ฒ€์ƒ‰์–ด๋ฅผ ํ™•์žฅํ•ด์ฃผ์„ธ์š”: '{query}'"}
45
- ],
46
- temperature=0.3,
47
- max_tokens=150
48
- )
49
- expanded = query + " " + response.choices[0].message.content
50
- return expanded
51
- except:
52
- # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์›๋ณธ ์ฟผ๋ฆฌ ๋ฐ˜ํ™˜
53
- return query
54
 
55
- def extract_keywords(text: str) -> List[str]:
56
- """
57
- ํ…์ŠคํŠธ์—์„œ ์ค‘์š” ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
58
- """
59
- # ๋‹จ์ˆœํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๊ณ ๊ธ‰ NLP ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ ๋Œ€์ฒด ๊ฐ€๋Šฅ)
60
- # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ ๋ฐ ์ •๊ทœํ‘œํ˜„์‹์œผ๋กœ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
61
- stop_words = {'์žˆ๋Š”', 'ํ•˜๋Š”', '๊ทธ๋ฆฌ๊ณ ', '์ž…๋‹ˆ๋‹ค', '๊ทธ๊ฒƒ์€', '์žˆ์Šต๋‹ˆ๋‹ค', 'ํ•ฉ๋‹ˆ๋‹ค', '๊ทธ๋Ÿฐ', '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋ƒฅ'}
62
- words = re.findall(r'\w+', text.lower())
63
- keywords = [w for w in words if len(w) > 1 and w not in stop_words]
64
- return list(set(keywords))
65
 
66
- def perform_hybrid_search(
67
- query: str,
68
- vector_results: List[Dict],
69
- keyword_weight: float = 0.3,
70
- similarity_threshold: float = 0.4
71
- ) -> List[Dict]:
72
  """
73
- ๋ฒกํ„ฐ ๊ฒ€์ƒ‰๊ณผ ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์„ ๊ฒฐํ•ฉํ•œ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
74
- """
75
- # ์ž„๊ณ„๊ฐ’ ๋ฏธ๋งŒ์˜ ๊ฒฐ๊ณผ ํ•„ํ„ฐ๋ง
76
- filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold]
77
-
78
- if not filtered_results:
79
- # ๊ฒฐ๊ณผ๊ฐ€ ์—†์œผ๋ฉด ์ž„๊ณ„๊ฐ’์„ ๋‚ฎ์ถฐ์„œ ์žฌ์‹œ๋„
80
- filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold * 0.7]
81
-
82
- if not filtered_results:
83
- return vector_results[:5] # ์—ฌ์ „ํžˆ ์—†์œผ๋ฉด ์ƒ์œ„ 5๊ฐœ ๋ฐ˜ํ™˜
84
-
85
- # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ๊ฐ€์ค‘์น˜ ์ ์šฉ
86
- keywords = extract_keywords(query)
87
 
88
- for result in filtered_results:
89
- content = result.get("content", "")
90
- keyword_matches = sum(1 for kw in keywords if kw.lower() in content.lower())
91
- keyword_score = keyword_matches / max(len(keywords), 1)
92
-
93
- # ์ตœ์ข… ์ ์ˆ˜ ๊ณ„์‚ฐ (๋ฒกํ„ฐ ์œ ์‚ฌ๋„ + ํ‚ค์›Œ๋“œ ๊ฐ€์ค‘์น˜)
94
- result["original_similarity"] = result["similarity"]
95
- result["keyword_score"] = keyword_score
96
- result["similarity"] = (1 - keyword_weight) * result["similarity"] + keyword_weight * keyword_score
97
-
98
- # ์ตœ์ข… ์ ์ˆ˜๋กœ ์žฌ์ •๋ ฌ
99
- return sorted(filtered_results, key=lambda x: x["similarity"], reverse=True)
100
-
101
- def preprocess_query(query: str) -> str:
102
- """
103
- ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
104
- """
105
- # ๊ฒ€์ƒ‰์— ๋งž๊ฒŒ ํ”„๋กฌํ”„ํŠธ ์žฌ๊ตฌ์„ฑ
106
- return f"๋‹ค์Œ ์งˆ๋ฌธ์ด๋‚˜ ์ฃผ์ œ์™€ ๊ด€๋ จ๋œ ๋Œ€ํ™”๋ฅผ ์ฐพ์•„์ฃผ์„ธ์š”: {query}"
107
-
108
- def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
109
- """
110
- ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
111
  Args:
112
  query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
113
- maxResults (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
 
114
  Returns:
115
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
116
  """
117
- # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
118
- processed_query = preprocess_query(query)
119
- try:
120
- expanded_query = expand_query(processed_query)
121
- except:
122
- expanded_query = processed_query
 
 
123
 
124
- embedding = np.array(get_embedding(expanded_query))
125
- conn = get_db_conn()
126
- register_vector(conn)
127
 
128
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  with conn.cursor() as cur:
130
- # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
131
- cur.execute("""
132
- SELECT id, metadata, content,
133
- 1 - (embedding <=> %s) AS similarity
134
- FROM vector_store
135
- ORDER BY similarity DESC
136
- LIMIT %s
137
- """, (embedding, maxResults))
138
-
139
  rows = cur.fetchall()
140
 
141
- results = [{
142
- "id": row[0],
143
- "metadata": row[1],
144
- "content": row[2],
145
- "similarity": float(row[3])
146
- } for row in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
149
- results = perform_hybrid_search(
150
- query,
151
- results,
152
- keyword_weight=0.3,
153
- similarity_threshold=0.4
154
- )
155
 
156
- return results
 
157
  except Exception as e:
158
- raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
 
 
159
  finally:
160
- conn.close()
 
161
 
162
- def search_similar_chats_by_date(
163
- query: str,
164
- startDate: str = None,
165
- endDate: str = None,
166
- maxResults: int = 200
167
  ) -> List[Dict]:
168
  """
169
- ์ง€์ •๋œ ๋‚ ์งœ ๋ฒ”์œ„์— ํ•ด๋‹นํ•˜๋Š” ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
170
 
171
  Args:
172
- query (str): ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ
173
- startDate (str): ๊ฒ€์ƒ‰ ์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD)
174
- endDate (str): ๊ฒ€์ƒ‰ ์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD)
175
- maxResults (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
 
176
  Returns:
177
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
178
  """
179
- try:
180
- start_dt = datetime.strptime(startDate, "%Y-%m-%d") if startDate else None
181
- end_dt = datetime.strptime(endDate, "%Y-%m-%d") if endDate else None
182
- except ValueError as e:
183
- raise ValueError(f"๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {e}")
184
-
185
- # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
186
- processed_query = preprocess_query(query)
187
- try:
188
- expanded_query = expand_query(processed_query)
189
- except:
190
- expanded_query = processed_query
191
-
192
- embedding = np.array(get_embedding(expanded_query))
193
- conn = get_db_conn()
194
- register_vector(conn)
195
 
196
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  with conn.cursor() as cur:
198
- base_query = """
199
- SELECT id, metadata, content,
200
- 1 - (embedding <=> %s) AS similarity
201
- FROM vector_store
202
- WHERE 1=1
203
- """
204
- params = [embedding]
205
-
206
- # ๋™์  ์ฟผ๋ฆฌ ๊ตฌ์„ฑ
207
- if startDate:
208
- base_query += " AND (metadata->>'startTime')::date >= %s"
209
- params.append(startDate)
210
- if endDate:
211
- base_query += " AND (metadata->>'startTime')::date <= %s"
212
- params.append(endDate)
213
-
214
- base_query += " ORDER BY similarity DESC LIMIT %s"
215
- params.append(maxResults)
216
-
217
- cur.execute(base_query, tuple(params))
218
  rows = cur.fetchall()
219
 
220
- results = [{
221
- "id": row[0],
222
- "metadata": row[1],
223
- "content": row[2],
224
- "similarity": float(row[3])
225
- } for row in rows]
226
-
227
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
228
- results = perform_hybrid_search(
229
- query,
230
- results,
231
- keyword_weight=0.3,
232
- similarity_threshold=0.4
233
- )
234
-
235
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
236
- keywords = extract_keywords(query)
237
- for result in results:
238
- metadata = result.get("metadata", {})
239
- if not metadata or isinstance(metadata, str):
240
- continue
241
 
242
- # ์ฃผ์ œ(topic) ํ•„๋“œ์— ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
243
- topic = metadata.get("topic", "")
244
- topic_matches = sum(1 for kw in keywords if kw.lower() in topic.lower())
245
-
246
- # ์ฃผ์ œ ์ผ์น˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
247
- if topic_matches > 0:
248
- topic_boost = 0.1 * min(topic_matches, 3) # ์ตœ๋Œ€ 0.3 ๊ฐ€์ค‘์น˜
249
- result["similarity"] += topic_boost
250
- result["topic_boost"] = topic_boost
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- # ๊ฒฐ๊ณผ ์žฌ์ •๋ ฌ
253
- results = sorted(results, key=lambda x: x["similarity"], reverse=True)
254
 
255
- return results
 
256
  except Exception as e:
257
- raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
 
 
258
  finally:
259
- conn.close()
 
260
 
261
  # Gradio Blocks์— ํ•จ์ˆ˜ ๋“ฑ๋ก
262
  with gr.Blocks() as demo:
263
  gr.Markdown("# Chat Analysis Search")
264
- gr.Interface(fn=search_similar_chats, inputs=["text", "number"], outputs="json", api_name="search_similar_chats")
265
- gr.Interface(fn=search_similar_chats_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chats_by_date")
266
 
267
  if __name__ == "__main__":
268
- demo.launch(mcp_server=True)
 
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
  from datetime import datetime
10
+
11
+ # ๊ฐ€์ค‘์น˜ ๋ฐ ์ž„๊ณ„๊ฐ’ ์„ค์ •
12
+ DEFAULT_FULL_WEIGHT = 0.2
13
+ DEFAULT_TOPIC_WEIGHT = 0.5
14
+ DEFAULT_CUSTOMER_WEIGHT = 0.2
15
+ DEFAULT_AGENT_WEIGHT = 0.1
16
+ DEFAULT_SIMILARITY_THRESHOLD = 0.7
17
 
18
  # DB ์—ฐ๊ฒฐ ์„ค์ •
19
  def get_db_conn():
 
35
  )
36
  return response.data[0].embedding
37
 
38
+ def format_vector_for_pg(vector: List[float]) -> str:
39
+ """๋ฒกํ„ฐ๋ฅผ PostgreSQL ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
40
+ return f"[{','.join(str(x) for x in vector)}]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ def get_text_value(node, field_name):
43
+ """JSON ๋…ธ๋“œ์—์„œ ํ…์ŠคํŠธ ๊ฐ’์„ ์•ˆ์ „ํ•˜๊ฒŒ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."""
44
+ if node and field_name in node and node[field_name] is not None:
45
+ return node[field_name]
46
+ return None
 
 
 
 
 
47
 
48
+ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
 
 
 
 
 
49
  """
50
+ ๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ๋œ ์ฑ„ํŒ… ๋ฐ์ดํ„ฐ์—์„œ ์œ ์‚ฌํ•œ ์ฝ˜ํ…์ธ ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  Args:
53
  query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
54
+ max_results (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
55
+
56
  Returns:
57
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
58
  """
59
+ limit = max_results if max_results is not None else 100
60
+
61
+ # ๊ฐ€์ค‘์น˜ ์„ค์ •
62
+ full_w = DEFAULT_FULL_WEIGHT
63
+ topic_w = DEFAULT_TOPIC_WEIGHT
64
+ customer_w = DEFAULT_CUSTOMER_WEIGHT
65
+ agent_w = DEFAULT_AGENT_WEIGHT
66
+ threshold = DEFAULT_SIMILARITY_THRESHOLD
67
 
68
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์‹œ์ž‘: ์ฟผ๋ฆฌ='{query}', ๊ฐ€์ค‘์น˜=(full={full_w}, topic={topic_w}, customer={customer_w}, agent={agent_w}), ์ตœ๋Œ€ ๊ฒฐ๊ณผ={limit}")
 
 
69
 
70
  try:
71
+ # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
72
+ query_embedding = np.array(get_embedding(query))
73
+ query_vector = format_vector_for_pg(query_embedding)
74
+
75
+ # DB ์—ฐ๊ฒฐ
76
+ conn = get_db_conn()
77
+ register_vector(conn)
78
+
79
+ # ์—ฌ๋Ÿฌ ํ•„๋“œ๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์กฐํ•ฉํ•œ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ SQL
80
+ sql = f"""
81
+ WITH embeddings AS (
82
+ SELECT
83
+ id,
84
+ metadata,
85
+ content,
86
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '{query_vector}'::vector) ELSE 0 END * {full_w} as full_sim,
87
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '{query_vector}'::vector) ELSE 0 END * {topic_w} as topic_sim,
88
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '{query_vector}'::vector) ELSE 0 END * {customer_w} as customer_sim,
89
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '{query_vector}'::vector) ELSE 0 END * {agent_w} as agent_sim
90
+ FROM vector_store_multi_embeddings
91
+ WHERE full_embedding IS NOT NULL
92
+ OR topic_embedding IS NOT NULL
93
+ OR customer_embedding IS NOT NULL
94
+ OR agent_embedding IS NOT NULL
95
+ )
96
+ SELECT
97
+ id,
98
+ metadata,
99
+ content,
100
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
101
+ FROM embeddings
102
+ ORDER BY combined_similarity DESC
103
+ LIMIT %s
104
+ """
105
+
106
  with conn.cursor() as cur:
107
+ cur.execute(sql, (limit,))
 
 
 
 
 
 
 
 
108
  rows = cur.fetchall()
109
 
110
+ results = []
111
+ for row in rows:
112
+ id_val = row[0]
113
+ metadata_json = row[1]
114
+ content = row[2]
115
+ similarity_score = float(row[3])
116
+
117
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
118
+ try:
119
+ metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
120
+
121
+ result = {
122
+ "id": id_val,
123
+ "similarityScore": similarity_score,
124
+ "content": content,
125
+ "chatId": get_text_value(metadata, "chatId"),
126
+ "topic": get_text_value(metadata, "topic")
127
+ }
128
+
129
+ # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
130
+ if "startTime" in metadata and metadata["startTime"] is not None:
131
+ result["startTime"] = metadata["startTime"]
132
+
133
+ if "endTime" in metadata and metadata["endTime"] is not None:
134
+ result["endTime"] = metadata["endTime"]
135
+
136
+ results.append(result)
137
+ except Exception as e:
138
+ print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
139
+ continue
140
 
141
+ # ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ๋ง
142
+ filtered_results = [r for r in results if r["similarityScore"] >= threshold]
 
 
 
 
 
143
 
144
+ return filtered_results
145
+
146
  except Exception as e:
147
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
148
+ return []
149
+
150
  finally:
151
+ if 'conn' in locals():
152
+ conn.close()
153
 
154
+ def search_similar_chat_by_date(
155
+ query: str,
156
+ start_date: str = None,
157
+ end_date: str = None,
158
+ max_results: int = 100
159
  ) -> List[Dict]:
160
  """
161
+ ์ง€์ •๋œ ๋‚ ์งœ ๋ฒ”์œ„ ๋‚ด์˜ ๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ์ฑ„ํŒ… ๋ฐ์ดํ„ฐ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
162
 
163
  Args:
164
+ query (str): ๊ฒ€์ƒ‰ํ•  ์ฟผ๋ฆฌ ํ…์ŠคํŠธ
165
+ start_date (str): ๊ฒ€์ƒ‰ ์‹œ์ž‘ ๋‚ ์งœ (YYYY-MM-DD ํ˜•์‹)
166
+ end_date (str): ๊ฒ€์ƒ‰ ์ข…๋ฃŒ ๋‚ ์งœ (YYYY-MM-DD ํ˜•์‹)
167
+ max_results (int): ๋ฐ˜ํ™˜ํ•  ์ตœ๋Œ€ ๊ฒฐ๊ณผ ์ˆ˜
168
+
169
  Returns:
170
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
171
  """
172
+ limit = max_results if max_results is not None else 100
173
+
174
+ # ๊ฐ€์ค‘์น˜ ์„ค์ •
175
+ full_w = DEFAULT_FULL_WEIGHT
176
+ topic_w = DEFAULT_TOPIC_WEIGHT
177
+ customer_w = DEFAULT_CUSTOMER_WEIGHT
178
+ agent_w = DEFAULT_AGENT_WEIGHT
179
+ threshold = DEFAULT_SIMILARITY_THRESHOLD
180
+
181
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๋‚ ์งœ ๊ฒ€์ƒ‰ ์‹œ์ž‘: ์ฟผ๋ฆฌ='{query}', ์‹œ์ž‘์ผ={start_date}, ์ข…๋ฃŒ์ผ={end_date}, ์ตœ๋Œ€ ๊ฒฐ๊ณผ={limit}")
 
 
 
 
 
 
182
 
183
  try:
184
+ # ๋‚ ์งœ ํ•„ํ„ฐ ํŒŒ๋ผ๋ฏธํ„ฐ ์ƒ์„ฑ
185
+ start_timestamp = None
186
+ end_timestamp = None
187
+
188
+ if start_date and start_date.strip():
189
+ try:
190
+ start_datetime = datetime.strptime(start_date, '%Y-%m-%d')
191
+ start_timestamp = int(start_datetime.timestamp() * 1000) # ๋ฐ€๋ฆฌ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜
192
+ except ValueError as e:
193
+ print(f"์‹œ์ž‘ ๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {str(e)}")
194
+ return []
195
+
196
+ if end_date and end_date.strip():
197
+ try:
198
+ # ์ข…๋ฃŒ์ผ์˜ 23:59:59๋กœ ์„ค์ •
199
+ end_datetime = datetime.strptime(end_date + ' 23:59:59', '%Y-%m-%d %H:%M:%S')
200
+ end_timestamp = int(end_datetime.timestamp() * 1000) # ๋ฐ€๋ฆฌ์ดˆ ๋‹จ์œ„๋กœ ๋ณ€ํ™˜
201
+ except ValueError as e:
202
+ print(f"์ข…๋ฃŒ ๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {str(e)}")
203
+ return []
204
+
205
+ # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
206
+ query_embedding = np.array(get_embedding(query))
207
+ query_vector = format_vector_for_pg(query_embedding)
208
+
209
+ # DB ์—ฐ๊ฒฐ
210
+ conn = get_db_conn()
211
+ register_vector(conn)
212
+
213
+ # ์—ฌ๋Ÿฌ ํ•„๋“œ๋ฅผ ๊ฐ€์ค‘์น˜๋กœ ์กฐํ•ฉํ•œ ์œ ์‚ฌ๋„ ๊ฒ€์ƒ‰ SQL (๋‚ ์งœ ํ•„ํ„ฐ ์ถ”๊ฐ€)
214
+ sql = f"""
215
+ WITH embeddings AS (
216
+ SELECT
217
+ id,
218
+ metadata,
219
+ content,
220
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '{query_vector}'::vector) ELSE 0 END * {full_w} as full_sim,
221
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '{query_vector}'::vector) ELSE 0 END * {topic_w} as topic_sim,
222
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '{query_vector}'::vector) ELSE 0 END * {customer_w} as customer_sim,
223
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '{query_vector}'::vector) ELSE 0 END * {agent_w} as agent_sim
224
+ FROM vector_store_multi_embeddings
225
+ WHERE full_embedding IS NOT NULL
226
+ OR topic_embedding IS NOT NULL
227
+ OR customer_embedding IS NOT NULL
228
+ OR agent_embedding IS NOT NULL
229
+ """
230
+
231
+ params = []
232
+
233
+ # ๋‚ ์งœ ํ•„ํ„ฐ ์ถ”๊ฐ€
234
+ if start_timestamp is not None:
235
+ sql += f" AND (metadata->>'startTime')::bigint >= %s"
236
+ params.append(start_timestamp)
237
+
238
+ if end_timestamp is not None:
239
+ sql += f" AND (metadata->>'startTime')::bigint <= %s"
240
+ params.append(end_timestamp)
241
+
242
+ sql += """
243
+ )
244
+ SELECT
245
+ id,
246
+ metadata,
247
+ content,
248
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
249
+ FROM embeddings
250
+ ORDER BY combined_similarity DESC
251
+ LIMIT %s
252
+ """
253
+
254
+ params.append(limit)
255
+
256
  with conn.cursor() as cur:
257
+ cur.execute(sql, tuple(params))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  rows = cur.fetchall()
259
 
260
+ results = []
261
+ for row in rows:
262
+ id_val = row[0]
263
+ metadata_json = row[1]
264
+ content = row[2]
265
+ similarity_score = float(row[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
268
+ try:
269
+ metadata = json.loads(metadata_json) if isinstance(metadata_json, str) else metadata_json
270
+
271
+ result = {
272
+ "id": id_val,
273
+ "similarityScore": similarity_score,
274
+ "content": content,
275
+ "chatId": get_text_value(metadata, "chatId"),
276
+ "topic": get_text_value(metadata, "topic")
277
+ }
278
+
279
+ # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
280
+ if "startTime" in metadata and metadata["startTime"] is not None:
281
+ result["startTime"] = metadata["startTime"]
282
+
283
+ if "endTime" in metadata and metadata["endTime"] is not None:
284
+ result["endTime"] = metadata["endTime"]
285
+
286
+ results.append(result)
287
+ except Exception as e:
288
+ print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
289
+ continue
290
 
291
+ # ์ž„๊ณ„๊ฐ’ ํ•„ํ„ฐ๋ง
292
+ filtered_results = [r for r in results if r["similarityScore"] >= threshold]
293
 
294
+ return filtered_results
295
+
296
  except Exception as e:
297
+ print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๋‚ ์งœ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
298
+ return []
299
+
300
  finally:
301
+ if 'conn' in locals():
302
+ conn.close()
303
 
304
  # Gradio Blocks์— ํ•จ์ˆ˜ ๋“ฑ๋ก
305
  with gr.Blocks() as demo:
306
  gr.Markdown("# Chat Analysis Search")
307
+ gr.Interface(fn=search_similar_chat, inputs=["text", "number"], outputs="json", api_name="search_similar_chat")
308
+ gr.Interface(fn=search_similar_chat_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chat_by_date")
309
 
310
  if __name__ == "__main__":
311
+ demo.launch(mcp_server=True)