Jake-seong commited on
Commit
21f7ac0
ยท
verified ยท
1 Parent(s): a9e3ab0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -82
app.py CHANGED
@@ -31,7 +31,8 @@ def get_embedding(text: str) -> List[float]:
31
  """ํ…์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
32
  response = client.embeddings.create(
33
  input=text,
34
- model="text-embedding-3-small"
 
35
  )
36
  return response.data[0].embedding
37
 
@@ -43,7 +44,9 @@ def get_text_value(node, field_name):
43
 
44
  def format_vector_for_pg(vector: List[float]) -> str:
45
  """๋ฒกํ„ฐ๋ฅผ PostgreSQL ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
46
- return f"[{','.join(str(x) for x in vector)}]"
 
 
47
 
48
  def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
49
  """
@@ -68,106 +71,70 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
68
  print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์‹œ์ž‘: ์ฟผ๋ฆฌ='{query}', ๊ฐ€์ค‘์น˜=(full={full_w}, topic={topic_w}, customer={customer_w}, agent={agent_w}), ์ตœ๋Œ€ ๊ฒฐ๊ณผ={limit}")
69
 
70
  try:
71
- # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ - ์ •๊ทœํ™” ์ œ๊ฑฐ
72
  query_embedding = get_embedding(query)
73
- print(f"์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์™„๋ฃŒ: ์ฒซ 5๊ฐœ ์š”์†Œ: {query_embedding[:5]}")
74
 
75
- # Java ๋ฐฉ์‹: ๋ฒกํ„ฐ๋ฅผ ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
76
  query_vector = format_vector_for_pg(query_embedding)
77
 
78
  # DB ์—ฐ๊ฒฐ
79
  conn = get_db_conn()
80
  register_vector(conn)
81
 
82
- # ๊ฐœ์„ ๋œ SQL ์ฟผ๋ฆฌ - ๊ฐ€์ค‘ ํ‰๊ท  ๋ฐฉ์‹์œผ๋กœ ๊ณ„์‚ฐํ•˜๊ณ  ๊ฐ ์ž„๋ฒ ๋”ฉ ์œ ํ˜•๋ณ„ ์ ์ˆ˜ ํฌํ•จ
83
  sql = f"""
84
- WITH embeddings AS (
85
  SELECT
86
  id,
87
  metadata,
88
  content,
89
- -- ๊ฐ ์ž„๋ฒ ๋”ฉ ์œ ํ˜•๋ณ„ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ (NULL์ด ์•„๋‹Œ ๊ฒฝ์šฐ๋งŒ)
90
- CASE WHEN full_embedding IS NOT NULL
91
- THEN 1 - (full_embedding <=> '{query_vector}'::vector)
92
- ELSE NULL END as full_sim,
93
-
94
- CASE WHEN topic_embedding IS NOT NULL
95
- THEN 1 - (topic_embedding <=> '{query_vector}'::vector)
96
- ELSE NULL END as topic_sim,
97
-
98
- CASE WHEN customer_embedding IS NOT NULL
99
- THEN 1 - (customer_embedding <=> '{query_vector}'::vector)
100
- ELSE NULL END as customer_sim,
101
-
102
- CASE WHEN agent_embedding IS NOT NULL
103
- THEN 1 - (agent_embedding <=> '{query_vector}'::vector)
104
- ELSE NULL END as agent_sim,
105
-
106
- -- ์œ ํšจํ•œ ์ž„๋ฒ ๋”ฉ ์นด์šดํŠธ (0์œผ๋กœ ๋‚˜๋ˆ„๊ธฐ ๋ฐฉ์ง€)
107
- (CASE WHEN full_embedding IS NOT NULL THEN 1 ELSE 0 END +
108
- CASE WHEN topic_embedding IS NOT NULL THEN 1 ELSE 0 END +
109
- CASE WHEN customer_embedding IS NOT NULL THEN 1 ELSE 0 END +
110
- CASE WHEN agent_embedding IS NOT NULL THEN 1 ELSE 0 END) as valid_count
111
  FROM vector_store_multi_embeddings
112
- WHERE (full_embedding IS NOT NULL
113
- OR topic_embedding IS NOT NULL
114
- OR customer_embedding IS NOT NULL
115
- OR agent_embedding IS NOT NULL)
116
- ),
117
- weighted_scores AS (
118
- SELECT
119
- id,
120
- metadata,
121
- content,
122
- full_sim,
123
- topic_sim,
124
- customer_sim,
125
- agent_sim,
126
- valid_count,
127
-
128
- -- ๊ฐ€์ค‘์น˜๋ฅผ ์ ์šฉํ•œ ์ด ์œ ์‚ฌ๋„ ์ ์ˆ˜ ๊ณ„์‚ฐ
129
- (COALESCE(full_sim, 0) * {full_w} +
130
- COALESCE(topic_sim, 0) * {topic_w} +
131
- COALESCE(customer_sim, 0) * {customer_w} +
132
- COALESCE(agent_sim, 0) * {agent_w}) as weighted_sum
133
- FROM embeddings
134
  )
135
  SELECT
136
  id,
137
  metadata,
138
  content,
139
- full_sim,
140
- topic_sim,
141
- customer_sim,
142
- agent_sim,
143
- weighted_sum as combined_similarity
144
- FROM weighted_scores
145
- WHERE weighted_sum >= {threshold}
146
  ORDER BY combined_similarity DESC
147
  LIMIT {limit}
148
  """
149
 
150
  with conn.cursor() as cur:
151
- print(f"์ฟผ๋ฆฌ ์‹คํ–‰ - ๊ฐœ์„ ๋œ ๋ฐฉ์‹, ๊ฐ€์ค‘์น˜ ์„ค์ •={full_w}, {topic_w}, {customer_w}, {agent_w}, ๊ฒฐ๊ณผ ์ œํ•œ={limit}")
152
- # Java ๋ฐฉ์‹: ๋งค๊ฐœ๋ณ€์ˆ˜ ์—†์ด ์ง์ ‘ ์ฟผ๋ฆฌ ์‹คํ–‰
153
  cur.execute(sql)
154
  rows = cur.fetchall()
155
 
156
  print(f"๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ: ์ด {len(rows)}๊ฐœ ๋ฐ์ดํ„ฐ ์กฐํšŒ๋จ")
157
  if len(rows) > 0:
158
- print(f"์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ ID: {rows[0][0]}, ์œ ์‚ฌ๋„: {float(rows[0][7])}")
159
- print(f"์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ ์„ธ๋ถ€ ์œ ์‚ฌ๋„ - full: {rows[0][3]}, topic: {rows[0][4]}, customer: {rows[0][5]}, agent: {rows[0][6]}")
160
 
161
  results = []
162
  for row in rows:
163
  id_val = row[0]
164
  metadata_json = row[1]
165
  content = row[2]
166
- full_similarity = None if row[3] is None else float(row[3])
167
- topic_similarity = None if row[4] is None else float(row[4])
168
- customer_similarity = None if row[5] is None else float(row[5])
169
- agent_similarity = None if row[6] is None else float(row[6])
170
- similarity_score = float(row[7])
 
 
171
 
172
  # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
173
  try:
@@ -179,12 +146,7 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
179
  "content": content,
180
  "chatId": get_text_value(metadata, "chatId"),
181
  "topic": get_text_value(metadata, "topic"),
182
- "similarityDetails": {
183
- "full": full_similarity,
184
- "topic": topic_similarity,
185
- "customer": customer_similarity,
186
- "agent": agent_similarity
187
- }
188
  }
189
 
190
  # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
@@ -199,15 +161,13 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
199
  print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
200
  print(f"๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ: {metadata_json[:200]}...")
201
  continue
202
-
203
- print(f"์ž„๊ณ„๊ฐ’({threshold}) ์ด์ƒ ๊ฒฐ๊ณผ: {len(results)}๊ฐœ")
204
-
205
- if len(results) > 0:
206
- print(f"๊ฐ€์žฅ ๋†’์€ ์œ ์‚ฌ๋„ ์ ์ˆ˜: {results[0]['similarityScore']}")
207
- print(f"์ƒ์œ„ ๊ฒฐ๊ณผ ์ฑ—ID: {results[0].get('chatId')}, ์ฃผ์ œ: {results[0].get('topic', '')[:50]}...")
208
- print(f"์ƒ์œ„ ๊ฒฐ๊ณผ ์„ธ๋ถ€ ์œ ์‚ฌ๋„: {results[0]['similarityDetails']}")
209
-
210
- return results
211
 
212
  except Exception as e:
213
  print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
 
31
  """ํ…์ŠคํŠธ๋ฅผ ์ž„๋ฒ ๋”ฉ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
32
  response = client.embeddings.create(
33
  input=text,
34
+ model="text-embedding-3-small",
35
+ encoding_format="float" # ๋ช…์‹œ์ ์œผ๋กœ float ํ˜•์‹ ์ง€์ •
36
  )
37
  return response.data[0].embedding
38
 
 
44
 
45
  def format_vector_for_pg(vector: List[float]) -> str:
46
  """๋ฒกํ„ฐ๋ฅผ PostgreSQL ํฌ๋งท์œผ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค."""
47
+ # ์ •๋ฐ€๋„ ์œ ์ง€๋ฅผ ์œ„ํ•ด str() ํ•จ์ˆ˜ ๋Œ€์‹  ์†Œ์ˆ˜์  ์ œํ•œ ์—†์ด ๋ฐ”๋กœ join
48
+ vector_str = ','.join([f"{x}" for x in vector])
49
+ return f"[{vector_str}]"
50
 
51
  def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
52
  """
 
71
  print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์‹œ์ž‘: ์ฟผ๋ฆฌ='{query}', ๊ฐ€์ค‘์น˜=(full={full_w}, topic={topic_w}, customer={customer_w}, agent={agent_w}), ์ตœ๋Œ€ ๊ฒฐ๊ณผ={limit}")
72
 
73
  try:
74
+ # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ - ์ธ์ฝ”๋”ฉ ํฌ๋งท ๋ช…์‹œ
75
  query_embedding = get_embedding(query)
76
+ print(f"์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ ์™„๋ฃŒ: ๋ฒกํ„ฐ ๊ธธ์ด={len(query_embedding)}")
77
 
78
+ # ๋ฒกํ„ฐ ํฌ๋งท ๋ณ€ํ™˜
79
  query_vector = format_vector_for_pg(query_embedding)
80
 
81
  # DB ์—ฐ๊ฒฐ
82
  conn = get_db_conn()
83
  register_vector(conn)
84
 
85
+ # ์ž๋ฐ” ๊ตฌํ˜„๊ณผ ์ผ์น˜ํ•˜๋„๋ก SQL ์ฟผ๋ฆฌ ์ˆ˜์ •
86
  sql = f"""
87
+ WITH similarities AS (
88
  SELECT
89
  id,
90
  metadata,
91
  content,
92
+ CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '{query_vector}'::vector) ELSE 0 END * {full_w} as full_sim,
93
+ CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '{query_vector}'::vector) ELSE 0 END * {topic_w} as topic_sim,
94
+ CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '{query_vector}'::vector) ELSE 0 END * {customer_w} as customer_sim,
95
+ CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '{query_vector}'::vector) ELSE 0 END * {agent_w} as agent_sim
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  FROM vector_store_multi_embeddings
97
+ WHERE full_embedding IS NOT NULL
98
+ OR topic_embedding IS NOT NULL
99
+ OR customer_embedding IS NOT NULL
100
+ OR agent_embedding IS NOT NULL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
  SELECT
103
  id,
104
  metadata,
105
  content,
106
+ (full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity,
107
+ full_sim / {full_w} as full_raw_sim,
108
+ topic_sim / {topic_w} as topic_raw_sim,
109
+ customer_sim / {customer_w} as customer_raw_sim,
110
+ agent_sim / {agent_w} as agent_raw_sim
111
+ FROM similarities
 
112
  ORDER BY combined_similarity DESC
113
  LIMIT {limit}
114
  """
115
 
116
  with conn.cursor() as cur:
117
+ print(f"์ฟผ๋ฆฌ ์‹คํ–‰: ์ž๋ฐ” ๊ตฌํ˜„๊ณผ ์ผ์น˜ํ•˜๋„๋ก ์ˆ˜์ •")
 
118
  cur.execute(sql)
119
  rows = cur.fetchall()
120
 
121
  print(f"๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ: ์ด {len(rows)}๊ฐœ ๋ฐ์ดํ„ฐ ์กฐํšŒ๋จ")
122
  if len(rows) > 0:
123
+ print(f"์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ ID: {rows[0][0]}, ์œ ์‚ฌ๋„: {float(rows[0][3])}")
124
+ print(f"์ฒซ ๋ฒˆ์งธ ๊ฒฐ๊ณผ ์›์‹œ ์œ ์‚ฌ๋„ - full: {rows[0][4]}, topic: {rows[0][5]}, customer: {rows[0][6]}, agent: {rows[0][7]}")
125
 
126
  results = []
127
  for row in rows:
128
  id_val = row[0]
129
  metadata_json = row[1]
130
  content = row[2]
131
+ similarity_score = float(row[3])
132
+ raw_sims = {
133
+ "full": None if row[4] is None else float(row[4]),
134
+ "topic": None if row[5] is None else float(row[5]),
135
+ "customer": None if row[6] is None else float(row[6]),
136
+ "agent": None if row[7] is None else float(row[7])
137
+ }
138
 
139
  # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ
140
  try:
 
146
  "content": content,
147
  "chatId": get_text_value(metadata, "chatId"),
148
  "topic": get_text_value(metadata, "topic"),
149
+ "rawSimilarities": raw_sims
 
 
 
 
 
150
  }
151
 
152
  # ์‹œ๊ฐ„ ํ•„๋“œ ๋ณ€ํ™˜ ์—†์ด ๊ทธ๋Œ€๋กœ ์‚ฌ์šฉ
 
161
  print(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
162
  print(f"๋ฌธ์ œ๊ฐ€ ๋ฐœ์ƒํ•œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ: {metadata_json[:200]}...")
163
  continue
164
+
165
+ if len(results) > 0:
166
+ print(f"๊ฐ€์žฅ ๋†’์€ ์œ ์‚ฌ๋„ ์ ์ˆ˜: {results[0]['similarityScore']}")
167
+ print(f"์ƒ์œ„ ๊ฒฐ๊ณผ ์ฑ—ID: {results[0].get('chatId')}, ์ฃผ์ œ: {results[0].get('topic', '')[:50]}...")
168
+ print(f"์ƒ์œ„ ๊ฒฐ๊ณผ ์›์‹œ ์œ ์‚ฌ๋„: {results[0]['rawSimilarities']}")
169
+
170
+ return results
 
 
171
 
172
  except Exception as e:
173
  print(f"๋‹ค์ค‘ ์ž„๋ฒ ๋”ฉ ๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")