Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -42,6 +42,10 @@ def get_text_value(node, field_name):
|
|
| 42 |
return node[field_name]
|
| 43 |
return None
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
| 46 |
"""
|
| 47 |
다중 임베딩된 채팅 데이터에서 유사한 콘텐츠를 검색합니다.
|
|
@@ -71,21 +75,24 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
|
| 71 |
query_embedding = normalize(raw_embedding.reshape(1, -1), norm='l2')[0]
|
| 72 |
print(f"임베딩 정규화 전/후 첫 5개 요소: {raw_embedding[:5]} -> {query_embedding[:5]}")
|
| 73 |
|
|
|
|
|
|
|
|
|
|
| 74 |
# DB 연결
|
| 75 |
conn = get_db_conn()
|
| 76 |
register_vector(conn)
|
| 77 |
|
| 78 |
-
#
|
| 79 |
-
sql = """
|
| 80 |
WITH embeddings AS (
|
| 81 |
SELECT
|
| 82 |
id,
|
| 83 |
metadata,
|
| 84 |
content,
|
| 85 |
-
CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=>
|
| 86 |
-
CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=>
|
| 87 |
-
CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=>
|
| 88 |
-
CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=>
|
| 89 |
FROM vector_store_multi_embeddings
|
| 90 |
WHERE full_embedding IS NOT NULL
|
| 91 |
OR topic_embedding IS NOT NULL
|
|
@@ -99,20 +106,13 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
|
| 99 |
(full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
|
| 100 |
FROM embeddings
|
| 101 |
ORDER BY combined_similarity DESC
|
| 102 |
-
LIMIT
|
| 103 |
"""
|
| 104 |
|
| 105 |
with conn.cursor() as cur:
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
query_embedding, topic_w,
|
| 110 |
-
query_embedding, customer_w,
|
| 111 |
-
query_embedding, agent_w,
|
| 112 |
-
limit
|
| 113 |
-
)
|
| 114 |
-
print(f"쿼리 실행 - 파라미터: 가중치 설정={full_w}, {topic_w}, {customer_w}, {agent_w}, 결과 제한={limit}")
|
| 115 |
-
cur.execute(sql, params)
|
| 116 |
rows = cur.fetchall()
|
| 117 |
|
| 118 |
print(f"검색 결과: 총 {len(rows)}개 데이터 조회됨")
|
|
@@ -226,21 +226,24 @@ def search_similar_chat_by_date(
|
|
| 226 |
query_embedding = normalize(raw_embedding.reshape(1, -1), norm='l2')[0]
|
| 227 |
print(f"날짜 검색 - 임베딩 정규화 전/후 첫 5개 요소: {raw_embedding[:5]} -> {query_embedding[:5]}")
|
| 228 |
|
|
|
|
|
|
|
|
|
|
| 229 |
# DB 연결
|
| 230 |
conn = get_db_conn()
|
| 231 |
register_vector(conn)
|
| 232 |
|
| 233 |
-
#
|
| 234 |
-
sql = """
|
| 235 |
WITH embeddings AS (
|
| 236 |
SELECT
|
| 237 |
id,
|
| 238 |
metadata,
|
| 239 |
content,
|
| 240 |
-
CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=>
|
| 241 |
-
CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=>
|
| 242 |
-
CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=>
|
| 243 |
-
CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=>
|
| 244 |
FROM vector_store_multi_embeddings
|
| 245 |
WHERE full_embedding IS NOT NULL
|
| 246 |
OR topic_embedding IS NOT NULL
|
|
@@ -248,21 +251,12 @@ def search_similar_chat_by_date(
|
|
| 248 |
OR agent_embedding IS NOT NULL
|
| 249 |
"""
|
| 250 |
|
| 251 |
-
params = [
|
| 252 |
-
query_embedding, full_w,
|
| 253 |
-
query_embedding, topic_w,
|
| 254 |
-
query_embedding, customer_w,
|
| 255 |
-
query_embedding, agent_w
|
| 256 |
-
]
|
| 257 |
-
|
| 258 |
# 날짜 필터 추가
|
| 259 |
if start_timestamp is not None:
|
| 260 |
-
sql += " AND (metadata->>'startTime')::bigint >=
|
| 261 |
-
params.append(start_timestamp)
|
| 262 |
|
| 263 |
if end_timestamp is not None:
|
| 264 |
-
sql += " AND (metadata->>'startTime')::bigint <=
|
| 265 |
-
params.append(end_timestamp)
|
| 266 |
|
| 267 |
sql += """
|
| 268 |
)
|
|
@@ -276,11 +270,10 @@ def search_similar_chat_by_date(
|
|
| 276 |
LIMIT %s
|
| 277 |
"""
|
| 278 |
|
| 279 |
-
params.append(limit)
|
| 280 |
-
|
| 281 |
with conn.cursor() as cur:
|
| 282 |
print(f"날짜 검색 쿼리 실행: 시작일={start_date}({start_timestamp}), 종료일={end_date}({end_timestamp})")
|
| 283 |
-
|
|
|
|
| 284 |
rows = cur.fetchall()
|
| 285 |
|
| 286 |
print(f"날짜 필터링 검색 결과: 총 {len(rows)}개 데이터 조회됨")
|
|
|
|
| 42 |
return node[field_name]
|
| 43 |
return None
|
| 44 |
|
| 45 |
+
def format_vector_for_pg(vector: List[float]) -> str:
|
| 46 |
+
"""벡터를 PostgreSQL 포맷으로 변환합니다."""
|
| 47 |
+
return f"[{','.join(str(x) for x in vector)}]"
|
| 48 |
+
|
| 49 |
def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
| 50 |
"""
|
| 51 |
다중 임베딩된 채팅 데이터에서 유사한 콘텐츠를 검색합니다.
|
|
|
|
| 75 |
query_embedding = normalize(raw_embedding.reshape(1, -1), norm='l2')[0]
|
| 76 |
print(f"임베딩 정규화 전/후 첫 5개 요소: {raw_embedding[:5]} -> {query_embedding[:5]}")
|
| 77 |
|
| 78 |
+
# Java 방식: 벡터를 문자열로 변환
|
| 79 |
+
query_vector = format_vector_for_pg(query_embedding)
|
| 80 |
+
|
| 81 |
# DB 연결
|
| 82 |
conn = get_db_conn()
|
| 83 |
register_vector(conn)
|
| 84 |
|
| 85 |
+
# Java 방식: 문자열 포맷팅 사용한 SQL 쿼리
|
| 86 |
+
sql = f"""
|
| 87 |
WITH embeddings AS (
|
| 88 |
SELECT
|
| 89 |
id,
|
| 90 |
metadata,
|
| 91 |
content,
|
| 92 |
+
CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '{query_vector}'::vector) ELSE 0 END * {full_w} as full_sim,
|
| 93 |
+
CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '{query_vector}'::vector) ELSE 0 END * {topic_w} as topic_sim,
|
| 94 |
+
CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '{query_vector}'::vector) ELSE 0 END * {customer_w} as customer_sim,
|
| 95 |
+
CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '{query_vector}'::vector) ELSE 0 END * {agent_w} as agent_sim
|
| 96 |
FROM vector_store_multi_embeddings
|
| 97 |
WHERE full_embedding IS NOT NULL
|
| 98 |
OR topic_embedding IS NOT NULL
|
|
|
|
| 106 |
(full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity
|
| 107 |
FROM embeddings
|
| 108 |
ORDER BY combined_similarity DESC
|
| 109 |
+
LIMIT {limit}
|
| 110 |
"""
|
| 111 |
|
| 112 |
with conn.cursor() as cur:
|
| 113 |
+
print(f"쿼리 실행 - Java 방식 포맷팅, 가중치 설정={full_w}, {topic_w}, {customer_w}, {agent_w}, 결과 제한={limit}")
|
| 114 |
+
# Java 방식: 매개변수 없이 직접 쿼리 실행
|
| 115 |
+
cur.execute(sql)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
rows = cur.fetchall()
|
| 117 |
|
| 118 |
print(f"검색 결과: 총 {len(rows)}개 데이터 조회됨")
|
|
|
|
| 226 |
query_embedding = normalize(raw_embedding.reshape(1, -1), norm='l2')[0]
|
| 227 |
print(f"날짜 검색 - 임베딩 정규화 전/후 첫 5개 요소: {raw_embedding[:5]} -> {query_embedding[:5]}")
|
| 228 |
|
| 229 |
+
# Java 방식: 벡터를 문자열로 변환
|
| 230 |
+
query_vector = format_vector_for_pg(query_embedding)
|
| 231 |
+
|
| 232 |
# DB 연결
|
| 233 |
conn = get_db_conn()
|
| 234 |
register_vector(conn)
|
| 235 |
|
| 236 |
+
# Java 방식: 문자열 포맷팅 사용한 SQL 쿼리 시작
|
| 237 |
+
sql = f"""
|
| 238 |
WITH embeddings AS (
|
| 239 |
SELECT
|
| 240 |
id,
|
| 241 |
metadata,
|
| 242 |
content,
|
| 243 |
+
CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '{query_vector}'::vector) ELSE 0 END * {full_w} as full_sim,
|
| 244 |
+
CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '{query_vector}'::vector) ELSE 0 END * {topic_w} as topic_sim,
|
| 245 |
+
CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '{query_vector}'::vector) ELSE 0 END * {customer_w} as customer_sim,
|
| 246 |
+
CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '{query_vector}'::vector) ELSE 0 END * {agent_w} as agent_sim
|
| 247 |
FROM vector_store_multi_embeddings
|
| 248 |
WHERE full_embedding IS NOT NULL
|
| 249 |
OR topic_embedding IS NOT NULL
|
|
|
|
| 251 |
OR agent_embedding IS NOT NULL
|
| 252 |
"""
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
# 날짜 필터 추가
|
| 255 |
if start_timestamp is not None:
|
| 256 |
+
sql += f" AND (metadata->>'startTime')::bigint >= {start_timestamp}"
|
|
|
|
| 257 |
|
| 258 |
if end_timestamp is not None:
|
| 259 |
+
sql += f" AND (metadata->>'startTime')::bigint <= {end_timestamp}"
|
|
|
|
| 260 |
|
| 261 |
sql += """
|
| 262 |
)
|
|
|
|
| 270 |
LIMIT %s
|
| 271 |
"""
|
| 272 |
|
|
|
|
|
|
|
| 273 |
with conn.cursor() as cur:
|
| 274 |
print(f"날짜 검색 쿼리 실행: 시작일={start_date}({start_timestamp}), 종료일={end_date}({end_timestamp})")
|
| 275 |
+
# 여기서는 limit만 매개변수로 전달
|
| 276 |
+
cur.execute(sql, (limit,))
|
| 277 |
rows = cur.fetchall()
|
| 278 |
|
| 279 |
print(f"날짜 필터링 검색 결과: 총 {len(rows)}개 데이터 조회됨")
|