Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -31,7 +31,8 @@ def get_embedding(text: str) -> List[float]:
|
|
| 31 |
"""ํ
์คํธ๋ฅผ ์๋ฒ ๋ฉ ๋ฒกํฐ๋ก ๋ณํํฉ๋๋ค."""
|
| 32 |
response = client.embeddings.create(
|
| 33 |
input=text,
|
| 34 |
-
model="text-embedding-3-small"
|
|
|
|
| 35 |
)
|
| 36 |
return response.data[0].embedding
|
| 37 |
|
|
@@ -43,7 +44,9 @@ def get_text_value(node, field_name):
|
|
| 43 |
|
| 44 |
def format_vector_for_pg(vector: List[float]) -> str:
|
| 45 |
"""๋ฒกํฐ๋ฅผ PostgreSQL ํฌ๋งท์ผ๋ก ๋ณํํฉ๋๋ค."""
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
| 49 |
"""
|
|
@@ -68,106 +71,70 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
|
| 68 |
print(f"๋ค์ค ์๋ฒ ๋ฉ ๊ฒ์ ์์: ์ฟผ๋ฆฌ='{query}', ๊ฐ์ค์น=(full={full_w}, topic={topic_w}, customer={customer_w}, agent={agent_w}), ์ต๋ ๊ฒฐ๊ณผ={limit}")
|
| 69 |
|
| 70 |
try:
|
| 71 |
-
# ์ฟผ๋ฆฌ ์๋ฒ ๋ฉ ์์ฑ -
|
| 72 |
query_embedding = get_embedding(query)
|
| 73 |
-
print(f"์๋ฒ ๋ฉ ์์ฑ ์๋ฃ:
|
| 74 |
|
| 75 |
-
#
|
| 76 |
query_vector = format_vector_for_pg(query_embedding)
|
| 77 |
|
| 78 |
# DB ์ฐ๊ฒฐ
|
| 79 |
conn = get_db_conn()
|
| 80 |
register_vector(conn)
|
| 81 |
|
| 82 |
-
#
|
| 83 |
sql = f"""
|
| 84 |
-
WITH
|
| 85 |
SELECT
|
| 86 |
id,
|
| 87 |
metadata,
|
| 88 |
content,
|
| 89 |
-
|
| 90 |
-
CASE WHEN
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
CASE WHEN topic_embedding IS NOT NULL
|
| 95 |
-
THEN 1 - (topic_embedding <=> '{query_vector}'::vector)
|
| 96 |
-
ELSE NULL END as topic_sim,
|
| 97 |
-
|
| 98 |
-
CASE WHEN customer_embedding IS NOT NULL
|
| 99 |
-
THEN 1 - (customer_embedding <=> '{query_vector}'::vector)
|
| 100 |
-
ELSE NULL END as customer_sim,
|
| 101 |
-
|
| 102 |
-
CASE WHEN agent_embedding IS NOT NULL
|
| 103 |
-
THEN 1 - (agent_embedding <=> '{query_vector}'::vector)
|
| 104 |
-
ELSE NULL END as agent_sim,
|
| 105 |
-
|
| 106 |
-
-- ์ ํจํ ์๋ฒ ๋ฉ ์นด์ดํธ (0์ผ๋ก ๋๋๊ธฐ ๋ฐฉ์ง)
|
| 107 |
-
(CASE WHEN full_embedding IS NOT NULL THEN 1 ELSE 0 END +
|
| 108 |
-
CASE WHEN topic_embedding IS NOT NULL THEN 1 ELSE 0 END +
|
| 109 |
-
CASE WHEN customer_embedding IS NOT NULL THEN 1 ELSE 0 END +
|
| 110 |
-
CASE WHEN agent_embedding IS NOT NULL THEN 1 ELSE 0 END) as valid_count
|
| 111 |
FROM vector_store_multi_embeddings
|
| 112 |
-
WHERE
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
),
|
| 117 |
-
weighted_scores AS (
|
| 118 |
-
SELECT
|
| 119 |
-
id,
|
| 120 |
-
metadata,
|
| 121 |
-
content,
|
| 122 |
-
full_sim,
|
| 123 |
-
topic_sim,
|
| 124 |
-
customer_sim,
|
| 125 |
-
agent_sim,
|
| 126 |
-
valid_count,
|
| 127 |
-
|
| 128 |
-
-- ๊ฐ์ค์น๋ฅผ ์ ์ฉํ ์ด ์ ์ฌ๋ ์ ์ ๊ณ์ฐ
|
| 129 |
-
(COALESCE(full_sim, 0) * {full_w} +
|
| 130 |
-
COALESCE(topic_sim, 0) * {topic_w} +
|
| 131 |
-
COALESCE(customer_sim, 0) * {customer_w} +
|
| 132 |
-
COALESCE(agent_sim, 0) * {agent_w}) as weighted_sum
|
| 133 |
-
FROM embeddings
|
| 134 |
)
|
| 135 |
SELECT
|
| 136 |
id,
|
| 137 |
metadata,
|
| 138 |
content,
|
| 139 |
-
full_sim,
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
FROM
|
| 145 |
-
WHERE weighted_sum >= {threshold}
|
| 146 |
ORDER BY combined_similarity DESC
|
| 147 |
LIMIT {limit}
|
| 148 |
"""
|
| 149 |
|
| 150 |
with conn.cursor() as cur:
|
| 151 |
-
print(f"์ฟผ๋ฆฌ
|
| 152 |
-
# Java ๋ฐฉ์: ๋งค๊ฐ๋ณ์ ์์ด ์ง์ ์ฟผ๋ฆฌ ์คํ
|
| 153 |
cur.execute(sql)
|
| 154 |
rows = cur.fetchall()
|
| 155 |
|
| 156 |
print(f"๊ฒ์ ๊ฒฐ๊ณผ: ์ด {len(rows)}๊ฐ ๋ฐ์ดํฐ ์กฐํ๋จ")
|
| 157 |
if len(rows) > 0:
|
| 158 |
-
print(f"์ฒซ ๋ฒ์งธ ๊ฒฐ๊ณผ ID: {rows[0][0]}, ์ ์ฌ๋: {float(rows[0][
|
| 159 |
-
print(f"์ฒซ ๋ฒ์งธ ๊ฒฐ๊ณผ
|
| 160 |
|
| 161 |
results = []
|
| 162 |
for row in rows:
|
| 163 |
id_val = row[0]
|
| 164 |
metadata_json = row[1]
|
| 165 |
content = row[2]
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
|
|
|
|
|
|
| 171 |
|
| 172 |
# ๋ฉํ๋ฐ์ดํฐ ํ์ฑ
|
| 173 |
try:
|
|
@@ -179,12 +146,7 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
|
| 179 |
"content": content,
|
| 180 |
"chatId": get_text_value(metadata, "chatId"),
|
| 181 |
"topic": get_text_value(metadata, "topic"),
|
| 182 |
-
"
|
| 183 |
-
"full": full_similarity,
|
| 184 |
-
"topic": topic_similarity,
|
| 185 |
-
"customer": customer_similarity,
|
| 186 |
-
"agent": agent_similarity
|
| 187 |
-
}
|
| 188 |
}
|
| 189 |
|
| 190 |
# ์๊ฐ ํ๋ ๋ณํ ์์ด ๊ทธ๋๋ก ์ฌ์ฉ
|
|
@@ -199,15 +161,13 @@ def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
|
| 199 |
print(f"๋ฉํ๋ฐ์ดํฐ ํ์ฑ ์ค๋ฅ: {e}")
|
| 200 |
print(f"๋ฌธ์ ๊ฐ ๋ฐ์ํ ๋ฉํ๋ฐ์ดํฐ: {metadata_json[:200]}...")
|
| 201 |
continue
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
return results
|
| 211 |
|
| 212 |
except Exception as e:
|
| 213 |
print(f"๋ค์ค ์๋ฒ ๋ฉ ๊ฒ์ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|
|
|
|
| 31 |
"""ํ
์คํธ๋ฅผ ์๋ฒ ๋ฉ ๋ฒกํฐ๋ก ๋ณํํฉ๋๋ค."""
|
| 32 |
response = client.embeddings.create(
|
| 33 |
input=text,
|
| 34 |
+
model="text-embedding-3-small",
|
| 35 |
+
encoding_format="float" # ๋ช
์์ ์ผ๋ก float ํ์ ์ง์
|
| 36 |
)
|
| 37 |
return response.data[0].embedding
|
| 38 |
|
|
|
|
| 44 |
|
| 45 |
def format_vector_for_pg(vector: List[float]) -> str:
|
| 46 |
"""๋ฒกํฐ๋ฅผ PostgreSQL ํฌ๋งท์ผ๋ก ๋ณํํฉ๋๋ค."""
|
| 47 |
+
# ์ ๋ฐ๋ ์ ์ง๋ฅผ ์ํด str() ํจ์ ๋์ ์์์ ์ ํ ์์ด ๋ฐ๋ก join
|
| 48 |
+
vector_str = ','.join([f"{x}" for x in vector])
|
| 49 |
+
return f"[{vector_str}]"
|
| 50 |
|
| 51 |
def search_similar_chat(query: str, max_results: int = 100) -> List[Dict]:
|
| 52 |
"""
|
|
|
|
| 71 |
print(f"๋ค์ค ์๋ฒ ๋ฉ ๊ฒ์ ์์: ์ฟผ๋ฆฌ='{query}', ๊ฐ์ค์น=(full={full_w}, topic={topic_w}, customer={customer_w}, agent={agent_w}), ์ต๋ ๊ฒฐ๊ณผ={limit}")
|
| 72 |
|
| 73 |
try:
|
| 74 |
+
# ์ฟผ๋ฆฌ ์๋ฒ ๋ฉ ์์ฑ - ์ธ์ฝ๋ฉ ํฌ๋งท ๋ช
์
|
| 75 |
query_embedding = get_embedding(query)
|
| 76 |
+
print(f"์๋ฒ ๋ฉ ์์ฑ ์๋ฃ: ๋ฒกํฐ ๊ธธ์ด={len(query_embedding)}")
|
| 77 |
|
| 78 |
+
# ๋ฒกํฐ ํฌ๋งท ๋ณํ
|
| 79 |
query_vector = format_vector_for_pg(query_embedding)
|
| 80 |
|
| 81 |
# DB ์ฐ๊ฒฐ
|
| 82 |
conn = get_db_conn()
|
| 83 |
register_vector(conn)
|
| 84 |
|
| 85 |
+
# ์๋ฐ ๊ตฌํ๊ณผ ์ผ์นํ๋๋ก SQL ์ฟผ๋ฆฌ ์์
|
| 86 |
sql = f"""
|
| 87 |
+
WITH similarities AS (
|
| 88 |
SELECT
|
| 89 |
id,
|
| 90 |
metadata,
|
| 91 |
content,
|
| 92 |
+
CASE WHEN full_embedding IS NOT NULL THEN 1 - (full_embedding <=> '{query_vector}'::vector) ELSE 0 END * {full_w} as full_sim,
|
| 93 |
+
CASE WHEN topic_embedding IS NOT NULL THEN 1 - (topic_embedding <=> '{query_vector}'::vector) ELSE 0 END * {topic_w} as topic_sim,
|
| 94 |
+
CASE WHEN customer_embedding IS NOT NULL THEN 1 - (customer_embedding <=> '{query_vector}'::vector) ELSE 0 END * {customer_w} as customer_sim,
|
| 95 |
+
CASE WHEN agent_embedding IS NOT NULL THEN 1 - (agent_embedding <=> '{query_vector}'::vector) ELSE 0 END * {agent_w} as agent_sim
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
FROM vector_store_multi_embeddings
|
| 97 |
+
WHERE full_embedding IS NOT NULL
|
| 98 |
+
OR topic_embedding IS NOT NULL
|
| 99 |
+
OR customer_embedding IS NOT NULL
|
| 100 |
+
OR agent_embedding IS NOT NULL
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
)
|
| 102 |
SELECT
|
| 103 |
id,
|
| 104 |
metadata,
|
| 105 |
content,
|
| 106 |
+
(full_sim + topic_sim + customer_sim + agent_sim) as combined_similarity,
|
| 107 |
+
full_sim / {full_w} as full_raw_sim,
|
| 108 |
+
topic_sim / {topic_w} as topic_raw_sim,
|
| 109 |
+
customer_sim / {customer_w} as customer_raw_sim,
|
| 110 |
+
agent_sim / {agent_w} as agent_raw_sim
|
| 111 |
+
FROM similarities
|
|
|
|
| 112 |
ORDER BY combined_similarity DESC
|
| 113 |
LIMIT {limit}
|
| 114 |
"""
|
| 115 |
|
| 116 |
with conn.cursor() as cur:
|
| 117 |
+
print(f"์ฟผ๋ฆฌ ์คํ: ์๋ฐ ๊ตฌํ๊ณผ ์ผ์นํ๋๋ก ์์ ")
|
|
|
|
| 118 |
cur.execute(sql)
|
| 119 |
rows = cur.fetchall()
|
| 120 |
|
| 121 |
print(f"๊ฒ์ ๊ฒฐ๊ณผ: ์ด {len(rows)}๊ฐ ๋ฐ์ดํฐ ์กฐํ๋จ")
|
| 122 |
if len(rows) > 0:
|
| 123 |
+
print(f"์ฒซ ๋ฒ์งธ ๊ฒฐ๊ณผ ID: {rows[0][0]}, ์ ์ฌ๋: {float(rows[0][3])}")
|
| 124 |
+
print(f"์ฒซ ๋ฒ์งธ ๊ฒฐ๊ณผ ์์ ์ ์ฌ๋ - full: {rows[0][4]}, topic: {rows[0][5]}, customer: {rows[0][6]}, agent: {rows[0][7]}")
|
| 125 |
|
| 126 |
results = []
|
| 127 |
for row in rows:
|
| 128 |
id_val = row[0]
|
| 129 |
metadata_json = row[1]
|
| 130 |
content = row[2]
|
| 131 |
+
similarity_score = float(row[3])
|
| 132 |
+
raw_sims = {
|
| 133 |
+
"full": None if row[4] is None else float(row[4]),
|
| 134 |
+
"topic": None if row[5] is None else float(row[5]),
|
| 135 |
+
"customer": None if row[6] is None else float(row[6]),
|
| 136 |
+
"agent": None if row[7] is None else float(row[7])
|
| 137 |
+
}
|
| 138 |
|
| 139 |
# ๋ฉํ๋ฐ์ดํฐ ํ์ฑ
|
| 140 |
try:
|
|
|
|
| 146 |
"content": content,
|
| 147 |
"chatId": get_text_value(metadata, "chatId"),
|
| 148 |
"topic": get_text_value(metadata, "topic"),
|
| 149 |
+
"rawSimilarities": raw_sims
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
}
|
| 151 |
|
| 152 |
# ์๊ฐ ํ๋ ๋ณํ ์์ด ๊ทธ๋๋ก ์ฌ์ฉ
|
|
|
|
| 161 |
print(f"๋ฉํ๋ฐ์ดํฐ ํ์ฑ ์ค๋ฅ: {e}")
|
| 162 |
print(f"๋ฌธ์ ๊ฐ ๋ฐ์ํ ๋ฉํ๋ฐ์ดํฐ: {metadata_json[:200]}...")
|
| 163 |
continue
|
| 164 |
+
|
| 165 |
+
if len(results) > 0:
|
| 166 |
+
print(f"๊ฐ์ฅ ๋์ ์ ์ฌ๋ ์ ์: {results[0]['similarityScore']}")
|
| 167 |
+
print(f"์์ ๊ฒฐ๊ณผ ์ฑID: {results[0].get('chatId')}, ์ฃผ์ : {results[0].get('topic', '')[:50]}...")
|
| 168 |
+
print(f"์์ ๊ฒฐ๊ณผ ์์ ์ ์ฌ๋: {results[0]['rawSimilarities']}")
|
| 169 |
+
|
| 170 |
+
return results
|
|
|
|
|
|
|
| 171 |
|
| 172 |
except Exception as e:
|
| 173 |
print(f"๋ค์ค ์๋ฒ ๋ฉ ๊ฒ์ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}")
|