Jake-seong commited on
Commit
6e0fcb3
ยท
verified ยท
1 Parent(s): 9578ec7
Files changed (1) hide show
  1. app.py +7 -139
app.py CHANGED
@@ -3,13 +3,10 @@ import psycopg2
3
  from openai import OpenAI
4
  import json
5
  import os
6
- from typing import List, Dict, Tuple, Any
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
  from datetime import datetime
10
- import re
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.metrics.pairwise import cosine_similarity
13
 
14
  # DB ์—ฐ๊ฒฐ ์„ค์ •
15
  def get_db_conn():
@@ -31,80 +28,6 @@ def get_embedding(text: str) -> List[float]:
31
  )
32
  return response.data[0].embedding
33
 
34
- def expand_query(query: str) -> str:
35
- """
36
- ์‚ฌ์šฉ์ž ์ฟผ๋ฆฌ๋ฅผ ํ™•์žฅํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
37
- """
38
- # GPT๋ฅผ ํ™œ์šฉํ•œ ์ฟผ๋ฆฌ ํ™•์žฅ
39
- try:
40
- response = client.chat.completions.create(
41
- model="gpt-3.5-turbo",
42
- messages=[
43
- {"role": "system", "content": "๋‹น์‹ ์€ ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ํ™•์žฅ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค. ์‚ฌ์šฉ์ž์˜ ์ฟผ๋ฆฌ๋ฅผ ๋ถ„์„ํ•˜๊ณ , ์ด์™€ ๊ด€๋ จ๋œ ํ‚ค์›Œ๋“œ์™€ ์งˆ๋ฌธ ํ˜•ํƒœ๋กœ ํ™•์žฅํ•˜์„ธ์š”."},
44
- {"role": "user", "content": f"๋‹ค์Œ ๊ฒ€์ƒ‰์–ด๋ฅผ ํ™•์žฅํ•ด์ฃผ์„ธ์š”: '{query}'"}
45
- ],
46
- temperature=0.3,
47
- max_tokens=150
48
- )
49
- expanded = query + " " + response.choices[0].message.content
50
- return expanded
51
- except:
52
- # ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์›๋ณธ ์ฟผ๋ฆฌ ๋ฐ˜ํ™˜
53
- return query
54
-
55
- def extract_keywords(text: str) -> List[str]:
56
- """
57
- ํ…์ŠคํŠธ์—์„œ ์ค‘์š” ํ‚ค์›Œ๋“œ๋ฅผ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.
58
- """
59
- # ๋‹จ์ˆœํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (๊ณ ๊ธ‰ NLP ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋กœ ๋Œ€์ฒด ๊ฐ€๋Šฅ)
60
- # ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ ๋ฐ ์ •๊ทœํ‘œํ˜„์‹์œผ๋กœ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
61
- stop_words = {'์žˆ๋Š”', 'ํ•˜๋Š”', '๊ทธ๋ฆฌ๊ณ ', '์ž…๋‹ˆ๋‹ค', '๊ทธ๊ฒƒ์€', '์žˆ์Šต๋‹ˆ๋‹ค', 'ํ•ฉ๋‹ˆ๋‹ค', '๊ทธ๋Ÿฐ', '์ด๋Ÿฐ', '์ €๋Ÿฐ', '๊ทธ๋ƒฅ'}
62
- words = re.findall(r'\w+', text.lower())
63
- keywords = [w for w in words if len(w) > 1 and w not in stop_words]
64
- return list(set(keywords))
65
-
66
- def perform_hybrid_search(
67
- query: str,
68
- vector_results: List[Dict],
69
- keyword_weight: float = 0.3,
70
- similarity_threshold: float = 0.4
71
- ) -> List[Dict]:
72
- """
73
- ๋ฒกํ„ฐ ๊ฒ€์ƒ‰๊ณผ ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰์„ ๊ฒฐํ•ฉํ•œ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
74
- """
75
- # ์ž„๊ณ„๊ฐ’ ๋ฏธ๋งŒ์˜ ๊ฒฐ๊ณผ ํ•„ํ„ฐ๋ง
76
- filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold]
77
-
78
- if not filtered_results:
79
- # ๊ฒฐ๊ณผ๊ฐ€ ์—†์œผ๋ฉด ์ž„๊ณ„๊ฐ’์„ ๋‚ฎ์ถฐ์„œ ์žฌ์‹œ๋„
80
- filtered_results = [r for r in vector_results if r["similarity"] >= similarity_threshold * 0.7]
81
-
82
- if not filtered_results:
83
- return vector_results[:5] # ์—ฌ์ „ํžˆ ์—†์œผ๋ฉด ์ƒ์œ„ 5๊ฐœ ๋ฐ˜ํ™˜
84
-
85
- # ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰ ๊ฐ€์ค‘์น˜ ์ ์šฉ
86
- keywords = extract_keywords(query)
87
-
88
- for result in filtered_results:
89
- content = result.get("content", "")
90
- keyword_matches = sum(1 for kw in keywords if kw.lower() in content.lower())
91
- keyword_score = keyword_matches / max(len(keywords), 1)
92
-
93
- # ์ตœ์ข… ์ ์ˆ˜ ๊ณ„์‚ฐ (๋ฒกํ„ฐ ์œ ์‚ฌ๋„ + ํ‚ค์›Œ๋“œ ๊ฐ€์ค‘์น˜)
94
- result["original_similarity"] = result["similarity"]
95
- result["keyword_score"] = keyword_score
96
- result["similarity"] = (1 - keyword_weight) * result["similarity"] + keyword_weight * keyword_score
97
-
98
- # ์ตœ์ข… ์ ์ˆ˜๋กœ ์žฌ์ •๋ ฌ
99
- return sorted(filtered_results, key=lambda x: x["similarity"], reverse=True)
100
-
101
- def preprocess_query(query: str) -> str:
102
- """
103
- ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ๋ฅผ ์ „์ฒ˜๋ฆฌํ•˜์—ฌ ๊ฒ€์ƒ‰ ํ’ˆ์งˆ์„ ๊ฐœ์„ ํ•ฉ๋‹ˆ๋‹ค.
104
- """
105
- # ๊ฒ€์ƒ‰์— ๋งž๊ฒŒ ํ”„๋กฌํ”„ํŠธ ์žฌ๊ตฌ์„ฑ
106
- return f"๋‹ค์Œ ์งˆ๋ฌธ์ด๋‚˜ ์ฃผ์ œ์™€ ๊ด€๋ จ๋œ ๋Œ€ํ™”๋ฅผ ์ฐพ์•„์ฃผ์„ธ์š”: {query}"
107
-
108
  def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
109
  """
110
  ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
@@ -114,20 +37,13 @@ def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
114
  Returns:
115
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
116
  """
117
- # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
118
- processed_query = preprocess_query(query)
119
- try:
120
- expanded_query = expand_query(processed_query)
121
- except:
122
- expanded_query = processed_query
123
-
124
- embedding = np.array(get_embedding(expanded_query))
125
  conn = get_db_conn()
126
  register_vector(conn)
127
 
128
  try:
129
  with conn.cursor() as cur:
130
- # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
131
  cur.execute("""
132
  SELECT id, metadata, content,
133
  1 - (embedding <=> %s) AS similarity
@@ -137,23 +53,12 @@ def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
137
  """, (embedding, maxResults))
138
 
139
  rows = cur.fetchall()
140
-
141
- results = [{
142
  "id": row[0],
143
  "metadata": row[1],
144
  "content": row[2],
145
  "similarity": float(row[3])
146
  } for row in rows]
147
-
148
- # ํ•˜๏ฟฝ๏ฟฝ๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
149
- results = perform_hybrid_search(
150
- query,
151
- results,
152
- keyword_weight=0.3,
153
- similarity_threshold=0.4
154
- )
155
-
156
- return results
157
  except Exception as e:
158
  raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
159
  finally:
@@ -182,14 +87,7 @@ def search_similar_chats_by_date(
182
  except ValueError as e:
183
  raise ValueError(f"๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {e}")
184
 
185
- # ์ฟผ๋ฆฌ ์ „์ฒ˜๋ฆฌ ๋ฐ ํ™•์žฅ
186
- processed_query = preprocess_query(query)
187
- try:
188
- expanded_query = expand_query(processed_query)
189
- except:
190
- expanded_query = processed_query
191
-
192
- embedding = np.array(get_embedding(expanded_query))
193
  conn = get_db_conn()
194
  register_vector(conn)
195
 
@@ -217,42 +115,12 @@ def search_similar_chats_by_date(
217
  cur.execute(base_query, tuple(params))
218
  rows = cur.fetchall()
219
 
220
- results = [{
221
  "id": row[0],
222
  "metadata": row[1],
223
  "content": row[2],
224
  "similarity": float(row[3])
225
  } for row in rows]
226
-
227
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ ์šฉ
228
- results = perform_hybrid_search(
229
- query,
230
- results,
231
- keyword_weight=0.3,
232
- similarity_threshold=0.4
233
- )
234
-
235
- # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๊ธฐ๋ฐ˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
236
- keywords = extract_keywords(query)
237
- for result in results:
238
- metadata = result.get("metadata", {})
239
- if not metadata or isinstance(metadata, str):
240
- continue
241
-
242
- # ์ฃผ์ œ(topic) ํ•„๋“œ์— ํ‚ค์›Œ๋“œ๊ฐ€ ์žˆ๋Š”์ง€ ํ™•์ธ
243
- topic = metadata.get("topic", "")
244
- topic_matches = sum(1 for kw in keywords if kw.lower() in topic.lower())
245
-
246
- # ์ฃผ์ œ ์ผ์น˜ ๊ฐ€์ค‘์น˜ ์ ์šฉ
247
- if topic_matches > 0:
248
- topic_boost = 0.1 * min(topic_matches, 3) # ์ตœ๋Œ€ 0.3 ๊ฐ€์ค‘์น˜
249
- result["similarity"] += topic_boost
250
- result["topic_boost"] = topic_boost
251
-
252
- # ๊ฒฐ๊ณผ ์žฌ์ •๋ ฌ
253
- results = sorted(results, key=lambda x: x["similarity"], reverse=True)
254
-
255
- return results
256
  except Exception as e:
257
  raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
258
  finally:
@@ -265,4 +133,4 @@ with gr.Blocks() as demo:
265
  gr.Interface(fn=search_similar_chats_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chats_by_date")
266
 
267
  if __name__ == "__main__":
268
- demo.launch(mcp_server=True)
 
3
  from openai import OpenAI
4
  import json
5
  import os
6
+ from typing import List, Dict
7
  from pgvector.psycopg2 import register_vector
8
  import numpy as np
9
  from datetime import datetime
 
 
 
10
 
11
  # DB ์—ฐ๊ฒฐ ์„ค์ •
12
  def get_db_conn():
 
28
  )
29
  return response.data[0].embedding
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def search_similar_chats(query: str, maxResults: int = 200) -> List[Dict]:
32
  """
33
  ์œ ์‚ฌํ•œ ์ฑ„ํŒ… ๋ฌธ์„œ๋ฅผ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค.
 
37
  Returns:
38
  List[Dict]: ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ๋ชฉ๋ก
39
  """
40
+ embedding = np.array(get_embedding(query))
 
 
 
 
 
 
 
41
  conn = get_db_conn()
42
  register_vector(conn)
43
 
44
  try:
45
  with conn.cursor() as cur:
46
+ # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ์—ฐ์‚ฐ์ž ๋ณ€๊ฒฝ (<=> ์‚ฌ์šฉ)
47
  cur.execute("""
48
  SELECT id, metadata, content,
49
  1 - (embedding <=> %s) AS similarity
 
53
  """, (embedding, maxResults))
54
 
55
  rows = cur.fetchall()
56
+ return [{
 
57
  "id": row[0],
58
  "metadata": row[1],
59
  "content": row[2],
60
  "similarity": float(row[3])
61
  } for row in rows]
 
 
 
 
 
 
 
 
 
 
62
  except Exception as e:
63
  raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
64
  finally:
 
87
  except ValueError as e:
88
  raise ValueError(f"๋‚ ์งœ ํ˜•์‹ ์˜ค๋ฅ˜: {e}")
89
 
90
+ embedding = np.array(get_embedding(query))
 
 
 
 
 
 
 
91
  conn = get_db_conn()
92
  register_vector(conn)
93
 
 
115
  cur.execute(base_query, tuple(params))
116
  rows = cur.fetchall()
117
 
118
+ return [{
119
  "id": row[0],
120
  "metadata": row[1],
121
  "content": row[2],
122
  "similarity": float(row[3])
123
  } for row in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  except Exception as e:
125
  raise RuntimeError(f"DB ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
126
  finally:
 
133
  gr.Interface(fn=search_similar_chats_by_date, inputs=["text", "text", "text", "number"], outputs="json", api_name="search_similar_chats_by_date")
134
 
135
  if __name__ == "__main__":
136
+ demo.launch(mcp_server=True)