SOY NV AI commited on
Commit
fa87e9c
ยท
1 Parent(s): d54e6a9

feat: Add Re-ranking system and improve AI response prompts

Browse files

- Add vector DB (Chroma) integration

- Implement vector search and Cross-Encoder re-ranking

- Add prompts for citing sources in AI responses

- Add prompt to say 'Content not found' when no evidence exists

- Update requirements.txt with sentence-transformers and chromadb

Files changed (3) hide show
  1. app/routes.py +148 -28
  2. app/vector_db.py +281 -0
  3. requirements.txt +3 -0
app/routes.py CHANGED
@@ -2,6 +2,7 @@ from flask import Blueprint, render_template, request, jsonify, send_from_direct
2
  from flask_login import login_user, logout_user, login_required, current_user
3
  from werkzeug.utils import secure_filename
4
  from app.database import db, UploadedFile, User, ChatSession, ChatMessage, DocumentChunk, ParentChunk
 
5
  import requests
6
  import os
7
  from datetime import datetime
@@ -177,15 +178,21 @@ def split_text_into_chunks(text, min_chunk_size=200, max_chunk_size=1000, overla
177
  return final_chunks if final_chunks else [text] if text.strip() else []
178
 
179
  def create_chunks_for_file(file_id, content):
180
- """ํŒŒ์ผ ๋‚ด์šฉ์„ ์˜๋ฏธ ๊ธฐ๋ฐ˜ ์ฒญํฌ๋กœ ๋ถ„ํ• ํ•˜์—ฌ ์ €์žฅ"""
181
  try:
182
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ํŒŒ์ผ ID {file_id}์— ๋Œ€ํ•œ ์ฒญํฌ ์ƒ์„ฑ ์‹œ์ž‘")
183
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ์›๋ณธ ํ…์ŠคํŠธ ๊ธธ์ด: {len(content)}์ž")
184
 
185
- # ๊ธฐ์กด ์ฒญํฌ ์‚ญ์ œ
186
- existing_chunks = DocumentChunk.query.filter_by(file_id=file_id).count()
187
- if existing_chunks > 0:
188
- print(f"[์ฒญํฌ ์ƒ์„ฑ] ๊ธฐ์กด ์ฒญํฌ {existing_chunks}๊ฐœ ์‚ญ์ œ")
 
 
 
 
 
 
189
  DocumentChunk.query.filter_by(file_id=file_id).delete()
190
  db.session.commit()
191
 
@@ -198,27 +205,40 @@ def create_chunks_for_file(file_id, content):
198
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ๊ฒฝ๊ณ : ์ฒญํฌ๊ฐ€ ์ƒ์„ฑ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ…์ŠคํŠธ๊ฐ€ ๋„ˆ๋ฌด ์งง๊ฑฐ๋‚˜ ๋น„์–ด์žˆ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
199
  return 0
200
 
201
- # ๊ฐ ์ฒญํฌ๋ฅผ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค์— ์ €์žฅ
202
  saved_count = 0
 
203
  for idx, chunk_content in enumerate(chunks):
204
  try:
 
205
  chunk = DocumentChunk(
206
  file_id=file_id,
207
  chunk_index=idx,
208
  content=chunk_content
209
  )
210
  db.session.add(chunk)
 
 
 
 
 
 
 
 
 
 
 
211
  saved_count += 1
212
 
213
  # ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ (10๊ฐœ๋งˆ๋‹ค)
214
  if (idx + 1) % 10 == 0:
215
- print(f"[์ฒญํฌ ์ƒ์„ฑ] ์ง„ํ–‰ ์ค‘: {idx + 1}/{len(chunks)}๊ฐœ ์ฒญํฌ ์ €์žฅ ์ค‘...")
216
  except Exception as e:
217
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ๊ฒฝ๊ณ : ์ฒญํฌ {idx} ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
218
  continue
219
 
220
  db.session.commit()
221
- print(f"[์ฒญํฌ ์ƒ์„ฑ] ์™„๋ฃŒ: {saved_count}๊ฐœ ์ฒญํฌ๊ฐ€ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
222
 
223
  # ์ €์žฅ ํ™•์ธ
224
  verified_count = DocumentChunk.query.filter_by(file_id=file_id).count()
@@ -440,8 +460,86 @@ def get_parent_chunks_for_files(file_ids):
440
  print(f"[Parent Chunk ์กฐํšŒ] ์˜ค๋ฅ˜: {str(e)}")
441
  return []
442
 
443
- def search_relevant_chunks(query, file_ids=None, model_name=None, top_k=25, min_score=1):
444
- """์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ฒญํฌ ๊ฒ€์ƒ‰ (๊ฐœ์„ ๋œ ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰) - Child Chunk ์ •๋ฐ€ ๊ฒ€์ƒ‰"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445
  try:
446
  # ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ์ค€๋น„ - ํ•œ๊ธ€๊ณผ ์˜๋ฌธ ๋‹จ์–ด ๋ชจ๋‘ ์ถ”์ถœ
447
  query_words = set(re.findall(r'[๊ฐ€-ํžฃ]+|\w+', query.lower()))
@@ -507,21 +605,12 @@ def search_relevant_chunks(query, file_ids=None, model_name=None, top_k=25, min_
507
  # ์ ์ˆ˜ ์ˆœ์œผ๋กœ ์ •๋ ฌํ•˜๊ณ  ์ƒ์œ„ k๊ฐœ ์„ ํƒ
508
  scored_chunks.sort(key=lambda x: x[0], reverse=True)
509
 
510
- # top_k๊ฐœ ์„ ํƒํ•˜๋˜, ์ ์ˆ˜๊ฐ€ ๋น„์Šทํ•œ ์ฒญํฌ๋„ ํฌํ•จ (์ ์ˆ˜ ์ฐจ์ด๊ฐ€ 30% ์ด๋‚ด๋ฉด ํฌํ•จ)
511
- top_chunks = []
512
- if scored_chunks:
513
- max_score = scored_chunks[0][0]
514
- threshold = max_score * 0.7 # ์ตœ๊ณ  ์ ์ˆ˜์˜ 70% ์ด์ƒ์ธ ์ฒญํฌ๋„ ํฌํ•จ
515
-
516
- for score, chunk in scored_chunks:
517
- if len(top_chunks) < top_k or score >= threshold:
518
- top_chunks.append(chunk)
519
- else:
520
- break
521
 
522
  return top_chunks
523
  except Exception as e:
524
- print(f"์ฒญํฌ ๊ฒ€์ƒ‰ ์˜ค๋ฅ˜: {str(e)}")
525
  import traceback
526
  traceback.print_exc()
527
  return []
@@ -813,16 +902,16 @@ def chat():
813
  parent_chunks = get_parent_chunks_for_files(file_ids)
814
  print(f"[RAG ๊ฒ€์ƒ‰ 1๋‹จ๊ณ„] Parent Chunk ์กฐํšŒ ์™„๋ฃŒ: {len(parent_chunks)}๊ฐœ ํŒŒ์ผ")
815
 
816
- # 2๋‹จ๊ณ„: Child Chunk๋กœ ์ •๋ฐ€ ๊ฒ€์ƒ‰
817
- print(f"[RAG ๊ฒ€์ƒ‰ 2๋‹จ๊ณ„] Child Chunk ์ •๋ฐ€ ๊ฒ€์ƒ‰ ์‹œ์ž‘...")
818
  relevant_chunks = search_relevant_chunks(
819
  query=message,
820
  file_ids=file_ids if file_ids else None,
821
  model_name=model,
822
- top_k=25, # 25๊ฐœ ์ฒญํฌ ๊ฒ€์ƒ‰
823
  min_score=0.5 # ์ตœ์†Œ ์ ์ˆ˜ ์ž„๊ณ„๊ฐ’
824
  )
825
- print(f"[RAG ๊ฒ€์ƒ‰ 2๋‹จ๊ณ„] Child Chunk ๊ฒ€์ƒ‰ ์™„๋ฃŒ: {len(relevant_chunks)}๊ฐœ ์ฒญํฌ")
826
 
827
  # ์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ
828
  context_parts = []
@@ -902,6 +991,10 @@ def chat():
902
  - ๊ทธ ๋‹ค์Œ ๊ตฌ์ฒด์ ์ธ ๋‚ด์šฉ(Child Chunk)์„ ํ†ตํ•ด ์งˆ๋ฌธ์— ๋Œ€ํ•œ ์ •ํ™•ํ•œ ๋‹ต๋ณ€์„ ์ œ๊ณตํ•˜์„ธ์š”.
903
  - ์›น์†Œ์„ค์˜ ๋งฅ๋ฝ๊ณผ ์Šคํ† ๋ฆฌ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ์ผ๊ด€์„ฑ ์žˆ๋Š” ๋‹ต๋ณ€์„ ์ž‘์„ฑํ•˜์„ธ์š”.
904
 
 
 
 
 
905
  ์งˆ๋ฌธ:
906
  """
907
  elif parent_chunks:
@@ -912,6 +1005,10 @@ def chat():
912
 
913
  ์œ„ ์ •๋ณด๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”. ์›น์†Œ์„ค์˜ ๋ฐฐ๊ฒฝ๊ณผ ์„ค์ •์„ ๊ณ ๋ คํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”.
914
 
 
 
 
 
915
  ์งˆ๋ฌธ:
916
  """
917
  else:
@@ -920,7 +1017,11 @@ def chat():
920
 
921
  {full_context}
922
 
923
- ์œ„ ๋‚ด์šฉ์„ ์ถฉ๋ถ„ํžˆ ์ฐธ๊ณ ํ•˜์—ฌ ๋‹ค์Œ ์งˆ๋ฌธ์— ์ •ํ™•ํ•˜๊ณ  ์ƒ์„ธํ•˜๊ฒŒ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”. ์›น์†Œ์„ค์˜ ๋งฅ๋ฝ๊ณผ ์Šคํ† ๋ฆฌ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”:
 
 
 
 
924
 
925
  ์งˆ๋ฌธ:
926
  """
@@ -981,7 +1082,18 @@ def chat():
981
 
982
  if context_parts:
983
  context = "\n\n".join(context_parts)
984
- context = f"๋‹ค์Œ์€ ํ•™์Šต๋œ ์›น์†Œ์„ค ๋‚ด์šฉ์ž…๋‹ˆ๋‹ค:\n\n{context}\n\n์œ„ ๋‚ด์šฉ์„ ์ฐธ๊ณ ํ•˜์—ฌ ๋‹ค์Œ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”:\n\n"
 
 
 
 
 
 
 
 
 
 
 
985
 
986
  # ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
987
  full_prompt = context + message if context else message
@@ -1567,6 +1679,14 @@ def delete_file(file_id):
1567
  DocumentChunk.query.filter_by(file_id=file_to_delete.id).delete()
1568
  print(f"[ํŒŒ์ผ ์‚ญ์ œ] Child Chunk {child_chunk_count}๊ฐœ ์‚ญ์ œ ์™„๋ฃŒ")
1569
 
 
 
 
 
 
 
 
 
1570
  # ๊ด€๋ จ Parent Chunk ์‚ญ์ œ
1571
  parent_chunk = ParentChunk.query.filter_by(file_id=file_to_delete.id).first()
1572
  if parent_chunk:
 
2
  from flask_login import login_user, logout_user, login_required, current_user
3
  from werkzeug.utils import secure_filename
4
  from app.database import db, UploadedFile, User, ChatSession, ChatMessage, DocumentChunk, ParentChunk
5
+ from app.vector_db import get_vector_db
6
  import requests
7
  import os
8
  from datetime import datetime
 
178
  return final_chunks if final_chunks else [text] if text.strip() else []
179
 
180
  def create_chunks_for_file(file_id, content):
181
+ """ํŒŒ์ผ ๋‚ด์šฉ์„ ์˜๋ฏธ ๊ธฐ๋ฐ˜ ์ฒญํฌ๋กœ ๋ถ„ํ• ํ•˜์—ฌ ์ €์žฅ (๋ฒกํ„ฐ DB ํฌํ•จ)"""
182
  try:
183
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ํŒŒ์ผ ID {file_id}์— ๋Œ€ํ•œ ์ฒญํฌ ์ƒ์„ฑ ์‹œ์ž‘")
184
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ์›๋ณธ ํ…์ŠคํŠธ ๊ธธ์ด: {len(content)}์ž")
185
 
186
+ # ๋ฒกํ„ฐ DB ๋งค๋‹ˆ์ € ๊ฐ€์ ธ์˜ค๊ธฐ
187
+ vector_db = get_vector_db()
188
+
189
+ # ๊ธฐ์กด ์ฒญํฌ ์‚ญ์ œ (DB + ๋ฒกํ„ฐ DB)
190
+ existing_chunks = DocumentChunk.query.filter_by(file_id=file_id).all()
191
+ if existing_chunks:
192
+ print(f"[์ฒญํฌ ์ƒ์„ฑ] ๊ธฐ์กด ์ฒญํฌ {len(existing_chunks)}๊ฐœ ์‚ญ์ œ ์ค‘...")
193
+ # ๋ฒกํ„ฐ DB์—์„œ ์‚ญ์ œ
194
+ vector_db.delete_chunks_by_file_id(file_id)
195
+ # DB์—์„œ ์‚ญ์ œ
196
  DocumentChunk.query.filter_by(file_id=file_id).delete()
197
  db.session.commit()
198
 
 
205
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ๊ฒฝ๊ณ : ์ฒญํฌ๊ฐ€ ์ƒ์„ฑ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ํ…์ŠคํŠธ๊ฐ€ ๋„ˆ๋ฌด ์งง๊ฑฐ๋‚˜ ๋น„์–ด์žˆ์„ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.")
206
  return 0
207
 
208
+ # ๊ฐ ์ฒญํฌ๋ฅผ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค์™€ ๋ฒกํ„ฐ DB์— ์ €์žฅ
209
  saved_count = 0
210
+ vector_saved_count = 0
211
  for idx, chunk_content in enumerate(chunks):
212
  try:
213
+ # DB์— ์ฒญํฌ ์ €์žฅ
214
  chunk = DocumentChunk(
215
  file_id=file_id,
216
  chunk_index=idx,
217
  content=chunk_content
218
  )
219
  db.session.add(chunk)
220
+ db.session.flush() # ID ์ƒ์„ฑ
221
+
222
+ # ๋ฒกํ„ฐ DB์— ์ฒญํฌ ์ถ”๊ฐ€
223
+ if vector_db.add_chunk(
224
+ chunk_id=chunk.id,
225
+ chunk_content=chunk_content,
226
+ file_id=file_id,
227
+ chunk_index=idx
228
+ ):
229
+ vector_saved_count += 1
230
+
231
  saved_count += 1
232
 
233
  # ์ง„ํ–‰ ์ƒํ™ฉ ์ถœ๋ ฅ (10๊ฐœ๋งˆ๋‹ค)
234
  if (idx + 1) % 10 == 0:
235
+ print(f"[์ฒญํฌ ์ƒ์„ฑ] ์ง„ํ–‰ ์ค‘: {idx + 1}/{len(chunks)}๊ฐœ ์ฒญํฌ ์ €์žฅ ์ค‘... (DB: {saved_count}, ๋ฒกํ„ฐ DB: {vector_saved_count})")
236
  except Exception as e:
237
  print(f"[์ฒญํฌ ์ƒ์„ฑ] ๊ฒฝ๊ณ : ์ฒญํฌ {idx} ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜: {str(e)}")
238
  continue
239
 
240
  db.session.commit()
241
+ print(f"[์ฒญํฌ ์ƒ์„ฑ] ์™„๋ฃŒ: {saved_count}๊ฐœ ์ฒญํฌ๊ฐ€ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค์— ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค. (๋ฒกํ„ฐ DB: {vector_saved_count}๊ฐœ)")
242
 
243
  # ์ €์žฅ ํ™•์ธ
244
  verified_count = DocumentChunk.query.filter_by(file_id=file_id).count()
 
460
  print(f"[Parent Chunk ์กฐํšŒ] ์˜ค๋ฅ˜: {str(e)}")
461
  return []
462
 
463
+ def search_relevant_chunks(query, file_ids=None, model_name=None, top_k=5, min_score=1):
464
+ """
465
+ ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ฒญํฌ ๊ฒ€์ƒ‰ (๋ฒกํ„ฐ ๊ฒ€์ƒ‰ + Re-ranking)
466
+ 1. ๋ฒกํ„ฐ ๊ฒ€์ƒ‰์œผ๋กœ ์ดˆ๊ธฐ 30๊ฐœ ๋ฌธ์„œ ๊ฒ€์ƒ‰
467
+ 2. Cross-Encoder๋กœ ๋ฆฌ๋žญํ‚น
468
+ 3. ์ƒ์œ„ top_k๊ฐœ ๋ฐ˜ํ™˜ (๊ธฐ๋ณธ 5๊ฐœ)
469
+ """
470
+ try:
471
+ # ๋ฒกํ„ฐ DB ๋งค๋‹ˆ์ € ๊ฐ€์ ธ์˜ค๊ธฐ
472
+ vector_db = get_vector_db()
473
+
474
+ # ํŒŒ์ผ ID ํ™•์žฅ (์ด์–ด์„œ ์—…๋กœ๋“œ๋œ ํŒŒ์ผ ํฌํ•จ)
475
+ expanded_file_ids = None
476
+ if file_ids:
477
+ expanded_file_ids = list(file_ids)
478
+ for file_id in file_ids:
479
+ # ์›๋ณธ ํŒŒ์ผ์ธ ๊ฒฝ์šฐ ์ด์–ด์„œ ์—…๋กœ๋“œ๋œ ํŒŒ์ผ๋“ค๋„ ํฌํ•จ
480
+ child_files = UploadedFile.query.filter_by(parent_file_id=file_id).all()
481
+ expanded_file_ids.extend([child.id for child in child_files])
482
+
483
+ # ์›๋ณธ ํŒŒ์ผ์ด ์„ ํƒ๋œ ๊ฒฝ์šฐ, ์ด์–ด์„œ ์—…๋กœ๋“œ๋œ ํŒŒ์ผ๋“ค๋„ ํฌํ•จ
484
+ parent_files = UploadedFile.query.filter(UploadedFile.id.in_(file_ids), UploadedFile.parent_file_id.is_(None)).all()
485
+ for parent_file in parent_files:
486
+ child_files = UploadedFile.query.filter_by(parent_file_id=parent_file.id).all()
487
+ expanded_file_ids.extend([child.id for child in child_files])
488
+
489
+ # ๋ชจ๋ธ ํ•„ํ„ฐ๋ง์ด ํ•„์š”ํ•œ ๊ฒฝ์šฐ ํŒŒ์ผ ID ํ•„ํ„ฐ๋ง
490
+ if model_name and expanded_file_ids:
491
+ filtered_files = UploadedFile.query.filter(
492
+ UploadedFile.id.in_(expanded_file_ids),
493
+ UploadedFile.model_name == model_name
494
+ ).all()
495
+ expanded_file_ids = [f.id for f in filtered_files]
496
+ elif model_name and not expanded_file_ids:
497
+ # ํŒŒ์ผ ID๊ฐ€ ์—†์œผ๋ฉด ๋ชจ๋ธ ์ด๋ฆ„์œผ๋กœ๋งŒ ํ•„ํ„ฐ๋ง
498
+ filtered_files = UploadedFile.query.filter_by(model_name=model_name).all()
499
+ expanded_file_ids = [f.id for f in filtered_files]
500
+
501
+ # 1๋‹จ๊ณ„: ๋ฒกํ„ฐ ๊ฒ€์ƒ‰์œผ๋กœ ์ดˆ๊ธฐ 30๊ฐœ ๋ฌธ์„œ ๊ฒ€์ƒ‰
502
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰] ์ฟผ๋ฆฌ: {query[:50]}..., ํŒŒ์ผ ID: {expanded_file_ids if expanded_file_ids else '๋ชจ๋“  ํŒŒ์ผ'}")
503
+ vector_results = vector_db.search_chunks(
504
+ query=query,
505
+ file_ids=expanded_file_ids,
506
+ top_k=30
507
+ )
508
+
509
+ if not vector_results:
510
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰] ๊ฒฐ๊ณผ ์—†์Œ, ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰์œผ๋กœ ๋Œ€์ฒด")
511
+ # ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ์—†์œผ๋ฉด ๊ธฐ์กด ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰์œผ๋กœ ๋Œ€์ฒด
512
+ return search_relevant_chunks_fallback(query, file_ids, model_name, top_k, min_score)
513
+
514
+ # 2๋‹จ๊ณ„: Cross-Encoder๋กœ ๋ฆฌ๋žญํ‚น
515
+ print(f"[๋ฆฌ๋žญํ‚น] {len(vector_results)}๊ฐœ ์ฒญํฌ์— ๋Œ€ํ•œ ๋ฆฌ๋žญํ‚น ์‹œ์ž‘...")
516
+ reranked_chunks = vector_db.rerank_chunks(
517
+ query=query,
518
+ chunks=vector_results,
519
+ top_k=top_k
520
+ )
521
+
522
+ # 3๋‹จ๊ณ„: DB์—์„œ ์ฒญํฌ ๊ฐ์ฒด ๊ฐ€์ ธ์˜ค๊ธฐ
523
+ final_chunks = []
524
+ for reranked in reranked_chunks:
525
+ chunk_id = reranked['chunk_id']
526
+ chunk = DocumentChunk.query.get(chunk_id)
527
+ if chunk:
528
+ final_chunks.append(chunk)
529
+
530
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰ + ๋ฆฌ๋žญํ‚น] ์ตœ์ข… {len(final_chunks)}๊ฐœ ์ฒญํฌ ๋ฐ˜ํ™˜")
531
+ return final_chunks
532
+
533
+ except Exception as e:
534
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰] ์˜ค๋ฅ˜: {str(e)}")
535
+ import traceback
536
+ traceback.print_exc()
537
+ # ์˜ค๋ฅ˜ ์‹œ ๊ธฐ์กด ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰์œผ๋กœ ๋Œ€์ฒด
538
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰] ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰์œผ๋กœ ๋Œ€์ฒด")
539
+ return search_relevant_chunks_fallback(query, file_ids, model_name, top_k, min_score)
540
+
541
+ def search_relevant_chunks_fallback(query, file_ids=None, model_name=None, top_k=25, min_score=1):
542
+ """๊ธฐ์กด ํ‚ค์›Œ๋“œ ๊ธฐ๋ฐ˜ ๊ฒ€์ƒ‰ (Fallback)"""
543
  try:
544
  # ๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ์ค€๋น„ - ํ•œ๊ธ€๊ณผ ์˜๋ฌธ ๋‹จ์–ด ๋ชจ๋‘ ์ถ”์ถœ
545
  query_words = set(re.findall(r'[๊ฐ€-ํžฃ]+|\w+', query.lower()))
 
605
  # ์ ์ˆ˜ ์ˆœ์œผ๋กœ ์ •๋ ฌํ•˜๊ณ  ์ƒ์œ„ k๊ฐœ ์„ ํƒ
606
  scored_chunks.sort(key=lambda x: x[0], reverse=True)
607
 
608
+ # top_k๊ฐœ ์„ ํƒ
609
+ top_chunks = [chunk for score, chunk in scored_chunks[:top_k]]
 
 
 
 
 
 
 
 
 
610
 
611
  return top_chunks
612
  except Exception as e:
613
+ print(f"[ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰] ์˜ค๋ฅ˜: {str(e)}")
614
  import traceback
615
  traceback.print_exc()
616
  return []
 
902
  parent_chunks = get_parent_chunks_for_files(file_ids)
903
  print(f"[RAG ๊ฒ€์ƒ‰ 1๋‹จ๊ณ„] Parent Chunk ์กฐํšŒ ์™„๋ฃŒ: {len(parent_chunks)}๊ฐœ ํŒŒ์ผ")
904
 
905
+ # 2๋‹จ๊ณ„: ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ + ๋ฆฌ๋žญํ‚น์œผ๋กœ Child Chunk ์ •๋ฐ€ ๊ฒ€์ƒ‰
906
+ print(f"[RAG ๊ฒ€์ƒ‰ 2๋‹จ๊ณ„] ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ + ๋ฆฌ๋žญํ‚น ์‹œ์ž‘...")
907
  relevant_chunks = search_relevant_chunks(
908
  query=message,
909
  file_ids=file_ids if file_ids else None,
910
  model_name=model,
911
+ top_k=5, # ๋ฆฌ๋žญํ‚น ํ›„ ์ƒ์œ„ 5๊ฐœ๋งŒ ์„ ํƒ
912
  min_score=0.5 # ์ตœ์†Œ ์ ์ˆ˜ ์ž„๊ณ„๊ฐ’
913
  )
914
+ print(f"[RAG ๊ฒ€์ƒ‰ 2๋‹จ๊ณ„] ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ + ๋ฆฌ๋žญํ‚น ์™„๋ฃŒ: {len(relevant_chunks)}๊ฐœ ์ฒญํฌ (์ƒ์œ„ 5๊ฐœ)")
915
 
916
  # ์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ
917
  context_parts = []
 
991
  - ๊ทธ ๋‹ค์Œ ๊ตฌ์ฒด์ ์ธ ๋‚ด์šฉ(Child Chunk)์„ ํ†ตํ•ด ์งˆ๋ฌธ์— ๋Œ€ํ•œ ์ •ํ™•ํ•œ ๋‹ต๋ณ€์„ ์ œ๊ณตํ•˜์„ธ์š”.
992
  - ์›น์†Œ์„ค์˜ ๋งฅ๋ฝ๊ณผ ์Šคํ† ๋ฆฌ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ์ผ๊ด€์„ฑ ์žˆ๋Š” ๋‹ต๋ณ€์„ ์ž‘์„ฑํ•˜์„ธ์š”.
993
 
994
+ ์ค‘์š”: ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ๋•Œ๋Š” ๋ฐ˜๋“œ์‹œ ์ œ๊ณต๋œ [์†Œ์„ค ๋ณธ๋ฌธ] ๋‚ด์˜ ๋‚ด์šฉ์„ ๊ทผ๊ฑฐ๋กœ ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
995
+ ๋‹ต๋ณ€์˜ ๊ฐ ๋ฌธ์žฅ ๋์—๋Š” ์ฐธ๊ณ ํ•œ ๋ณธ๋ฌธ์˜ ๋ฌธ์žฅ์„ [๊ทผ๊ฑฐ: "๋ฌธ์žฅ ๋‚ด์šฉ..."] ํ˜•์‹์œผ๋กœ ๋ฐ˜๋“œ์‹œ ๋ถ™์ด์„ธ์š”.
996
+ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†๋‹ค๋ฉด "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ๋‹ตํ•˜๊ณ  ์ง€์–ด๋‚ด์ง€ ๋งˆ์„ธ์š”.
997
+
998
  ์งˆ๋ฌธ:
999
  """
1000
  elif parent_chunks:
 
1005
 
1006
  ์œ„ ์ •๋ณด๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”. ์›น์†Œ์„ค์˜ ๋ฐฐ๊ฒฝ๊ณผ ์„ค์ •์„ ๊ณ ๋ คํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”.
1007
 
1008
+ ์ค‘์š”: ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ๋•Œ๋Š” ๋ฐ˜๋“œ์‹œ ์ œ๊ณต๋œ [์†Œ์„ค ๋ณธ๋ฌธ] ๋‚ด์˜ ๋‚ด์šฉ์„ ๊ทผ๊ฑฐ๋กœ ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
1009
+ ๋‹ต๋ณ€์˜ ๊ฐ ๋ฌธ์žฅ ๋์—๋Š” ์ฐธ๊ณ ํ•œ ๋ณธ๋ฌธ์˜ ๋ฌธ์žฅ์„ [๊ทผ๊ฑฐ: "๋ฌธ์žฅ ๋‚ด์šฉ..."] ํ˜•์‹์œผ๋กœ ๋ฐ˜๋“œ์‹œ ๋ถ™์ด์„ธ์š”.
1010
+ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†๋‹ค๋ฉด "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ๋‹ตํ•˜๊ณ  ์ง€์–ด๋‚ด์ง€ ๋งˆ์„ธ์š”.
1011
+
1012
  ์งˆ๋ฌธ:
1013
  """
1014
  else:
 
1017
 
1018
  {full_context}
1019
 
1020
+ ์œ„ ๋‚ด์šฉ์„ ์ถฉ๋ถ„ํžˆ ์ฐธ๊ณ ํ•˜์—ฌ ๋‹ค์Œ ์งˆ๋ฌธ์— ์ •ํ™•ํ•˜๊ณ  ์ƒ์„ธํ•˜๊ฒŒ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”. ์›น์†Œ์„ค์˜ ๋งฅ๋ฝ๊ณผ ์Šคํ† ๋ฆฌ๋ฅผ ๊ณ ๋ คํ•˜์—ฌ ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”.
1021
+
1022
+ ์ค‘์š”: ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ๋•Œ๋Š” ๋ฐ˜๋“œ์‹œ ์ œ๊ณต๋œ [์†Œ์„ค ๋ณธ๋ฌธ] ๋‚ด์˜ ๋‚ด์šฉ์„ ๊ทผ๊ฑฐ๋กœ ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
1023
+ ๋‹ต๋ณ€์˜ ๊ฐ ๋ฌธ์žฅ ๋์—๋Š” ์ฐธ๊ณ ํ•œ ๋ณธ๋ฌธ์˜ ๋ฌธ์žฅ์„ [๊ทผ๊ฑฐ: "๋ฌธ์žฅ ๋‚ด์šฉ..."] ํ˜•์‹์œผ๋กœ ๋ฐ˜๋“œ์‹œ ๋ถ™์ด์„ธ์š”.
1024
+ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†๋‹ค๋ฉด "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ๋‹ตํ•˜๊ณ  ์ง€์–ด๋‚ด์ง€ ๋งˆ์„ธ์š”.
1025
 
1026
  ์งˆ๋ฌธ:
1027
  """
 
1082
 
1083
  if context_parts:
1084
  context = "\n\n".join(context_parts)
1085
+ context = f"""๋‹ค์Œ์€ ํ•™์Šต๋œ ์›น์†Œ์„ค ๋‚ด์šฉ์ž…๋‹ˆ๋‹ค:
1086
+
1087
+ {context}
1088
+
1089
+ ์œ„ ๋‚ด์šฉ์„ ์ฐธ๊ณ ํ•˜์—ฌ ๋‹ค์Œ ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•ด์ฃผ์„ธ์š”.
1090
+
1091
+ ์ค‘์š”: ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ๋•Œ๋Š” ๋ฐ˜๋“œ์‹œ ์ œ๊ณต๋œ [์†Œ์„ค ๋ณธ๋ฌธ] ๋‚ด์˜ ๋‚ด์šฉ์„ ๊ทผ๊ฑฐ๋กœ ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
1092
+ ๋‹ต๋ณ€์˜ ๊ฐ ๋ฌธ์žฅ ๋์—๋Š” ์ฐธ๊ณ ํ•œ ๋ณธ๋ฌธ์˜ ๋ฌธ์žฅ์„ [๊ทผ๊ฑฐ: "๋ฌธ์žฅ ๋‚ด์šฉ..."] ํ˜•์‹์œผ๋กœ ๋ฐ˜๋“œ์‹œ ๋ถ™์ด์„ธ์š”.
1093
+ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†๋‹ค๋ฉด "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ๋‹ตํ•˜๊ณ  ์ง€์–ด๋‚ด์ง€ ๋งˆ์„ธ์š”.
1094
+
1095
+ ์งˆ๋ฌธ:
1096
+ """
1097
 
1098
  # ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
1099
  full_prompt = context + message if context else message
 
1679
  DocumentChunk.query.filter_by(file_id=file_to_delete.id).delete()
1680
  print(f"[ํŒŒ์ผ ์‚ญ์ œ] Child Chunk {child_chunk_count}๊ฐœ ์‚ญ์ œ ์™„๋ฃŒ")
1681
 
1682
+ # ๋ฒกํ„ฐ DB์—์„œ๋„ ํ•ด๋‹น ํŒŒ์ผ์˜ ์ฒญํฌ ์‚ญ์ œ
1683
+ try:
1684
+ vector_db = get_vector_db()
1685
+ vector_db.delete_chunks_by_file_id(file_to_delete.id)
1686
+ print(f"[ํŒŒ์ผ ์‚ญ์ œ] ๋ฒกํ„ฐ DB์—์„œ ์ฒญํฌ ์‚ญ์ œ ์™„๋ฃŒ")
1687
+ except Exception as vector_e:
1688
+ print(f"[ํŒŒ์ผ ์‚ญ์ œ] ๋ฒกํ„ฐ DB ์‚ญ์ œ ์˜ค๋ฅ˜ (๋ฌด์‹œ): {str(vector_e)}")
1689
+
1690
  # ๊ด€๋ จ Parent Chunk ์‚ญ์ œ
1691
  parent_chunk = ParentChunk.query.filter_by(file_id=file_to_delete.id).first()
1692
  if parent_chunk:
app/vector_db.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ๋ฒกํ„ฐ DB ๋ฐ ์ž„๋ฒ ๋”ฉ ๊ด€๋ จ ๊ธฐ๋Šฅ
3
+ Chroma DB๋ฅผ ์‚ฌ์šฉํ•œ ๋ฒกํ„ฐ ๊ฒ€์ƒ‰ ๋ฐ Re-ranking ์‹œ์Šคํ…œ
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import chromadb
9
+ from chromadb.config import Settings
10
+ from sentence_transformers import SentenceTransformer, CrossEncoder
11
+ from pathlib import Path
12
+ import numpy as np
13
+
14
+ # ๋ฒกํ„ฐ DB ๊ฒฝ๋กœ
15
+ VECTOR_DB_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'vector_db')
16
+
17
+ # ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ (ํ•œ๊ตญ์–ด ์ง€์›)
18
+ EMBEDDING_MODEL_NAME = "jhgan/ko-sroberta-multitask" # ํ•œ๊ตญ์–ด ์ง€์› ๋ชจ๋ธ
19
+ # ๋˜๋Š” ์˜์–ด ์ค‘์‹ฌ: "all-MiniLM-L6-v2" (๋” ๋น ๋ฅด์ง€๋งŒ ํ•œ๊ตญ์–ด ์„ฑ๋Šฅ ๋‚ฎ์Œ)
20
+
21
+ # Cross-Encoder ๋ชจ๋ธ (๋ฆฌ๋žญํ‚น์šฉ)
22
+ # ํ•œ๊ตญ์–ด ๋ฆฌ๋žญ์ปค๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์œผ๋ฉด ์˜์–ด ๋ชจ๋ธ ์‚ฌ์šฉ
23
+ RERANKER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2" # ๋ฒ”์šฉ ๋ฆฌ๋žญ์ปค (ํ•œ๊ตญ์–ด๋„ ์–ด๋А ์ •๋„ ์ง€์›)
24
+ # ๋˜๋Š”: "BAAI/bge-reranker-base" (๋” ๋‚˜์€ ์„ฑ๋Šฅ)
25
+
26
+ class VectorDBManager:
27
+ """๋ฒกํ„ฐ DB ๊ด€๋ฆฌ ํด๋ž˜์Šค"""
28
+
29
+ def __init__(self):
30
+ """๋ฒกํ„ฐ DB ์ดˆ๊ธฐํ™”"""
31
+ self.embedding_model = None
32
+ self.reranker_model = None
33
+ self.client = None
34
+ self.collection = None
35
+
36
+ # ๋ฒกํ„ฐ DB ํด๋” ์ƒ์„ฑ
37
+ os.makedirs(VECTOR_DB_PATH, exist_ok=True)
38
+
39
+ # Chroma DB ํด๋ผ์ด์–ธํŠธ ์ดˆ๊ธฐํ™”
40
+ self.client = chromadb.PersistentClient(
41
+ path=VECTOR_DB_PATH,
42
+ settings=Settings(
43
+ anonymized_telemetry=False,
44
+ allow_reset=True
45
+ )
46
+ )
47
+
48
+ # ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ (์—†์œผ๋ฉด ์ƒ์„ฑ, ์žˆ์œผ๋ฉด ๊ฐ€์ ธ์˜ค๊ธฐ)
49
+ try:
50
+ self.collection = self.client.get_or_create_collection(
51
+ name="document_chunks",
52
+ metadata={"hnsw:space": "cosine"} # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ์‚ฌ์šฉ
53
+ )
54
+ print(f"[๋ฒกํ„ฐ DB] ์ปฌ๋ ‰์…˜ ๋กœ๋“œ/์ƒ์„ฑ ์™„๋ฃŒ: {len(self.collection.get()['ids'])}๊ฐœ ๋ฌธ์„œ")
55
+ except Exception as e:
56
+ print(f"[๋ฒกํ„ฐ DB] ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ ์˜ค๋ฅ˜: {e}")
57
+ raise
58
+
59
+ def get_embedding_model(self):
60
+ """์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ (์ง€์—ฐ ๋กœ๋”ฉ)"""
61
+ if self.embedding_model is None:
62
+ print(f"[์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ] ๋กœ๋”ฉ ์ค‘: {EMBEDDING_MODEL_NAME}")
63
+ try:
64
+ self.embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
65
+ print(f"[์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ] ๋กœ๋”ฉ ์™„๋ฃŒ")
66
+ except Exception as e:
67
+ print(f"[์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ] ๋กœ๋”ฉ ์˜ค๋ฅ˜: {e}")
68
+ # ๋Œ€์ฒด ๋ชจ๋ธ ์‹œ๋„
69
+ try:
70
+ self.embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
71
+ print(f"[์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ] ๋Œ€์ฒด ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ: all-MiniLM-L6-v2")
72
+ except Exception as e2:
73
+ print(f"[์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ] ๋Œ€์ฒด ๋ชจ๋ธ๋„ ๋กœ๋”ฉ ์‹คํŒจ: {e2}")
74
+ raise
75
+ return self.embedding_model
76
+
77
+ def get_reranker_model(self):
78
+ """Cross-Encoder ๋ฆฌ๋žญ์ปค ๋ชจ๋ธ ๋กœ๋“œ (์ง€์—ฐ ๋กœ๋”ฉ)"""
79
+ if self.reranker_model is None:
80
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] ๋กœ๋”ฉ ์ค‘: {RERANKER_MODEL_NAME}")
81
+ try:
82
+ self.reranker_model = CrossEncoder(RERANKER_MODEL_NAME)
83
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] ๋กœ๋”ฉ ์™„๋ฃŒ")
84
+ except Exception as e:
85
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] ๋กœ๋”ฉ ์˜ค๋ฅ˜: {e}")
86
+ # ๋Œ€์ฒด ๋ชจ๋ธ ์‹œ๋„ (๋” ๊ฐ€๋ฒผ์šด ๋ชจ๋ธ)
87
+ try:
88
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] ๋Œ€์ฒด ๋ชจ๋ธ ์‹œ๋„: BAAI/bge-reranker-base")
89
+ self.reranker_model = CrossEncoder("BAAI/bge-reranker-base", max_length=512)
90
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] ๋Œ€์ฒด ๋ชจ๋ธ ๋กœ๋”ฉ ์™„๋ฃŒ")
91
+ except Exception as e2:
92
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] ๋Œ€์ฒด ๋ชจ๋ธ๋„ ๋กœ๋”ฉ ์‹คํŒจ: {e2}")
93
+ # ๋ฆฌ๋žญํ‚น ์—†์ด ์ง„ํ–‰ (๊ฒฝ๊ณ ๋งŒ ์ถœ๋ ฅ)
94
+ print(f"[๋ฆฌ๋žญ์ปค ๋ชจ๋ธ] โš ๏ธ ๊ฒฝ๊ณ : ๋ฆฌ๋žญ์ปค ๋ชจ๋ธ์„ ๋กœ๋“œํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ๋ฆฌ๋žญํ‚น ์—†์ด ์ง„ํ–‰ํ•ฉ๋‹ˆ๋‹ค.")
95
+ self.reranker_model = None
96
+ return self.reranker_model
97
+
98
+ def generate_embedding(self, text):
99
+ """ํ…์ŠคํŠธ์— ๋Œ€ํ•œ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ"""
100
+ try:
101
+ model = self.get_embedding_model()
102
+ embedding = model.encode(text, convert_to_numpy=True).tolist()
103
+ return embedding
104
+ except Exception as e:
105
+ print(f"[์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ] ์˜ค๋ฅ˜: {e}")
106
+ return None
107
+
108
+ def add_chunk(self, chunk_id, chunk_content, file_id, chunk_index, metadata=None):
109
+ """์ฒญํฌ๋ฅผ ๋ฒกํ„ฐ DB์— ์ถ”๊ฐ€"""
110
+ try:
111
+ # ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
112
+ embedding = self.generate_embedding(chunk_content)
113
+ if embedding is None:
114
+ return False
115
+
116
+ # ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ค€๋น„
117
+ chunk_metadata = {
118
+ 'file_id': str(file_id),
119
+ 'chunk_index': str(chunk_index),
120
+ 'content_length': str(len(chunk_content))
121
+ }
122
+ if metadata:
123
+ chunk_metadata.update(metadata)
124
+
125
+ # ๋ฒกํ„ฐ DB์— ์ถ”๊ฐ€
126
+ self.collection.add(
127
+ ids=[str(chunk_id)],
128
+ embeddings=[embedding],
129
+ documents=[chunk_content],
130
+ metadatas=[chunk_metadata]
131
+ )
132
+
133
+ return True
134
+ except Exception as e:
135
+ print(f"[๋ฒกํ„ฐ DB ์ถ”๊ฐ€] ์˜ค๋ฅ˜: {e}")
136
+ import traceback
137
+ traceback.print_exc()
138
+ return False
139
+
140
+ def search_chunks(self, query, file_ids=None, top_k=30):
141
+ """๋ฒกํ„ฐ ๊ฒ€์ƒ‰์œผ๋กœ ๊ด€๋ จ ์ฒญํฌ ๊ฒ€์ƒ‰ (์ดˆ๊ธฐ ๊ฒ€์ƒ‰, top_k=30)"""
142
+ try:
143
+ # ์ฟผ๋ฆฌ ์ž„๋ฒ ๋”ฉ ์ƒ์„ฑ
144
+ query_embedding = self.generate_embedding(query)
145
+ if query_embedding is None:
146
+ return []
147
+
148
+ # ํ•„ํ„ฐ ์กฐ๊ฑด ์„ค์ •
149
+ where_clause = None
150
+ if file_ids:
151
+ where_clause = {"file_id": {"$in": [str(fid) for fid in file_ids]}}
152
+
153
+ # ๋ฒกํ„ฐ ๊ฒ€์ƒ‰
154
+ results = self.collection.query(
155
+ query_embeddings=[query_embedding],
156
+ n_results=min(top_k, 30), # ์ตœ๋Œ€ 30๊ฐœ
157
+ where=where_clause
158
+ )
159
+
160
+ # ๊ฒฐ๊ณผ ํŒŒ์‹ฑ
161
+ chunks = []
162
+ if results and 'ids' in results and len(results['ids'][0]) > 0:
163
+ for i, chunk_id in enumerate(results['ids'][0]):
164
+ chunks.append({
165
+ 'chunk_id': int(chunk_id),
166
+ 'content': results['documents'][0][i] if 'documents' in results else '',
167
+ 'metadata': results['metadatas'][0][i] if 'metadatas' in results else {},
168
+ 'distance': results['distances'][0][i] if 'distances' in results else 1.0
169
+ })
170
+
171
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰] {len(chunks)}๊ฐœ ์ฒญํฌ ๊ฒ€์ƒ‰ ์™„๋ฃŒ")
172
+ return chunks
173
+ except Exception as e:
174
+ print(f"[๋ฒกํ„ฐ ๊ฒ€์ƒ‰] ์˜ค๋ฅ˜: {e}")
175
+ import traceback
176
+ traceback.print_exc()
177
+ return []
178
+
179
+ def rerank_chunks(self, query, chunks, top_k=5):
180
+ """Cross-Encoder๋ฅผ ์‚ฌ์šฉํ•˜์—ฌ ์ฒญํฌ ๋ฆฌ๋žญํ‚น (์ƒ์œ„ top_k๊ฐœ ๋ฐ˜ํ™˜)"""
181
+ try:
182
+ if not chunks or len(chunks) == 0:
183
+ return []
184
+
185
+ # ๋ฆฌ๋žญ์ปค ๋ชจ๋ธ ๋กœ๋“œ
186
+ reranker = self.get_reranker_model()
187
+
188
+ # ๋ฆฌ๋žญ์ปค ๋ชจ๋ธ์ด ์—†์œผ๋ฉด ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ˜ ์ •๋ ฌ๋งŒ ์ˆ˜ํ–‰
189
+ if reranker is None:
190
+ print(f"[๋ฆฌ๋žญํ‚น] โš ๏ธ ๋ฆฌ๋žญ์ปค ๋ชจ๋ธ ์—†์Œ, ๊ฑฐ๋ฆฌ ๊ธฐ๋ฐ˜ ์ •๋ ฌ๋งŒ ์ˆ˜ํ–‰")
191
+ scored_chunks = []
192
+ for chunk in chunks:
193
+ scored_chunks.append({
194
+ 'chunk_id': chunk['chunk_id'],
195
+ 'content': chunk['content'],
196
+ 'metadata': chunk['metadata'],
197
+ 'rerank_score': 1.0 - chunk.get('distance', 1.0), # ๊ฑฐ๋ฆฌ๋ฅผ ์ ์ˆ˜๋กœ ๋ณ€ํ™˜
198
+ 'original_distance': chunk.get('distance', 1.0)
199
+ })
200
+ scored_chunks.sort(key=lambda x: x['rerank_score'], reverse=True)
201
+ return scored_chunks[:top_k]
202
+
203
+ # ์ฟผ๋ฆฌ-๋ฌธ์„œ ์Œ ์ค€๋น„ (์ตœ๋Œ€ ๊ธธ์ด ์ œํ•œ)
204
+ pairs = []
205
+ max_content_length = 500 # ์ฒญํฌ ๋‚ด์šฉ์ด ๋„ˆ๋ฌด ๊ธธ๋ฉด ์ž˜๋ผ๋ƒ„
206
+ for chunk in chunks:
207
+ content = chunk['content']
208
+ if len(content) > max_content_length:
209
+ content = content[:max_content_length]
210
+ pairs.append([query, content])
211
+
212
+ # ๋ฆฌ๋žญํ‚น ์ ์ˆ˜ ๊ณ„์‚ฐ
213
+ print(f"[๋ฆฌ๋žญํ‚น] {len(pairs)}๊ฐœ ์ฒญํฌ์— ๋Œ€ํ•œ ๋ฆฌ๋žญํ‚น ์‹œ์ž‘...")
214
+ scores = reranker.predict(pairs)
215
+
216
+ # ์ ์ˆ˜์™€ ์ฒญํฌ ๊ฒฐํ•ฉ
217
+ scored_chunks = []
218
+ for i, chunk in enumerate(chunks):
219
+ scored_chunks.append({
220
+ 'chunk_id': chunk['chunk_id'],
221
+ 'content': chunk['content'],
222
+ 'metadata': chunk['metadata'],
223
+ 'rerank_score': float(scores[i]),
224
+ 'original_distance': chunk.get('distance', 1.0)
225
+ })
226
+
227
+ # ์ ์ˆ˜ ์ˆœ์œผ๋กœ ์ •๋ ฌ (๋†’์€ ์ ์ˆ˜ = ๋” ๊ด€๋ จ์„ฑ ๋†’์Œ)
228
+ scored_chunks.sort(key=lambda x: x['rerank_score'], reverse=True)
229
+
230
+ # ์ƒ์œ„ top_k๊ฐœ๋งŒ ์„ ํƒ
231
+ top_chunks = scored_chunks[:top_k]
232
+
233
+ print(f"[๋ฆฌ๋žญํ‚น] ์™„๋ฃŒ: ์ƒ์œ„ {len(top_chunks)}๊ฐœ ์ฒญํฌ ์„ ํƒ")
234
+ for i, chunk in enumerate(top_chunks):
235
+ print(f" {i+1}. ์ ์ˆ˜: {chunk['rerank_score']:.4f}, ์ฒญํฌ ID: {chunk['chunk_id']}")
236
+
237
+ return top_chunks
238
+ except Exception as e:
239
+ print(f"[๋ฆฌ๋žญํ‚น] ์˜ค๋ฅ˜: {e}")
240
+ import traceback
241
+ traceback.print_exc()
242
+ # ์˜ค๋ฅ˜ ์‹œ ์›๋ณธ ์ฒญํฌ ์ƒ์œ„ top_k๊ฐœ ๋ฐ˜ํ™˜ (๊ฑฐ๋ฆฌ ๊ธฐ์ค€)
243
+ chunks_sorted = sorted(chunks, key=lambda x: x.get('distance', 1.0))
244
+ return chunks_sorted[:top_k]
245
+
246
+ def delete_chunks_by_file_id(self, file_id):
247
+ """ํŒŒ์ผ ID๋กœ ํ•ด๋‹น ํŒŒ์ผ์˜ ๋ชจ๋“  ์ฒญํฌ ์‚ญ์ œ"""
248
+ try:
249
+ # ํ•ด๋‹น ํŒŒ์ผ์˜ ๋ชจ๋“  ์ฒญํฌ ์ฐพ๊ธฐ
250
+ results = self.collection.get(
251
+ where={"file_id": str(file_id)}
252
+ )
253
+
254
+ if results and 'ids' in results and len(results['ids']) > 0:
255
+ # ์ฒญํฌ ์‚ญ์ œ
256
+ self.collection.delete(ids=results['ids'])
257
+ print(f"[๋ฒกํ„ฐ DB ์‚ญ์ œ] ํŒŒ์ผ ID {file_id}์˜ {len(results['ids'])}๊ฐœ ์ฒญํฌ ์‚ญ์ œ ์™„๋ฃŒ")
258
+ return True
259
+ return False
260
+ except Exception as e:
261
+ print(f"[๋ฒกํ„ฐ DB ์‚ญ์ œ] ์˜ค๋ฅ˜: {e}")
262
+ return False
263
+
264
+ def get_chunk_count(self):
265
+ """๋ฒกํ„ฐ DB์— ์ €์žฅ๋œ ์ฒญํฌ ๊ฐœ์ˆ˜ ๋ฐ˜ํ™˜"""
266
+ try:
267
+ return self.collection.count()
268
+ except Exception as e:
269
+ print(f"[๋ฒกํ„ฐ DB] ์ฒญํฌ ๊ฐœ์ˆ˜ ์กฐํšŒ ์˜ค๋ฅ˜: {e}")
270
+ return 0
271
+
272
+ # ์ „์—ญ ๋ฒกํ„ฐ DB ๋งค๋‹ˆ์ € ์ธ์Šคํ„ด์Šค
273
+ _vector_db_manager = None
274
+
275
+ def get_vector_db():
276
+ """๋ฒกํ„ฐ DB ๋งค๋‹ˆ์ € ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ๋ฐ˜ํ™˜"""
277
+ global _vector_db_manager
278
+ if _vector_db_manager is None:
279
+ _vector_db_manager = VectorDBManager()
280
+ return _vector_db_manager
281
+
requirements.txt CHANGED
@@ -4,5 +4,8 @@ flask-login==0.6.3
4
  python-dotenv==1.0.0
5
  requests==2.31.0
6
  werkzeug==3.0.1
 
 
 
7
 
8
 
 
4
  python-dotenv==1.0.0
5
  requests==2.31.0
6
  werkzeug==3.0.1
7
+ chromadb==0.4.22
8
+ sentence-transformers==2.3.1
9
+ numpy==1.24.3
10
 
11