nneans commited on
Commit
f438fbf
Β·
verified Β·
1 Parent(s): 650a480

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +191 -176
app.py CHANGED
@@ -1,298 +1,313 @@
1
  # =========================================================
2
- # KB 금육 RAG 챗봇 (Local Self-Contained Version)
3
- # =========================================================
4
- # 이 μ½”λ“œλŠ” μ„œλ²„λ‚˜ ν΄λΌμš°λ“œ DB 없이, μ‚¬μš©μžκ°€ 직접 PDFλ₯Ό μ—…λ‘œλ“œν•˜μ—¬
5
- # λ‘œμ»¬μ—μ„œ 지식 베이슀λ₯Ό κ΅¬μΆ•ν•˜κ³  μ§ˆλ¬Έν•  수 μžˆλŠ” κ΅¬μ‘°μž…λ‹ˆλ‹€.
6
- # Groq(LLM), Google(Voice/Translate) APIλ₯Ό μ‚¬μš©ν•˜μ—¬ 무료둜 λ™μž‘ν•©λ‹ˆλ‹€.
7
  # =========================================================
8
 
9
  import os
10
  import sys
11
  import numpy as np
12
  import traceback
13
- import fitz # PyMuPDF (PDF 처리)
14
  from typing import List
15
 
16
  # --- 라이브러리 μž„ν¬νŠΈ ---
17
  import gradio as gr
18
  import speech_recognition as sr
 
 
 
 
 
19
  from deep_translator import GoogleTranslator
20
  from sentence_transformers import SentenceTransformer
21
  from groq import Groq
22
  from qdrant_client import QdrantClient
23
  from qdrant_client.models import Distance, VectorParams, PointStruct
 
24
  try:
25
  from langchain.text_splitter import RecursiveCharacterTextSplitter
26
  except ImportError:
27
- # langchain 0.2.0 μ΄μƒμ—μ„œ ꡬ쑰가 λ³€κ²½λœ 경우
28
  from langchain_text_splitters import RecursiveCharacterTextSplitter
29
 
30
  # =========================================================
31
  # 1. μ„€μ • 및 μ΄ˆκΈ°ν™”
32
  # =========================================================
33
 
34
- # Groq API ν‚€ (ν•„μˆ˜)
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_groq_api_key_here")
36
- if not GROQ_API_KEY or GROQ_API_KEY == "your_groq_api_key_here":
37
- print("⚠️ GROQ_API_KEYκ°€ μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€. RAG κΈ°λŠ₯ μ‚¬μš© μ‹œ 였λ₯˜κ°€ λ°œμƒν•  수 μžˆμŠ΅λ‹ˆλ‹€.")
38
-
39
- # λͺ¨λΈ μ„€μ •
40
  EMBEDDING_MODEL_NAME = "jhgan/ko-sroberta-multitask"
41
  GROQ_MODEL_NAME = "llama-3.3-70b-versatile"
42
  COLLECTION_NAME = "local_kb"
43
 
44
- print("πŸ› οΈ λͺ¨λΈ 및 ν΄λΌμ΄μ–ΈνŠΈ μ΄ˆκΈ°ν™” 쀑...")
45
 
46
- # 1. μž„λ² λ”© λͺ¨λΈ λ‘œλ“œ (둜컬 μ‹€ν–‰)
47
  embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
48
  embedding_model.max_seq_length = 512
49
 
50
- # 2. Qdrant ν΄λΌμ΄μ–ΈνŠΈ (둜컬 λ©”λͺ¨λ¦¬ DB - ν”„λ‘œκ·Έλž¨ μ’…λ£Œ μ‹œ 데이터 μ‚­μ œλ¨)
51
- # 영ꡬ μ €μž₯을 μ›ν•˜λ©΄ path="./local_qdrant_db" 둜 λ³€κ²½ν•˜μ„Έμš”.
52
- # μ—¬κΈ°μ„œλŠ” 포트폴리였용 데λͺ¨λ₯Ό μœ„ν•΄ 맀번 κΉ¨λ—ν•œ μƒνƒœμΈ ':memory:'λ₯Ό 기본으둜 ν•©λ‹ˆλ‹€.
53
  qdrant_client = QdrantClient(":memory:")
54
-
55
- # μ»¬λ ‰μ…˜ 생성 (이미 μ‘΄μž¬ν•˜λ©΄ μ‚­μ œ ν›„ μž¬μƒμ„±)
56
  try:
57
  qdrant_client.recreate_collection(
58
  collection_name=COLLECTION_NAME,
59
  vectors_config=VectorParams(size=768, distance=Distance.COSINE),
60
  )
61
- print(f"βœ… 둜컬 Qdrant μ»¬λ ‰μ…˜ '{COLLECTION_NAME}' 생성 μ™„λ£Œ.")
62
  except Exception as e:
63
- print(f"❌ Qdrant μ»¬λ ‰μ…˜ 생성 μ‹€νŒ¨: {e}")
64
 
65
- # 3. Groq ν΄λΌμ΄μ–ΈνŠΈ
66
- try:
67
- groq_client = Groq(api_key=GROQ_API_KEY)
68
- except Exception as e:
69
- groq_client = None
70
- print(f"❌ Groq ν΄λΌμ΄μ–ΈνŠΈ μ΄ˆκΈ°ν™” μ‹€νŒ¨: {e}")
 
 
 
71
 
72
- #μ „μ—­ λ³€μˆ˜: λ¬Έμ„œ ID μΉ΄μš΄ν„°
73
  doc_id_counter = 0
74
 
75
- print("βœ… λͺ¨λ“  μ‹œμŠ€ν…œ μ€€λΉ„ μ™„λ£Œ!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
 
 
 
 
 
77
 
78
  # =========================================================
79
- # 2. λ¬Έμ„œ 처리 및 RAG 핡심 둜직
80
  # =========================================================
81
 
82
  def process_uploaded_files(files):
83
- """PDF νŒŒμΌμ„ 읽어 ν…μŠ€νŠΈλ₯Ό μΆ”μΆœν•˜κ³  벑터 DB에 μ €μž₯"""
84
  global doc_id_counter
85
-
86
- if not files:
87
- return "파일이 μ—…λ‘œλ“œλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."
88
 
89
  total_chunks = 0
90
  status_msg = ""
91
-
92
- # ν…μŠ€νŠΈ 뢄리기 μ„€μ •
93
- text_splitter = RecursiveCharacterTextSplitter(
94
- chunk_size=500,
95
- chunk_overlap=50,
96
- length_function=len,
97
- )
98
 
99
  for file in files:
100
  try:
101
- # Gradio 버전/섀정에 따라 file이 λ¬Έμžμ—΄(경둜)일 μˆ˜λ„ 있고 객체일 μˆ˜λ„ 있음
102
  file_path = file.name if hasattr(file, 'name') else file
103
-
104
- # 1. PDF ν…μŠ€νŠΈ μΆ”μΆœ
105
  doc = fitz.open(file_path)
106
  file_text = ""
107
- for page in doc:
108
- file_text += page.get_text()
109
 
110
  if not file_text.strip():
111
- status_msg += f"⚠️ {os.path.basename(file_path)}: ν…μŠ€νŠΈ μΆ”μΆœ μ‹€νŒ¨ (이미지 PDF일 수 있음)\n"
112
  continue
113
 
114
- # 2. ν…μŠ€νŠΈ λΆ„ν•  (Chunking)
115
  chunks = text_splitter.split_text(file_text)
116
-
117
- # 3. μž„λ² λ”© 및 μ €μž₯
118
  points = []
119
  for i, chunk in enumerate(chunks):
120
  vector = embedding_model.encode(chunk).tolist()
121
-
122
- payload = {
123
- "filename": os.path.basename(file_path),
124
- "text": chunk,
125
- "chunk_id": i
126
- }
127
-
128
  points.append(PointStruct(id=doc_id_counter, vector=vector, payload=payload))
129
  doc_id_counter += 1
130
 
131
- # Qdrant에 μ €μž₯
132
  if points:
133
- qdrant_client.upsert(
134
- collection_name=COLLECTION_NAME,
135
- points=points
136
- )
137
  total_chunks += len(points)
138
- status_msg += f"βœ… {os.path.basename(file_path)}: {len(points)}개 지식 μ €μž₯ μ™„λ£Œ.\n"
139
 
140
  except Exception as e:
141
- traceback.print_exc()
142
- file_name_debug = getattr(file, 'name', str(file))
143
- status_msg += f"❌ {os.path.basename(file_name_debug)} 처리 쀑 였λ₯˜: {str(e)}\n"
144
 
145
- print(f"DEBUG: 총 μ €μž₯된 청크 수: {total_chunks}")
146
- if total_chunks == 0:
147
- return status_msg + "\n(μ €μž₯된 데이터가 μ—†μŠ΅λ‹ˆλ‹€. PDFκ°€ λΉ„μ–΄μžˆκ±°λ‚˜ 이미지일 수 μžˆμŠ΅λ‹ˆλ‹€.)"
148
-
149
- return f"처리 μ™„λ£Œ! 총 {total_chunks}개의 지식 쑰각이 μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€.\n\n{status_msg}"
150
 
151
  def search_knowledge_base(query, top_k=5):
152
- """둜컬 Qdrantμ—μ„œ κ΄€λ ¨ λ¬Έμ„œ 검색"""
153
  try:
154
  query_vector = embedding_model.encode(query).tolist()
155
- # qdrant-client 버전에 따라 .search()κ°€ μ—†κ±°λ‚˜ λ‹€λ₯΄κ²Œ λ™μž‘ν•  수 μžˆμ–΄ .query_points() μ‚¬μš©
156
- search_result = qdrant_client.query_points(
157
- collection_name=COLLECTION_NAME,
158
- query=query_vector,
159
- limit=top_k,
160
- with_payload=True
161
  )
162
- return search_result.points
163
- except Exception as e:
164
- print(f"검색 였λ₯˜: {e}")
165
  return []
166
 
167
  def generate_answer_groq(query, context_text):
168
- """Groq APIλ₯Ό μ‚¬μš©ν•˜μ—¬ λ‹΅λ³€ 생성"""
169
- if not groq_client:
170
- return "Groq API μ„€μ • 였λ₯˜"
171
-
172
  system_prompt = """
173
- 당신은 μΉœμ ˆν•˜κ³  전문적인 금육 AI μ–΄μ‹œμŠ€ν„΄νŠΈμž…λ‹ˆλ‹€.
174
- λ°˜λ“œμ‹œ μ•„λž˜ 제곡된 [참고자료]λ§Œμ„ λ°”νƒ•μœΌλ‘œ μ§ˆλ¬Έμ— λ‹΅λ³€ν•˜μ„Έμš”.
175
- μ°Έκ³ μžλ£Œμ— λ‚΄μš©μ΄ μ—†λ‹€λ©΄ μ†”μ§ν•˜κ²Œ λͺ¨λ₯Έλ‹€κ³  λŒ€λ‹΅ν•˜μ„Έμš”.
176
- 좜처(νŒŒμΌμ΄λ¦„)λ₯Ό λ‹΅λ³€ 끝에 λͺ…μ‹œν•΄μ£Όμ„Έμš”.
177
  """
178
-
179
- user_prompt = f"질문: {query}\n\n[참고자료]\n{context_text}"
180
-
181
  try:
182
  response = groq_client.chat.completions.create(
183
- messages=[
184
- {"role": "system", "content": system_prompt},
185
- {"role": "user", "content": user_prompt},
186
- ],
187
- model=GROQ_MODEL_NAME,
188
- temperature=0.1,
189
  )
190
  return response.choices[0].message.content
191
  except Exception as e:
192
- return f"Groq 생성 였λ₯˜: {e}"
193
 
194
- # RAG νŒŒμ΄ν”„λΌμΈ (톡합)
195
- def run_rag_pipeline(text_input, detected_lang='ko'):
196
- if not text_input:
197
- return "", "", "", ""
198
-
199
- # 1. 질문 λ²ˆμ—­ (ν•„μš”μ‹œ)
200
- korean_query = text_input
201
- if detected_lang != 'ko':
202
- try:
203
- korean_query = GoogleTranslator(source='auto', target='ko').translate(text_input)
204
- except: pass
205
-
206
- # 2. λ¬Έμ„œ 검색
207
- hits = search_knowledge_base(korean_query)
208
 
209
- if not hits:
210
- return korean_query, "μ €μž₯된 지식이 λΆ€μ‘±ν•˜μ—¬ λ‹΅λ³€ν•  수 μ—†μŠ΅λ‹ˆλ‹€. PDFλ₯Ό λ¨Όμ € μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”.", "", "μ°Έκ³  λ¬Έμ„œ μ—†μŒ"
211
-
212
- # 3. μ»¨ν…μŠ€νŠΈ ꡬ성
213
- context_text = ""
214
- references = []
215
- for hit in hits:
216
- context_text += f"{hit.payload['text']}\n\n"
217
- references.append(f"- {hit.payload['filename']} (μœ μ‚¬λ„: {hit.score:.2f})")
218
 
219
- ref_str = "\n".join(references)
 
 
 
220
 
221
- # 4. λ‹΅λ³€ 생성
222
- korean_answer = generate_answer_groq(korean_query, context_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
- # 5. λ‹΅λ³€ λ²ˆμ—­ (ν•„μš”μ‹œ)
225
- final_answer = korean_answer
226
- if detected_lang != 'ko':
227
- try:
228
- final_answer = GoogleTranslator(source='ko', target=detected_lang).translate(korean_answer)
229
- except: pass
230
-
231
- return korean_query, korean_answer, final_answer, ref_str
232
-
233
-
234
- # =========================================================
235
- # 3. μŒμ„± 및 UI 헬퍼 ν•¨μˆ˜
236
- # =========================================================
237
 
238
- def voice_to_text(audio_input):
239
- """μŒμ„± 인식 (Google API)"""
240
- if audio_input is None: return "μŒμ„± μž…λ ₯ μ—†μŒ", None
 
241
 
242
  try:
243
- sample_rate, audio_numpy = audio_input
244
  if audio_numpy.dtype == np.float32:
245
  audio_numpy = (audio_numpy * 32767).astype(np.int16)
246
  if len(audio_numpy.shape) > 1:
247
  audio_numpy = audio_numpy.mean(axis=1).astype(np.int16)
248
-
249
  audio_data = sr.AudioData(audio_numpy.tobytes(), sample_rate, 2)
250
  r = sr.Recognizer()
251
- text = r.recognize_google(audio_data, language='ko-KR')
252
- return text, 'ko'
 
 
 
 
 
253
  except sr.UnknownValueError:
254
- return "인식 μ‹€νŒ¨ (λ‹€μ‹œ λ§ν•΄μ£Όμ„Έμš”)", None
255
- except Exception as e:
256
- return f"였λ₯˜: {e}", None
257
 
258
  # =========================================================
259
- # 4. Gradio UI ꡬ성
260
  # =========================================================
261
 
262
- with gr.Blocks(theme=gr.themes.Soft(), title="KB AI Challenge") as demo:
263
- gr.Markdown("# KB AI Challenge")
264
- gr.Markdown("μ„œλ²„ 없이 λ‘œμ»¬μ—μ„œ λ™μž‘ν•˜λŠ” **개인용 RAG μ‹œμŠ€ν…œ**μž…λ‹ˆλ‹€. PDFλ₯Ό μ—…λ‘œλ“œν•˜λ©΄ μ¦‰μ‹œ ν•™μŠ΅ν•˜μ—¬ λ‹΅λ³€ν•©λ‹ˆλ‹€.")
265
-
266
- with gr.Accordion("πŸ“‚ 1. 지식 베이슀 ꡬ좕 (파일 μ—…λ‘œλ“œ)", open=True):
267
- with gr.Row():
268
- file_input = gr.File(label="PDF μ—…λ‘œλ“œ (μ—¬λŸ¬ 개 κ°€λŠ₯)", file_count="multiple", file_types=[".pdf"])
269
- upload_btn = gr.Button("μ €μž₯ν•˜κΈ°", variant="primary")
270
- upload_status = gr.Textbox(label="처리 μƒνƒœ", interactive=False)
271
-
272
- gr.Markdown("---")
273
- gr.Markdown("### 🎀 2. AI와 λŒ€ν™”ν•˜κΈ°")
274
 
 
 
 
 
 
 
 
275
  with gr.Row():
276
- with gr.Column(scale=1):
277
- audio_in = gr.Audio(sources=["microphone", "upload"], type="numpy", label="μŒμ„± 질문")
278
- asr_btn = gr.Button("μŒμ„± 인식 μ‹œμž‘", variant="secondary")
279
- text_in = gr.Textbox(label="μΈμ‹λœ ν…μŠ€νŠΈ (직접 μž…λ ₯ κ°€λŠ₯)", lines=3)
280
- chat_btn = gr.Button("μ§ˆλ¬Έν•˜κΈ°", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- with gr.Column(scale=2):
283
- answer_box = gr.Textbox(label="AI λ‹΅λ³€ (ν•œκ΅­μ–΄)", lines=6, interactive=False)
284
- ref_box = gr.Textbox(label="μ°Έκ³  λ¬Έν—Œ", lines=4, interactive=False)
285
 
286
- # 이벀트 μ—°κ²°
 
 
 
 
 
 
 
 
 
 
287
  upload_btn.click(process_uploaded_files, inputs=[file_input], outputs=[upload_status])
288
 
289
- asr_btn.click(voice_to_text, inputs=[audio_in], outputs=[text_in, gr.State()])
 
290
 
291
- chat_btn.click(
292
- run_rag_pipeline,
293
- inputs=[text_in, gr.State('ko')], # μ–Έμ–΄λŠ” κΈ°λ³Έ ν•œκ΅­μ–΄λ‘œ κ³ μ • (λ‹¨μˆœν™”)
294
- outputs=[gr.State(), answer_box, gr.State(), ref_box]
295
- )
296
 
297
  if __name__ == "__main__":
298
  demo.launch(share=True)
 
1
  # =========================================================
2
+ # KB AI Challenge - Professional RAG System (Multilingual)
 
 
 
 
3
  # =========================================================
4
 
5
  import os
6
  import sys
7
  import numpy as np
8
  import traceback
9
+ import fitz # PyMuPDF
10
  from typing import List
11
 
12
  # --- 라이브러리 μž„ν¬νŠΈ ---
13
  import gradio as gr
14
  import speech_recognition as sr
15
+ from dotenv import load_dotenv
16
+
17
+ # .env λ‘œλ“œ
18
+ load_dotenv()
19
+
20
  from deep_translator import GoogleTranslator
21
  from sentence_transformers import SentenceTransformer
22
  from groq import Groq
23
  from qdrant_client import QdrantClient
24
  from qdrant_client.models import Distance, VectorParams, PointStruct
25
+
26
  try:
27
  from langchain.text_splitter import RecursiveCharacterTextSplitter
28
  except ImportError:
 
29
  from langchain_text_splitters import RecursiveCharacterTextSplitter
30
 
31
  # =========================================================
32
  # 1. μ„€μ • 및 μ΄ˆκΈ°ν™”
33
  # =========================================================
34
 
 
35
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY", "your_groq_api_key_here")
 
 
 
 
36
  EMBEDDING_MODEL_NAME = "jhgan/ko-sroberta-multitask"
37
  GROQ_MODEL_NAME = "llama-3.3-70b-versatile"
38
  COLLECTION_NAME = "local_kb"
39
 
40
+ print("πŸ› οΈ μ‹œμŠ€ν…œ μ΄ˆκΈ°ν™” 쀑... (System Init)")
41
 
42
+ # λͺ¨λΈ λ‘œλ“œ
43
  embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
44
  embedding_model.max_seq_length = 512
45
 
46
+ # Qdrant (λ©”λͺ¨λ¦¬)
 
 
47
  qdrant_client = QdrantClient(":memory:")
 
 
48
  try:
49
  qdrant_client.recreate_collection(
50
  collection_name=COLLECTION_NAME,
51
  vectors_config=VectorParams(size=768, distance=Distance.COSINE),
52
  )
53
+ print(f"βœ… Qdrant Collection Ready.")
54
  except Exception as e:
55
+ print(f"❌ Qdrant Error: {e}")
56
 
57
+ # Groq Init
58
+ groq_client = None
59
+ if GROQ_API_KEY and GROQ_API_KEY != "your_groq_api_key_here":
60
+ try:
61
+ groq_client = Groq(api_key=GROQ_API_KEY)
62
+ except Exception as e:
63
+ print(f"❌ Groq Error: {e}")
64
+ else:
65
+ print("⚠️ Groq API Key Missing.")
66
 
 
67
  doc_id_counter = 0
68
 
69
+ print("βœ… System Ready.")
70
+
71
+
72
+ # =========================================================
73
+ # 2. λ‹€κ΅­μ–΄ 지원 둜직 (Translation & STT)
74
+ # =========================================================
75
+
76
+ LANG_MAP = {
77
+ "ν•œκ΅­μ–΄ (Korean)": {"code": "ko", "stt": "ko-KR"},
78
+ "English (μ˜μ–΄)": {"code": "en", "stt": "en-US"},
79
+ "ζ—₯本θͺž (Japanese)": {"code": "ja", "stt": "ja-JP"},
80
+ "δΈ­ζ–‡ (Chinese)": {"code": "zh-CN", "stt": "zh-CN"}
81
+ }
82
+
83
+ def translate_text(text, target_lang_code):
84
+ try:
85
+ if target_lang_code == "ko": return text
86
+ return GoogleTranslator(source='auto', target=target_lang_code).translate(text)
87
+ except:
88
+ return text
89
 
90
+ def translate_to_korean(text):
91
+ try:
92
+ return GoogleTranslator(source='auto', target='ko').translate(text)
93
+ except:
94
+ return text
95
 
96
  # =========================================================
97
+ # 3. 핡심 둜직 (RAG Pipeline)
98
  # =========================================================
99
 
100
  def process_uploaded_files(files):
101
+ """PDF 처리 및 μž„λ² λ”©"""
102
  global doc_id_counter
103
+ if not files: return "파일이 μ„ νƒλ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€."
 
 
104
 
105
  total_chunks = 0
106
  status_msg = ""
107
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, length_function=len)
 
 
 
 
 
 
108
 
109
  for file in files:
110
  try:
 
111
  file_path = file.name if hasattr(file, 'name') else file
 
 
112
  doc = fitz.open(file_path)
113
  file_text = ""
114
+ for page in doc: file_text += page.get_text()
 
115
 
116
  if not file_text.strip():
117
+ status_msg += f"⚠️ {os.path.basename(file_path)}: ν…μŠ€νŠΈ μ—†μŒ.\n"
118
  continue
119
 
 
120
  chunks = text_splitter.split_text(file_text)
 
 
121
  points = []
122
  for i, chunk in enumerate(chunks):
123
  vector = embedding_model.encode(chunk).tolist()
124
+ payload = {"filename": os.path.basename(file_path), "text": chunk}
 
 
 
 
 
 
125
  points.append(PointStruct(id=doc_id_counter, vector=vector, payload=payload))
126
  doc_id_counter += 1
127
 
 
128
  if points:
129
+ qdrant_client.upsert(collection_name=COLLECTION_NAME, points=points)
 
 
 
130
  total_chunks += len(points)
131
+ status_msg += f"βœ… {os.path.basename(file_path)} ({len(points)} 개 μ €μž₯됨)\n"
132
 
133
  except Exception as e:
134
+ status_msg += f"❌ 였λ₯˜: {os.path.basename(file_path)} - {str(e)}\n"
 
 
135
 
136
+ return f"총 {total_chunks}개 데이터 처리 μ™„λ£Œ.\n\n{status_msg}"
 
 
 
 
137
 
138
  def search_knowledge_base(query, top_k=5):
 
139
  try:
140
  query_vector = embedding_model.encode(query).tolist()
141
+ res = qdrant_client.query_points(
142
+ collection_name=COLLECTION_NAME, query=query_vector, limit=top_k, with_payload=True
 
 
 
 
143
  )
144
+ return res.points
145
+ except:
 
146
  return []
147
 
148
  def generate_answer_groq(query, context_text):
149
+ if not groq_client: return "API ν‚€κ°€ ν•„μš”ν•©λ‹ˆλ‹€."
150
+
 
 
151
  system_prompt = """
152
+ 당신은 KB 금육그룹의 μ „λ¬Έ AI μ–΄μ‹œμŠ€ν„΄νŠΈμž…λ‹ˆλ‹€.
153
+ 제곡된 [λ¬Έλ§₯]에 κΈ°λ°˜ν•˜μ—¬ μ§ˆλ¬Έμ— λŒ€ν•΄ μ •ν™•ν•˜κ³  전문적인 닡변을 μž‘μ„±ν•˜μ„Έμš”.
154
+ λͺ¨λ₯΄λŠ” λ‚΄μš©μ€ λͺ¨λ₯Έλ‹€κ³  λ‹΅ν•˜κ³ , μΆ”μΈ‘ν•˜μ§€ λ§ˆμ„Έμš”.
155
+ 닡변은 ν•œκ΅­μ–΄λ‘œ μž‘μ„±ν•˜μ„Έμš”.
156
  """
157
+ user_prompt = f"질문: {query}\n\n[λ¬Έλ§₯]\n{context_text}"
 
 
158
  try:
159
  response = groq_client.chat.completions.create(
160
+ messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
161
+ model=GROQ_MODEL_NAME, temperature=0.1
 
 
 
 
162
  )
163
  return response.choices[0].message.content
164
  except Exception as e:
165
+ return f"응닡 생성 였λ₯˜: {e}"
166
 
167
+ def run_rag_chat(message, history, lang_selection):
168
+ if not message: return "", history, ""
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
+ target_lang = LANG_MAP[lang_selection]["code"]
 
 
 
 
 
 
 
 
171
 
172
+ # 1. μž…λ ₯ λ²ˆμ—­ (Target -> Korean)
173
+ korean_query = message
174
+ if target_lang != "ko":
175
+ korean_query = translate_to_korean(message)
176
 
177
+ # 2. 검색 & λ‹΅λ³€ 생성 (Korean)
178
+ hits = search_knowledge_base(korean_query)
179
+ if not hits:
180
+ bot_response_ko = "μ£„μ†‘ν•©λ‹ˆλ‹€. κ΄€λ ¨ 정보λ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
181
+ reference_text = "μ°Έκ³  λ¬Έμ„œ μ—†μŒ"
182
+ else:
183
+ context_text = "\n\n".join([h.payload['text'] for h in hits])
184
+ # 쀑볡 제거 및 κ·Έλ£Ήν™” (File grouping)
185
+ ref_data = {}
186
+ for h in hits:
187
+ fname = h.payload['filename']
188
+ if fname not in ref_data:
189
+ ref_data[fname] = []
190
+ ref_data[fname].append(h.score)
191
+
192
+ refs = []
193
+ for fname, scores in ref_data.items():
194
+ refs.append(f"- {fname} (κ΄€λ ¨ λ‚΄μš© {len(scores)}건, 졜고 μœ μ‚¬λ„: {max(scores):.2f})")
195
+ reference_text = "\n".join(refs)
196
+ bot_response_ko = generate_answer_groq(korean_query, context_text)
197
 
198
+ # 3. λ‹΅λ³€ λ²ˆμ—­ (Korean -> Target)
199
+ final_response = bot_response_ko
200
+ if target_lang != "ko":
201
+ translated_response = translate_text(bot_response_ko, target_lang)
202
+ final_response = f"{translated_response}\n\n---\n[ν•œκ΅­μ–΄ 원문]\n{bot_response_ko}"
203
+
204
+ # νžˆμŠ€ν† λ¦¬μ— μΆ”κ°€ (Messages Format for Gradio 6.x)
205
+ new_history = history + [
206
+ {"role": "user", "content": message},
207
+ {"role": "assistant", "content": final_response}
208
+ ]
209
+ return "", new_history, reference_text
 
210
 
211
+ def voice_to_text_chat(audio, history, lang_selection):
212
+ if audio is None: return "", history, "μŒμ„± μž…λ ₯ μ—†μŒ"
213
+
214
+ stt_lang = LANG_MAP[lang_selection]["stt"]
215
 
216
  try:
217
+ sample_rate, audio_numpy = audio
218
  if audio_numpy.dtype == np.float32:
219
  audio_numpy = (audio_numpy * 32767).astype(np.int16)
220
  if len(audio_numpy.shape) > 1:
221
  audio_numpy = audio_numpy.mean(axis=1).astype(np.int16)
 
222
  audio_data = sr.AudioData(audio_numpy.tobytes(), sample_rate, 2)
223
  r = sr.Recognizer()
224
+
225
+ # μ„ νƒλœ μ–Έμ–΄λ‘œ 인식
226
+ text = r.recognize_google(audio_data, language=stt_lang)
227
+
228
+ # μ±„νŒ… ν•¨μˆ˜ 호좜
229
+ return run_rag_chat(text, history, lang_selection)
230
+
231
  except sr.UnknownValueError:
232
+ return "", history, "μŒμ„±μ„ 이해할 수 μ—†μŠ΅λ‹ˆλ‹€."
233
+ except Exception as e:
234
+ return "", history, f"였λ₯˜: {e}"
235
 
236
  # =========================================================
237
+ # 4. UI Layout (Clean Professional Korean)
238
  # =========================================================
239
 
240
+ theme = gr.themes.Soft(
241
+ primary_hue="amber",
242
+ neutral_hue="slate",
243
+ font=[gr.themes.GoogleFont("Noto Sans KR"), "sans-serif"]
244
+ )
 
 
 
 
 
 
 
245
 
246
+ css = """
247
+ footer {visibility: hidden !important;}
248
+ .gradio-container {min-height: 0px !important;}
249
+ """
250
+
251
+ with gr.Blocks(theme=theme, title="KB AI Challenge", css=css) as demo:
252
+
253
  with gr.Row():
254
+ # --- LEFT SIDEBAR ---
255
+ with gr.Column(scale=1, min_width=300, variant="panel"):
256
+ gr.Markdown("## KB AI Challenge")
257
+ gr.Markdown("**λ‹€κ΅­μ–΄ 금육 AI μ–΄μ‹œμŠ€ν„΄νŠΈ**")
258
+
259
+ with gr.Group():
260
+ lang_dropdown = gr.Dropdown(
261
+ choices=list(LANG_MAP.keys()),
262
+ value="ν•œκ΅­μ–΄ (Korean)",
263
+ label="μ–Έμ–΄ μ„€μ •",
264
+ interactive=True
265
+ )
266
+
267
+ file_input = gr.File(label="지식 베이슀 (PDF)", file_count="multiple", file_types=[".pdf"])
268
+ with gr.Row():
269
+ upload_btn = gr.Button("μ—…λ‘œλ“œ 및 뢄석", variant="primary", size="sm")
270
+ upload_status = gr.Textbox(show_label=False, placeholder="μƒνƒœ λŒ€κΈ° 쀑...", interactive=False, lines=1, max_lines=1)
271
+
272
+ gr.Markdown("### μŒμ„± λŒ€ν™”")
273
+ audio_input = gr.Audio(sources=["microphone"], type="numpy", label="μŒμ„± μž…λ ₯", show_label=False)
274
+
275
+ with gr.Accordion("μ‹œμŠ€ν…œ μ•„ν‚€ν…μ²˜", open=False):
276
+ gr.Markdown(
277
+ """
278
+ **μ΅œμ ν™” λ‚΄μ—­**
279
+ 1. **STT**: Google Speech API
280
+ 2. **λ²ˆμ—­**: Google Translate API
281
+ 3. **LLM**: Groq LPU (Llama 3)
282
+ """
283
+ )
284
+
285
+ # --- RIGHT MAIN ---
286
+ with gr.Column(scale=3):
287
+ # chatbot (Messages format)
288
+ chatbot = gr.Chatbot(label="λŒ€ν™”", height=500, show_label=False)
289
 
290
+ # References
291
+ gr.Markdown("**μ°Έκ³  λ¬Έμ„œ**")
292
+ ref_output = gr.Textbox(show_label=False, interactive=False, lines=3, max_lines=5, placeholder="κ΄€λ ¨ λ¬Έμ„œκ°€ ν‘œμ‹œλ©λ‹ˆλ‹€.")
293
 
294
+ # Input Area
295
+ with gr.Row():
296
+ msg = gr.Textbox(
297
+ scale=6,
298
+ show_label=False,
299
+ placeholder="μ§ˆλ¬Έμ„ μž…λ ₯ν•˜μ„Έμš”...",
300
+ container=False
301
+ )
302
+ submit_btn = gr.Button("전솑", scale=1, variant="primary")
303
+
304
+ # --- Event Handlers ---
305
  upload_btn.click(process_uploaded_files, inputs=[file_input], outputs=[upload_status])
306
 
307
+ msg.submit(run_rag_chat, [msg, chatbot, lang_dropdown], [msg, chatbot, ref_output])
308
+ submit_btn.click(run_rag_chat, [msg, chatbot, lang_dropdown], [msg, chatbot, ref_output])
309
 
310
+ audio_input.stop_recording(voice_to_text_chat, [audio_input, chatbot, lang_dropdown], [msg, chatbot, ref_output])
 
 
 
 
311
 
312
  if __name__ == "__main__":
313
  demo.launch(share=True)