dohyune commited on
Commit
bcef05a
ยท
verified ยท
1 Parent(s): e77a206

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +213 -269
app.py CHANGED
@@ -1,6 +1,5 @@
1
  """
2
- PLOBIN - ๋ฌธ์„œ ์† ๋‹ต์„ ์ฐพ์•„์ฃผ๋Š” AI ๋น„์„œ
3
-
4
  """
5
  import streamlit as st
6
  import streamlit.components.v1 as components
@@ -17,65 +16,42 @@ from typing import List, Dict, Tuple
17
  import base64
18
  from dotenv import load_dotenv
19
  import json
20
-
21
- import base64
22
 
23
  def get_svg_content(svg_path):
24
  with open(svg_path, "r", encoding="utf-8") as f:
25
  return f.read()
26
 
27
- # ํŒŒ์ผ ์ƒ๋‹จ์—์„œ ํ•œ ๋ฒˆ๋งŒ ๋กœ๋“œ
28
  plobin_logo_svg = get_svg_content("img/plobin.svg")
29
 
30
- # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๋กœ๋“œ
31
  load_dotenv()
32
 
33
- # Configuration
34
  GROK_API_KEY = os.getenv("GROK_API_KEY")
35
  GROK_API_BASE = "https://api.x.ai/v1"
36
  CHROMA_DIR = "./chroma_db"
37
  EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
38
 
39
- # ==================== ํ•˜์ด๋ผ์ดํŠธ ์„ค์ • ํด๋ž˜์Šค ====================
40
  class HighlightConfig:
41
- """ํ•˜์ด๋ผ์ดํŠธ ์„ค์ • - ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •"""
42
-
43
  def __init__(self):
44
- # ํ•˜์ด๋ผ์ดํŠธ ์ƒ‰์ƒ - ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •
45
- self.color = [1.0, 1.0, 0.0] # ๋…ธ๋ž€์ƒ‰ (RGB 0-1 ๋ฒ”์œ„)
46
 
47
- # Page config
48
  st.set_page_config(
49
  page_title="PLOBIN",
50
- page_icon="๐Ÿ”ฎ",
51
  layout="wide",
52
  initial_sidebar_state="expanded"
53
  )
54
 
55
- # ์‚ฌ์ด๋“œ๋ฐ” ๊ธฐ์กด
56
- #667eea 0%,
57
- #764ba2 100%);
58
-
59
- # ์‚ฌ์ด๋“œ๋ฐ” 1์•ˆ
60
- #5ECFFF 0%,
61
- #B8FF6E 100%);
62
-
63
- # ์‚ฌ์ด๋“œ๋ฐ” 2์•ˆ
64
- #258CFF 0%,
65
- #0A1E6A 100%);
66
-
67
- # Custom CSS
68
  st.markdown("""
69
  <style>
70
  [data-testid="stSidebar"] {
71
  background: linear-gradient(180deg,
72
- #90B9E8 0%,
73
- #B3BEC9 100%);
74
  box-shadow: 4px 0 30px rgba(0,0,0,0.2);
75
  width: 290px !important;
76
  }
77
 
78
- /* ์‚ฌ์ด๋“œ๋ฐ” ํƒ€์ดํ‹€ ๋น›๋‚˜๋Š” ํšจ๊ณผ */
79
  [data-testid="stSidebar"] h1 {
80
  color: white !important;
81
  font-weight: 900 !important;
@@ -87,7 +63,6 @@ st.markdown("""
87
  letter-spacing: 2px;
88
  }
89
 
90
- /* ์‚ฌ์ด๋“œ๋ฐ” ํƒ€์ดํ‹€ ์• ๋‹ˆ๋ฉ”์ด์…˜ */
91
  @keyframes sidebarTitlePulse {
92
  0%, 100% {
93
  transform: scale(1);
@@ -106,7 +81,6 @@ st.markdown("""
106
  }
107
  }
108
 
109
- /* ํŒŒ์ผ ์—…๋กœ๋” ๋ฐฐ๊ฒฝ ํˆฌ๋ช…ํ•˜๊ฒŒ */
110
  [data-testid="stSidebar"] [data-testid="stFileUploader"] {
111
  background: rgba(255,255,255,0.15);
112
  border-radius: 15px;
@@ -116,37 +90,34 @@ st.markdown("""
116
  backdrop-filter: blur(10px);
117
  }
118
 
119
- /* ํŒŒ์ผ ์—…๋กœ๋” ๋‚ด๋ถ€ ์„น์…˜๋„ ํˆฌ๋ช…ํ•˜๊ฒŒ */
120
  [data-testid="stFileUploader"] > section {
121
  background: transparent !important;
122
  }
123
 
124
- /* ํŒŒ์ผ ์—…๋กœ๋” ๋“œ๋ž˜๊ทธ ์˜์—ญ */
125
  [data-testid="stFileUploader"] > section > div {
126
  background: transparent !important;
127
  }
128
 
129
- /* ์—…๋กœ๋“œ๋œ ํŒŒ์ผ ํ‘œ์‹œ ์˜์—ญ */
130
  [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
131
  color: #fafafa;
132
  }
133
- /* ํ•˜์–€ ๋ฐ•์Šค(๋“œ๋กญ์กด) ์Šคํƒ€์ผ */
134
  [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
135
  [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
136
  background: transparent !important;
137
  border: none !important;
138
  }
139
- /* ๋“œ๋กญ์กด ๋‚ด๋ถ€ ํ…์ŠคํŠธ ์ƒ‰์ƒ */
140
  [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
141
  color: rgba(255,255,255,0.9) !important;
142
  }
143
- /* "ํŒŒ์ผ ์ฐพ๊ธฐ" ๋ฒ„ํŠผ */
144
  [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
145
  background: rgba(255,255,255,0.2) !important;
146
  color: white !important;
147
  border: 1px solid rgba(255,255,255,0.3) !important;
148
  }
149
- /* ์‚ฌ์ด๋“œ๋ฐ” ๋ฒ„ํŠผ ์Šคํƒ€์ผ ์—…๋ฐ์ดํŠธ */
150
  [data-testid="stSidebar"] .stButton button {
151
  background: rgba(255,255,255,0.15) !important;
152
  color: white !important;
@@ -158,48 +129,53 @@ st.markdown("""
158
  transition: all 0.3s ease !important;
159
  box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
160
  }
 
161
  [data-testid="stSidebar"] .stButton button:hover {
162
  background: rgba(255,255,255,0.25) !important;
163
  border-color: rgba(255,255,255,0.6) !important;
164
  transform: translateY(-2px) scale(1.02) !important;
165
  box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
166
  }
 
167
  [data-testid="stSidebar"] .stButton button:active {
168
  transform: translateY(0px) scale(0.98) !important;
169
  }
170
- /* Primary ๋ฒ„ํŠผ (๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘) ํŠน๋ณ„ ์Šคํƒ€์ผ */
171
  [data-testid="stSidebar"] .stButton button[kind="primary"] {
172
  background: rgba(255,255,255,0.25) !important;
173
  border: 2px solid rgba(255,255,255,0.5) !important;
174
  font-size: 1.05rem !important;
175
  }
 
176
  [data-testid="stSidebar"] .stButton button[kind="primary"]:hover {
177
  background: rgba(255,255,255,0.35) !important;
178
  border-color: rgba(255,255,255,0.7) !important;
179
  }
 
180
  [data-testid="stSidebar"] [data-testid="stAlert"] {
181
  background-color: rgba(255, 255, 255, 0.001) !important;
182
  border-radius: 0.5rem !important;
183
  }
 
184
  [data-testid="stAlert"] p {
185
- color: rgb(250, 250, 250); /* ํฐ์ƒ‰ */
186
  }
187
- /* ๋ฉ”์ธ ์ปจํ…์ธ  ์ „์ฒด ๋„ˆ๋น„ ์‚ฌ์šฉ */
188
  .main .block-container {
189
  max-width: 100%;
190
  padding-left: 2rem;
191
  padding-right: 2rem;
192
  }
193
 
194
- /* ํ—ค๋” ์Šคํƒ€์ผ - ๋ฐ•์Šค ์ œ๊ฑฐ, ํ…์ŠคํŠธ ๊ทธ๋ฆผ์ž๋งŒ */
195
  .plobin-header {
196
  padding: 1.5rem 2rem;
197
  margin-bottom: 2rem;
198
  }
 
199
  .plobin-logo {
200
  display: block;
201
  margin: 0 auto;
202
- height: 60px; /* ์›ํ•˜๋Š” ํฌ๊ธฐ๋กœ ์กฐ์ • */
203
  }
204
 
205
  .plobin-title {
@@ -211,6 +187,7 @@ st.markdown("""
211
  text-shadow: 2px 2px 8px rgba(0, 0, 0, 0.4),
212
  0 0 20px rgba(102, 126, 234, 0.4);
213
  }
 
214
  .plobin-subtitle {
215
  font-size: 1rem;
216
  color: rgba(255, 255, 255, 0.9);
@@ -219,7 +196,6 @@ st.markdown("""
219
  text-shadow: 1px 1px 6px rgba(0, 0, 0, 0.4);
220
  }
221
 
222
- /* ํŒŒ์ผ ์—…๋กœ๋” ์ปค์Šคํ„ฐ๋งˆ์ด์ง• */
223
  [data-testid="stFileUploader"] {
224
  background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
225
  border: 3px dashed #667eea;
@@ -237,17 +213,15 @@ st.markdown("""
237
  font-weight: 600 !important;
238
  }
239
 
240
- /* PDF ์ปจํ…Œ์ด๋„ˆ */
241
  .pdf-container {
242
  border: 2px solid #E2E8F0;
243
  border-radius: 0.5rem;
244
  padding: 0.5rem;
245
- height: 705px;
246
  overflow-y: auto;
247
  background: white;
248
  }
249
 
250
- /* ์ฑ„ํŒ… ์ปจํ…Œ์ด๋„ˆ - ์Šคํฌ๋กค ์ถ”๊ฐ€ */
251
  .chat-container {
252
  border: 2px solid #E2E8F0;
253
  border-radius: 0.5rem;
@@ -258,13 +232,11 @@ st.markdown("""
258
  margin-bottom: 0.5rem;
259
  }
260
 
261
- /* ์ฑ„ํŒ… ์ž…๋ ฅ์ฐฝ๊ณผ ์ปจํ…Œ์ด๋„ˆ ๊ฐ„๊ฒฉ ์ตœ์†Œํ™” */
262
  [data-testid="stChatInput"] {
263
  margin-top: 0 !important;
264
  padding-top: 0 !important;
265
  }
266
 
267
- /* ์ฑ„ํŒ… ์Šคํƒ€์ผ */
268
  .source-box {
269
  background: #F1F5F9;
270
  padding: 1rem;
@@ -299,7 +271,6 @@ st.markdown("""
299
  border-left: 4px solid #EAB308;
300
  }
301
 
302
- /* ์‚ฌ์šฉ ์•ˆ๋‚ด ์Šคํƒ€์ผ */
303
  .usage-guide {
304
  background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
305
  padding: 2rem;
@@ -332,7 +303,6 @@ st.markdown("""
332
  flex-shrink: 0;
333
  }
334
 
335
- /* ๋ทฐ์–ด ํ—ค๋” ์Šคํƒ€์ผ */
336
  .viewer-header {
337
  display: flex;
338
  justify-content: space-between;
@@ -340,7 +310,6 @@ st.markdown("""
340
  margin-bottom: 1rem;
341
  }
342
 
343
- /* ํŽ˜์ด์ง€ ์„ ํƒ ์• ๋‹ˆ๋ฉ”์ด์…˜ */
344
  @keyframes pulse {
345
  0%, 100% {
346
  box-shadow: 0 0 0 0 rgba(16, 185, 129, 0.7);
@@ -350,7 +319,6 @@ st.markdown("""
350
  }
351
  }
352
 
353
- /* ์ฑ„ํŒ… ํƒ€์ดํ‹€ ์Šคํƒ€์ผ (์• ๋‹ˆ๋ฉ”์ด์…˜ ์ œ๊ฑฐ) */
354
  .chat-title {
355
  color: black !important;
356
  font-weight: 900 !important;
@@ -363,7 +331,6 @@ st.markdown("""
363
  letter-spacing: 2px;
364
  }
365
 
366
- /* ์ฑ„ํŒ… ๋‚ด ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ๋ฒ„ํŠผ ์Šคํƒ€์ผ - ํด๋ฆญ ๊ฐ€๋Šฅํ•œ ๋ฐ•์Šค์ฒ˜๋Ÿผ */
367
  [data-testid="column"] button[kind="secondary"] {
368
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
369
  color: white !important;
@@ -408,7 +375,6 @@ st.markdown("""
408
 
409
 
410
  def init_session():
411
- """์„ธ์…˜ ์ƒํƒœ ์ดˆ๊ธฐํ™”"""
412
  if 'processed' not in st.session_state:
413
  st.session_state.processed = False
414
  if 'vector_db' not in st.session_state:
@@ -436,13 +402,6 @@ def init_session():
436
 
437
 
438
  def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
439
- """
440
- PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
441
-
442
- ์ˆ˜์ • ์‚ฌํ•ญ:
443
- - CHUNK_SIZE: 300 โ†’ 800
444
- - OVERLAP_SIZE: 60 โ†’ 150
445
- """
446
  pdf_bytes = pdf_file.read()
447
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
448
 
@@ -450,10 +409,8 @@ def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]
450
  metadata_list = []
451
  pages_text = {}
452
 
453
- # ==================== ์ˆ˜์ •๋œ ์ฒญํฌ ์„ค์ • ====================
454
- CHUNK_SIZE = 800 # 300์—์„œ 800์œผ๋กœ ์ฆ๊ฐ€
455
- OVERLAP_SIZE = 150 # 60์—์„œ 150์œผ๋กœ ์ฆ๊ฐ€
456
- # ========================================================
457
 
458
  for page_num in range(len(doc)):
459
  page = doc[page_num]
@@ -504,15 +461,12 @@ def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]
504
 
505
  @st.cache_resource
506
  def load_embedding_model():
507
- """์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ"""
508
  return SentenceTransformer(EMBEDDING_MODEL)
509
 
510
 
511
  def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
512
- """๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ƒ์„ฑ - ๋ฉ”๋ชจ๋ฆฌ ๊ธฐ๋ฐ˜์œผ๋กœ ์•ˆ์ •์„ฑ ํ–ฅ์ƒ"""
513
  embedder = load_embedding_model()
514
 
515
- # ๋ฉ”๋ชจ๋ฆฌ ๊ธฐ๋ฐ˜ ChromaDB ์‚ฌ์šฉ (ํŒŒ์ผ ์‹œ์Šคํ…œ ๋ฌธ์ œ ํšŒํ”ผ)
516
  client = chromadb.EphemeralClient(
517
  settings=chromadb.Settings(
518
  anonymized_telemetry=False,
@@ -520,7 +474,6 @@ def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
520
  )
521
  )
522
 
523
- # ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
524
  try:
525
  client.delete_collection("rfx_docs")
526
  except Exception:
@@ -531,7 +484,6 @@ def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
531
  metadata={"hnsw:space": "cosine"}
532
  )
533
 
534
- # ๋ฐฐ์น˜ ์ž„๋ฒ ๋”ฉ
535
  batch_size = 32
536
  all_embeddings = []
537
 
@@ -551,57 +503,32 @@ def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
551
  return collection, embedder
552
 
553
 
554
- # ==================== ์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (ํ•˜๋“œ์ฝ”๋”ฉ ์ œ๊ฑฐ) ====================
555
  def extract_keywords_semantic(text: str, embedder, top_n: int = 5) -> List[str]:
556
- """
557
- ์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ์ถ”์ถœ - ๋ถˆ์šฉ์–ด ๋ฆฌ์ŠคํŠธ ๋ถˆํ•„์š”
558
-
559
- ์›๋ฆฌ:
560
- 1. ์ˆซ์ž ํฌํ•จ ๋‹จ์–ด๋Š” ๋ฌด์กฐ๊ฑด ์ค‘์š”ํ•˜๊ฒŒ ์ทจ๊ธ‰
561
- 2. ์›๋ณธ ํ…์ŠคํŠธ์˜ ์˜๋ฏธ์™€ ๊ฐ ๋‹จ์–ด์˜ ์˜๋ฏธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
562
- 3. ์œ ์‚ฌ๋„ ร— ๋นˆ๋„์ˆ˜๋กœ ์ ์ˆ˜ ์‚ฐ์ •
563
- 4. ์ ์ˆ˜์ˆœ์œผ๋กœ ์ƒ์œ„ ํ‚ค์›Œ๋“œ ์ถ”์ถœ
564
- """
565
- # 1. ์ˆซ์ž ํฌํ•จ ๋‹จ์–ด๋Š” ๋ฌด์กฐ๊ฑด ํฌํ•จ (๊ธˆ์•ก, ๋‚ ์งœ, ์ˆ˜๋Ÿ‰ ๋“ฑ)
566
  words_with_numbers = re.findall(r'[๊ฐ€-ํžฃ]*\d+[๊ฐ€-ํžฃ]*', text)
567
-
568
- # 2. ๋ช…์‚ฌ๊ตฌ ์ถ”์ถœ (2๊ธ€์ž ์ด์ƒ)
569
  candidate_words = re.findall(r'[๊ฐ€-ํžฃ]{2,}', text)
570
 
571
  if not candidate_words:
572
  return words_with_numbers[:top_n]
573
 
574
  word_freq = Counter(candidate_words)
575
-
576
- # 3. ์›๋ณธ ํ…์ŠคํŠธ์™€ ๊ฐ ๋‹จ์–ด์˜ ์˜๋ฏธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
577
  text_embedding = embedder.encode([text], convert_to_numpy=True)[0]
578
  word_embeddings = embedder.encode(list(word_freq.keys()), convert_to_numpy=True)
579
-
580
- # ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
581
  similarities = util.cos_sim(text_embedding, word_embeddings)[0].numpy()
582
 
583
- # 4. ์ ์ˆ˜ = ์˜๋ฏธ์œ ์‚ฌ๋„ ร— ๋นˆ๋„์ˆ˜ (๋นˆ๋„๋Š” ๋กœ๊ทธ ์Šค์ผ€์ผ)
584
  scored_words = []
585
  for idx, (word, freq) in enumerate(word_freq.items()):
586
- # ์˜๋ฏธ ์œ ์‚ฌ๋„ 70% + ๋นˆ๋„ 30%
587
  semantic_score = similarities[idx]
588
- frequency_score = np.log1p(freq) / 10.0 # ๋นˆ๋„์— ๋กœ๊ทธ ์ ์šฉ ํ›„ ์ •๊ทœํ™”
589
-
590
  combined_score = 0.7 * semantic_score + 0.3 * frequency_score
591
  scored_words.append((word, combined_score))
592
 
593
- # 5. ์ ์ˆ˜์ˆœ ์ •๋ ฌ
594
  scored_words.sort(key=lambda x: x[1], reverse=True)
595
 
596
- # 6. ๊ฒฐ๊ณผ ์กฐํ•ฉ: ์ˆซ์ž ํฌํ•จ ๋‹จ์–ด ์šฐ์„  + ์˜๋ฏธ ์ ์ˆ˜ ๋†’์€ ๋‹จ์–ด
597
  result = []
598
-
599
- # ์ˆซ์ž ํฌํ•จ ๋‹จ์–ด ๋จผ์ € ์ถ”๊ฐ€ (์ตœ๋Œ€ 3๊ฐœ)
600
  for word in words_with_numbers[:3]:
601
  if word and word not in result:
602
  result.append(word)
603
 
604
- # ๋‚˜๋จธ์ง€๋ฅผ ์˜๋ฏธ ์ ์ˆ˜๋กœ ์ฑ„์›€
605
  for word, score in scored_words:
606
  if word not in result:
607
  result.append(word)
@@ -611,28 +538,22 @@ def extract_keywords_semantic(text: str, embedder, top_n: int = 5) -> List[str]:
611
  return result[:top_n]
612
 
613
 
614
- # ==================== ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ (์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ์‚ฌ์šฉ) ====================
615
  def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
616
- """ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰: ๋ฒกํ„ฐ ์œ ์‚ฌ๋„ + ์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ๋งค์นญ"""
617
- # 1. ๋ฒกํ„ฐ ๊ฒ€์ƒ‰
618
  query_embedding = embedder.encode([query], convert_to_numpy=True)[0]
619
  vector_results = collection.query(
620
  query_embeddings=[query_embedding.tolist()],
621
- n_results=20, # ๋งŽ์ด ๊ฐ€์ ธ์™€์„œ ํ‚ค์›Œ๋“œ๋กœ ํ•„ํ„ฐ๋ง
622
  include=["documents", "metadatas", "distances"]
623
  )
624
 
625
- # 2. ์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ ์ถ”์ถœ (ํ•˜๋“œ์ฝ”๋”ฉ ์ œ๊ฑฐ)
626
  keywords = extract_keywords_semantic(query, embedder, top_n=5)
627
 
628
- # 3. ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ ์ˆ˜ ๊ณ„์‚ฐ
629
  hybrid_results = []
630
  for i, doc_id in enumerate(vector_results['ids'][0]):
631
  doc = vector_results['documents'][0][i]
632
  metadata = vector_results['metadatas'][0][i]
633
- vector_score = 1 - vector_results['distances'][0][i] # ๊ฑฐ๋ฆฌ๋ฅผ ์œ ์‚ฌ๋„๋กœ ๋ณ€ํ™˜
634
 
635
- # ํ‚ค์›Œ๋“œ ๋งค์นญ ์ ์ˆ˜
636
  keyword_score = 0
637
  doc_lower = doc.lower()
638
  for keyword in keywords:
@@ -640,7 +561,6 @@ def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
640
  keyword_score += 1
641
  keyword_score = keyword_score / len(keywords) if keywords else 0
642
 
643
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ ์ˆ˜ (๋ฒกํ„ฐ 70% + ํ‚ค์›Œ๋“œ 30%)
644
  hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
645
 
646
  hybrid_results.append({
@@ -652,7 +572,6 @@ def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
652
  'keyword_score': keyword_score
653
  })
654
 
655
- # 4. ์ ์ˆ˜์ˆœ ์ •๋ ฌ ํ›„ ์ƒ์œ„ k๊ฐœ
656
  hybrid_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
657
  top_results = hybrid_results[:top_k]
658
 
@@ -664,13 +583,10 @@ def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
664
  }
665
 
666
 
667
- # ==================== Grok API ์ ๊ฒ€ ํ•จ์ˆ˜ ====================
668
  def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> Dict:
669
- """Grok API๋กœ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ ๊ฒ€ ๋ฐ ์ตœ์ข… 1๊ฐœ๋งŒ ์„ ํƒ"""
670
  docs = search_results['documents'][0]
671
  metas = search_results['metadatas'][0]
672
 
673
- # ๋ฌธ์„œ๋“ค์„ ๋ฒˆํ˜ธ์™€ ํ•จ๊ป˜ ํฌ๋งทํŒ…
674
  formatted_docs = []
675
  for i, (doc, meta) in enumerate(zip(docs, metas), 1):
676
  formatted_docs.append(f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}")
@@ -734,9 +650,6 @@ def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> D
734
 
735
  result = response.json()
736
  content = result["choices"][0]["message"]["content"]
737
-
738
- # JSON ํŒŒ์‹ฑ
739
- # markdown ์ฝ”๋“œ ๋ธ”๋ก ์ œ๊ฑฐ
740
  content = content.replace("```json", "").replace("```", "").strip()
741
  extracted_data = json.loads(content)
742
 
@@ -747,7 +660,6 @@ def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> D
747
 
748
 
749
  def build_context(search_results: Dict, max_length: int = 3000) -> str:
750
- """์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ"""
751
  context_parts = []
752
  current_length = 0
753
 
@@ -772,7 +684,6 @@ def build_context(search_results: Dict, max_length: int = 3000) -> str:
772
 
773
 
774
  def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
775
- """๋‹ต๋ณ€ ์ƒ์„ฑ"""
776
  context = build_context(search_results, max_length=4000)
777
 
778
  system_prompt = """๋‹น์‹ ์€ ์ž๋™์ฐจ ์ œ์กฐ์—… RFx ๋ฌธ์„œ ์ „๋ฌธ ๋ถ„์„๊ฐ€์ž…๋‹ˆ๋‹ค.
@@ -781,49 +692,51 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
781
  2. **์–ธ์–ด ํ˜ผ์šฉ ๋ฐ ๋น„๋ฌธ ๋Œ€์‘**: ์‚ฌ์šฉ์ž์˜ ๋ฌธ์žฅ์€ ํ•œ๊ตญ์–ด์™€ ์˜์–ด๊ฐ€ ์„ž์ด๊ฑฐ๋‚˜ ๋ฌธ๋ฒ• ์˜ค๋ฅ˜๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ •ํ™•ํžˆ ์ดํ•ดํ•˜๋ผ.
782
  3. **๋ชจํ˜ธํ•œ ์งˆ๋ฌธ ์ž๋™ ๋ณด์ •**: ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์ด ๋ถˆ์™„์ „ํ•˜๊ฑฐ๋‚˜ ๋ชจํ˜ธํ•ด๋„ ์งˆ๋ฌธ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ ์ ˆํ•˜๊ฒŒ ์žฌ๊ตฌ์„ฑํ•˜๋ผ.
783
  **๋ฌธ์„œ ๊ธฐ๋ฐ˜ ์‘๋‹ต ์›์น™ (์ ˆ๋Œ€ ์ถ”์ธก ๊ธˆ์ง€):**
784
- 1. ์ œ๊ณต๋œ ๋ฌธ์„œ๋ฅผ **๋งค์šฐ ๏ฟฝ๊ผผํžˆ** ์ฝ๊ณ  ์ •ํ™•ํ•œ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
785
  2. **๋ฐ˜๋“œ์‹œ ๋ฌธ์„œ์—์„œ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์•„ ๋‹ต๋ณ€**ํ•˜๊ณ , ๋ฌธ์„œ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ž„์˜๋กœ ์ถ”์ธกํ•˜์ง€ ๋ง๊ณ  **"๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"**๋ผ๊ณ  ๋ช…์‹œํ•˜๋ผ
786
  3. **๋ฌธ์„œ์™€ ์ „ํ˜€ ๋ฌด๊ด€ํ•œ ์งˆ๋ฌธ**(์˜ˆ: ์ ์‹ฌ ์ถ”์ฒœ, ๋‚ ์”จ, ์ผ์ƒ ๋Œ€ํ™” ๋“ฑ)์€ **"์ฃ„์†กํ•˜์ง€๋งŒ, ์ œ๊ณต๋œ ๋ฌธ์„œ์—๋Š” ํ•ด๋‹น ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."**๋ผ๊ณ ๋งŒ ๋‹ต๋ณ€ํ•˜๊ณ  ์ถ”๊ฐ€ ์„ค๋ช… ์—†์ด ์ข…๋ฃŒํ•˜๋ผ
787
  4. ๋ฌธ์„œ์— ์ •๋ณด๊ฐ€ ์žˆ๋Š”๋ฐ๋„ "์—†๋‹ค"๊ณ  ํ•˜์ง€ ๋งˆ์„ธ์š”
788
  **ํ•ต์‹ฌ ์ •๋ณด ์šฐ์„  ์ถ”์ถœ:**
789
  - ๊ธˆ์•ก, ์ˆ˜๋Ÿ‰, ๊ทœ๊ฒฉ, ์ผ์ •, ์š”๊ตฌ์กฐ๊ฑด ๋“ฑ **์ˆ˜์น˜ ๊ธฐ๋ฐ˜ ์ •๋ณด๋ฅผ ์ตœ์šฐ์„ **์œผ๋กœ ์‹๋ณ„ํ•˜๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ๋ฐ˜ํ™˜ํ•˜๋ผ
790
  - ์ˆซ์ž, ๊ธˆ์•ก, ๋‚ ์งœ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์šฐ์„ ์ ์œผ๋กœ ์ฐพ์œผ์„ธ์š”
791
- - ์• ๋งคํ•œ ํ‘œํ˜„ ๋Œ€์‹  ๊ตฌ์ฒด์ ์ธ ์ˆ˜์น˜๋ฅผ ์ œ๊ณตํ•˜์„ธ์š”
792
- **๋ฐฉ๋Œ€ํ•œ ๋ฌธ์„œ ์ฒ˜๋ฆฌ (500ํŽ˜์ด์ง€ ๊ฐ€๋Šฅ):**
793
- - ๋ฌธ์„œ๊ฐ€ ๋งค์šฐ ๊ธธ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์งˆ๋ฌธ๊ณผ ์ง์ ‘ ๊ด€๋ จ๋œ ๋ถ€๋ถ„๋งŒ ์„ ๋ณ„ํ•ด ์š”์•ฝํ•˜๊ณ  ํ•ต์‹ฌ ์ •๋ณด๋งŒ ์‚ฌ์šฉํ•˜๋ผ
794
- **์‹ค๋ฌด ๋งฅ๋ฝ ๊ณ ๋ ค (RFx ํ”„๋กœ์„ธ์Šค ํŠนํ™”):**
795
- - ๋‹ต๋ณ€ํ•  ๋•Œ ์‹ค์ œ ์ž๋™์ฐจ RFx ์‹ค๋ฌด์ž๊ฐ€ ์˜์‚ฌ๊ฒฐ์ •์— ์‚ฌ์šฉํ•˜๋Š” ์ •๋ณด๋ผ๋Š” ์ ์„ ๊ณ ๋ คํ•˜์—ฌ ์‹ค๋ฌด ์ค‘์‹ฌ์œผ๋กœ ๋ช…ํ™•ํ•˜๊ฒŒ ์„ค๋ช…ํ•˜๋ผ
796
- - ํŠนํžˆ ๋‹ค์Œ ํ•ญ๋ชฉ๋“ค์„ ์šฐ์„ ์ ์œผ๋กœ ํŒŒ์•…ํ•˜๋ผ:
797
- 1. ์‚ฌ์—… ์ฐธ์—ฌ ์ž๊ฒฉ ๋ฐ ์š”๊ตฌ ์ธ์ฆ
798
- 2. ์‚ฌ์—… ๊ธฐ๊ฐ„ ๋ฐ ์ผ์ •
799
- 3. ์˜ˆ์‚ฐ (ํ˜„๊ธˆ/ํ˜„๋ฌผ ๋น„์ค‘, ์ˆœ์ˆ˜ ํšŒ์‚ฌ ์ˆ˜์ต ๊ฐ€๋Šฅ์„ฑ)
800
- 4. ์ œ์•ˆ์š”์ฒญ์„œ ์‚ฌ์–‘์„œ โ€“ ํ•„์š”ํ•œ ๊ธฐ์ˆ ์  ์š”๊ตฌ์‚ฌํ•ญ(์„œ๋ฒ„/์†Œํ”„ํŠธ์›จ์–ด ๋“ฑ)
801
- 5. ํŒ๋งค ๋Œ€์ƒ ๋ฐ ์‚ฌ์—… ๋ฒ”์œ„
802
  **๋‹ต๋ณ€ ํ˜•์‹:**
803
- - ๋‹ต๋ณ€ ์‹œ ๋ฐ˜๋“œ์‹œ **[ํŽ˜์ด์ง€ X]** ํ˜•ํƒœ๋กœ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œํ•˜์„ธ์š” (์˜ˆ: [ํŽ˜์ด์ง€ 3], [ํŽ˜์ด์ง€ 5, 12])
804
- - ๊ด€๋ จ ๋ฌธ๋งฅ์„ ์œ ์ง€ํ•˜๋ฉฐ, ๋‹ต๋ณ€์—๋Š” ๋ฌธ์„œ์˜ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ์™€ ์›๋ฌธ ์ผ๋ถ€๋ฅผ ์ •ํ™•ํžˆ ์ธ์šฉํ•˜๋ผ
805
  - ํ•ต์‹ฌ ๋‹ต๋ณ€์„ ๋จผ์ € ๋ช…ํ™•ํ•˜๊ฒŒ ์ œ์‹œ
806
- - ํ•„์š”์‹œ ์ถ”๊ฐ€ ๊ด€๋ จ ์ •๋ณด ์ œ๊ณต
807
- - ๋ฆฌ์ŠคํŠธ๋Š” - ๋˜๋Š” ๋ฒˆํ˜ธ๋ฅผ ์‚ฌ์šฉ
808
- - ๊ฐ•์กฐ๋Š” **๊ตต๊ฒŒ** ๋˜๋Š” *๊ธฐ์šธ์ž„* ์‚ฌ์šฉ
809
- - **๋‹ต๋ณ€์€ ๋ฐ˜๋“œ์‹œ ๋งˆํฌ๋‹ค์šด๋งŒ ์‚ฌ์šฉํ•ด์•ผ ํ•˜๋ฉฐ, HTML ํƒœ๊ทธ(<div>, <span>, <details>, <summary> ๋“ฑ)๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค**"""
810
-
811
- user_prompt = f"""๋‹ค์Œ ๋ฌธ์„œ๋“ค์„ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”.
 
 
 
 
 
 
812
  <๋ฌธ์„œ>
813
  {context}
814
  </๋ฌธ์„œ>
 
815
  <์งˆ๋ฌธ>
816
  {query}
817
  </์งˆ๋ฌธ>
818
- **์ค‘์š”**:
819
- - ์งˆ๋ฌธ์ด ๋ฌธ์„œ์™€ ์ „ํ˜€ ๋ฌด๊ด€ํ•œ ๊ฒฝ์šฐ(์˜ˆ: ์ ์‹ฌ ์ถ”์ฒœ, ๋‚ ์”จ, ์ผ์ƒ ๋Œ€ํ™” ๋“ฑ) "์ฃ„์†กํ•˜์ง€๋งŒ, ์ œ๊ณต๋œ ๋ฌธ์„œ์—๋Š” ํ•ด๋‹น ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."๋ผ๊ณ ๋งŒ ๋‹ต๋ณ€ํ•˜์„ธ์š”
820
- - ๋ฌธ์„œ๋ฅผ ์ฒ˜์Œ๋ถ€ํ„ฐ ๋๊นŒ์ง€ ์ฃผ์˜ ๊นŠ๊ฒŒ ์ฝ์œผ์„ธ์š”
821
- - ์ˆซ์ž, ๊ธˆ์•ก ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
822
- - ์ฐพ์€ ์ •๋ณด๋Š” ์ •ํ™•ํžˆ ์ธ์šฉํ•˜์„ธ์š”
823
- - ์ถœ์ฒ˜๋Š” ๋ฐ˜๋“œ์‹œ [ํŽ˜์ด์ง€ X] ํ˜•ํƒœ๋กœ ํ‘œ์‹œํ•˜์„ธ์š” (์˜ˆ: [ํŽ˜์ด์ง€ 3])
824
- - ์—ฌ๋Ÿฌ ํŽ˜์ด์ง€์—์„œ ์ •๋ณด๋ฅผ ์ฐพ์€ ๊ฒฝ์šฐ [ํŽ˜์ด์ง€ 3, 5, 12] ํ˜•ํƒœ๋กœ ํ‘œ์‹œํ•˜์„ธ์š”
825
- - ์ •๋ง๋กœ ๋ฌธ์„œ์— ์—†๋Š” ๊ฒฝ์šฐ์—๋งŒ "๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ํ•˜์„ธ์š”
826
- - ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ๋งŒ ๋‹ต๋ณ€ํ•˜๊ณ , HTML ํƒœ๊ทธ๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์„ธ์š”"""
 
 
 
 
 
 
827
 
828
  headers = {
829
  "Content-Type": "application/json",
@@ -857,7 +770,7 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
857
  except Exception:
858
  error_detail = response.text
859
 
860
- return f"โŒ API ์˜ค๋ฅ˜ (์ฝ”๋“œ: {response.status_code})\n\n{error_detail}"
861
 
862
  result = response.json()
863
  return result["choices"][0]["message"]["content"]
@@ -867,50 +780,145 @@ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
867
 
868
 
869
  def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
870
- """PDF์— ํ…์ŠคํŠธ ํ•˜์ด๋ผ์ดํŠธ ์ถ”๊ฐ€ - ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •"""
 
 
871
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
872
-
873
- # ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •
874
  yellow_color = [1.0, 1.0, 0.0]
875
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
  for item in highlight_info:
877
  page_num = item['page'] - 1
878
- search_text = item['text']
879
 
880
  if page_num >= len(doc):
881
  continue
882
 
883
  page = doc[page_num]
884
 
885
- text_variations = [
886
- search_text,
887
- search_text.replace(' ', ''),
888
- search_text.replace(',', ''),
889
- ]
 
 
 
 
 
 
 
 
 
890
 
891
- for text_var in text_variations:
892
- text_instances = page.search_for(text_var)
 
 
 
893
 
894
- for inst in text_instances:
895
- highlight = page.add_highlight_annot(inst)
 
 
 
 
 
 
 
 
 
 
 
896
  highlight.set_colors(stroke=yellow_color)
897
  highlight.update()
 
 
 
 
 
 
 
 
 
 
 
898
 
899
  output_bytes = doc.tobytes()
900
  doc.close()
901
-
902
  return output_bytes
903
 
904
 
905
- # ==================== Grok ์ถ”์ถœ ๊ฒฐ๊ณผ ๊ธฐ๋ฐ˜ ํ•˜์ด๋ผ์ดํŠธ ====================
906
  def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
907
- """Grok API๊ฐ€ ์„ ํƒํ•œ ์ตœ์ข… 1๊ฐœ๋ฅผ ํ•˜์ด๋ผ์ดํŠธ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜"""
908
  if "error" in grok_result:
909
  return []
910
 
911
  highlights = []
912
-
913
- # ์ตœ์ข… ์„ ํƒ๋œ 1๊ฐœ๋งŒ ์ฒ˜๋ฆฌ
914
  selected_text = grok_result.get("selected_text", "")
915
  page = grok_result.get("page", 1)
916
 
@@ -923,73 +931,70 @@ def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
923
  return highlights
924
 
925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
927
- """ํ•˜์ด๋ผ์ดํŠธ๋œ PDF ๋ Œ๋”๋ง"""
928
  highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
929
-
930
  doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
931
-
932
  highlighted_pages = set(h['page'] for h in highlight_info)
933
 
934
  pdf_html = '<div class="pdf-container" id="pdf-viewer-container">'
935
 
936
  for page_num in range(len(doc)):
937
  page = doc[page_num]
938
-
939
- # zoom_level์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ Œ๋”๋ง
940
  pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
941
  img_data = pix.tobytes("png")
942
  img_base64 = base64.b64encode(img_data).decode()
943
 
944
- # ์‹ค์ œ ์ด๋ฏธ์ง€ ํฌ๊ธฐ ๊ณ„์‚ฐ (zoom_level์— ๋”ฐ๋ผ)
945
- zoom_percentage = int(zoom_level * 50) # 2.0 = 100%, 1.0 = 50%
946
-
947
- # ๊ฐ ํŽ˜์ด์ง€์— ๊ณ ์œ  ID ๋ถ€์—ฌ
948
  page_id = f'page-{page_num + 1}'
949
  pdf_html += f'<div id="{page_id}" style="margin-bottom: 2rem; position: relative;">'
950
 
951
- # ํ•˜์ด๋ผ์ดํŠธ ์—ฌ๋ถ€์— ๋”ฐ๋ผ ํŽ˜์ด์ง€ ํ—ค๋” ์Šคํƒ€์ผ ๋ณ€๊ฒฝ
952
  if (page_num + 1) in highlighted_pages:
953
- # ํ•˜์ด๋ผ์ดํŠธ๊ฐ€ ์žˆ๋Š” ํŽ˜์ด์ง€ - ๋…ธ๋ž€ ๋ฐฐ๊ฒฝ
954
  pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํŽ˜์ด์ง€ {page_num + 1}</div>'
955
  else:
956
- # ์ผ๋ฐ˜ ํŽ˜์ด์ง€ - ํŒŒ๋ž€ ๋ฐฐ๊ฒฝ
957
  pdf_html += f'<div style="background: #667eea; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;">๐Ÿ“„ ํŽ˜์ด์ง€ {page_num + 1}</div>'
958
 
959
- # width๋ฅผ zoom_percentage๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ์‹ค์ œ ํ™•๋Œ€/์ถ•์†Œ ์ ์šฉ
960
  pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
961
  pdf_html += '</div>'
962
 
963
  pdf_html += '</div>'
964
-
965
  doc.close()
966
-
967
  return pdf_html
968
 
969
 
970
  def main():
971
  init_session()
972
-
973
 
974
- # Header ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ „์—๋งŒ ๋ณด์ž„
975
  if not st.session_state.processed:
976
- # ๋กœ๊ณ  ๊ฐ€์šด๋ฐ ์ •๋ ฌ
977
  col1, col2, col3 = st.columns([1, 1, 1])
978
  with col2:
979
  st.image("img/plobin.svg", use_container_width=True)
980
  st.text(' ')
981
-
982
- # ์„œ๋ธŒํƒ€์ดํ‹€
983
- # st.markdown("""
984
- # <div style="text-align: center; margin-top: 10px;">
985
- # ๋ฌธ์„œ ์† ๋‹ต์„ ์ฐพ์•„์ฃผ๋Š” AI ๋น„์„œ
986
- # </div>
987
- # """, unsafe_allow_html=True)
988
-
989
- # ========== ์‚ฌ์ด๋“œ๋ฐ” ==========
990
  with st.sidebar:
991
- st.image("img/plobin-left-only.png", width=30) # ํ”ฝ์…€ ๊ฐ’์œผ๋กœ ์ง์ ‘ ์ง€์ •
992
- # st.title("๐Ÿ”ฎ PLOBIN")
993
 
994
  uploaded_file = st.file_uploader(
995
  "๋“œ๋ž˜๊ทธํ•˜์—ฌ ํŒŒ์ผ์„ ์—…๋กœ๋“œ ๋˜๋Š” ํด๋ฆญํ•˜์—ฌ ์„ ํƒํ•˜์„ธ์š”.",
@@ -999,22 +1004,21 @@ def main():
999
  )
1000
 
1001
  if uploaded_file:
1002
- if st.button("๐Ÿ“„ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘", type="primary", use_container_width=True):
1003
  if not GROK_API_KEY:
1004
  st.error("โš ๏ธ GROK_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค!")
1005
  st.stop()
1006
 
1007
- # ๊ธฐ์กด ์„ธ์…˜ ์ดˆ๊ธฐํ™”
1008
  st.session_state.vector_db = None
1009
  st.session_state.embedder = None
1010
  st.session_state.chat_history = []
1011
  st.session_state.current_highlights = []
1012
 
1013
- with st.spinner("๐Ÿ“„ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘..."):
1014
  try:
1015
  chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
1016
 
1017
- with st.spinner("๐Ÿค– ๋ฌธ์„œ๋ฅผ AI๊ฐ€ ์ดํ•ดํ•  ์ˆ˜ ์žˆ๊ฒŒ ์ฒ˜๋ฆฌ ์ค‘.."):
1018
  collection, embedder = create_vector_db(chunks, metadata_list)
1019
 
1020
  st.session_state.vector_db = collection
@@ -1028,26 +1032,24 @@ def main():
1028
  "pages": len(set(m['page'] for m in metadata_list))
1029
  }
1030
 
1031
- st.success("โœ… ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
1032
  st.rerun()
1033
 
1034
  except Exception as e:
1035
  st.error(f"์˜ค๋ฅ˜: {str(e)}")
1036
 
1037
- # ๋ฌธ์„œ ์ •๋ณด ํ‘œ์‹œ (์ฒญํฌ ์ •๋ณด ์ œ์™ธ)
1038
  if st.session_state.processed:
1039
- st.markdown("#### ๐Ÿ“Š ๋ฌธ์„œ ์ •๋ณด")
1040
- st.info(f"๐Ÿ“„ **{st.session_state.doc_metadata['filename']}**")
1041
- st.info(f"๐Ÿ“‘ ํŽ˜์ด์ง€: {st.session_state.doc_metadata['pages']}")
1042
 
1043
- # ===== ์•„์ง ๋ฌธ์„œ๊ฐ€ ์ฒ˜๋ฆฌ๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ
1044
  if not st.session_state.processed:
1045
  st.markdown("""
1046
  <div class="usage-guide">
1047
- <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">๐Ÿ“– ์‚ฌ์šฉ ๋ฐฉ๋ฒ•</h2>
1048
  <div class="guide-step">
1049
  <div class="step-number">1</div>
1050
- <div>๐Ÿ“ค PDF ํŒŒ์ผ์„ ์˜ฌ๋ ค์ฃผ์„ธ์š”</div>
1051
  </div>
1052
  <div class="guide-step">
1053
  <div class="step-number">2</div>
@@ -1064,28 +1066,13 @@ def main():
1064
  </div>
1065
  """, unsafe_allow_html=True)
1066
 
1067
- # ๋ฌธ์„œ๊ฐ€ ์ฒ˜๋ฆฌ๋œ ๊ฒฝ์šฐ: ๋ถ„์„ ํ™”๋ฉด
1068
  else:
1069
- # 2๋‹จ ๋ ˆ์ด์•„์›ƒ
1070
  col1, col2 = st.columns([1, 1])
1071
 
1072
  with col1:
1073
- # ํ—ค๋”์™€ ์คŒ ์ปจํŠธ๋กค์„ ๊ฐ€๋กœ๋กœ ๋‚˜๋ž€ํžˆ
1074
  header_cols = st.columns([7, 1, 1.5, 1])
1075
  with header_cols[0]:
1076
- st.markdown("### ๐Ÿ“„ ๋ฌธ์„œ ๋ทฐ์–ด")
1077
- with header_cols[1]:
1078
- if st.button("โž–", key="zoom_out", help="์ถ•์†Œ", use_container_width=True):
1079
- if st.session_state.zoom_level > 0.5:
1080
- st.session_state.zoom_level -= 0.25
1081
- st.rerun()
1082
- with header_cols[2]:
1083
- st.markdown(f"<div style='text-align: center; padding-top: 0.5rem; font-weight: bold;'>{int(st.session_state.zoom_level * 50)}%</div>", unsafe_allow_html=True)
1084
- with header_cols[3]:
1085
- if st.button("โž•", key="zoom_in", help="ํ™•๋Œ€", use_container_width=True):
1086
- if st.session_state.zoom_level < 4.0:
1087
- st.session_state.zoom_level += 0.25
1088
- st.rerun()
1089
 
1090
  if st.session_state.pdf_bytes:
1091
  pdf_html = render_pdf_with_highlights(
@@ -1095,22 +1082,18 @@ def main():
1095
  )
1096
  st.markdown(pdf_html, unsafe_allow_html=True)
1097
 
1098
- # ์Šคํฌ๋กค ๊ธฐ๋Šฅ - JavaScript๋กœ ๊ตฌํ˜„
1099
  if st.session_state.scroll_to_page:
1100
  scroll_js = f"""
1101
  <script>
1102
- // PDF ์ปจํ…Œ์ด๋„ˆ ์ฐพ๊ธฐ
1103
  const container = parent.document.querySelector('.pdf-container');
1104
  const targetPage = parent.document.getElementById('page-{st.session_state.scroll_to_page}');
1105
 
1106
  if (container && targetPage) {{
1107
- // ์ปจํ…Œ์ด๋„ˆ ๋‚ด์—์„œ ํƒ€๊ฒŸ ํŽ˜์ด์ง€์˜ ์œ„์น˜ ๊ณ„์‚ฐ
1108
  const containerRect = container.getBoundingClientRect();
1109
  const targetRect = targetPage.getBoundingClientRect();
1110
  const scrollTop = container.scrollTop;
1111
  const offset = targetRect.top - containerRect.top + scrollTop;
1112
 
1113
- // ๋ถ€๋“œ๋Ÿฝ๊ฒŒ ์Šคํฌ๋กค
1114
  container.scrollTo({{
1115
  top: offset - 20,
1116
  behavior: 'smooth'
@@ -1122,9 +1105,8 @@ def main():
1122
  st.session_state.scroll_to_page = None
1123
 
1124
  with col2:
1125
- st.markdown('<h3 class="chat-title">๐Ÿ”ฎ PLOBIN CHAT</h3>', unsafe_allow_html=True)
1126
 
1127
- # ์ฑ„ํŒ… ํžˆ์Šคํ† ๋ฆฌ๋ฅผ ๋‹ด์„ ์ปจํ…Œ์ด๋„ˆ
1128
  chat_container = st.container(height=650)
1129
 
1130
  with chat_container:
@@ -1135,12 +1117,10 @@ def main():
1135
  if msg["role"] == "assistant" and "sources" in msg:
1136
  with st.expander("๐Ÿ“š ์ฐธ์กฐ ๋ฌธ์„œ"):
1137
  for idx, (doc, meta) in enumerate(zip(msg["sources"]["docs"], msg["sources"]["metas"])):
1138
- # ํ…์ŠคํŠธ๋ฅผ 150์ž๋กœ ์ œํ•œํ•˜๊ณ  ๊ฐ„๊ฒฐํ•˜๊ฒŒ ํ‘œ์‹œ
1139
  clean_text = doc[:150] + ('...' if len(doc) > 150 else '')
1140
 
1141
- # ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ๋ฒ„ํŠผ (๋ฐ•์Šค์ฒ˜๋Ÿผ ๋ณด์ด๊ฒŒ) - msg_idx ์ถ”๊ฐ€๋กœ ๊ณ ์œ  ํ‚ค ์ƒ์„ฑ
1142
  if st.button(
1143
- f"๐Ÿ“„ ํŽ˜์ด์ง€ {meta['page']}",
1144
  key=f"goto_source_msg{msg_idx}_{meta['page']}_{idx}",
1145
  use_container_width=True,
1146
  type="secondary"
@@ -1148,7 +1128,6 @@ def main():
1148
  st.session_state.scroll_to_page = meta['page']
1149
  st.rerun()
1150
 
1151
- # ๋ฌธ์„œ ๋‚ด์šฉ ํ‘œ์‹œ
1152
  st.markdown(f"""
1153
  <div style="background: #F1F5F9; padding: 0.8rem; border-radius: 0.5rem; margin-bottom: 1rem; border-left: 3px solid #667eea;">
1154
  <div style="font-size: 0.9rem; color: #475569;">
@@ -1156,50 +1135,20 @@ def main():
1156
  </div>
1157
  </div>
1158
  """, unsafe_allow_html=True)
1159
-
1160
- # Grok ๊ฒ€์ฆ ๊ฒฐ๊ณผ ํ‘œ์‹œ (์ตœ์ข… 1๊ฐœ)
1161
- if "grok_verified" in msg["sources"]:
1162
- with st.expander("๐Ÿ” Grok AI ์ตœ์ข… ์„ ํƒ"):
1163
- grok_data = msg["sources"]["grok_verified"]
1164
- if isinstance(grok_data, dict) and "selected_text" in grok_data:
1165
- selected_text = grok_data.get('selected_text', '์„ ํƒ๋œ ์ •๋ณด ์—†์Œ')
1166
- # ํ…์ŠคํŠธ๋ฅผ 150์ž๋กœ ์ œํ•œ
1167
- display_text = selected_text[:150] + ('...' if len(selected_text) > 150 else '')
1168
-
1169
- # ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ๋ฒ„ํŠผ (ํ•˜์ด๋ผ์ดํŠธ ์Šคํƒ€์ผ) - msg_idx ์ถ”๊ฐ€๋กœ ๊ณ ์œ  ํ‚ค ์ƒ์„ฑ
1170
- if st.button(
1171
- f"โญ ํŽ˜์ด์ง€ {grok_data.get('page', '?')}",
1172
- key=f"goto_grok_msg{msg_idx}_{grok_data.get('page', 0)}",
1173
- use_container_width=True,
1174
- type="primary"
1175
- ):
1176
- st.session_state.scroll_to_page = grok_data.get('page', 1)
1177
- st.rerun()
1178
-
1179
- # ์„ ํƒ๋œ ํ…์ŠคํŠธ ํ‘œ์‹œ
1180
- st.markdown(f"""
1181
- <div style="background: #FEF08A; color: #854D0E; padding: 0.8rem; border-radius: 0.5rem; margin-top: 0.5rem; border-left: 4px solid #EAB308;">
1182
- <div style="font-size: 0.9rem;">{display_text}</div>
1183
- </div>
1184
- """, unsafe_allow_html=True)
1185
 
1186
- # ์ฑ„ํŒ… ์ž…๋ ฅ - ์ปจํ…Œ์ด๋„ˆ ๋ฐ”๋กœ ์•„๋ž˜์— ๋ฐฐ์น˜
1187
  prompt = st.chat_input("๐Ÿ’ฌ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", key="chat_input")
1188
 
1189
- # 1๋‹จ๊ณ„: ์งˆ๋ฌธ์„ ๋ฐ›์œผ๋ฉด ์ฆ‰์‹œ ํžˆ์Šคํ† ๋ฆฌ์— ์ถ”๊ฐ€ํ•˜๊ณ  rerun (์งˆ๋ฌธ์ด ์ฑ„ํŒ… ๋ฐ•์Šค ์•ˆ์— ๋‚˜ํƒ€๋‚จ)
1190
  if prompt:
1191
  st.session_state.chat_history.append({"role": "user", "content": prompt})
1192
  st.session_state.processing_query = prompt
1193
  st.rerun()
1194
 
1195
- # 2๋‹จ๊ณ„: processing_query๊ฐ€ ์žˆ์œผ๋ฉด AI ๋‹ต๋ณ€ ์ƒ์„ฑ
1196
  if st.session_state.processing_query:
1197
  query = st.session_state.processing_query
1198
- st.session_state.processing_query = None # ํ”Œ๋ž˜๊ทธ ๋ฆฌ์…‹
1199
 
1200
- with st.spinner("๐Ÿ”ฎ PLOBIN์ด ๊ฒ€์ƒ‰์ค‘์ž…๋‹ˆ๋‹ค..."):
1201
  try:
1202
- # 1. ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ (๋ฒกํ„ฐ + ์˜๋ฏธ ๊ธฐ๋ฐ˜ ํ‚ค์›Œ๋“œ) - ์ƒ์œ„ 3๊ฐœ
1203
  search_results = hybrid_search(
1204
  query,
1205
  st.session_state.vector_db,
@@ -1207,29 +1156,24 @@ def main():
1207
  top_k=3
1208
  )
1209
 
1210
- # 2. Grok API๋กœ ๊ฒ€์ฆ ๋ฐ ์ถ”์ถœ
1211
  grok_result = grok_verify_and_extract(
1212
  query,
1213
  search_results,
1214
  GROK_API_KEY
1215
  )
1216
 
1217
- # 3. ๋‹ต๋ณ€ ์ƒ์„ฑ
1218
  answer = generate_answer(
1219
  query,
1220
  search_results,
1221
  GROK_API_KEY
1222
  )
1223
 
1224
- # 4. Grok ์ถ”์ถœ ๊ฒฐ๊ณผ๋ฅผ ํ•˜์ด๋ผ์ดํŠธ๋กœ ๋ณ€ํ™˜
1225
- highlights = extract_highlights_from_grok(grok_result)
1226
  st.session_state.current_highlights = highlights
1227
 
1228
- # 5. Grok์ด ์„ ํƒํ•œ ํŽ˜์ด์ง€๋กœ ์ž๋™ ์Šคํฌ๋กค ์„ค์ •
1229
  if grok_result and "page" in grok_result and "error" not in grok_result:
1230
  st.session_state.scroll_to_page = grok_result["page"]
1231
 
1232
- # 6. ์ฑ„ํŒ… ํžˆ์Šคํ† ๋ฆฌ์— ๋‹ต๋ณ€ ์ €์žฅ
1233
  chat_data = {
1234
  "role": "assistant",
1235
  "content": answer,
 
1
  """
2
+ PLOBIN
 
3
  """
4
  import streamlit as st
5
  import streamlit.components.v1 as components
 
16
  import base64
17
  from dotenv import load_dotenv
18
  import json
19
+ from difflib import SequenceMatcher
 
20
 
21
  def get_svg_content(svg_path):
22
  with open(svg_path, "r", encoding="utf-8") as f:
23
  return f.read()
24
 
 
25
  plobin_logo_svg = get_svg_content("img/plobin.svg")
26
 
 
27
  load_dotenv()
28
 
 
29
  GROK_API_KEY = os.getenv("GROK_API_KEY")
30
  GROK_API_BASE = "https://api.x.ai/v1"
31
  CHROMA_DIR = "./chroma_db"
32
  EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
33
 
 
34
  class HighlightConfig:
 
 
35
  def __init__(self):
36
+ self.color = [1.0, 1.0, 0.0]
 
37
 
 
38
  st.set_page_config(
39
  page_title="PLOBIN",
40
+ page_icon="img/plobin-left-only.png",
41
  layout="wide",
42
  initial_sidebar_state="expanded"
43
  )
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  st.markdown("""
46
  <style>
47
  [data-testid="stSidebar"] {
48
  background: linear-gradient(180deg,
49
+ #618FC2 0%,
50
+ #8E969E 100%);
51
  box-shadow: 4px 0 30px rgba(0,0,0,0.2);
52
  width: 290px !important;
53
  }
54
 
 
55
  [data-testid="stSidebar"] h1 {
56
  color: white !important;
57
  font-weight: 900 !important;
 
63
  letter-spacing: 2px;
64
  }
65
 
 
66
  @keyframes sidebarTitlePulse {
67
  0%, 100% {
68
  transform: scale(1);
 
81
  }
82
  }
83
 
 
84
  [data-testid="stSidebar"] [data-testid="stFileUploader"] {
85
  background: rgba(255,255,255,0.15);
86
  border-radius: 15px;
 
90
  backdrop-filter: blur(10px);
91
  }
92
 
 
93
  [data-testid="stFileUploader"] > section {
94
  background: transparent !important;
95
  }
96
 
 
97
  [data-testid="stFileUploader"] > section > div {
98
  background: transparent !important;
99
  }
100
 
 
101
  [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
102
  color: #fafafa;
103
  }
104
+
105
  [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
106
  [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
107
  background: transparent !important;
108
  border: none !important;
109
  }
110
+
111
  [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
112
  color: rgba(255,255,255,0.9) !important;
113
  }
114
+
115
  [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
116
  background: rgba(255,255,255,0.2) !important;
117
  color: white !important;
118
  border: 1px solid rgba(255,255,255,0.3) !important;
119
  }
120
+
121
  [data-testid="stSidebar"] .stButton button {
122
  background: rgba(255,255,255,0.15) !important;
123
  color: white !important;
 
129
  transition: all 0.3s ease !important;
130
  box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
131
  }
132
+
133
  [data-testid="stSidebar"] .stButton button:hover {
134
  background: rgba(255,255,255,0.25) !important;
135
  border-color: rgba(255,255,255,0.6) !important;
136
  transform: translateY(-2px) scale(1.02) !important;
137
  box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
138
  }
139
+
140
  [data-testid="stSidebar"] .stButton button:active {
141
  transform: translateY(0px) scale(0.98) !important;
142
  }
143
+
144
  [data-testid="stSidebar"] .stButton button[kind="primary"] {
145
  background: rgba(255,255,255,0.25) !important;
146
  border: 2px solid rgba(255,255,255,0.5) !important;
147
  font-size: 1.05rem !important;
148
  }
149
+
150
  [data-testid="stSidebar"] .stButton button[kind="primary"]:hover {
151
  background: rgba(255,255,255,0.35) !important;
152
  border-color: rgba(255,255,255,0.7) !important;
153
  }
154
+
155
  [data-testid="stSidebar"] [data-testid="stAlert"] {
156
  background-color: rgba(255, 255, 255, 0.001) !important;
157
  border-radius: 0.5rem !important;
158
  }
159
+
160
  [data-testid="stAlert"] p {
161
+ color: rgb(250, 250, 250);
162
  }
163
+
164
  .main .block-container {
165
  max-width: 100%;
166
  padding-left: 2rem;
167
  padding-right: 2rem;
168
  }
169
 
 
170
  .plobin-header {
171
  padding: 1.5rem 2rem;
172
  margin-bottom: 2rem;
173
  }
174
+
175
  .plobin-logo {
176
  display: block;
177
  margin: 0 auto;
178
+ height: 60px;
179
  }
180
 
181
  .plobin-title {
 
187
  text-shadow: 2px 2px 8px rgba(0, 0, 0, 0.4),
188
  0 0 20px rgba(102, 126, 234, 0.4);
189
  }
190
+
191
  .plobin-subtitle {
192
  font-size: 1rem;
193
  color: rgba(255, 255, 255, 0.9);
 
196
  text-shadow: 1px 1px 6px rgba(0, 0, 0, 0.4);
197
  }
198
 
 
199
  [data-testid="stFileUploader"] {
200
  background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
201
  border: 3px dashed #667eea;
 
213
  font-weight: 600 !important;
214
  }
215
 
 
216
  .pdf-container {
217
  border: 2px solid #E2E8F0;
218
  border-radius: 0.5rem;
219
  padding: 0.5rem;
220
+ height: 706px;
221
  overflow-y: auto;
222
  background: white;
223
  }
224
 
 
225
  .chat-container {
226
  border: 2px solid #E2E8F0;
227
  border-radius: 0.5rem;
 
232
  margin-bottom: 0.5rem;
233
  }
234
 
 
235
  [data-testid="stChatInput"] {
236
  margin-top: 0 !important;
237
  padding-top: 0 !important;
238
  }
239
 
 
240
  .source-box {
241
  background: #F1F5F9;
242
  padding: 1rem;
 
271
  border-left: 4px solid #EAB308;
272
  }
273
 
 
274
  .usage-guide {
275
  background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
276
  padding: 2rem;
 
303
  flex-shrink: 0;
304
  }
305
 
 
306
  .viewer-header {
307
  display: flex;
308
  justify-content: space-between;
 
310
  margin-bottom: 1rem;
311
  }
312
 
 
313
  @keyframes pulse {
314
  0%, 100% {
315
  box-shadow: 0 0 0 0 rgba(16, 185, 129, 0.7);
 
319
  }
320
  }
321
 
 
322
  .chat-title {
323
  color: black !important;
324
  font-weight: 900 !important;
 
331
  letter-spacing: 2px;
332
  }
333
 
 
334
  [data-testid="column"] button[kind="secondary"] {
335
  background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
336
  color: white !important;
 
375
 
376
 
377
  def init_session():
 
378
  if 'processed' not in st.session_state:
379
  st.session_state.processed = False
380
  if 'vector_db' not in st.session_state:
 
402
 
403
 
404
  def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
 
 
 
 
 
 
 
405
  pdf_bytes = pdf_file.read()
406
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
407
 
 
409
  metadata_list = []
410
  pages_text = {}
411
 
412
+ CHUNK_SIZE = 800
413
+ OVERLAP_SIZE = 150
 
 
414
 
415
  for page_num in range(len(doc)):
416
  page = doc[page_num]
 
461
 
462
  @st.cache_resource
463
  def load_embedding_model():
 
464
  return SentenceTransformer(EMBEDDING_MODEL)
465
 
466
 
467
  def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
 
468
  embedder = load_embedding_model()
469
 
 
470
  client = chromadb.EphemeralClient(
471
  settings=chromadb.Settings(
472
  anonymized_telemetry=False,
 
474
  )
475
  )
476
 
 
477
  try:
478
  client.delete_collection("rfx_docs")
479
  except Exception:
 
484
  metadata={"hnsw:space": "cosine"}
485
  )
486
 
 
487
  batch_size = 32
488
  all_embeddings = []
489
 
 
503
  return collection, embedder
504
 
505
 
 
506
  def extract_keywords_semantic(text: str, embedder, top_n: int = 5) -> List[str]:
 
 
 
 
 
 
 
 
 
 
507
  words_with_numbers = re.findall(r'[๊ฐ€-ํžฃ]*\d+[๊ฐ€-ํžฃ]*', text)
 
 
508
  candidate_words = re.findall(r'[๊ฐ€-ํžฃ]{2,}', text)
509
 
510
  if not candidate_words:
511
  return words_with_numbers[:top_n]
512
 
513
  word_freq = Counter(candidate_words)
 
 
514
  text_embedding = embedder.encode([text], convert_to_numpy=True)[0]
515
  word_embeddings = embedder.encode(list(word_freq.keys()), convert_to_numpy=True)
 
 
516
  similarities = util.cos_sim(text_embedding, word_embeddings)[0].numpy()
517
 
 
518
  scored_words = []
519
  for idx, (word, freq) in enumerate(word_freq.items()):
 
520
  semantic_score = similarities[idx]
521
+ frequency_score = np.log1p(freq) / 10.0
 
522
  combined_score = 0.7 * semantic_score + 0.3 * frequency_score
523
  scored_words.append((word, combined_score))
524
 
 
525
  scored_words.sort(key=lambda x: x[1], reverse=True)
526
 
 
527
  result = []
 
 
528
  for word in words_with_numbers[:3]:
529
  if word and word not in result:
530
  result.append(word)
531
 
 
532
  for word, score in scored_words:
533
  if word not in result:
534
  result.append(word)
 
538
  return result[:top_n]
539
 
540
 
 
541
  def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
 
 
542
  query_embedding = embedder.encode([query], convert_to_numpy=True)[0]
543
  vector_results = collection.query(
544
  query_embeddings=[query_embedding.tolist()],
545
+ n_results=20,
546
  include=["documents", "metadatas", "distances"]
547
  )
548
 
 
549
  keywords = extract_keywords_semantic(query, embedder, top_n=5)
550
 
 
551
  hybrid_results = []
552
  for i, doc_id in enumerate(vector_results['ids'][0]):
553
  doc = vector_results['documents'][0][i]
554
  metadata = vector_results['metadatas'][0][i]
555
+ vector_score = 1 - vector_results['distances'][0][i]
556
 
 
557
  keyword_score = 0
558
  doc_lower = doc.lower()
559
  for keyword in keywords:
 
561
  keyword_score += 1
562
  keyword_score = keyword_score / len(keywords) if keywords else 0
563
 
 
564
  hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
565
 
566
  hybrid_results.append({
 
572
  'keyword_score': keyword_score
573
  })
574
 
 
575
  hybrid_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
576
  top_results = hybrid_results[:top_k]
577
 
 
583
  }
584
 
585
 
 
586
  def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> Dict:
 
587
  docs = search_results['documents'][0]
588
  metas = search_results['metadatas'][0]
589
 
 
590
  formatted_docs = []
591
  for i, (doc, meta) in enumerate(zip(docs, metas), 1):
592
  formatted_docs.append(f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}")
 
650
 
651
  result = response.json()
652
  content = result["choices"][0]["message"]["content"]
 
 
 
653
  content = content.replace("```json", "").replace("```", "").strip()
654
  extracted_data = json.loads(content)
655
 
 
660
 
661
 
662
  def build_context(search_results: Dict, max_length: int = 3000) -> str:
 
663
  context_parts = []
664
  current_length = 0
665
 
 
684
 
685
 
686
  def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
 
687
  context = build_context(search_results, max_length=4000)
688
 
689
  system_prompt = """๋‹น์‹ ์€ ์ž๋™์ฐจ ์ œ์กฐ์—… RFx ๋ฌธ์„œ ์ „๋ฌธ ๋ถ„์„๊ฐ€์ž…๋‹ˆ๋‹ค.
 
692
  2. **์–ธ์–ด ํ˜ผ์šฉ ๋ฐ ๋น„๋ฌธ ๋Œ€์‘**: ์‚ฌ์šฉ์ž์˜ ๋ฌธ์žฅ์€ ํ•œ๊ตญ์–ด์™€ ์˜์–ด๊ฐ€ ์„ž์ด๊ฑฐ๋‚˜ ๋ฌธ๋ฒ• ์˜ค๋ฅ˜๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ •ํ™•ํžˆ ์ดํ•ดํ•˜๋ผ.
693
  3. **๋ชจํ˜ธํ•œ ์งˆ๋ฌธ ์ž๋™ ๋ณด์ •**: ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์ด ๋ถˆ์™„์ „ํ•˜๊ฑฐ๋‚˜ ๋ชจํ˜ธํ•ด๋„ ์งˆ๋ฌธ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ ์ ˆํ•˜๊ฒŒ ์žฌ๊ตฌ์„ฑํ•˜๋ผ.
694
  **๋ฌธ์„œ ๊ธฐ๋ฐ˜ ์‘๋‹ต ์›์น™ (์ ˆ๋Œ€ ์ถ”์ธก ๊ธˆ์ง€):**
695
+ 1. ์ œ๊ณต๋œ ๋ฌธ์„œ๋ฅผ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์ •ํ™•ํ•œ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
696
  2. **๋ฐ˜๋“œ์‹œ ๋ฌธ์„œ์—์„œ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์•„ ๋‹ต๋ณ€**ํ•˜๊ณ , ๋ฌธ์„œ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ž„์˜๋กœ ์ถ”์ธกํ•˜์ง€ ๋ง๊ณ  **"๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"**๋ผ๊ณ  ๋ช…์‹œํ•˜๋ผ
697
  3. **๋ฌธ์„œ์™€ ์ „ํ˜€ ๋ฌด๊ด€ํ•œ ์งˆ๋ฌธ**(์˜ˆ: ์ ์‹ฌ ์ถ”์ฒœ, ๋‚ ์”จ, ์ผ์ƒ ๋Œ€ํ™” ๋“ฑ)์€ **"์ฃ„์†กํ•˜์ง€๋งŒ, ์ œ๊ณต๋œ ๋ฌธ์„œ์—๋Š” ํ•ด๋‹น ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."**๋ผ๊ณ ๋งŒ ๋‹ต๋ณ€ํ•˜๊ณ  ์ถ”๊ฐ€ ์„ค๋ช… ์—†์ด ์ข…๋ฃŒํ•˜๋ผ
698
  4. ๋ฌธ์„œ์— ์ •๋ณด๊ฐ€ ์žˆ๋Š”๋ฐ๋„ "์—†๋‹ค"๊ณ  ํ•˜์ง€ ๋งˆ์„ธ์š”
699
  **ํ•ต์‹ฌ ์ •๋ณด ์šฐ์„  ์ถ”์ถœ:**
700
  - ๊ธˆ์•ก, ์ˆ˜๋Ÿ‰, ๊ทœ๊ฒฉ, ์ผ์ •, ์š”๊ตฌ์กฐ๊ฑด ๋“ฑ **์ˆ˜์น˜ ๊ธฐ๋ฐ˜ ์ •๋ณด๋ฅผ ์ตœ์šฐ์„ **์œผ๋กœ ์‹๋ณ„ํ•˜๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ๋ฐ˜ํ™˜ํ•˜๋ผ
701
  - ์ˆซ์ž, ๊ธˆ์•ก, ๋‚ ์งœ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์šฐ์„ ์ ์œผ๋กœ ์ฐพ์œผ์„ธ์š”
 
 
 
 
 
 
 
 
 
 
 
702
  **๋‹ต๋ณ€ ํ˜•์‹:**
703
+ - ๋‹ต๋ณ€ ์‹œ ๋ฐ˜๋“œ์‹œ **[ํŽ˜์ด์ง€ X]** ํ˜•ํƒœ๋กœ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œํ•˜์„ธ์š”
704
+ - **์ ˆ๋Œ€ ์ค‘์š”**: "๋ฌธ์„œ 1", "๋ฌธ์„œ 2" ๊ฐ™์€ ํ‘œ๊ธฐ๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์„ธ์š”
705
  - ํ•ต์‹ฌ ๋‹ต๋ณ€์„ ๋จผ์ € ๋ช…ํ™•ํ•˜๊ฒŒ ์ œ์‹œ
706
+ - ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ๋งŒ ๋‹ต๋ณ€ํ•˜์„ธ์š”
707
+ - ์งˆ๋ฌธ์— ๋”ฐ๋ผ ๊ฐ€์žฅ ์ ์ ˆํ•œ ๊ตฌ์กฐ๋กœ ๋‹ต๋ณ€ํ•˜์„ธ์š” (๋‹จ๊ณ„๋ณ„, ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„, ์‹œ๊ฐ„์ˆœ ๋“ฑ)
708
+
709
+ **์›๋ฌธ ์ธ์šฉ ๊ทœ์น™ (ํ•˜์ด๋ผ์ดํŠธ์šฉ):**
710
+ - ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์„ค๋ช…ํ•  ๋•Œ๋Š” ํฐ๋”ฐ์˜ดํ‘œ("")๋กœ PDF ์›๋ฌธ์„ ๊ทธ๋Œ€๋กœ ์ธ์šฉํ•˜์„ธ์š”
711
+ - ํฐ๋”ฐ์˜ดํ‘œ ์•ˆ์˜ ๋‚ด์šฉ์€ PDF ์›๋ฌธ์„ **ํ•œ ๊ธ€์ž๋„ ๋ฐ”๊พธ์ง€ ๋ง๊ณ ** ๊ทธ๋Œ€๏ฟฝ๏ฟฝ ๋ณต์‚ฌ
712
+ - ๋ฌธ์žฅ ์ข…๊ฒฐ์–ด("~ํ•จ", "~์ž„", "~์š”์ฒญํ•จ" ๋“ฑ)๋„ ์›๋ฌธ ๊ทธ๋Œ€๋กœ ์œ ์ง€
713
+ - ์ธ์šฉ ์˜ˆ์‹œ: "๊ธฐ์ˆ ํ‰๊ฐ€ ์ ์ˆ˜๊ฐ€ ๋ฐฐ์ ํ•œ๋„(100์ )์˜ 85% ์ด์ƒ์ธ ์ž๋ฅผ ๊ธฐ์ˆ ํ‰๊ฐ€ ์ ๊ฒฉ์ž๋กœ ์„ ์ •" [ํŽ˜์ด์ง€ 9]
714
+ - ์›๋ฌธ ์ธ์šฉ ํ›„ ํ•„์š”ํ•˜๋ฉด ๋ถ€์—ฐ ์„ค๋ช… ์ถ”๊ฐ€ ๊ฐ€๋Šฅ"""
715
+
716
+ user_prompt = f"""๋‹ค์Œ ๋ฌธ์„œ๋“ค์„ ๋งค์šฐ ๊ผผ๊ผผํžˆ ์ฝ๊ณ  ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”.
717
+
718
  <๋ฌธ์„œ>
719
  {context}
720
  </๋ฌธ์„œ>
721
+
722
  <์งˆ๋ฌธ>
723
  {query}
724
  </์งˆ๋ฌธ>
725
+
726
+ **๋‹ต๋ณ€ ์ž‘์„ฑ ๊ฐ€์ด๋“œ:**
727
+
728
+ 1. **๊ตฌ์กฐํ™”**: ์งˆ๋ฌธ ์œ ํ˜•์— ๋งž๋Š” ๊ฐ€์žฅ ์ฝ๊ธฐ ์‰ฌ์šด ๊ตฌ์กฐ ์„ ํƒ
729
+ - ์ ˆ์ฐจ/ํ”„๋กœ์„ธ์Šค ์งˆ๋ฌธ โ†’ ๋‹จ๊ณ„๋ณ„ ๋ฒˆํ˜ธ (1, 2, 3...)
730
+ - ํ•ญ๋ชฉ ๋‚˜์—ด ์งˆ๋ฌธ โ†’ ๋ถˆ๋ฆฟ ํฌ์ธํŠธ (โ€ข ๋˜๋Š” *)
731
+ - ๋น„๊ต/์„ ํƒ ์งˆ๋ฌธ โ†’ ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๊ตฌ๋ถ„
732
+
733
+ 2. **์›๋ฌธ ์ธ์šฉ**: ํ•ต์‹ฌ ๋‚ด์šฉ์€ ํฐ๋”ฐ์˜ดํ‘œ๋กœ PDF ์›๋ฌธ ๊ทธ๋Œ€๋กœ ์ธ์šฉ
734
+ - ์˜ˆ: "๊ธฐ์ˆ ํ‰๊ฐ€ ์ ๊ฒฉ์ž๋ฅผ ๋Œ€์ƒ์œผ๋กœ ๊ฐ€๊ฒฉ ์ž…์ฐฐ์„ ์‹ค์‹œํ•˜์—ฌ, ํ•œ๊ตญ์ž๋™์ฐจ์—ฐ๊ตฌ์›์˜ ์˜ˆ์ •๊ฐ€๊ฒฉ์ดํ•˜ ์ตœ์ €๊ฐ€๊ฒฉ ํˆฌ์ฐฐ์ž๋ฅผ ๋‚™์ฐฐ์ž๋กœ ์„ ์ •" [ํŽ˜์ด์ง€ 9]
735
+ - ํฐ๋”ฐ์˜ดํ‘œ ์•ˆ = ์›๋ฌธ ๊ทธ๋Œ€๋กœ (์ ˆ๋Œ€ ์˜์—ญ ๊ธˆ์ง€)
736
+
737
+ 3. **์ถœ์ฒ˜ ํ‘œ๊ธฐ**: ๋ชจ๋“  ์ •๋ณด์— [ํŽ˜์ด์ง€ X] ํ‘œ๊ธฐ
738
+
739
+ 4. **ํ˜•์‹**: ๋งˆํฌ๋‹ค์šด๋งŒ ์‚ฌ์šฉ, "๋ฌธ์„œ 1" ๊ฐ™์€ ํ‘œ๊ธฐ ๊ธˆ์ง€"""
740
 
741
  headers = {
742
  "Content-Type": "application/json",
 
770
  except Exception:
771
  error_detail = response.text
772
 
773
+ return f"โŒ API ์˜ค๋ฅ˜ (์ฝ”๋“œ: {response.status_code})\n์ƒ์„ธ: {error_detail}"
774
 
775
  result = response.json()
776
  return result["choices"][0]["message"]["content"]
 
780
 
781
 
782
  def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
783
+ """
784
+ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ์ชผ๊ฐœ์„œ ๊ฐ๊ฐ ์ฐพ์€ ๋’ค ๋ชจ๋‘ ํ•˜์ด๋ผ์ดํŠธ (๋” ๊ณต๊ฒฉ์ )
785
+ """
786
  doc = fitz.open(stream=pdf_bytes, filetype="pdf")
 
 
787
  yellow_color = [1.0, 1.0, 0.0]
788
 
789
+ def normalize_text(text):
790
+ return re.sub(r'\s+', ' ', text.strip().lower())
791
+
792
+ def find_text_fuzzy(page, search_text, threshold=0.65):
793
+ """ํผ์ง€ ๋งค์นญ์œผ๋กœ ํ…์ŠคํŠธ ์˜์—ญ ์ฐพ๊ธฐ (์ž„๊ณ„๊ฐ’ ๋‚ฎ์ถค)"""
794
+ search_norm = normalize_text(search_text)
795
+
796
+ # 1. ์ •ํ™•ํ•œ ๋งค์นญ ์‹œ๋„
797
+ variations = [
798
+ search_text,
799
+ search_text.replace(' ', ''),
800
+ search_text.replace('\n', ' '),
801
+ search_text.replace(',', ''),
802
+ ]
803
+
804
+ for var in variations:
805
+ instances = page.search_for(var)
806
+ if instances:
807
+ return instances
808
+
809
+ # 2. ๋ธ”๋ก ๋‹จ์œ„ ํผ์ง€ ๋งค์นญ
810
+ blocks = page.get_text("blocks")
811
+ for block in blocks:
812
+ if len(block) < 5:
813
+ continue
814
+
815
+ block_text = block[4]
816
+ block_norm = normalize_text(block_text)
817
+
818
+ similarity = SequenceMatcher(None, search_norm, block_norm).ratio()
819
+ if similarity >= threshold:
820
+ return [fitz.Rect(block[0], block[1], block[2], block[3])]
821
+
822
+ # 3. ๋‹จ์–ด ๋‹จ์œ„ ๋งค์นญ
823
+ words = page.get_text("words")
824
+ if not words:
825
+ return []
826
+
827
+ search_words = search_norm.split()
828
+ min_words = max(2, len(search_words) // 3) # 1/3๋งŒ ๋งค์นญ๋˜์–ด๋„ OK
829
+
830
+ best_match = None
831
+ best_sim = 0.0
832
+
833
+ for i in range(len(words)):
834
+ for size in range(len(search_words), min_words - 1, -1):
835
+ if i + size > len(words):
836
+ continue
837
+
838
+ window = words[i:i + size]
839
+ window_text = " ".join([w[4] for w in window])
840
+ window_norm = normalize_text(window_text)
841
+
842
+ sim = SequenceMatcher(None, search_norm, window_norm).ratio()
843
+ if sim > best_sim and sim >= threshold:
844
+ best_sim = sim
845
+ rect = fitz.Rect(window[0][:4])
846
+ for w in window[1:]:
847
+ rect = rect | fitz.Rect(w[:4])
848
+ best_match = rect
849
+
850
+ if best_match:
851
+ return [best_match]
852
+
853
+ return []
854
+
855
  for item in highlight_info:
856
  page_num = item['page'] - 1
857
+ full_text = item['text'].strip()
858
 
859
  if page_num >= len(doc):
860
  continue
861
 
862
  page = doc[page_num]
863
 
864
+ # ์ „๋žต 1: ๋งˆ์นจํ‘œ ๊ธฐ์ค€์œผ๋กœ ๋ถ„๋ฆฌ (์‰ผํ‘œ ๋ฌด์‹œ)
865
+ sentences = re.split(r'([.ใ€‚]\s*)', full_text)
866
+ sentences = [s.strip() for s in sentences if s.strip() and len(s.strip()) > 1]
867
+
868
+ # ๋งˆ์นจํ‘œ๋ฅผ ์•ž ๋ฌธ์žฅ์— ๋ถ™์ด๊ธฐ
869
+ combined = []
870
+ i = 0
871
+ while i < len(sentences):
872
+ if i + 1 < len(sentences) and sentences[i+1] in ['.', 'ใ€‚']:
873
+ combined.append(sentences[i] + sentences[i+1])
874
+ i += 2
875
+ else:
876
+ combined.append(sentences[i])
877
+ i += 1
878
 
879
+ # ๊ฐ ๋ฌธ์žฅ์„ ๊ฐœ๋ณ„์ ์œผ๋กœ ์ฐพ๊ธฐ (8์ž ์ด์ƒ)
880
+ found_any = False
881
+ for sentence in combined:
882
+ if len(sentence) < 8: # 10์ž โ†’ 8์ž๋กœ ๋‚ฎ์ถค
883
+ continue
884
 
885
+ rects = find_text_fuzzy(page, sentence, threshold=0.60) # 0.70 โ†’ 0.60
886
+ if rects:
887
+ found_any = True
888
+ for rect in rects:
889
+ highlight = page.add_highlight_annot(rect)
890
+ highlight.set_colors(stroke=yellow_color)
891
+ highlight.update()
892
+
893
+ # ์ „๋žต 2: ๋ฌธ์žฅ๋ณ„๋กœ ์•ˆ ๋˜๋ฉด ์ „์ฒด๋ฅผ ๋” ๋‚ฎ์€ ์ž„๊ณ„๊ฐ’์œผ๋กœ
894
+ if not found_any:
895
+ rects = find_text_fuzzy(page, full_text, threshold=0.50) # 0.60 โ†’ 0.50
896
+ for rect in rects:
897
+ highlight = page.add_highlight_annot(rect)
898
  highlight.set_colors(stroke=yellow_color)
899
  highlight.update()
900
+
901
+ # ์ „๋žต 3: ๊ทธ๋ž˜๋„ ์•ˆ ๋˜๋ฉด ํ•ต์‹ฌ ํ‚ค์›Œ๋“œ๋งŒ์ด๋ผ๋„ ์ฐพ๊ธฐ
902
+ if not found_any:
903
+ # 10์ž ์ด์ƒ์˜ ๋ช…์‚ฌ๊ตฌ ์ถ”์ถœ
904
+ keywords = re.findall(r'[๊ฐ€-ํžฃ]{10,}', full_text)
905
+ for kw in keywords[:3]: # ์ƒ์œ„ 3๊ฐœ๋งŒ
906
+ rects = find_text_fuzzy(page, kw, threshold=0.70)
907
+ for rect in rects:
908
+ highlight = page.add_highlight_annot(rect)
909
+ highlight.set_colors(stroke=yellow_color)
910
+ highlight.update()
911
 
912
  output_bytes = doc.tobytes()
913
  doc.close()
 
914
  return output_bytes
915
 
916
 
 
917
  def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
 
918
  if "error" in grok_result:
919
  return []
920
 
921
  highlights = []
 
 
922
  selected_text = grok_result.get("selected_text", "")
923
  page = grok_result.get("page", 1)
924
 
 
931
  return highlights
932
 
933
 
934
+ def extract_highlights_from_answer(answer: str) -> List[Dict]:
935
+ """
936
+ ๋‹ต๋ณ€์—์„œ ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ผ ์›๋ฌธ๋งŒ ์ถ”์ถœํ•˜์—ฌ ํ•˜์ด๋ผ์ดํŠธ
937
+ ํŒจํ„ด: "PDF ์›๋ฌธ" [ํŽ˜์ด์ง€ X]
938
+ """
939
+ import re
940
+
941
+ highlights = []
942
+
943
+ # ํฐ๋”ฐ์˜ดํ‘œ๋กœ ๊ฐ์‹ผ ์ธ์šฉ๊ตฌ ์ถ”์ถœ
944
+ # ์˜ˆ: "์ œ์•ˆ์—…์ฒด๋Š”... ์ œ์•ˆํ•˜์—ฌ์•ผ ํ•จ" [ํŽ˜์ด์ง€ 9]
945
+ pattern = r'"([^"]+)"\s*\[ํŽ˜์ด์ง€\s+(\d+)\]'
946
+ matches = re.findall(pattern, answer)
947
+
948
+ for quote, quote_page in matches:
949
+ highlights.append({
950
+ 'text': quote.strip(),
951
+ 'page': int(quote_page)
952
+ })
953
+
954
+ return highlights
955
+
956
+
957
  def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
 
958
  highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
 
959
  doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
 
960
  highlighted_pages = set(h['page'] for h in highlight_info)
961
 
962
  pdf_html = '<div class="pdf-container" id="pdf-viewer-container">'
963
 
964
  for page_num in range(len(doc)):
965
  page = doc[page_num]
 
 
966
  pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
967
  img_data = pix.tobytes("png")
968
  img_base64 = base64.b64encode(img_data).decode()
969
 
970
+ zoom_percentage = int(zoom_level * 50)
 
 
 
971
  page_id = f'page-{page_num + 1}'
972
  pdf_html += f'<div id="{page_id}" style="margin-bottom: 2rem; position: relative;">'
973
 
 
974
  if (page_num + 1) in highlighted_pages:
 
975
  pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํŽ˜์ด์ง€ {page_num + 1}</div>'
976
  else:
 
977
  pdf_html += f'<div style="background: #667eea; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;">๐Ÿ“„ ํŽ˜์ด์ง€ {page_num + 1}</div>'
978
 
 
979
  pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
980
  pdf_html += '</div>'
981
 
982
  pdf_html += '</div>'
 
983
  doc.close()
 
984
  return pdf_html
985
 
986
 
987
  def main():
988
  init_session()
 
989
 
 
990
  if not st.session_state.processed:
 
991
  col1, col2, col3 = st.columns([1, 1, 1])
992
  with col2:
993
  st.image("img/plobin.svg", use_container_width=True)
994
  st.text(' ')
995
+
 
 
 
 
 
 
 
 
996
  with st.sidebar:
997
+ st.image("img/plobin.svg", width=120)
 
998
 
999
  uploaded_file = st.file_uploader(
1000
  "๋“œ๋ž˜๊ทธํ•˜์—ฌ ํŒŒ์ผ์„ ์—…๋กœ๋“œ ๋˜๋Š” ํด๋ฆญํ•˜์—ฌ ์„ ํƒํ•˜์„ธ์š”.",
 
1004
  )
1005
 
1006
  if uploaded_file:
1007
+ if st.button("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘", type="primary", use_container_width=True):
1008
  if not GROK_API_KEY:
1009
  st.error("โš ๏ธ GROK_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค!")
1010
  st.stop()
1011
 
 
1012
  st.session_state.vector_db = None
1013
  st.session_state.embedder = None
1014
  st.session_state.chat_history = []
1015
  st.session_state.current_highlights = []
1016
 
1017
+ with st.spinner("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘..."):
1018
  try:
1019
  chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
1020
 
1021
+ with st.spinner("๋ฌธ์„œ๋ฅผ AI๊ฐ€ ์ดํ•ดํ•  ์ˆ˜ ์žˆ๊ฒŒ ์ฒ˜๋ฆฌ ์ค‘.."):
1022
  collection, embedder = create_vector_db(chunks, metadata_list)
1023
 
1024
  st.session_state.vector_db = collection
 
1032
  "pages": len(set(m['page'] for m in metadata_list))
1033
  }
1034
 
1035
+ st.success("๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
1036
  st.rerun()
1037
 
1038
  except Exception as e:
1039
  st.error(f"์˜ค๋ฅ˜: {str(e)}")
1040
 
 
1041
  if st.session_state.processed:
1042
+ st.markdown("#### ๋ฌธ์„œ ์ •๋ณด")
1043
+ st.info(f"**{st.session_state.doc_metadata['filename']}**")
1044
+ st.info(f"ํŽ˜์ด์ง€: {st.session_state.doc_metadata['pages']}")
1045
 
 
1046
  if not st.session_state.processed:
1047
  st.markdown("""
1048
  <div class="usage-guide">
1049
+ <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">์‚ฌ์šฉ ๋ฐฉ๋ฒ•</h2>
1050
  <div class="guide-step">
1051
  <div class="step-number">1</div>
1052
+ <div>PDF ํŒŒ์ผ์„ ์˜ฌ๋ ค์ฃผ์„ธ์š”</div>
1053
  </div>
1054
  <div class="guide-step">
1055
  <div class="step-number">2</div>
 
1066
  </div>
1067
  """, unsafe_allow_html=True)
1068
 
 
1069
  else:
 
1070
  col1, col2 = st.columns([1, 1])
1071
 
1072
  with col1:
 
1073
  header_cols = st.columns([7, 1, 1.5, 1])
1074
  with header_cols[0]:
1075
+ st.markdown("### ๋ฌธ์„œ ๋ทฐ์–ด")
 
 
 
 
 
 
 
 
 
 
 
 
1076
 
1077
  if st.session_state.pdf_bytes:
1078
  pdf_html = render_pdf_with_highlights(
 
1082
  )
1083
  st.markdown(pdf_html, unsafe_allow_html=True)
1084
 
 
1085
  if st.session_state.scroll_to_page:
1086
  scroll_js = f"""
1087
  <script>
 
1088
  const container = parent.document.querySelector('.pdf-container');
1089
  const targetPage = parent.document.getElementById('page-{st.session_state.scroll_to_page}');
1090
 
1091
  if (container && targetPage) {{
 
1092
  const containerRect = container.getBoundingClientRect();
1093
  const targetRect = targetPage.getBoundingClientRect();
1094
  const scrollTop = container.scrollTop;
1095
  const offset = targetRect.top - containerRect.top + scrollTop;
1096
 
 
1097
  container.scrollTo({{
1098
  top: offset - 20,
1099
  behavior: 'smooth'
 
1105
  st.session_state.scroll_to_page = None
1106
 
1107
  with col2:
1108
+ st.markdown('### PLOBIN CHAT', unsafe_allow_html=True)
1109
 
 
1110
  chat_container = st.container(height=650)
1111
 
1112
  with chat_container:
 
1117
  if msg["role"] == "assistant" and "sources" in msg:
1118
  with st.expander("๐Ÿ“š ์ฐธ์กฐ ๋ฌธ์„œ"):
1119
  for idx, (doc, meta) in enumerate(zip(msg["sources"]["docs"], msg["sources"]["metas"])):
 
1120
  clean_text = doc[:150] + ('...' if len(doc) > 150 else '')
1121
 
 
1122
  if st.button(
1123
+ f"ํŽ˜์ด์ง€ {meta['page']}",
1124
  key=f"goto_source_msg{msg_idx}_{meta['page']}_{idx}",
1125
  use_container_width=True,
1126
  type="secondary"
 
1128
  st.session_state.scroll_to_page = meta['page']
1129
  st.rerun()
1130
 
 
1131
  st.markdown(f"""
1132
  <div style="background: #F1F5F9; padding: 0.8rem; border-radius: 0.5rem; margin-bottom: 1rem; border-left: 3px solid #667eea;">
1133
  <div style="font-size: 0.9rem; color: #475569;">
 
1135
  </div>
1136
  </div>
1137
  """, unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1138
 
 
1139
  prompt = st.chat_input("๐Ÿ’ฌ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", key="chat_input")
1140
 
 
1141
  if prompt:
1142
  st.session_state.chat_history.append({"role": "user", "content": prompt})
1143
  st.session_state.processing_query = prompt
1144
  st.rerun()
1145
 
 
1146
  if st.session_state.processing_query:
1147
  query = st.session_state.processing_query
1148
+ st.session_state.processing_query = None
1149
 
1150
+ with st.spinner("PLOBIN์ด ๊ฒ€์ƒ‰์ค‘์ž…๋‹ˆ๋‹ค..."):
1151
  try:
 
1152
  search_results = hybrid_search(
1153
  query,
1154
  st.session_state.vector_db,
 
1156
  top_k=3
1157
  )
1158
 
 
1159
  grok_result = grok_verify_and_extract(
1160
  query,
1161
  search_results,
1162
  GROK_API_KEY
1163
  )
1164
 
 
1165
  answer = generate_answer(
1166
  query,
1167
  search_results,
1168
  GROK_API_KEY
1169
  )
1170
 
1171
+ highlights = extract_highlights_from_answer(answer)
 
1172
  st.session_state.current_highlights = highlights
1173
 
 
1174
  if grok_result and "page" in grok_result and "error" not in grok_result:
1175
  st.session_state.scroll_to_page = grok_result["page"]
1176
 
 
1177
  chat_data = {
1178
  "role": "assistant",
1179
  "content": answer,