dohyune commited on
Commit
adb068a
ยท
verified ยท
1 Parent(s): 1e3e9f5

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -1076
app.py DELETED
@@ -1,1076 +0,0 @@
1
- """
2
- PROBIN - RFx ๋ฌธ์„œ ๋ถ„์„ AI (ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ + Grok ์ ๊ฒ€ ๋ฒ„์ „)
3
-
4
- """
5
- import streamlit as st
6
- import fitz # PyMuPDF
7
- import chromadb
8
- from sentence_transformers import SentenceTransformer, util
9
- import requests
10
- import os
11
- import re
12
- import shutil
13
- from collections import Counter
14
- import numpy as np
15
- from typing import List, Dict, Tuple
16
- import base64
17
- from dotenv import load_dotenv
18
- import json
19
-
20
- # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ๋กœ๋“œ
21
- load_dotenv()
22
-
23
- # Configuration
24
- GROK_API_KEY = os.getenv("GROK_API_KEY")
25
- GROK_API_BASE = "https://api.x.ai/v1"
26
- CHROMA_DIR = "./chroma_db"
27
- EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
28
-
29
- # ==================== ํ•˜์ด๋ผ์ดํŠธ ์„ค์ • ํด๋ž˜์Šค ====================
30
- class HighlightConfig:
31
- """ํ•˜์ด๋ผ์ดํŠธ ์„ค์ • - ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •"""
32
-
33
- def __init__(self):
34
- # ํ•˜์ด๋ผ์ดํŠธ ์ƒ‰์ƒ - ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •
35
- self.color = [1.0, 1.0, 0.0] # ๋…ธ๋ž€์ƒ‰ (RGB 0-1 ๋ฒ”์œ„)
36
-
37
- # Page config
38
- st.set_page_config(
39
- page_title="PROBIN",
40
- page_icon="๐Ÿ”ฎ",
41
- layout="wide",
42
- initial_sidebar_state="expanded"
43
- )
44
-
45
- # Custom CSS
46
- st.markdown("""
47
- <style>
48
- [data-testid="stSidebar"] {
49
- background: linear-gradient(180deg,
50
- #667eea 0%,
51
- #764ba2 100%);
52
- box-shadow: 4px 0 30px rgba(0,0,0,0.2);
53
- width: 290px !important;
54
- }
55
-
56
- [data-testid="stSidebar"] h1 {
57
- color: white !important;
58
- text-shadow: 2px 2px 15px rgba(0,0,0,0.4);
59
- }
60
-
61
- /* ํŒŒ์ผ ์—…๋กœ๋” ๋ฐฐ๊ฒฝ ํˆฌ๋ช…ํ•˜๊ฒŒ */
62
- [data-testid="stSidebar"] [data-testid="stFileUploader"] {
63
- background: rgba(255,255,255,0.15);
64
- border-radius: 15px;
65
- padding: 1.5rem;
66
- border: 3px dashed rgba(255,255,255,0.4);
67
- transition: all 0.3s ease;
68
- backdrop-filter: blur(10px);
69
- }
70
-
71
- /* ํŒŒ์ผ ์—…๋กœ๋” ๋‚ด๋ถ€ ์„น์…˜๋„ ํˆฌ๋ช…ํ•˜๊ฒŒ */
72
- [data-testid="stFileUploader"] > section {
73
- background: transparent !important;
74
- }
75
-
76
- /* ํŒŒ์ผ ์—…๋กœ๋” ๋“œ๋ž˜๊ทธ ์˜์—ญ */
77
- [data-testid="stFileUploader"] > section > div {
78
- background: transparent !important;
79
- }
80
-
81
- /* ์—…๋กœ๋“œ๋œ ํŒŒ์ผ ํ‘œ์‹œ ์˜์—ญ */
82
- [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] {
83
- color: #fafafa;
84
- }
85
- /* ํ•˜์–€ ๋ฐ•์Šค(๋“œ๋กญ์กด) ์Šคํƒ€์ผ */
86
- [data-testid="stSidebar"] [data-testid="stFileUploader"] > section,
87
- [data-testid="stSidebar"] [data-testid="stFileUploader"] section > div {
88
- background: transparent !important;
89
- border: none !important;
90
- }
91
- /* ๋“œ๋กญ์กด ๋‚ด๋ถ€ ํ…์ŠคํŠธ ์ƒ‰์ƒ */
92
- [data-testid="stSidebar"] [data-testid="stFileUploader"] [data-testid="stMarkdownContainer"] p {
93
- color: rgba(255,255,255,0.9) !important;
94
- }
95
- /* "ํŒŒ์ผ ์ฐพ๊ธฐ" ๋ฒ„ํŠผ */
96
- [data-testid="stSidebar"] [data-testid="stFileUploader"] button[kind="secondary"] {
97
- background: rgba(255,255,255,0.2) !important;
98
- color: white !important;
99
- border: 1px solid rgba(255,255,255,0.3) !important;
100
- }
101
- /* ์‚ฌ์ด๋“œ๋ฐ” ๋ฒ„ํŠผ ์Šคํƒ€์ผ ์—…๋ฐ์ดํŠธ */
102
- [data-testid="stSidebar"] .stButton button {
103
- background: rgba(255,255,255,0.15) !important;
104
- color: white !important;
105
- border: 2px solid rgba(255,255,255,0.4) !important;
106
- border-radius: 12px !important;
107
- font-weight: 700 !important;
108
- padding: 0.75rem 1.5rem !important;
109
- backdrop-filter: blur(10px) !important;
110
- transition: all 0.3s ease !important;
111
- box-shadow: 0 4px 15px rgba(0,0,0,0.1) !important;
112
- }
113
- [data-testid="stSidebar"] .stButton button:hover {
114
- background: rgba(255,255,255,0.25) !important;
115
- border-color: rgba(255,255,255,0.6) !important;
116
- transform: translateY(-2px) scale(1.02) !important;
117
- box-shadow: 0 6px 20px rgba(0,0,0,0.2) !important;
118
- }
119
- [data-testid="stSidebar"] .stButton button:active {
120
- transform: translateY(0px) scale(0.98) !important;
121
- }
122
- /* Primary ๋ฒ„ํŠผ (๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘) ํŠน๋ณ„ ์Šคํƒ€์ผ */
123
- [data-testid="stSidebar"] .stButton button[kind="primary"] {
124
- background: rgba(255,255,255,0.25) !important;
125
- border: 2px solid rgba(255,255,255,0.5) !important;
126
- font-size: 1.05rem !important;
127
- }
128
- [data-testid="stSidebar"] .stButton button[kind="primary"]:hover {
129
- background: rgba(255,255,255,0.35) !important;
130
- border-color: rgba(255,255,255,0.7) !important;
131
- }
132
- [data-testid="stSidebar"] [data-testid="stAlert"] {
133
- background-color: rgba(255, 255, 255, 0.001) !important;
134
- border-radius: 0.5rem !important;
135
- }
136
- [data-testid="stAlert"] p {
137
- color: rgb(250, 250, 250); /* ํฐ์ƒ‰ */
138
- }
139
- /* ๋ฉ”์ธ ์ปจํ…์ธ  ์ „์ฒด ๋„ˆ๋น„ ์‚ฌ์šฉ */
140
- .main .block-container {
141
- max-width: 100%;
142
- padding-left: 2rem;
143
- padding-right: 2rem;
144
- }
145
-
146
- /* ํ—ค๋” ์Šคํƒ€์ผ - ๋ฐ•์Šค ์ œ๊ฑฐ, ํ…์ŠคํŠธ ๊ทธ๋ฆผ์ž๋งŒ */
147
- .probin-header {
148
- padding: 1.5rem 2rem;
149
- margin-bottom: 2rem;
150
- }
151
- .probin-title {
152
- font-size: 2.5rem;
153
- font-weight: bold;
154
- color: white;
155
- margin: 0;
156
- text-align: center;
157
- text-shadow: 2px 2px 8px rgba(0, 0, 0, 0.4),
158
- 0 0 20px rgba(102, 126, 234, 0.4);
159
- }
160
- .probin-subtitle {
161
- font-size: 1rem;
162
- color: rgba(255, 255, 255, 0.9);
163
- text-align: center;
164
- margin-top: 0.5rem;
165
- text-shadow: 1px 1px 6px rgba(0, 0, 0, 0.4);
166
- }
167
-
168
- /* ํŒŒ์ผ ์—…๋กœ๋” ์ปค์Šคํ„ฐ๋งˆ์ด์ง• */
169
- [data-testid="stFileUploader"] {
170
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
171
- border: 3px dashed #667eea;
172
- border-radius: 1rem;
173
- padding: 3rem 2rem;
174
- }
175
-
176
- [data-testid="stFileUploader"] > div {
177
- text-align: center;
178
- }
179
-
180
- [data-testid="stFileUploader"] label {
181
- font-size: 1.2rem !important;
182
- color: #2D3748 !important;
183
- font-weight: 600 !important;
184
- }
185
-
186
- /* PDF ์ปจํ…Œ์ด๋„ˆ */
187
- .pdf-container {
188
- border: 2px solid #E2E8F0;
189
- border-radius: 0.5rem;
190
- padding: 0.5rem;
191
- height: 705px;
192
- overflow-y: auto;
193
- background: white;
194
- }
195
-
196
- /* ์ฑ„ํŒ… ์ปจํ…Œ์ด๋„ˆ - ์Šคํฌ๋กค ์ถ”๊ฐ€ */
197
- .chat-container {
198
- border: 2px solid #E2E8F0;
199
- border-radius: 0.5rem;
200
- padding: 1rem;
201
- height: 650px;
202
- overflow-y: auto;
203
- background: white;
204
- margin-bottom: 0.5rem;
205
- }
206
-
207
- /* ์ฑ„ํŒ… ์ž…๋ ฅ์ฐฝ๊ณผ ์ปจํ…Œ์ด๋„ˆ ๊ฐ„๊ฒฉ ์ตœ์†Œํ™” */
208
- [data-testid="stChatInput"] {
209
- margin-top: 0 !important;
210
- padding-top: 0 !important;
211
- }
212
-
213
- /* ์ฑ„ํŒ… ์Šคํƒ€์ผ */
214
- .source-box {
215
- background: #F1F5F9;
216
- padding: 1rem;
217
- border-radius: 0.5rem;
218
- margin: 0.5rem 0;
219
- border-left: 3px solid #667eea;
220
- }
221
-
222
- .source-title {
223
- font-weight: bold;
224
- color: #667eea;
225
- margin-bottom: 0.5rem;
226
- }
227
-
228
- .page-indicator {
229
- background: #667eea;
230
- color: white;
231
- padding: 0.3rem 0.8rem;
232
- border-radius: 1rem;
233
- font-size: 0.85rem;
234
- display: inline-block;
235
- margin: 0.2rem;
236
- }
237
-
238
- .highlight-indicator {
239
- background: #FEF08A;
240
- color: #854D0E;
241
- padding: 0.5rem 1rem;
242
- border-radius: 0.5rem;
243
- margin: 0.5rem 0;
244
- font-weight: bold;
245
- border-left: 4px solid #EAB308;
246
- }
247
-
248
- /* ์‚ฌ์šฉ ์•ˆ๋‚ด ์Šคํƒ€์ผ */
249
- .usage-guide {
250
- background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
251
- padding: 2rem;
252
- border-radius: 1rem;
253
- margin-bottom: 2rem;
254
- height: 100%;
255
- }
256
-
257
- .guide-step {
258
- display: flex;
259
- align-items: center;
260
- margin: 1.5rem 0;
261
- font-size: 1.1rem;
262
- color: #2D3748;
263
- }
264
-
265
- .step-number {
266
- background: #667eea;
267
- color: white;
268
- width: 2.5rem;
269
- height: 2.5rem;
270
- border-radius: 50%;
271
- display: flex;
272
- align-items: center;
273
- justify-content: center;
274
- font-weight: bold;
275
- font-size: 1.2rem;
276
- margin-right: 1rem;
277
- flex-shrink: 0;
278
- }
279
-
280
- /* ๋ทฐ์–ด ํ—ค๋” ์Šคํƒ€์ผ */
281
- .viewer-header {
282
- display: flex;
283
- justify-content: space-between;
284
- align-items: center;
285
- margin-bottom: 1rem;
286
- }
287
- </style>
288
- """, unsafe_allow_html=True)
289
-
290
-
291
- def init_session():
292
- """์„ธ์…˜ ์ƒํƒœ ์ดˆ๊ธฐํ™”"""
293
- if 'processed' not in st.session_state:
294
- st.session_state.processed = False
295
- if 'vector_db' not in st.session_state:
296
- st.session_state.vector_db = None
297
- if 'embedder' not in st.session_state:
298
- st.session_state.embedder = None
299
- if 'chat_history' not in st.session_state:
300
- st.session_state.chat_history = []
301
- if 'doc_metadata' not in st.session_state:
302
- st.session_state.doc_metadata = {}
303
- if 'pdf_bytes' not in st.session_state:
304
- st.session_state.pdf_bytes = None
305
- if 'pdf_pages_text' not in st.session_state:
306
- st.session_state.pdf_pages_text = {}
307
- if 'current_highlights' not in st.session_state:
308
- st.session_state.current_highlights = []
309
- if 'zoom_level' not in st.session_state:
310
- st.session_state.zoom_level = 2.0
311
- if 'highlight_config' not in st.session_state:
312
- st.session_state.highlight_config = HighlightConfig()
313
- if 'processing_query' not in st.session_state:
314
- st.session_state.processing_query = None
315
-
316
-
317
- def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
318
- """
319
- PDF์—์„œ ํ…์ŠคํŠธ ์ถ”์ถœ
320
-
321
- ์ˆ˜์ • ์‚ฌํ•ญ:
322
- - CHUNK_SIZE: 300 โ†’ 800
323
- - OVERLAP_SIZE: 60 โ†’ 150
324
- """
325
- pdf_bytes = pdf_file.read()
326
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
327
-
328
- chunks = []
329
- metadata_list = []
330
- pages_text = {}
331
-
332
- # ==================== ์ˆ˜์ •๋œ ์ฒญํฌ ์„ค์ • ====================
333
- CHUNK_SIZE = 300 # 300์—์„œ 800์œผ๋กœ ์ฆ๊ฐ€
334
- OVERLAP_SIZE = 60 # 60์—์„œ 150์œผ๋กœ ์ฆ๊ฐ€
335
- # ========================================================
336
-
337
- for page_num in range(len(doc)):
338
- page = doc[page_num]
339
- text = page.get_text("text")
340
- pages_text[page_num + 1] = text
341
-
342
- if not text.strip():
343
- continue
344
-
345
- lines = [line.strip() for line in text.split('\n') if line.strip()]
346
- cleaned_text = '\n'.join(lines)
347
-
348
- sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
349
- sentences = [s for s in sentences if s.strip()]
350
-
351
- current_chunk = ""
352
- current_length = 0
353
-
354
- for sentence in sentences:
355
- sentence_length = len(sentence)
356
-
357
- if current_length + sentence_length > CHUNK_SIZE and current_chunk:
358
- chunks.append(current_chunk.strip())
359
- metadata_list.append({
360
- "page": page_num + 1,
361
- "source": pdf_file.name,
362
- "chunk_type": "paragraph"
363
- })
364
-
365
- overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
366
- current_chunk = overlap_text + sentence
367
- current_length = len(current_chunk)
368
- else:
369
- current_chunk += sentence
370
- current_length += sentence_length
371
-
372
- if current_chunk.strip():
373
- chunks.append(current_chunk.strip())
374
- metadata_list.append({
375
- "page": page_num + 1,
376
- "source": pdf_file.name,
377
- "chunk_type": "paragraph"
378
- })
379
-
380
- doc.close()
381
- return chunks, metadata_list, pdf_bytes, pages_text
382
-
383
-
384
- @st.cache_resource
385
- def load_embedding_model():
386
- """์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ"""
387
- return SentenceTransformer(EMBEDDING_MODEL)
388
-
389
-
390
- def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
391
- """๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ƒ์„ฑ - ๋ฉ”๋ชจ๋ฆฌ ๊ธฐ๋ฐ˜์œผ๋กœ ์•ˆ์ •์„ฑ ํ–ฅ์ƒ"""
392
- embedder = load_embedding_model()
393
-
394
- # ๋ฉ”๋ชจ๋ฆฌ ๊ธฐ๋ฐ˜ ChromaDB ์‚ฌ์šฉ (ํŒŒ์ผ ์‹œ์Šคํ…œ ๋ฌธ์ œ ํšŒํ”ผ)
395
- client = chromadb.EphemeralClient(
396
- settings=chromadb.Settings(
397
- anonymized_telemetry=False,
398
- allow_reset=True
399
- )
400
- )
401
-
402
- # ์ปฌ๋ ‰์…˜ ์ƒ์„ฑ
403
- try:
404
- client.delete_collection("rfx_docs")
405
- except Exception:
406
- pass
407
-
408
- collection = client.create_collection(
409
- name="rfx_docs",
410
- metadata={"hnsw:space": "cosine"}
411
- )
412
-
413
- # ๋ฐฐ์น˜ ์ž„๋ฒ ๋”ฉ
414
- batch_size = 32
415
- all_embeddings = []
416
-
417
- for i in range(0, len(chunks), batch_size):
418
- batch = chunks[i:i + batch_size]
419
- embeddings = embedder.encode(batch, show_progress_bar=False, convert_to_numpy=True)
420
- all_embeddings.extend(embeddings)
421
-
422
- ids = [f"doc_{i}" for i in range(len(chunks))]
423
- collection.add(
424
- embeddings=[emb.tolist() for emb in all_embeddings],
425
- documents=chunks,
426
- metadatas=metadata_list,
427
- ids=ids
428
- )
429
-
430
- return collection, embedder
431
-
432
-
433
- def extract_keywords(text: str, top_n: int = 5) -> List[str]:
434
- """ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
435
- words_with_numbers = re.findall(r'[๊ฐ€-ํžฃ]*\d+[๊ฐ€-ํžฃ]*', text)
436
- words = re.findall(r'[๊ฐ€-ํžฃ]{2,}', text)
437
-
438
- stopwords = {
439
- '๊ฒƒ', '๋“ฑ', '๋ฐ', '๊ทธ', '์ด', '์ €', '์ˆ˜', '๋•Œ', '์ค‘', '๋‚ด', '๋…„', '์›”', '์ผ',
440
- '๊ฒฝ์šฐ', '๋Œ€ํ•œ', 'ํ†ตํ•ด', '์œ„ํ•ด', '๊ด€๋ จ', '์žˆ๋Š”', 'ํ•˜๋Š”', '๋˜๋Š”', '์ด๋Ÿฐ', '์ €๋Ÿฐ',
441
- '์–ด๋–ค', '๋ฌด์Šจ', '์–ด๋А', '๋ˆ„๊ตฌ', '์–ธ์ œ', '์–ด๋””', '๋ฌด์—‡', '์–ด๋–ป๊ฒŒ', '์™œ',
442
- '์•Œ๋ ค', '์„ค๋ช…', '๋งํ•ด', '๋Œ€ํ•ด', '๊ด€ํ•˜์—ฌ', '์žˆ๋‚˜์š”', '์ธ๊ฐ€์š”', '๋ฌด์—‡์ธ๊ฐ€์š”',
443
- '์–ผ๋งˆ', '์ž…๋‹ˆ๊นŒ', 'ํ•ฉ๋‹ˆ๊นŒ'
444
- }
445
-
446
- important_keywords = {
447
- '๊ธˆ์•ก', '๊ฐ€๊ฒฉ', '๋น„์šฉ', '์˜ˆ์‚ฐ', '์„ค๊ณ„', '์‚ฌ์—…', '๊ณผ์—…', '๊ณ„์•ฝ',
448
- '๊ณต์‚ฌ', '์šฉ์—ญ', '์ œ์•ˆ', '์ž…์ฐฐ', '๋‚™์ฐฐ', '๊ฒฌ์ ', '๋‹จ๊ฐ€'
449
- }
450
-
451
- filtered_words = [w for w in words if w not in stopwords and len(w) >= 2]
452
- word_freq = Counter(filtered_words)
453
-
454
- for word in word_freq:
455
- if word in important_keywords:
456
- word_freq[word] += 5
457
-
458
- result = []
459
- result.extend([w for w in words_with_numbers if w])
460
-
461
- for word, _ in word_freq.most_common(top_n * 2):
462
- if word not in result:
463
- result.append(word)
464
- if len(result) >= top_n:
465
- break
466
-
467
- return result[:top_n]
468
-
469
-
470
- # ==================== ์ƒˆ๋กœ์šด ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ํ•จ์ˆ˜ ====================
471
- def hybrid_search(query: str, collection, embedder, top_k: int = 3) -> Dict:
472
- """ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰: ๋ฒกํ„ฐ ์œ ์‚ฌ๋„ + ํ‚ค์›Œ๋“œ ๋งค์นญ"""
473
- # 1. ๋ฒกํ„ฐ ๊ฒ€์ƒ‰
474
- query_embedding = embedder.encode([query], convert_to_numpy=True)[0]
475
- vector_results = collection.query(
476
- query_embeddings=[query_embedding.tolist()],
477
- n_results=20, # ๋งŽ์ด ๊ฐ€์ ธ์™€์„œ ํ‚ค์›Œ๋“œ๋กœ ํ•„ํ„ฐ๋ง
478
- include=["documents", "metadatas", "distances"]
479
- )
480
-
481
- # 2. ํ‚ค์›Œ๋“œ ์ถ”์ถœ
482
- keywords = extract_keywords(query, top_n=5)
483
-
484
- # 3. ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ ์ˆ˜ ๊ณ„์‚ฐ
485
- hybrid_results = []
486
- for i, doc_id in enumerate(vector_results['ids'][0]):
487
- doc = vector_results['documents'][0][i]
488
- metadata = vector_results['metadatas'][0][i]
489
- vector_score = 1 - vector_results['distances'][0][i] # ๊ฑฐ๋ฆฌ๋ฅผ ์œ ์‚ฌ๋„๋กœ ๋ณ€ํ™˜
490
-
491
- # ํ‚ค์›Œ๋“œ ๋งค์นญ ์ ์ˆ˜
492
- keyword_score = 0
493
- doc_lower = doc.lower()
494
- for keyword in keywords:
495
- if keyword.lower() in doc_lower:
496
- keyword_score += 1
497
- keyword_score = keyword_score / len(keywords) if keywords else 0
498
-
499
- # ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ์ ์ˆ˜ (๋ฒกํ„ฐ 70% + ํ‚ค์›Œ๋“œ 30%)
500
- hybrid_score = 0.7 * vector_score + 0.3 * keyword_score
501
-
502
- hybrid_results.append({
503
- 'id': doc_id,
504
- 'document': doc,
505
- 'metadata': metadata,
506
- 'hybrid_score': hybrid_score,
507
- 'vector_score': vector_score,
508
- 'keyword_score': keyword_score
509
- })
510
-
511
- # 4. ์ ์ˆ˜์ˆœ ์ •๋ ฌ ํ›„ ์ƒ์œ„ 5๊ฐœ
512
- hybrid_results.sort(key=lambda x: x['hybrid_score'], reverse=True)
513
- top_results = hybrid_results[:top_k]
514
-
515
- return {
516
- 'documents': [[r['document'] for r in top_results]],
517
- 'metadatas': [[r['metadata'] for r in top_results]],
518
- 'scores': [r['hybrid_score'] for r in top_results],
519
- 'keywords': keywords
520
- }
521
-
522
-
523
- # ==================== Grok API ์ ๊ฒ€ ํ•จ์ˆ˜ ====================
524
- def grok_verify_and_extract(query: str, search_results: Dict, api_key: str) -> Dict:
525
- """Grok API๋กœ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ ์ ๊ฒ€ ๋ฐ ์ตœ์ข… 1๊ฐœ๋งŒ ์„ ํƒ"""
526
- docs = search_results['documents'][0]
527
- metas = search_results['metadatas'][0]
528
-
529
- # ๋ฌธ์„œ๋“ค์„ ๋ฒˆํ˜ธ์™€ ํ•จ๊ป˜ ํฌ๋งทํŒ…
530
- formatted_docs = []
531
- for i, (doc, meta) in enumerate(zip(docs, metas), 1):
532
- formatted_docs.append(f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}")
533
-
534
- context = "\n\n".join(formatted_docs)
535
-
536
- system_prompt = """๋‹น์‹ ์€ RFx ๋ฌธ์„œ ๋ถ„์„ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
537
- ์ฃผ์–ด์ง„ 3๊ฐœ์˜ ๋ฌธ์„œ ์ค‘์—์„œ ์‚ฌ์šฉ์ž ์งˆ๋ฌธ๊ณผ **๊ฐ€์žฅ ๊ด€๋ จ ์žˆ๋Š” ๋‹จ 1๊ฐœ์˜ ํ•ต์‹ฌ ์ •๋ณด**๋งŒ ์„ ํƒํ•˜์„ธ์š”.
538
-
539
- **์ค‘์š” ๊ทœ์น™:**
540
- 1. ๋ฐ˜๋“œ์‹œ **1๊ฐœ์˜ ํ…์ŠคํŠธ**๋งŒ ์ถ”์ถœ
541
- 2. ๊ฐ€์žฅ ์ง์ ‘์ ์œผ๋กœ ์งˆ๋ฌธ์— ๋‹ตํ•˜๋Š” ์ •๋ณด ์„ ํƒ
542
- 3. ๊ธˆ์•ก, ๋‚ ์งœ, ์ˆ˜๋Ÿ‰ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ˆซ์ž ์ •๋ณด ์šฐ์„ 
543
- 4. ์ถ”์ถœ๋œ ํ…์ŠคํŠธ๋Š” ์›๋ฌธ ๊ทธ๋Œ€๋กœ ์œ ์ง€ (150์ž ์ด๋‚ด)
544
- 5. JSON ํ˜•์‹์œผ๋กœ๋งŒ ์‘๋‹ต
545
-
546
- **์‘๋‹ต ํ˜•์‹:**
547
- {
548
- "selected_text": "์„ ํƒ๋œ ํ…์ŠคํŠธ (์›๋ฌธ ๊ทธ๋Œ€๋กœ)",
549
- "page": ํŽ˜์ด์ง€๋ฒˆํ˜ธ,
550
- "relevance_reason": "์ด ํ…์ŠคํŠธ๋ฅผ ์„ ํƒํ•œ ์ด์œ "
551
- }"""
552
-
553
- user_prompt = f"""<์งˆ๋ฌธ>
554
- {query}
555
- </์งˆ๋ฌธ>
556
-
557
- <๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ๋“ค>
558
- {context}
559
- </๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ๋“ค>
560
-
561
- ์œ„ 3๊ฐœ ๋ฌธ์„œ์—์„œ ์งˆ๋ฌธ์— ๊ฐ€์žฅ ์ •ํ™•ํ•˜๊ฒŒ ๋‹ตํ•˜๋Š” **๋‹จ 1๊ฐœ์˜ ํ•ต์‹ฌ ์ •๋ณด**๋ฅผ JSON ํ˜•์‹์œผ๋กœ ์„ ํƒํ•˜์„ธ์š”.
562
- ์„ ํƒํ•œ ํ…์ŠคํŠธ๋Š” 150์ž ์ด๋‚ด๋กœ ํ•˜์„ธ์š”."""
563
-
564
- headers = {
565
- "Content-Type": "application/json",
566
- "Authorization": f"Bearer {api_key}"
567
- }
568
-
569
- payload = {
570
- "model": "grok-3",
571
- "messages": [
572
- {"role": "system", "content": system_prompt},
573
- {"role": "user", "content": user_prompt}
574
- ],
575
- "temperature": 0.1,
576
- "max_tokens": 1000,
577
- "stream": False
578
- }
579
-
580
- try:
581
- response = requests.post(
582
- f"{GROK_API_BASE}/chat/completions",
583
- headers=headers,
584
- json=payload,
585
- timeout=30
586
- )
587
-
588
- if response.status_code != 200:
589
- return {"error": f"API ์˜ค๋ฅ˜: {response.status_code}"}
590
-
591
- result = response.json()
592
- content = result["choices"][0]["message"]["content"]
593
-
594
- # JSON ํŒŒ์‹ฑ
595
- # markdown ์ฝ”๋“œ ๋ธ”๋ก ์ œ๊ฑฐ
596
- content = content.replace("```json", "").replace("```", "").strip()
597
- extracted_data = json.loads(content)
598
-
599
- return extracted_data
600
-
601
- except Exception as e:
602
- return {"error": f"์˜ค๋ฅ˜: {str(e)}"}
603
-
604
-
605
- def build_context(search_results: Dict, max_length: int = 3000) -> str:
606
- """์ปจํ…์ŠคํŠธ ๊ตฌ์„ฑ"""
607
- context_parts = []
608
- current_length = 0
609
-
610
- docs = search_results['documents'][0]
611
- metas = search_results['metadatas'][0]
612
-
613
- for i, (doc, meta) in enumerate(zip(docs, metas), 1):
614
- part = f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}\n"
615
- part_length = len(part)
616
-
617
- if current_length + part_length > max_length:
618
- remaining = max_length - current_length
619
- if remaining > 200:
620
- part = f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc[:remaining-50]}...\n"
621
- context_parts.append(part)
622
- break
623
-
624
- context_parts.append(part)
625
- current_length += part_length
626
-
627
- return "\n".join(context_parts)
628
-
629
-
630
- def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
631
- """๋‹ต๋ณ€ ์ƒ์„ฑ"""
632
- context = build_context(search_results, max_length=4000)
633
-
634
- system_prompt = """๋‹น์‹ ์€ ์ž๋™์ฐจ ์ œ์กฐ์—… RFx ๋ฌธ์„œ ์ „๋ฌธ ๋ถ„์„๊ฐ€์ž…๋‹ˆ๋‹ค.
635
- **์‚ฐ์—… ํŠนํ™” ์ง€์นจ:**
636
- 1. **์ž๋™์ฐจ ์ œ์กฐ์—… ์€์–ดยท์•ฝ์–ด ํ•ด์„**: ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์—๋Š” ์ž๋™์ฐจ ์ œ์กฐ์—… ํŠน์œ ์˜ ์€์–ดยท์•ฝ์–ดยท์ „๋ฌธ์šฉ์–ด๊ฐ€ ํฌํ•จ๋  ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์‚ฐ์—… ๋ฌธ๋งฅ์— ๋งž๊ฒŒ ์ •ํ™•ํžˆ ํ•ด์„ํ•˜๋ผ.
637
- 2. **์–ธ์–ด ํ˜ผ์šฉ ๋ฐ ๋น„๋ฌธ ๋Œ€์‘**: ์‚ฌ์šฉ์ž์˜ ๋ฌธ์žฅ์€ ํ•œ๊ตญ์–ด์™€ ์˜์–ด๊ฐ€ ์„ž์ด๊ฑฐ๋‚˜ ๋ฌธ๋ฒ• ์˜ค๋ฅ˜๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ •ํ™•ํžˆ ์ดํ•ดํ•˜๋ผ.
638
- 3. **๋ชจํ˜ธํ•œ ์งˆ๋ฌธ ์ž๋™ ๋ณด์ •**: ์‚ฌ์šฉ์ž์˜ ์งˆ๋ฌธ์ด ๋ถˆ์™„์ „ํ•˜๊ฑฐ๋‚˜ ๋ชจํ˜ธํ•ด๋„ ์งˆ๋ฌธ ์˜๋„๋ฅผ ์ถ”๋ก ํ•˜์—ฌ ์ ์ ˆํ•˜๊ฒŒ ์žฌ๊ตฌ์„ฑํ•˜๋ผ.
639
- **๋ฌธ์„œ ๊ธฐ๋ฐ˜ ์‘๋‹ต ์›์น™ (์ ˆ๋Œ€ ์ถ”์ธก ๊ธˆ์ง€):**
640
- 1. ์ œ๊ณต๋œ ๋ฌธ์„œ๋ฅผ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์ •ํ™•ํ•œ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
641
- 2. **๋ฐ˜๋“œ์‹œ ๋ฌธ์„œ์—์„œ ๊ทผ๊ฑฐ๋ฅผ ์ฐพ์•„ ๋‹ต๋ณ€**ํ•˜๊ณ , ๋ฌธ์„œ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ž„์˜๋กœ ์ถ”์ธกํ•˜์ง€ ๋ง๊ณ  **"๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"**๋ผ๊ณ  ๋ช…์‹œํ•˜๋ผ
642
- 3. **๋ฌธ์„œ์™€ ์ „ํ˜€ ๋ฌด๊ด€ํ•œ ์งˆ๋ฌธ**(์˜ˆ: ์ ์‹ฌ ์ถ”์ฒœ, ๋‚ ์”จ, ์ผ์ƒ ๋Œ€ํ™” ๋“ฑ)์€ **"์ฃ„์†กํ•˜์ง€๋งŒ, ์ œ๊ณต๋œ ๋ฌธ์„œ์—๋Š” ํ•ด๋‹น ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."**๋ผ๊ณ ๋งŒ ๋‹ต๋ณ€ํ•˜๊ณ  ์ถ”๊ฐ€ ์„ค๋ช… ์—†์ด ์ข…๋ฃŒํ•˜๋ผ
643
- 4. ๋ฌธ์„œ์— ์ •๋ณด๊ฐ€ ์žˆ๋Š”๋ฐ๋„ "์—†๋‹ค"๊ณ  ํ•˜์ง€ ๋งˆ์„ธ์š”
644
- **ํ•ต์‹ฌ ์ •๋ณด ์šฐ์„  ์ถ”์ถœ:**
645
- - ๊ธˆ์•ก, ์ˆ˜๋Ÿ‰, ๊ทœ๊ฒฉ, ์ผ์ •, ์š”๊ตฌ์กฐ๊ฑด ๋“ฑ **์ˆ˜์น˜ ๊ธฐ๋ฐ˜ ์ •๋ณด๋ฅผ ์ตœ์šฐ์„ **์œผ๋กœ ์‹๋ณ„ํ•˜๊ณ  ์ •ํ™•ํ•˜๊ฒŒ ๋ฐ˜ํ™˜ํ•˜๋ผ
646
- - ์ˆซ์ž, ๊ธˆ์•ก, ๋‚ ์งœ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์šฐ์„ ์ ์œผ๋กœ ์ฐพ์œผ์„ธ์š”
647
- - ์• ๋งคํ•œ ํ‘œํ˜„ ๋Œ€์‹  ๊ตฌ์ฒด์ ์ธ ์ˆ˜์น˜๋ฅผ ์ œ๊ณตํ•˜์„ธ์š”
648
- **๋ฐฉ๋Œ€ํ•œ ๋ฌธ์„œ ์ฒ˜๋ฆฌ (500ํŽ˜์ด์ง€ ๊ฐ€๋Šฅ):**
649
- - ๋ฌธ์„œ๊ฐ€ ๋งค์šฐ ๊ธธ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ์งˆ๋ฌธ๊ณผ ์ง์ ‘ ๊ด€๋ จ๋œ ๋ถ€๋ถ„๋งŒ ์„ ๋ณ„ํ•ด ์š”์•ฝํ•˜๊ณ  ํ•ต์‹ฌ ์ •๋ณด๋งŒ ์‚ฌ์šฉํ•˜๋ผ
650
- **์‹ค๋ฌด ๋งฅ๋ฝ ๊ณ ๋ ค (RFx ํ”„๋กœ์„ธ์Šค ํŠนํ™”):**
651
- - ๋‹ต๋ณ€ํ•  ๋•Œ ์‹ค์ œ ์ž๋™์ฐจ RFx ์‹ค๋ฌด์ž๊ฐ€ ์˜์‚ฌ๊ฒฐ์ •์— ์‚ฌ์šฉํ•˜๋Š” ์ •๋ณด๋ผ๋Š” ์ ์„ ๊ณ ๋ คํ•˜์—ฌ ์‹ค๋ฌด ์ค‘์‹ฌ์œผ๋กœ ๋ช…ํ™•ํ•˜๊ฒŒ ์„ค๋ช…ํ•˜๋ผ
652
- - ํŠนํžˆ ๋‹ค์Œ ํ•ญ๋ชฉ๋“ค์„ ์šฐ์„ ์ ์œผ๋กœ ํŒŒ์•…ํ•˜๋ผ:
653
- 1. ์‚ฌ์—… ์ฐธ์—ฌ ์ž๊ฒฉ ๋ฐ ์š”๊ตฌ ์ธ์ฆ
654
- 2. ์‚ฌ์—… ๊ธฐ๊ฐ„ ๋ฐ ์ผ์ •
655
- 3. ์˜ˆ์‚ฐ (ํ˜„๊ธˆ/ํ˜„๋ฌผ ๋น„์ค‘, ์ˆœ์ˆ˜ ํšŒ์‚ฌ ์ˆ˜์ต ๊ฐ€๋Šฅ์„ฑ)
656
- 4. ์ œ์•ˆ์š”์ฒญ์„œ ์‚ฌ์–‘์„œ โ€“ ํ•„์š”ํ•œ ๊ธฐ์ˆ ์  ์š”๊ตฌ์‚ฌํ•ญ(์„œ๋ฒ„/์†Œํ”„ํŠธ์›จ์–ด ๋“ฑ)
657
- 5. ํŒ๋งค ๋Œ€์ƒ ๋ฐ ์‚ฌ์—… ๋ฒ”์œ„
658
- **๋‹ต๋ณ€ ํ˜•์‹:**
659
- - ๋‹ต๋ณ€ ์‹œ ๋ฐ˜๋“œ์‹œ **[ํŽ˜์ด์ง€ X]** ํ˜•ํƒœ๋กœ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œํ•˜์„ธ์š” (์˜ˆ: [ํŽ˜์ด์ง€ 3], [ํŽ˜์ด์ง€ 5, 12])
660
- - ๊ด€๋ จ ๋ฌธ๋งฅ์„ ์œ ์ง€ํ•˜๋ฉฐ, ๋‹ต๋ณ€์—๋Š” ๋ฌธ์„œ์˜ ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ์™€ ์›๋ฌธ ์ผ๋ถ€๋ฅผ ์ •ํ™•ํžˆ ์ธ์šฉํ•˜๋ผ
661
- - ํ•ต์‹ฌ ๋‹ต๋ณ€์„ ๋จผ์ € ๋ช…ํ™•ํ•˜๊ฒŒ ์ œ์‹œ
662
- - ํ•„์š”์‹œ ์ถ”๊ฐ€ ๊ด€๋ จ ์ •๋ณด ์ œ๊ณต
663
- - ๋ฆฌ์ŠคํŠธ๋Š” - ๋˜๋Š” ๋ฒˆํ˜ธ๋ฅผ ์‚ฌ์šฉ
664
- - ๊ฐ•์กฐ๋Š” **๊ตต๊ฒŒ** ๋˜๋Š” *๊ธฐ์šธ์ž„* ์‚ฌ์šฉ
665
- - **๋‹ต๋ณ€์€ ๋ฐ˜๋“œ์‹œ ๋งˆํฌ๋‹ค์šด๋งŒ ์‚ฌ์šฉํ•ด์•ผ ํ•˜๋ฉฐ, HTML ํƒœ๊ทธ(<div>, <span>, <details>, <summary> ๋“ฑ)๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์‹ญ์‹œ์˜ค**"""
666
-
667
- user_prompt = f"""๋‹ค์Œ ๋ฌธ์„œ๋“ค์„ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”.
668
- <๋ฌธ์„œ>
669
- {context}
670
- </๋ฌธ์„œ>
671
- <์งˆ๋ฌธ>
672
- {query}
673
- </์งˆ๋ฌธ>
674
- **์ค‘์š”**:
675
- - ์งˆ๋ฌธ์ด ๋ฌธ์„œ์™€ ์ „ํ˜€ ๋ฌด๊ด€ํ•œ ๊ฒฝ์šฐ(์˜ˆ: ์ ์‹ฌ ์ถ”์ฒœ, ๋‚ ์”จ, ์ผ์ƒ ๋Œ€ํ™” ๋“ฑ) "์ฃ„์†กํ•˜์ง€๋งŒ, ์ œ๊ณต๋œ ๋ฌธ์„œ์—๋Š” ํ•ด๋‹น ์งˆ๋ฌธ๊ณผ ๊ด€๋ จ๋œ ์ •๋ณด๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์ง€ ์•Š์Šต๋‹ˆ๋‹ค."๋ผ๊ณ ๋งŒ ๋‹ต๋ณ€ํ•˜์„ธ์š”
676
- - ๋ฌธ์„œ๋ฅผ ์ฒ˜์Œ๋ถ€ํ„ฐ ๋๊นŒ์ง€ ์ฃผ์˜ ๊นŠ๊ฒŒ ์ฝ์œผ์„ธ์š”
677
- - ์ˆซ์ž, ๊ธˆ์•ก ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
678
- - ์ฐพ์€ ์ •๋ณด๋Š” ์ •ํ™•ํžˆ ์ธ์šฉํ•˜์„ธ์š”
679
- - ์ถœ์ฒ˜๋Š” ๋ฐ˜๋“œ์‹œ [ํŽ˜์ด์ง€ X] ํ˜•ํƒœ๋กœ ํ‘œ์‹œํ•˜์„ธ์š” (์˜ˆ: [ํŽ˜์ด์ง€ 3])
680
- - ์—ฌ๋Ÿฌ ํŽ˜์ด์ง€์—์„œ ์ •๋ณด๋ฅผ ์ฐพ์€ ๊ฒฝ์šฐ [ํŽ˜์ด์ง€ 3, 5, 12] ํ˜•ํƒœ๋กœ ํ‘œ์‹œํ•˜์„ธ์š”
681
- - ์ •๋ง๋กœ ๋ฌธ์„œ์— ์—†๋Š” ๊ฒฝ์šฐ์—๋งŒ "๋ฌธ์„œ์—์„œ ๊ด€๋ จ ์ •๋ณด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ํ•˜์„ธ์š”
682
- - ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ๋งŒ ๋‹ต๋ณ€ํ•˜๊ณ , HTML ํƒœ๊ทธ๋Š” ์ ˆ๋Œ€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์„ธ์š”"""
683
-
684
- headers = {
685
- "Content-Type": "application/json",
686
- "Authorization": f"Bearer {api_key}"
687
- }
688
-
689
- payload = {
690
- "model": "grok-3",
691
- "messages": [
692
- {"role": "system", "content": system_prompt},
693
- {"role": "user", "content": user_prompt}
694
- ],
695
- "temperature": 0.1,
696
- "max_tokens": 2000,
697
- "stream": False
698
- }
699
-
700
- try:
701
- response = requests.post(
702
- f"{GROK_API_BASE}/chat/completions",
703
- headers=headers,
704
- json=payload,
705
- timeout=30
706
- )
707
-
708
- if response.status_code != 200:
709
- error_detail = ""
710
- try:
711
- error_data = response.json()
712
- error_detail = error_data.get('error', {}).get('message', '')
713
- except Exception:
714
- error_detail = response.text
715
-
716
- return f"โŒ API ์˜ค๋ฅ˜ (์ฝ”๋“œ: {response.status_code})\n\n{error_detail}"
717
-
718
- result = response.json()
719
- return result["choices"][0]["message"]["content"]
720
-
721
- except Exception as e:
722
- return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
723
-
724
-
725
- def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
726
- """PDF์— ํ…์ŠคํŠธ ํ•˜์ด๋ผ์ดํŠธ ์ถ”๊ฐ€ - ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •"""
727
- doc = fitz.open(stream=pdf_bytes, filetype="pdf")
728
-
729
- # ๋…ธ๋ž€์ƒ‰ ๊ณ ์ •
730
- yellow_color = [1.0, 1.0, 0.0]
731
-
732
- for item in highlight_info:
733
- page_num = item['page'] - 1
734
- search_text = item['text']
735
-
736
- if page_num >= len(doc):
737
- continue
738
-
739
- page = doc[page_num]
740
-
741
- text_variations = [
742
- search_text,
743
- search_text.replace(' ', ''),
744
- search_text.replace(',', ''),
745
- ]
746
-
747
- for text_var in text_variations:
748
- text_instances = page.search_for(text_var)
749
-
750
- for inst in text_instances:
751
- highlight = page.add_highlight_annot(inst)
752
- highlight.set_colors(stroke=yellow_color)
753
- highlight.update()
754
-
755
- output_bytes = doc.tobytes()
756
- doc.close()
757
-
758
- return output_bytes
759
-
760
-
761
- # ==================== Grok ์ถ”์ถœ ๊ฒฐ๊ณผ ๊ธฐ๋ฐ˜ ํ•˜์ด๋ผ์ดํŠธ ====================
762
- def extract_highlights_from_grok(grok_result: Dict) -> List[Dict]:
763
- """Grok API๊ฐ€ ์„ ํƒํ•œ ์ตœ์ข… 1๊ฐœ๋ฅผ ํ•˜์ด๋ผ์ดํŠธ ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜"""
764
- if "error" in grok_result:
765
- return []
766
-
767
- highlights = []
768
-
769
- # ์ตœ์ข… ์„ ํƒ๋œ 1๊ฐœ๋งŒ ์ฒ˜๋ฆฌ
770
- selected_text = grok_result.get("selected_text", "")
771
- page = grok_result.get("page", 1)
772
-
773
- if selected_text and len(selected_text) <= 150:
774
- highlights.append({
775
- 'text': selected_text,
776
- 'page': page
777
- })
778
-
779
- return highlights
780
-
781
-
782
- def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict], zoom_level: float = 2.0):
783
- """ํ•˜์ด๋ผ์ดํŠธ๋œ PDF ๋ Œ๋”๋ง"""
784
- highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
785
-
786
- doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
787
-
788
- highlighted_pages = set(h['page'] for h in highlight_info)
789
-
790
- pdf_html = '<div class="pdf-container">'
791
-
792
- for page_num in range(len(doc)):
793
- page = doc[page_num]
794
-
795
- # zoom_level์„ ์‚ฌ์šฉํ•˜์—ฌ ๋ Œ๋”๋ง
796
- pix = page.get_pixmap(matrix=fitz.Matrix(zoom_level, zoom_level))
797
- img_data = pix.tobytes("png")
798
- img_base64 = base64.b64encode(img_data).decode()
799
-
800
- # ์‹ค์ œ ์ด๋ฏธ์ง€ ํฌ๊ธฐ ๊ณ„์‚ฐ (zoom_level์— ๋”ฐ๋ผ)
801
- zoom_percentage = int(zoom_level * 50) # 2.0 = 100%, 1.0 = 50%
802
-
803
- pdf_html += '<div style="margin-bottom: 2rem; position: relative;">'
804
-
805
- # ํ•˜์ด๋ผ์ดํŠธ ์—ฌ๋ถ€์— ๋”ฐ๋ผ ํŽ˜์ด์ง€ ํ—ค๋” ์Šคํƒ€์ผ ๋ณ€๊ฒฝ
806
- if (page_num + 1) in highlighted_pages:
807
- # ํ•˜์ด๋ผ์ดํŠธ๊ฐ€ ์žˆ๋Š” ํŽ˜์ด์ง€ - ๋…ธ๋ž€ ๋ฐฐ๊ฒฝ
808
- pdf_html += f'<div style="background: #FEF08A; color: #854D0E; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold; border-left: 4px solid #EAB308;">โญ ํŽ˜์ด์ง€ {page_num + 1}</div>'
809
- else:
810
- # ์ผ๋ฐ˜ ํŽ˜์ด์ง€ - ํŒŒ๋ž€ ๋ฐฐ๊ฒฝ
811
- pdf_html += f'<div style="background: #667eea; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;">๐Ÿ“„ ํŽ˜์ด์ง€ {page_num + 1}</div>'
812
-
813
- # width๋ฅผ zoom_percentage๋กœ ๋ณ€๊ฒฝํ•˜์—ฌ ์‹ค์ œ ํ™•๋Œ€/์ถ•์†Œ ์ ์šฉ
814
- pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: {zoom_percentage}%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1); display: block; margin: 0 auto;" />'
815
- pdf_html += '</div>'
816
-
817
- pdf_html += '</div>'
818
- doc.close()
819
-
820
- return pdf_html
821
-
822
-
823
- def main():
824
- init_session()
825
-
826
-
827
- # Header ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ „์—๋งŒ ๋ณด์ž„
828
- if not st.session_state.processed:
829
- st.markdown("""
830
- <div class="probin-header">
831
- <div class="probin-title">๐Ÿ“„ PROBIN</div>
832
- <div class="probin-subtitle">RFx ๋ฌธ์„œ ๋ถ„์„ AI - ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ + Grok ์ ๊ฒ€</div>
833
- </div>
834
- """, unsafe_allow_html=True)
835
-
836
- # ========== ์‚ฌ์ด๋“œ๋ฐ” ==========
837
- with st.sidebar:
838
- st.title("๐Ÿ”ฎ PROBIN")
839
-
840
- uploaded_file = st.file_uploader(
841
- "๋“œ๋ž˜๊ทธํ•˜์—ฌ ํŒŒ์ผ์„ ์—…๋กœ๋“œ ๋˜๋Š” ํด๋ฆญํ•˜์—ฌ ์„ ํƒํ•˜์„ธ์š”.",
842
- type=['pdf'],
843
- label_visibility="visible",
844
- help="PDF ํŒŒ์ผ๋งŒ ์—…๋กœ๋“œ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค (์ตœ๋Œ€ 200MB)"
845
- )
846
-
847
- if uploaded_file:
848
- if st.button("๐Ÿ“„ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์‹œ์ž‘", type="primary", use_container_width=True):
849
- if not GROK_API_KEY:
850
- st.error("โš ๏ธ GROK_API_KEY๊ฐ€ .env ํŒŒ์ผ์— ์„ค์ •๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค!")
851
- st.stop()
852
-
853
- # ๊ธฐ์กด ์„ธ์…˜ ์ดˆ๊ธฐํ™”
854
- st.session_state.vector_db = None
855
- st.session_state.embedder = None
856
- st.session_state.chat_history = []
857
- st.session_state.current_highlights = []
858
-
859
- with st.spinner("๐Ÿ“„ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘..."):
860
- try:
861
- chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
862
-
863
- with st.spinner("๐Ÿ”ง ๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ƒ์„ฑ ์ค‘..."):
864
- collection, embedder = create_vector_db(chunks, metadata_list)
865
-
866
- st.session_state.vector_db = collection
867
- st.session_state.embedder = embedder
868
- st.session_state.pdf_bytes = pdf_bytes
869
- st.session_state.pdf_pages_text = pages_text
870
- st.session_state.processed = True
871
- st.session_state.doc_metadata = {
872
- "filename": uploaded_file.name,
873
- "chunks": len(chunks),
874
- "pages": len(set(m['page'] for m in metadata_list))
875
- }
876
-
877
- st.success("โœ… ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
878
- st.rerun()
879
-
880
- except Exception as e:
881
- st.error(f"์˜ค๋ฅ˜: {str(e)}")
882
-
883
- # ==================== ์ˆ˜์ •: ์ฒญํฌ ํ‘œ์‹œ ์ œ๊ฑฐ ====================
884
- # ๋ฌธ์„œ ์ •๋ณด ํ‘œ์‹œ (์ฒญํฌ ์ •๋ณด ์ œ์™ธ)
885
- if st.session_state.processed:
886
- st.markdown("#### ๐Ÿ“Š ๋ฌธ์„œ ์ •๋ณด")
887
- st.info(f"๐Ÿ“„ **{st.session_state.doc_metadata['filename']}**")
888
- st.info(f"๐Ÿ“‘ ํŽ˜์ด์ง€: {st.session_state.doc_metadata['pages']}")
889
- # ์ฒญํฌ ํ‘œ์‹œ ์ œ๊ฑฐ๋จ
890
- # ============================================================
891
-
892
- st.divider()
893
-
894
- # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
895
- if st.button("๐Ÿ”„ ์ƒˆ ๋ฌธ์„œ ์—…๋กœ๋“œ", use_container_width=True):
896
- st.session_state.processed = False
897
- st.session_state.vector_db = None
898
- st.session_state.embedder = None
899
- st.session_state.chat_history = []
900
- st.session_state.current_highlights = []
901
- st.session_state.pdf_bytes = None
902
- st.session_state.pdf_pages_text = {}
903
- st.session_state.zoom_level = 2.0
904
- st.rerun()
905
-
906
- # ===== ์•„์ง ๋ฌธ์„œ๊ฐ€ ์ฒ˜๋ฆฌ๋˜์ง€ ์•Š์€ ๊ฒฝ์šฐ
907
- if not st.session_state.processed:
908
- st.markdown("""
909
- <div class="usage-guide">
910
- <h2 style="text-align: center; color: #2D3748; margin-bottom: 1.5rem;">๐Ÿ“– ์‚ฌ์šฉ ๋ฐฉ๋ฒ•</h2>
911
- <div class="guide-step">
912
- <div class="step-number">1</div>
913
- <div>์˜ค๋ฅธ์ชฝ์— PDF ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”.</div>
914
- </div>
915
- <div class="guide-step">
916
- <div class="step-number">2</div>
917
- <div>๋ฌธ์„œ ์ฒ˜๋ฆฌ๊ฐ€ ์™„๋ฃŒ๋  ๋•Œ๊นŒ์ง€ 30์ดˆ ์ •๋„ ๊ธฐ๋‹ค๋ฆฝ๋‹ˆ๋‹ค.</div>
918
- </div>
919
- <div class="guide-step">
920
- <div class="step-number">3</div>
921
- <div>์™ผ์ชฝ์—์„œ PDF๋ฅผ ํ™•์ธํ•˜๊ณ , ์˜ค๋ฅธ์ชฝ ์ฑ„ํŒ…์ฐฝ์—์„œ ์งˆ๋ฌธํ•˜์„ธ์š”.</div>
922
- </div>
923
- <div class="guide-step">
924
- <div class="step-number">4</div>
925
- <div>AI๊ฐ€ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰์œผ๋กœ 3๊ฐœ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ๊ณ , Grok์ด ์ตœ์ข… 1๊ฐœ๋งŒ ์„ ํƒํ•ด ํ•˜์ด๋ผ์ดํŠธํ•ฉ๋‹ˆ๋‹ค.</div>
926
- </div>
927
- </div>
928
- """, unsafe_allow_html=True)
929
-
930
- # ๋ฌธ์„œ๊ฐ€ ์ฒ˜๋ฆฌ๋œ ๊ฒฝ์šฐ: ๋ถ„์„ ํ™”๋ฉด
931
- else:
932
- # 2๋‹จ ๋ ˆ์ด์•„์›ƒ
933
- col1, col2 = st.columns([1, 1])
934
-
935
- with col1:
936
- # ํ—ค๋”์™€ ์คŒ ์ปจํŠธ๋กค์„ ๊ฐ€๋กœ๋กœ ๋‚˜๋ž€ํžˆ
937
- header_cols = st.columns([7, 1, 1.5, 1])
938
- with header_cols[0]:
939
- st.markdown("### ๐Ÿ“„ ๋ฌธ์„œ ๋ทฐ์–ด")
940
- with header_cols[1]:
941
- if st.button("โž–", key="zoom_out", help="์ถ•์†Œ", use_container_width=True):
942
- if st.session_state.zoom_level > 0.5:
943
- st.session_state.zoom_level -= 0.25
944
- st.rerun()
945
- with header_cols[2]:
946
- st.markdown(f"<div style='text-align: center; padding-top: 0.5rem; font-weight: bold;'>{int(st.session_state.zoom_level * 50)}%</div>", unsafe_allow_html=True)
947
- with header_cols[3]:
948
- if st.button("โž•", key="zoom_in", help="ํ™•๋Œ€", use_container_width=True):
949
- if st.session_state.zoom_level < 4.0:
950
- st.session_state.zoom_level += 0.25
951
- st.rerun()
952
-
953
- if st.session_state.pdf_bytes:
954
- pdf_html = render_pdf_with_highlights(
955
- st.session_state.pdf_bytes,
956
- st.session_state.current_highlights,
957
- st.session_state.zoom_level
958
- )
959
- st.markdown(pdf_html, unsafe_allow_html=True)
960
-
961
- with col2:
962
- st.markdown("### ๐Ÿ’ฌ AI ์ฑ—๋ด‡")
963
-
964
- # ์ฑ„ํŒ… ํžˆ์Šคํ† ๋ฆฌ๋ฅผ ๋‹ด์„ ์ปจํ…Œ์ด๋„ˆ
965
- chat_container = st.container(height=650)
966
-
967
- with chat_container:
968
- for msg in st.session_state.chat_history:
969
- with st.chat_message(msg["role"]):
970
- st.markdown(msg["content"])
971
-
972
- if msg["role"] == "assistant" and "sources" in msg:
973
- with st.expander("๐Ÿ“š ์ฐธ์กฐ ๋ฌธ์„œ"):
974
- for i, (doc, meta) in enumerate(zip(
975
- msg["sources"]["docs"],
976
- msg["sources"]["metas"]
977
- ), 1):
978
- # ํ…์ŠคํŠธ๋ฅผ 150์ž๋กœ ์ œํ•œํ•˜๊ณ  ๊ฐ„๊ฒฐํ•˜๊ฒŒ ํ‘œ์‹œ
979
- clean_text = doc[:150] + ('...' if len(doc) > 150 else '')
980
-
981
- st.markdown(f"""
982
- <div class="source-box">
983
- <div class="source-title">
984
- <span class="page-indicator">ํŽ˜์ด์ง€ {meta['page']}</span>
985
- </div>
986
- <div style="font-size: 0.9rem; color: #475569; margin-top: 0.3rem;">
987
- {clean_text}
988
- </div>
989
- </div>
990
- """, unsafe_allow_html=True)
991
-
992
- # Grok ๊ฒ€์ฆ ๊ฒฐ๊ณผ ํ‘œ์‹œ (์ตœ์ข… 1๊ฐœ)
993
- if "grok_verified" in msg["sources"]:
994
- with st.expander("๐Ÿ” Grok AI ์ตœ์ข… ์„ ํƒ"):
995
- grok_data = msg["sources"]["grok_verified"]
996
- if isinstance(grok_data, dict) and "selected_text" in grok_data:
997
- selected_text = grok_data.get('selected_text', '์„ ํƒ๋œ ์ •๋ณด ์—†์Œ')
998
- # ํ…์ŠคํŠธ๋ฅผ 150์ž๋กœ ์ œํ•œ
999
- display_text = selected_text[:150] + ('...' if len(selected_text) > 150 else '')
1000
-
1001
- st.markdown(f"""
1002
- <div class="highlight-indicator">
1003
- <strong>โœ… ํŽ˜์ด์ง€ {grok_data.get('page', '?')}</strong><br>
1004
- <div style="margin-top: 0.5rem;">{display_text}</div>
1005
- </div>
1006
- """, unsafe_allow_html=True)
1007
-
1008
- # ์ฑ„ํŒ… ์ž…๋ ฅ - ์ปจํ…Œ์ด๋„ˆ ๋ฐ”๋กœ ์•„๋ž˜์— ๋ฐฐ์น˜
1009
- prompt = st.chat_input("๐Ÿ’ฌ ์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", key="chat_input")
1010
-
1011
- # 1๋‹จ๊ณ„: ์งˆ๋ฌธ์„ ๋ฐ›์œผ๋ฉด ์ฆ‰์‹œ ํžˆ์Šคํ† ๋ฆฌ์— ์ถ”๊ฐ€ํ•˜๊ณ  rerun (์งˆ๋ฌธ์ด ์ฑ„ํŒ… ๋ฐ•์Šค ์•ˆ์— ๋‚˜ํƒ€๋‚จ)
1012
- if prompt:
1013
- st.session_state.chat_history.append({"role": "user", "content": prompt})
1014
- st.session_state.processing_query = prompt
1015
- st.rerun()
1016
-
1017
- # 2๋‹จ๊ณ„: processing_query๊ฐ€ ์žˆ์œผ๋ฉด AI ๋‹ต๋ณ€ ์ƒ์„ฑ
1018
- if st.session_state.processing_query:
1019
- query = st.session_state.processing_query
1020
- st.session_state.processing_query = None # ํ”Œ๋ž˜๊ทธ ๋ฆฌ์…‹
1021
-
1022
- with st.spinner("๐Ÿ” ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ ์ค‘..."):
1023
- try:
1024
- # 1. ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ๊ฒ€์ƒ‰ (๋ฒกํ„ฐ + ํ‚ค์›Œ๋“œ) - ์ƒ์œ„ 3๊ฐœ
1025
- search_results = hybrid_search(
1026
- query,
1027
- st.session_state.vector_db,
1028
- st.session_state.embedder,
1029
- top_k=3
1030
- )
1031
-
1032
- # 2. Grok API๋กœ ๊ฒ€์ฆ ๋ฐ ์ถ”์ถœ
1033
- with st.spinner("๐Ÿค– Grok AI ๊ฒ€์ฆ ์ค‘..."):
1034
- grok_result = grok_verify_and_extract(
1035
- query,
1036
- search_results,
1037
- GROK_API_KEY
1038
- )
1039
-
1040
- # 3. ๋‹ต๋ณ€ ์ƒ์„ฑ
1041
- answer = generate_answer(
1042
- query,
1043
- search_results,
1044
- GROK_API_KEY
1045
- )
1046
-
1047
- # 4. Grok ์ถ”์ถœ ๊ฒฐ๊ณผ๋ฅผ ํ•˜์ด๋ผ์ดํŠธ๋กœ ๋ณ€ํ™˜
1048
- highlights = extract_highlights_from_grok(grok_result)
1049
- st.session_state.current_highlights = highlights
1050
-
1051
- # 5. ์ฑ„ํŒ… ํžˆ์Šคํ† ๋ฆฌ์— ๋‹ต๋ณ€ ์ €์žฅ
1052
- chat_data = {
1053
- "role": "assistant",
1054
- "content": answer,
1055
- "sources": {
1056
- "docs": search_results['documents'][0],
1057
- "metas": search_results['metadatas'][0],
1058
- "scores": search_results.get('scores', []),
1059
- "keywords": search_results.get('keywords', []),
1060
- "grok_verified": grok_result
1061
- }
1062
- }
1063
- st.session_state.chat_history.append(chat_data)
1064
- st.rerun()
1065
-
1066
- except Exception as e:
1067
- error_msg = f"โŒ ์˜ค๋ฅ˜: {str(e)}"
1068
- st.session_state.chat_history.append({
1069
- "role": "assistant",
1070
- "content": error_msg
1071
- })
1072
- st.rerun()
1073
-
1074
-
1075
- if __name__ == "__main__":
1076
- main()