dohyune commited on
Commit
e6afbcf
ยท
verified ยท
1 Parent(s): 3fbb91b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +857 -0
app.py ADDED
@@ -0,0 +1,857 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RFx ๋ฌธ์„œ ๋ถ„์„ AI ์—์ด์ „ํŠธ (PDF Text Highlighting)
3
+ PDF ํ…์ŠคํŠธ์— ์ง์ ‘ ํ•˜์ด๋ผ์ดํŠธ ํ‘œ์‹œ
4
+ """
5
+ import streamlit as st
6
+ import fitz # PyMuPDF
7
+ import chromadb
8
+ from sentence_transformers import SentenceTransformer, util
9
+ import requests
10
+ import os
11
+ import re
12
+ import shutil
13
+ from collections import Counter
14
+ import numpy as np
15
+ from typing import List, Dict, Tuple
16
+ import base64
17
+
18
+ GROK_API_KEY = os.getenv("GROK_API_KEY")
19
+ GROK_API_BASE = "https://api.x.ai/v1"
20
+ CHROMA_DIR = "./chroma_db"
21
+ EMBEDDING_MODEL = 'jhgan/ko-sroberta-multitask'
22
+
23
+ st.set_page_config(
24
+ page_title="RFx ๋ฌธ์„œ ๋ถ„์„ AI",
25
+ page_icon="๐Ÿ“„",
26
+ layout="wide",
27
+ initial_sidebar_state="collapsed"
28
+ )
29
+
30
+ st.markdown("""
31
+ <style>
32
+ .main-title {
33
+ font-size: 1.8rem;
34
+ font-weight: bold;
35
+ color: #1E3A8A;
36
+ margin-bottom: 1rem;
37
+ text-align: center;
38
+ }
39
+ .source-box {
40
+ background: #F1F5F9;
41
+ padding: 1rem;
42
+ border-radius: 0.5rem;
43
+ margin: 0.5rem 0;
44
+ border-left: 3px solid #3B82F6;
45
+ }
46
+ .source-title {
47
+ font-weight: bold;
48
+ color: #1E40AF;
49
+ margin-bottom: 0.5rem;
50
+ }
51
+ .keyword-badge {
52
+ display: inline-block;
53
+ background: #DBEAFE;
54
+ color: #1E40AF;
55
+ padding: 0.2rem 0.6rem;
56
+ border-radius: 0.3rem;
57
+ margin: 0.2rem;
58
+ font-size: 0.85rem;
59
+ }
60
+ .pdf-container {
61
+ border: 2px solid #E2E8F0;
62
+ border-radius: 0.5rem;
63
+ padding: 0.5rem;
64
+ height: 800px;
65
+ overflow-y: auto;
66
+ background: white;
67
+ }
68
+ .page-indicator {
69
+ background: #3B82F6;
70
+ color: white;
71
+ padding: 0.3rem 0.8rem;
72
+ border-radius: 1rem;
73
+ font-size: 0.85rem;
74
+ display: inline-block;
75
+ margin: 0.2rem;
76
+ }
77
+ .highlight-indicator {
78
+ background: #FEF08A;
79
+ color: #854D0E;
80
+ padding: 0.5rem 1rem;
81
+ border-radius: 0.5rem;
82
+ margin: 0.5rem 0;
83
+ font-weight: bold;
84
+ border-left: 4px solid #EAB308;
85
+ }
86
+ </style>
87
+ """, unsafe_allow_html=True)
88
+
89
+
90
+ def init_session():
91
+ if 'processed' not in st.session_state:
92
+ st.session_state.processed = False
93
+ if 'vector_db' not in st.session_state:
94
+ st.session_state.vector_db = None
95
+ if 'embedder' not in st.session_state:
96
+ st.session_state.embedder = None
97
+ if 'chat_history' not in st.session_state:
98
+ st.session_state.chat_history = []
99
+ if 'doc_metadata' not in st.session_state:
100
+ st.session_state.doc_metadata = {}
101
+ if 'pdf_bytes' not in st.session_state:
102
+ st.session_state.pdf_bytes = None
103
+ if 'pdf_pages_text' not in st.session_state:
104
+ st.session_state.pdf_pages_text = {}
105
+ if 'current_highlights' not in st.session_state:
106
+ st.session_state.current_highlights = []
107
+
108
+
109
+ def extract_text_from_pdf(pdf_file) -> Tuple[List[str], List[Dict], bytes, Dict]:
110
+ pdf_bytes = pdf_file.read()
111
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
112
+
113
+ chunks = []
114
+ metadata_list = []
115
+ pages_text = {}
116
+
117
+ CHUNK_SIZE = 300
118
+ OVERLAP_SIZE = 60
119
+
120
+ for page_num in range(len(doc)):
121
+ page = doc[page_num]
122
+ text = page.get_text("text")
123
+ pages_text[page_num + 1] = text
124
+
125
+ if not text.strip():
126
+ continue
127
+
128
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
129
+ cleaned_text = '\n'.join(lines)
130
+
131
+ sentences = re.split(r'([.!?]\s+|\n{2,})', cleaned_text)
132
+ sentences = [s for s in sentences if s.strip()]
133
+
134
+ current_chunk = ""
135
+ current_length = 0
136
+
137
+ for sentence in sentences:
138
+ sentence_length = len(sentence)
139
+
140
+ if current_length + sentence_length > CHUNK_SIZE and current_chunk:
141
+ chunks.append(current_chunk.strip())
142
+ metadata_list.append({
143
+ "page": page_num + 1,
144
+ "source": pdf_file.name,
145
+ "chunk_type": "paragraph"
146
+ })
147
+
148
+ overlap_text = current_chunk[-OVERLAP_SIZE:] if len(current_chunk) > OVERLAP_SIZE else current_chunk
149
+ current_chunk = overlap_text + sentence
150
+ current_length = len(current_chunk)
151
+ else:
152
+ current_chunk += sentence
153
+ current_length += sentence_length
154
+
155
+ if current_chunk.strip():
156
+ chunks.append(current_chunk.strip())
157
+ metadata_list.append({
158
+ "page": page_num + 1,
159
+ "source": pdf_file.name,
160
+ "chunk_type": "paragraph"
161
+ })
162
+
163
+ doc.close()
164
+ return chunks, metadata_list, pdf_bytes, pages_text
165
+
166
+
167
+ @st.cache_resource
168
+ def load_embedding_model():
169
+ return SentenceTransformer(EMBEDDING_MODEL)
170
+
171
+
172
+ def create_vector_db(chunks: List[str], metadata_list: List[Dict]):
173
+ embedder = load_embedding_model()
174
+
175
+ if os.path.exists(CHROMA_DIR):
176
+ try:
177
+ shutil.rmtree(CHROMA_DIR)
178
+ except Exception:
179
+ pass
180
+
181
+ client = chromadb.PersistentClient(
182
+ path=CHROMA_DIR,
183
+ settings=chromadb.Settings(
184
+ anonymized_telemetry=False,
185
+ allow_reset=True,
186
+ is_persistent=True
187
+ )
188
+ )
189
+
190
+ try:
191
+ collection = client.get_or_create_collection(
192
+ name="rfx_docs",
193
+ metadata={"hnsw:space": "cosine"}
194
+ )
195
+ except Exception:
196
+ try:
197
+ client.delete_collection("rfx_docs")
198
+ except Exception:
199
+ pass
200
+ collection = client.create_collection(
201
+ name="rfx_docs",
202
+ metadata={"hnsw:space": "cosine"}
203
+ )
204
+
205
+ batch_size = 32
206
+ all_embeddings = []
207
+
208
+ for i in range(0, len(chunks), batch_size):
209
+ batch = chunks[i:i + batch_size]
210
+ embeddings = embedder.encode(batch, show_progress_bar=False, convert_to_numpy=True)
211
+ all_embeddings.extend(embeddings)
212
+
213
+ ids = [f"doc_{i}" for i in range(len(chunks))]
214
+ collection.add(
215
+ embeddings=[emb.tolist() for emb in all_embeddings],
216
+ documents=chunks,
217
+ metadatas=metadata_list,
218
+ ids=ids
219
+ )
220
+
221
+ return collection, embedder
222
+
223
+
224
+ def extract_keywords(text: str, top_n: int = 5) -> List[str]:
225
+ words_with_numbers = re.findall(r'[๊ฐ€-ํžฃ]*\d+[๊ฐ€-ํžฃ]*', text)
226
+ words = re.findall(r'[๊ฐ€-ํžฃ]{2,}', text)
227
+
228
+ stopwords = {
229
+ '๊ฒƒ', '๋“ฑ', '๋ฐ', '๊ทธ', '์ด', '์ €', '์ˆ˜', '๋•Œ', '์ค‘', '๋‚ด', '๋…„', '์›”', '์ผ',
230
+ '๊ฒฝ์šฐ', '๋Œ€ํ•œ', 'ํ†ตํ•ด', '์œ„ํ•ด', '๊ด€๋ จ', '์žˆ๋Š”', 'ํ•˜๋Š”', '๋˜๋Š”', '์ด๋Ÿฐ', '์ €๋Ÿฐ',
231
+ '์–ด๋–ค', '๋ฌด์Šจ', '์–ด๋А', '๋ˆ„๊ตฌ', '์–ธ์ œ', '์–ด๋””', '๋ฌด์—‡', '์–ด๋–ป๊ฒŒ', '์™œ',
232
+ '์•Œ๋ ค', '์„ค๋ช…', '๋งํ•ด', '๋Œ€ํ•ด', '๊ด€ํ•˜์—ฌ', '์žˆ๋‚˜์š”', '์ธ๊ฐ€์š”', '๋ฌด์—‡์ธ๊ฐ€์š”',
233
+ '์–ผ๋งˆ', '์ž…๋‹ˆ๊นŒ', 'ํ•ฉ๋‹ˆ๊นŒ'
234
+ }
235
+
236
+ important_keywords = {
237
+ '๊ธˆ์•ก', '๊ฐ€๊ฒฉ', '๋น„์šฉ', '์˜ˆ์‚ฐ', '์„ค๊ณ„', '์‚ฌ์—…', '๊ณผ์—…', '๊ณ„์•ฝ',
238
+ '๊ณต์‚ฌ', '์šฉ์—ญ', '์ œ์•ˆ', '์ž…์ฐฐ', '๋‚™์ฐฐ', '๊ฒฌ์ ', '๋‹จ๊ฐ€'
239
+ }
240
+
241
+ filtered_words = [w for w in words if w not in stopwords and len(w) >= 2]
242
+ word_freq = Counter(filtered_words)
243
+
244
+ for word in word_freq:
245
+ if word in important_keywords:
246
+ word_freq[word] += 5
247
+
248
+ result = []
249
+ result.extend([w for w in words_with_numbers if w])
250
+
251
+ for word, _ in word_freq.most_common(top_n * 2):
252
+ if word not in result:
253
+ result.append(word)
254
+ if len(result) >= top_n:
255
+ break
256
+
257
+ return result[:top_n]
258
+
259
+
260
+ def rewrite_query(query: str) -> Dict[str, any]:
261
+ original = query.strip()
262
+ cleaned = re.sub(r'[?!,.~]', '', original)
263
+ keywords = extract_keywords(cleaned, top_n=7)
264
+
265
+ variations = []
266
+ variations.append(original)
267
+
268
+ if keywords:
269
+ if len(keywords) >= 2:
270
+ variations.append(' '.join(keywords[:2]))
271
+ if len(keywords) >= 3:
272
+ variations.append(' '.join(keywords[:3]))
273
+
274
+ for kw in keywords[:3]:
275
+ if kw not in variations:
276
+ variations.append(kw)
277
+
278
+ synonym_map = {
279
+ '๊ธˆ์•ก': ['๊ฐ€๊ฒฉ', '๋น„์šฉ', '์˜ˆ์‚ฐ'],
280
+ '์„ค๊ณ„': ['๋””์ž์ธ', '๊ณ„ํš'],
281
+ '์‚ฌ์—…': ['ํ”„๋กœ์ ํŠธ', '๊ณผ์—…'],
282
+ }
283
+
284
+ for keyword in keywords[:2]:
285
+ if keyword in synonym_map:
286
+ for syn in synonym_map[keyword]:
287
+ combined = original.replace(keyword, syn)
288
+ if combined not in variations:
289
+ variations.append(combined)
290
+ break
291
+
292
+ seen = set()
293
+ unique_variations = []
294
+ for v in variations:
295
+ if v not in seen and v.strip():
296
+ seen.add(v)
297
+ unique_variations.append(v)
298
+
299
+ return {
300
+ 'original': original,
301
+ 'cleaned': cleaned,
302
+ 'keywords': keywords,
303
+ 'variations': unique_variations[:7]
304
+ }
305
+
306
+
307
+ def search_with_multiple_queries(queries: List[str], collection, embedder, top_k: int = 5) -> Dict:
308
+ all_results = []
309
+ seen_ids = set()
310
+
311
+ for query in queries:
312
+ query_embedding = embedder.encode([query], convert_to_numpy=True)[0]
313
+
314
+ results = collection.query(
315
+ query_embeddings=[query_embedding.tolist()],
316
+ n_results=min(top_k * 5, 30),
317
+ include=["documents", "metadatas", "distances"]
318
+ )
319
+
320
+ for i, doc_id in enumerate(results['ids'][0]):
321
+ if doc_id not in seen_ids:
322
+ seen_ids.add(doc_id)
323
+ all_results.append({
324
+ 'id': doc_id,
325
+ 'document': results['documents'][0][i],
326
+ 'metadata': results['metadatas'][0][i],
327
+ 'distance': results['distances'][0][i],
328
+ 'query': query
329
+ })
330
+
331
+ all_results.sort(key=lambda x: x['distance'])
332
+ top_results = all_results[:top_k]
333
+
334
+ return {
335
+ 'documents': [[r['document'] for r in top_results]],
336
+ 'metadatas': [[r['metadata'] for r in top_results]],
337
+ 'distances': [[r['distance'] for r in top_results]],
338
+ 'queries_used': queries,
339
+ 'total_found': len(all_results)
340
+ }
341
+
342
+
343
+ def rerank_results(query: str, search_results: Dict, embedder, keywords: List[str]) -> Dict:
344
+ docs = search_results['documents'][0]
345
+ metas = search_results['metadatas'][0]
346
+ distances = search_results['distances'][0]
347
+
348
+ if not docs:
349
+ return {
350
+ 'documents': [[]],
351
+ 'metadatas': [[]],
352
+ 'distances': [[]],
353
+ 'scores': []
354
+ }
355
+
356
+ query_embedding = embedder.encode([query], convert_to_numpy=True)[0]
357
+ doc_embeddings = embedder.encode(docs, convert_to_numpy=True)
358
+
359
+ similarities = util.cos_sim(query_embedding, doc_embeddings)[0].cpu().numpy()
360
+
361
+ keyword_scores = []
362
+ for doc in docs:
363
+ doc_lower = doc.lower()
364
+ score = sum(1 for kw in keywords if kw.lower() in doc_lower)
365
+ keyword_scores.append(score)
366
+
367
+ if max(keyword_scores) > 0:
368
+ keyword_scores = [s / max(keyword_scores) for s in keyword_scores]
369
+
370
+ numeric_query_terms = ['๊ธˆ์•ก', '์˜ˆ์‚ฐ', '๊ฐ€๊ฒฉ', '๋น„์šฉ', '๋‹จ๊ฐ€']
371
+ is_numeric_query = any(term in query for term in numeric_query_terms)
372
+
373
+ if is_numeric_query:
374
+ money_patterns = [
375
+ r'\d{1,3}(?:,\d{3})+์›',
376
+ r'\d+๋งŒ์›',
377
+ r'\d+์–ต์›',
378
+ r'\(์ผ๊ธˆ\s*[^)]+\)'
379
+ ]
380
+ numeric_scores = []
381
+ for doc in docs:
382
+ score = 0
383
+ for pattern in money_patterns:
384
+ if re.search(pattern, doc):
385
+ score = 1
386
+ break
387
+ numeric_scores.append(score)
388
+ if max(numeric_scores) > 0:
389
+ numeric_scores = [s / max(numeric_scores) for s in numeric_scores]
390
+ else:
391
+ numeric_scores = [0.0 for _ in numeric_scores]
392
+
393
+ final_scores = [
394
+ 0.6 * sim + 0.25 * kw + 0.15 * num
395
+ for sim, kw, num in zip(similarities, keyword_scores, numeric_scores)
396
+ ]
397
+ else:
398
+ final_scores = [0.7 * sim + 0.3 * kw for sim, kw in zip(similarities, keyword_scores)]
399
+
400
+ ranked_indices = np.argsort(final_scores)[::-1]
401
+
402
+ return {
403
+ 'documents': [[docs[i] for i in ranked_indices]],
404
+ 'metadatas': [[metas[i] for i in ranked_indices]],
405
+ 'distances': [[distances[i] for i in ranked_indices]],
406
+ 'scores': [final_scores[i] for i in ranked_indices]
407
+ }
408
+
409
+
410
+ def build_context(search_results: Dict, max_length: int = 3000) -> str:
411
+ context_parts = []
412
+ current_length = 0
413
+
414
+ docs = search_results['documents'][0]
415
+ metas = search_results['metadatas'][0]
416
+
417
+ for i, (doc, meta) in enumerate(zip(docs, metas), 1):
418
+ part = f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc}\n"
419
+ part_length = len(part)
420
+
421
+ if current_length + part_length > max_length:
422
+ remaining = max_length - current_length
423
+ if remaining > 200:
424
+ part = f"[๋ฌธ์„œ {i}] (ํŽ˜์ด์ง€ {meta['page']})\n{doc[:remaining-50]}...\n"
425
+ context_parts.append(part)
426
+ break
427
+
428
+ context_parts.append(part)
429
+ current_length += part_length
430
+
431
+ return "\n".join(context_parts)
432
+
433
+
434
+ def generate_answer(query: str, search_results: Dict, api_key: str) -> str:
435
+ context = build_context(search_results, max_length=4000)
436
+
437
+ system_prompt = """๋‹น์‹ ์€ RFx ๋ฌธ์„œ ์ „๋ฌธ ๋ถ„์„๊ฐ€์ž…๋‹ˆ๋‹ค.
438
+
439
+ **์ค‘์š” ์›์น™:**
440
+ 1. ์ œ๊ณต๋œ ๋ฌธ์„œ๋ฅผ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์ •ํ™•ํ•œ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
441
+ 2. ์ˆซ์ž, ๊ธˆ์•ก, ๋‚ ์งœ ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์šฐ์„ ์ ์œผ๋กœ ์ฐพ์œผ์„ธ์š”
442
+ 3. ๋ฌธ์„œ์— ์ •๋ณด๊ฐ€ ์žˆ๋Š”๋ฐ๋„ "์—†๋‹ค"๊ณ  ํ•˜์ง€ ๋งˆ์„ธ์š”
443
+ 4. ๋‹ต๋ณ€ ์‹œ ๋ฐ˜๋“œ์‹œ [๋ฌธ์„œ N, ํŽ˜์ด์ง€ X] ํ˜•ํƒœ๋กœ ์ถœ์ฒ˜ ๋ช…์‹œ
444
+ 5. ์• ๋งคํ•œ ํ‘œํ˜„ ๋Œ€์‹  ๊ตฌ์ฒด์ ์ธ ์ˆ˜์น˜๋ฅผ ์ œ๊ณตํ•˜์„ธ์š”
445
+
446
+ **๋‹ต๋ณ€ ํ˜•์‹:**
447
+ - ํ•ต์‹ฌ ๋‹ต๋ณ€์„ ๋จผ์ € ๋ช…ํ™•ํ•˜๊ฒŒ ์ œ์‹œ
448
+ - ์ถœ์ฒ˜ ๋ช…์‹œ (ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ ํฌํ•จ)
449
+ - ํ•„์š”์‹œ ์ถ”๊ฐ€ ๊ด€๋ จ ์ •๋ณด ์ œ๊ณต"""
450
+
451
+ user_prompt = f"""๋‹ค์Œ ๋ฌธ์„œ๋“ค์„ **๋งค์šฐ ๊ผผ๊ผผํžˆ** ์ฝ๊ณ  ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•˜์„ธ์š”.
452
+
453
+ <๋ฌธ์„œ>
454
+ {context}
455
+ </๋ฌธ์„œ>
456
+
457
+ <์งˆ๋ฌธ>
458
+ {query}
459
+ </์งˆ๋ฌธ>
460
+
461
+ **์ค‘์š”**:
462
+ - ๋ฌธ์„œ๋ฅผ ์ฒ˜์Œ๋ถ€ํ„ฐ ๋๊นŒ์ง€ ์ฃผ์˜ ๊นŠ๊ฒŒ ์ฝ์œผ์„ธ์š”
463
+ - ์ˆซ์ž, ๊ธˆ์•ก ๋“ฑ ๊ตฌ์ฒด์ ์ธ ์ •๋ณด๋ฅผ ์ฐพ์œผ์„ธ์š”
464
+ - ์ฐพ์€ ์ •๋ณด๋Š” ์ •ํ™•ํžˆ ์ธ์šฉํ•˜์„ธ์š”
465
+ - ์ •๋ง๋กœ ๋ฌธ์„œ์— ์—†๋Š” ๊ฒฝ์šฐ์—๋งŒ "์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค"๋ผ๊ณ  ํ•˜์„ธ์š”"""
466
+
467
+ headers = {
468
+ "Content-Type": "application/json",
469
+ "Authorization": f"Bearer {api_key}"
470
+ }
471
+
472
+ payload = {
473
+ "model": "grok-3",
474
+ "messages": [
475
+ {"role": "system", "content": system_prompt},
476
+ {"role": "user", "content": user_prompt}
477
+ ],
478
+ "temperature": 0.1,
479
+ "max_tokens": 2000,
480
+ "stream": False
481
+ }
482
+
483
+ try:
484
+ response = requests.post(
485
+ f"{GROK_API_BASE}/chat/completions",
486
+ headers=headers,
487
+ json=payload,
488
+ timeout=30
489
+ )
490
+
491
+ if response.status_code != 200:
492
+ error_detail = ""
493
+ try:
494
+ error_data = response.json()
495
+ error_detail = error_data.get('error', {}).get('message', '')
496
+ except Exception:
497
+ error_detail = response.text
498
+
499
+ return f"โŒ API ์˜ค๋ฅ˜ (์ฝ”๋“œ: {response.status_code})\n\n{error_detail}"
500
+
501
+ result = response.json()
502
+ return result["choices"][0]["message"]["content"]
503
+
504
+ except Exception as e:
505
+ return f"โŒ ์˜ค๋ฅ˜: {str(e)}"
506
+
507
+
508
+ def highlight_text_in_pdf(pdf_bytes: bytes, highlight_info: List[Dict]) -> bytes:
509
+ doc = fitz.open(stream=pdf_bytes, filetype="pdf")
510
+
511
+ for item in highlight_info:
512
+ page_num = item['page'] - 1
513
+ search_text = item['text']
514
+
515
+ if page_num >= len(doc):
516
+ continue
517
+
518
+ page = doc[page_num]
519
+
520
+ text_variations = [
521
+ search_text,
522
+ search_text.replace(' ', ''),
523
+ search_text.replace(',', ''),
524
+ ]
525
+
526
+ for text_var in text_variations:
527
+ text_instances = page.search_for(text_var)
528
+
529
+ for inst in text_instances:
530
+ highlight = page.add_highlight_annot(inst)
531
+ highlight.set_colors(stroke=[1, 1, 0])
532
+ highlight.update()
533
+
534
+ output_bytes = doc.tobytes()
535
+ doc.close()
536
+
537
+ return output_bytes
538
+
539
+
540
+ def extract_highlight_texts(documents: List[str], keywords: List[str]) -> List[str]:
541
+ highlight_texts = []
542
+
543
+ for doc in documents:
544
+ money_patterns = [
545
+ r'\d{1,3}(?:,\d{3})+์›',
546
+ r'\d+๋งŒ์›',
547
+ r'\d+์–ต์›',
548
+ r'\(์ผ๊ธˆ\s*[^)]+\)',
549
+ ]
550
+
551
+ for pattern in money_patterns:
552
+ matches = re.findall(pattern, doc)
553
+ highlight_texts.extend(matches)
554
+
555
+ date_patterns = [
556
+ r'\d{4}[๋…„.]\d{1,2}[์›”.]\d{1,2}์ผ?',
557
+ r'\d{2}\.\d{2}\.\d{2}',
558
+ ]
559
+
560
+ for pattern in date_patterns:
561
+ matches = re.findall(pattern, doc)
562
+ highlight_texts.extend(matches)
563
+
564
+ for keyword in keywords:
565
+ if keyword in doc:
566
+ sentences = re.split(r'[.!?]\s+', doc)
567
+ for sent in sentences:
568
+ if keyword in sent and len(sent) < 100:
569
+ highlight_texts.append(sent.strip())
570
+
571
+ unique_texts = list(set(highlight_texts))
572
+ unique_texts.sort(key=len)
573
+
574
+ return unique_texts[:10]
575
+
576
+
577
+ def render_pdf_with_highlights(pdf_bytes: bytes, highlight_info: List[Dict]):
578
+ highlighted_pdf = highlight_text_in_pdf(pdf_bytes, highlight_info)
579
+
580
+ doc = fitz.open(stream=highlighted_pdf, filetype="pdf")
581
+
582
+ highlighted_pages = set(h['page'] for h in highlight_info)
583
+
584
+ pdf_html = '<div class="pdf-container">'
585
+
586
+ for page_num in range(len(doc)):
587
+ page = doc[page_num]
588
+
589
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
590
+ img_data = pix.tobytes("png")
591
+ img_base64 = base64.b64encode(img_data).decode()
592
+
593
+ pdf_html += '<div style="margin-bottom: 2rem; position: relative;">'
594
+ pdf_html += f'<div style="background: #3B82F6; color: white; padding: 0.5rem; margin-bottom: 0.5rem; border-radius: 0.3rem; font-weight: bold;">๐Ÿ“„ ํŽ˜์ด์ง€ {page_num + 1}</div>'
595
+
596
+ if (page_num + 1) in highlighted_pages:
597
+ page_highlights = [h for h in highlight_info if h['page'] == page_num + 1]
598
+ highlight_texts = ', '.join([f'"{h["text"][:30]}..."' for h in page_highlights[:3]])
599
+ pdf_html += f'<div class="highlight-indicator">โญ ํ•˜์ด๋ผ์ดํŠธ: {highlight_texts}</div>'
600
+
601
+ pdf_html += f'<img src="data:image/png;base64,{img_base64}" style="width: 100%; border: 1px solid #E2E8F0; border-radius: 0.3rem; box-shadow: 0 1px 3px rgba(0,0,0,0.1);" />'
602
+ pdf_html += '</div>'
603
+
604
+ pdf_html += '</div>'
605
+ doc.close()
606
+
607
+ return pdf_html
608
+
609
+
610
+ def main():
611
+ init_session()
612
+
613
+ st.markdown('<div class="main-title">๐Ÿ“„ RFx ๋ฌธ์„œ ๋ถ„์„ AI ์—์ด์ „ํŠธ</div>', unsafe_allow_html=True)
614
+
615
+ with st.sidebar:
616
+ st.header("โš™๏ธ ์„ค์ •")
617
+ grok_key = st.text_input("Grok API Key", value=GROK_API_KEY or "", type="password")
618
+
619
+ if grok_key:
620
+ os.environ["GROK_API_KEY"] = grok_key
621
+ st.session_state.grok_key = grok_key
622
+
623
+ st.divider()
624
+
625
+ if st.button("๐Ÿ”„ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ดˆ๊ธฐํ™”", help="ChromaDB ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ํด๋ฆญ"):
626
+ if os.path.exists(CHROMA_DIR):
627
+ try:
628
+ shutil.rmtree(CHROMA_DIR)
629
+ st.success("โœ… ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ดˆ๊ธฐํ™” ์™„๋ฃŒ!")
630
+ st.session_state.processed = False
631
+ st.session_state.vector_db = None
632
+ st.rerun()
633
+ except Exception as e:
634
+ st.error(f"์ดˆ๊ธฐํ™” ์‹คํŒจ: {str(e)}")
635
+
636
+ st.divider()
637
+
638
+ st.subheader("๐Ÿ“ค ๋ฌธ์„œ ์—…๋กœ๋“œ")
639
+ uploaded_file = st.file_uploader("PDF ํŒŒ์ผ ์„ ํƒ", type=['pdf'])
640
+
641
+ if uploaded_file:
642
+ if st.button("๐Ÿ“„ ๋ฌธ์„œ ์ฒ˜๋ฆฌ", type="primary", disabled=st.session_state.get('processing', False)):
643
+ if not grok_key:
644
+ st.error("โš ๏ธ Grok API ํ‚ค๋ฅผ ์ž…๋ ฅํ•˜์„ธ์š”!")
645
+ return
646
+
647
+ st.session_state.processing = True
648
+
649
+ with st.spinner("๐Ÿ“„ ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์ค‘..."):
650
+ try:
651
+ chunks, metadata_list, pdf_bytes, pages_text = extract_text_from_pdf(uploaded_file)
652
+
653
+ st.info(f"๐Ÿ“‘ {len(chunks)}๊ฐœ ์ฒญํฌ ์ถ”์ถœ ์™„๋ฃŒ")
654
+
655
+ with st.expander("๐Ÿ“ ์ถ”์ถœ๋œ ํ…์ŠคํŠธ ์ƒ˜ํ”Œ", expanded=False):
656
+ if chunks:
657
+ st.text(f"์ฒซ ๋ฒˆ์งธ ์ฒญํฌ (์ด {len(chunks[0])}์ž):")
658
+ st.code(chunks[0][:500] + "..." if len(chunks[0]) > 500 else chunks[0])
659
+
660
+ with st.spinner("๐Ÿ”ง ๋ฒกํ„ฐ ๋ฐ์ดํ„ฐ๋ฒ ์ด์Šค ์ƒ์„ฑ ์ค‘..."):
661
+ collection, embedder = create_vector_db(chunks, metadata_list)
662
+
663
+ st.session_state.vector_db = collection
664
+ st.session_state.embedder = embedder
665
+ st.session_state.pdf_bytes = pdf_bytes
666
+ st.session_state.pdf_pages_text = pages_text
667
+ st.session_state.processed = True
668
+ st.session_state.doc_metadata = {
669
+ "filename": uploaded_file.name,
670
+ "chunks": len(chunks),
671
+ "pages": len(set(m['page'] for m in metadata_list))
672
+ }
673
+
674
+ st.success("โœ… ๋ฌธ์„œ ์ฒ˜๋ฆฌ ์™„๋ฃŒ!")
675
+
676
+ except Exception as e:
677
+ st.error(f"์˜ค๋ฅ˜: {str(e)}")
678
+ finally:
679
+ st.session_state.processing = False
680
+
681
+ st.divider()
682
+
683
+ if st.session_state.processed:
684
+ st.subheader("๐Ÿ“Š ๋ฌธ์„œ ์ •๋ณด")
685
+ meta = st.session_state.doc_metadata
686
+ st.write(f"**ํŒŒ์ผ๋ช…:** {meta['filename']}")
687
+ st.write(f"**ํŽ˜์ด์ง€:** {meta['pages']}ํŽ˜์ด์ง€")
688
+ st.write(f"**์ฒญํฌ:** {meta['chunks']}๊ฐœ")
689
+
690
+ if st.button("๐Ÿ—‘๏ธ ์ฑ„ํŒ… ์ดˆ๊ธฐํ™”"):
691
+ st.session_state.chat_history = []
692
+ st.session_state.current_highlights = []
693
+ st.rerun()
694
+
695
+ if not st.session_state.processed:
696
+ st.info("๐Ÿ‘ˆ ์™ผ์ชฝ ์‚ฌ์ด๋“œ๋ฐ”์—์„œ PDF ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”")
697
+
698
+ col1, col2, col3 = st.columns(3)
699
+ with col1:
700
+ st.markdown("### ๐Ÿ“„ PDF ๋ทฐ์–ด\n์›๋ณธ ๋ฌธ์„œ ํ™•์ธ")
701
+ with col2:
702
+ st.markdown("### ๐ŸŽจ ํ•˜์ด๋ผ์ดํŠธ\nํ•ต์‹ฌ ๋‚ด์šฉ ๊ฐ•์กฐ")
703
+ with col3:
704
+ st.markdown("### ๐Ÿ’ฌ AI ์ฑ—๋ด‡\n์ •ํ™•ํ•œ ๋‹ต๋ณ€")
705
+
706
+ else:
707
+ col1, col2 = st.columns([1, 1])
708
+
709
+ with col1:
710
+ st.markdown("### ๐Ÿ“„ ๋ฌธ์„œ ๋ทฐ์–ด")
711
+
712
+ if st.session_state.pdf_bytes:
713
+ pdf_html = render_pdf_with_highlights(
714
+ st.session_state.pdf_bytes,
715
+ st.session_state.current_highlights
716
+ )
717
+ st.markdown(pdf_html, unsafe_allow_html=True)
718
+
719
+ with col2:
720
+ st.markdown("### ๐Ÿ’ฌ AI ์ฑ—๋ด‡")
721
+
722
+ chat_container = st.container()
723
+ with chat_container:
724
+ for msg in st.session_state.chat_history:
725
+ with st.chat_message(msg["role"]):
726
+ st.markdown(msg["content"])
727
+
728
+ if msg["role"] == "assistant" and "sources" in msg:
729
+ with st.expander("๐Ÿ“š ์ฐธ์กฐ ๋ฌธ์„œ"):
730
+ for i, (doc, meta) in enumerate(zip(
731
+ msg["sources"]["docs"],
732
+ msg["sources"]["metas"]
733
+ ), 1):
734
+ score = msg["sources"]["scores"][i-1] if "scores" in msg["sources"] else None
735
+ score_text = f" (๊ด€๋ จ๋„: {score:.2%})" if score else ""
736
+
737
+ st.markdown(f"""
738
+ <div class="source-box">
739
+ <div class="source-title">
740
+ <span class="page-indicator">ํŽ˜์ด์ง€ {meta['page']}</span>
741
+ {score_text}
742
+ </div>
743
+ <div style="font-size: 0.9rem; color: #475569;">
744
+ {doc[:300]}{'...' if len(doc) > 300 else ''}
745
+ </div>
746
+ </div>
747
+ """, unsafe_allow_html=True)
748
+
749
+ if prompt := st.chat_input("์งˆ๋ฌธ์„ ์ž…๋ ฅํ•˜์„ธ์š”...", disabled=st.session_state.get('processing', False)):
750
+
751
+ if not st.session_state.get('grok_key'):
752
+ st.error("โš ๏ธ Grok API ํ‚ค๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”!")
753
+ return
754
+
755
+ with st.chat_message("user"):
756
+ st.markdown(prompt)
757
+ st.session_state.chat_history.append({"role": "user", "content": prompt})
758
+
759
+ with st.chat_message("assistant"):
760
+ with st.spinner("๐Ÿ” ๊ฒ€์ƒ‰ ๋ฐ ๋ถ„์„ ์ค‘..."):
761
+ try:
762
+ query_info = rewrite_query(prompt)
763
+
764
+ with st.expander("๐Ÿ” ๊ฒ€์ƒ‰ ๋””๋ฒ„๊ทธ ์ •๋ณด", expanded=False):
765
+ st.write("**์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ:**", query_info['keywords'])
766
+ st.write("**๊ฒ€์ƒ‰ ์ฟผ๋ฆฌ ๋ณ€ํ˜•:**", query_info['variations'])
767
+
768
+ search_results = search_with_multiple_queries(
769
+ query_info['variations'],
770
+ st.session_state.vector_db,
771
+ st.session_state.embedder,
772
+ top_k=7
773
+ )
774
+
775
+ with st.expander("๐Ÿ“„ ๊ฒ€์ƒ‰๋œ ๋ฌธ์„œ ๋‚ด์šฉ", expanded=False):
776
+ st.write(f"**์ด {search_results.get('total_found', 0)}๊ฐœ ๋ฌธ์„œ ๋ฐœ๊ฒฌ**")
777
+ for i, doc in enumerate(search_results['documents'][0][:3], 1):
778
+ st.write(f"**๋ฌธ์„œ {i}:**")
779
+ st.text(doc[:300] + "..." if len(doc) > 300 else doc)
780
+ st.divider()
781
+
782
+ if 'total_found' in search_results:
783
+ st.success(f"โœ… {search_results['total_found']}๊ฐœ ๋ฌธ์„œ์—์„œ ์ƒ์œ„ 7๊ฐœ ์„ ํƒ")
784
+
785
+ reranked_results = rerank_results(
786
+ query_info['original'],
787
+ search_results,
788
+ st.session_state.embedder,
789
+ query_info['keywords']
790
+ )
791
+
792
+ answer = generate_answer(
793
+ query_info['original'],
794
+ reranked_results,
795
+ st.session_state.grok_key
796
+ )
797
+
798
+ st.markdown(answer)
799
+
800
+ highlight_texts = extract_highlight_texts(
801
+ reranked_results['documents'][0],
802
+ query_info['keywords']
803
+ )
804
+
805
+ highlights = []
806
+ for doc, meta in zip(reranked_results['documents'][0],
807
+ reranked_results['metadatas'][0]):
808
+ for text in highlight_texts:
809
+ if text in doc:
810
+ highlights.append({
811
+ 'page': meta['page'],
812
+ 'text': text
813
+ })
814
+
815
+ st.session_state.current_highlights = highlights
816
+
817
+ st.session_state.chat_history.append({
818
+ "role": "assistant",
819
+ "content": answer,
820
+ "sources": {
821
+ "docs": reranked_results['documents'][0],
822
+ "metas": reranked_results['metadatas'][0],
823
+ "scores": reranked_results.get('scores', []),
824
+ "keywords": query_info['keywords']
825
+ }
826
+ })
827
+
828
+ with st.expander("๐Ÿ“š ์ฐธ์กฐ ๋ฌธ์„œ", expanded=True):
829
+ for i, (doc, meta) in enumerate(zip(
830
+ reranked_results['documents'][0],
831
+ reranked_results['metadatas'][0]
832
+ ), 1):
833
+ score = reranked_results.get('scores', [None])[i-1]
834
+ score_text = f" (๊ด€๋ จ๋„: {score:.2%})" if score else ""
835
+
836
+ st.markdown(f"""
837
+ <div class="source-box">
838
+ <div class="source-title">
839
+ <span class="page-indicator">ํŽ˜์ด์ง€ {meta['page']}</span>
840
+ {score_text}
841
+ </div>
842
+ <div style="font-size: 0.9rem; color: #475569;">
843
+ {doc[:300]}{'...' if len(doc) > 300 else ''}
844
+ </div>
845
+ </div>
846
+ """, unsafe_allow_html=True)
847
+
848
+ st.rerun()
849
+
850
+ except Exception as e:
851
+ st.error(f"์˜ค๋ฅ˜: {str(e)}")
852
+ import traceback
853
+ st.code(traceback.format_exc())
854
+
855
+
856
+ if __name__ == "__main__":
857
+ main()