# app.py """PROBIN - Intelligent Document Analysis System""" import streamlit as st import os import sys import uuid from pathlib import Path from streamlit_pdf_viewer import pdf_viewer # 프로젝트 루트 경로 추가 sys.path.insert(0, str(Path(__file__).parent)) from core.pdf_loader import load_pdf from core.chunker import chunk_text from core.embedder import embed_chunks from core.vectordb import VectorDB from core.retriever import Retriever from core.generator import Generator from ui.styles import get_custom_css from ui.components import render_sources_with_relevance from utils.pdf_utils import get_text_coordinates from config.settings import ( CHUNK_SIZE, CHUNK_OVERLAP, TOP_K, APP_NAME, APP_SUBTITLE, APP_ICON, SHOW_STATS, PDF_HEIGHT ) # 1. 페이지 설정 st.set_page_config( page_title=f"{APP_NAME} - {APP_SUBTITLE}", page_icon=APP_ICON, layout="wide", initial_sidebar_state="collapsed" ) # 2. 세션 스테이트 초기화 if "session_id" not in st.session_state: st.session_state.session_id = str(uuid.uuid4())[:8] print(f"🆔 새 세션 ID 생성: {st.session_state.session_id}") if "vectordb" not in st.session_state: st.session_state.vectordb = None if "retriever" not in st.session_state: st.session_state.retriever = None if "generator" not in st.session_state: st.session_state.generator = Generator() if "pdf_processed" not in st.session_state: st.session_state.pdf_processed = False if "messages" not in st.session_state: st.session_state.messages = [] if "current_page" not in st.session_state: st.session_state.current_page = 1 if "pdf_path" not in st.session_state: st.session_state.pdf_path = None if "pdf_bytes" not in st.session_state: st.session_state.pdf_bytes = None if "annotations" not in st.session_state: st.session_state.annotations = [] if "zoom_level" not in st.session_state: st.session_state.zoom_level = 500 # 3. CSS 적용 st.markdown(get_custom_css(), unsafe_allow_html=True) # -------------------------------------------------------------------------- # 함수 정의 # -------------------------------------------------------------------------- def render_welcome_screen(): """웰컴 화면 (PDF 업로드 전에만 표시)""" if not st.session_state.pdf_processed: st.markdown( f"""

{APP_ICON} {APP_NAME}

Experience Intelligent Document Analysis with AI

""", unsafe_allow_html=True ) def move_to_page(page_num, text_content): """페이지 이동 및 하이라이트 (즉시 반영)""" st.session_state.current_page = page_num if st.session_state.pdf_path: highlights = get_text_coordinates( str(st.session_state.pdf_path), page_num, text_content ) st.session_state.annotations = highlights # 즉시 페이지 이동 반영 st.rerun() def reset_app(): """앱 완전 초기화""" print("\n🔄 앱 전체 초기화 시작...") # 1. 현재 컬렉션 삭제 if st.session_state.vectordb is not None: try: print(f" 🗑️ 현재 컬렉션 삭제 (세션: {st.session_state.session_id})") st.session_state.vectordb.delete_collection() print(" ✅ 컬렉션 삭제 완료") except Exception as e: print(f" ⚠️ 컬렉션 삭제 오류: {e}") # 2. 새 세션 ID 생성 old_session_id = st.session_state.session_id new_session_id = str(uuid.uuid4())[:8] print(f" 🆔 세션 ID 변경: {old_session_id} → {new_session_id}") # 3. 세션 초기화 keys_to_delete = list(st.session_state.keys()) for key in keys_to_delete: del st.session_state[key] # 새 세션 ID 설정 st.session_state.session_id = new_session_id st.session_state.pdf_processed = False st.session_state.pdf_path = None st.session_state.pdf_bytes = None print(" ✅ 세션 초기화 완료") print(f"🎉 초기화 완료! 새 세션: {new_session_id}\n") st.success("✅ 초기화 완료!") st.info("💡 **새 PDF를 업로드할 준비가 되었습니다!**") st.rerun() def process_pdf(uploaded_file): """PDF 처리 파이프라인""" try: # 파일 저장 save_dir = Path("./data/uploads") save_dir.mkdir(parents=True, exist_ok=True) pdf_path = save_dir / uploaded_file.name with open(pdf_path, "wb") as f: f.write(uploaded_file.getbuffer()) st.session_state.pdf_path = pdf_path st.session_state.pdf_bytes = uploaded_file.getvalue() with st.spinner("🔄 문서를 분석하고 있습니다..."): # 1. PDF 로드 print(f"\n📄 PDF 로드 중: {uploaded_file.name}") pdf_data = load_pdf(str(pdf_path)) st.session_state.total_pages = pdf_data["total_pages"] print(f" ✅ 총 {pdf_data['total_pages']} 페이지") # 2. 청킹 print(f"\n✂️ 청킹 중...") chunks = chunk_text(pdf_data["pages"], CHUNK_SIZE, CHUNK_OVERLAP) st.session_state.total_chunks = len(chunks) print(f" ✅ 총 {len(chunks)}개 청크 생성") # 3. 임베딩 print(f"\n🔢 임베딩 생성 중...") embedded_chunks = embed_chunks(chunks) print(f" ✅ 임베딩 완료") # 4. VectorDB 생성 print(f"\n💾 VectorDB 초기화 (세션: {st.session_state.session_id})...") if st.session_state.vectordb is not None: print(" 🗑️ 기존 컬렉션 삭제") try: st.session_state.vectordb.delete_collection() except Exception as e: print(f" ⚠️ 삭제 오류: {e}") print(" 🆕 새 VectorDB 생성") st.session_state.vectordb = VectorDB( session_id=st.session_state.session_id ) initial_count = st.session_state.vectordb.count() print(f" 📊 초기 상태: {initial_count}개 청크") # 5. 청크 저장 print(f"\n 💾 청크 저장: {len(embedded_chunks)}개") st.session_state.vectordb.add_chunks(embedded_chunks) # 6. Retriever 생성 st.session_state.retriever = Retriever(st.session_state.vectordb) # 7. 상태 업데이트 st.session_state.pdf_processed = True # 8. 최종 확인 final_count = st.session_state.vectordb.count() print(f"\n✅ 최종: {final_count}개 청크 저장 완료") # 9. 초기화 st.session_state.messages = [] st.session_state.annotations = [] st.session_state.current_page = 1 print(f"\n🎉 PDF 처리 완료! (세션: {st.session_state.session_id})\n") st.success("✅ 문서 분석 완료!") st.rerun() except Exception as e: st.error(f"❌ 오류 발생: {str(e)}") print(f"\n❌ 오류:") import traceback print(traceback.format_exc()) # -------------------------------------------------------------------------- # 메인 UI # -------------------------------------------------------------------------- # 웰컴 화면 출력 render_welcome_screen() # -------------------------------------------------------------------------- # Sidebar # -------------------------------------------------------------------------- with st.sidebar: st.title(f"{APP_ICON} {APP_NAME}") uploaded_file = st.file_uploader( "PDF 파일 업로드", type=["pdf"], key=f"pdf_uploader_{st.session_state.session_id}" ) if uploaded_file and not st.session_state.pdf_processed: process_pdf(uploaded_file) st.divider() if st.button("🔄 초기화", use_container_width=True): reset_app() # -------------------------------------------------------------------------- # PDF + Chat UI # -------------------------------------------------------------------------- if st.session_state.pdf_processed: col1, col2 = st.columns([5, 5], gap="medium") # 왼쪽: PDF 뷰어 with col1: # 툴바 toolbar1, toolbar2, toolbar3, toolbar4 = st.columns([1, 1, 2, 2]) with toolbar1: if st.button("◀", help="이전 페이지"): if st.session_state.current_page > 1: st.session_state.current_page -= 1 st.rerun() with toolbar2: if st.button("▶", help="다음 페이지"): if st.session_state.current_page < st.session_state.total_pages: st.session_state.current_page += 1 st.rerun() with toolbar3: st.write(f"Page {st.session_state.current_page} / {st.session_state.total_pages}") with toolbar4: new_zoom = st.slider("Zoom", 500, 1200, st.session_state.zoom_level, label_visibility="collapsed") if new_zoom != st.session_state.zoom_level: st.session_state.zoom_level = new_zoom st.rerun() # PDF 뷰어 pdf_viewer( input=st.session_state.pdf_bytes, width=st.session_state.zoom_level, annotations=st.session_state.annotations, pages_to_render=[st.session_state.current_page], render_text=True ) # 오른쪽: 채팅 with col2: st.markdown("### 💬 PROBIN CHAT") # 채팅 컨테이너 (스크롤 가능 - 높이 줄임) chat_container = st.container(height=500) with chat_container: # 채팅 기록이 없을 때 가이드 표시 if not st.session_state.messages: st.markdown("""
👋 반가워요! 이렇게 활용해보세요
  1. AI가 문서 내용을 분석하여 답변과 근거를 찾아줍니다.
  2. 답변의 노란색 하이라이트를 확인하세요.
""", unsafe_allow_html=True) # 채팅 기록 표시 else: for idx, msg in enumerate(st.session_state.messages): with st.chat_message(msg["role"]): st.markdown(msg["content"]) if msg.get("sources"): render_sources_with_relevance( sources=msg["sources"], message_idx=idx, move_to_page_callback=move_to_page ) # 채팅 입력 if query := st.chat_input("질문을 입력하세요..."): # 사용자 메시지 추가 st.session_state.messages.append({ "role": "user", "content": query }) # 검색 및 답변 생성 with st.spinner("🔍 PROBIN이 검색중입니다..."): print(f"\n🔍 질문: {query}") retrieved_chunks = st.session_state.retriever.retrieve(query, TOP_K) result = st.session_state.generator.generate_answer(query, retrieved_chunks) # AI 답변 추가 st.session_state.messages.append({ "role": "assistant", "content": result["answer"], "sources": result["sources"] }) # 첫 번째 출처로 이동 if result["sources"]: top_source = result["sources"][0] highlights = get_text_coordinates( str(st.session_state.pdf_path), top_source["page_num"], top_source["text"] ) st.session_state.annotations = highlights st.session_state.current_page = top_source["page_num"] print(f"✅ 답변 완료\n") st.rerun()