# app.py
"""PROBIN - Intelligent Document Analysis System"""
import streamlit as st
import os
import sys
import uuid
from pathlib import Path
from streamlit_pdf_viewer import pdf_viewer
# 프로젝트 루트 경로 추가
sys.path.insert(0, str(Path(__file__).parent))
from core.pdf_loader import load_pdf
from core.chunker import chunk_text
from core.embedder import embed_chunks
from core.vectordb import VectorDB
from core.retriever import Retriever
from core.generator import Generator
from ui.styles import get_custom_css
from ui.components import render_sources_with_relevance
from utils.pdf_utils import get_text_coordinates
from config.settings import (
CHUNK_SIZE, CHUNK_OVERLAP, TOP_K,
APP_NAME, APP_SUBTITLE, APP_ICON, SHOW_STATS, PDF_HEIGHT
)
# 1. 페이지 설정
st.set_page_config(
page_title=f"{APP_NAME} - {APP_SUBTITLE}",
page_icon=APP_ICON,
layout="wide",
initial_sidebar_state="collapsed"
)
# 2. 세션 스테이트 초기화
if "session_id" not in st.session_state:
st.session_state.session_id = str(uuid.uuid4())[:8]
print(f"🆔 새 세션 ID 생성: {st.session_state.session_id}")
if "vectordb" not in st.session_state:
st.session_state.vectordb = None
if "retriever" not in st.session_state:
st.session_state.retriever = None
if "generator" not in st.session_state:
st.session_state.generator = Generator()
if "pdf_processed" not in st.session_state:
st.session_state.pdf_processed = False
if "messages" not in st.session_state:
st.session_state.messages = []
if "current_page" not in st.session_state:
st.session_state.current_page = 1
if "pdf_path" not in st.session_state:
st.session_state.pdf_path = None
if "pdf_bytes" not in st.session_state:
st.session_state.pdf_bytes = None
if "annotations" not in st.session_state:
st.session_state.annotations = []
if "zoom_level" not in st.session_state:
st.session_state.zoom_level = 500
# 3. CSS 적용
st.markdown(get_custom_css(), unsafe_allow_html=True)
# --------------------------------------------------------------------------
# 함수 정의
# --------------------------------------------------------------------------
def render_welcome_screen():
"""웰컴 화면 (PDF 업로드 전에만 표시)"""
if not st.session_state.pdf_processed:
st.markdown(
f"""
{APP_ICON} {APP_NAME}
Experience Intelligent Document Analysis with AI
""",
unsafe_allow_html=True
)
def move_to_page(page_num, text_content):
"""페이지 이동 및 하이라이트 (즉시 반영)"""
st.session_state.current_page = page_num
if st.session_state.pdf_path:
highlights = get_text_coordinates(
str(st.session_state.pdf_path),
page_num,
text_content
)
st.session_state.annotations = highlights
# 즉시 페이지 이동 반영
st.rerun()
def reset_app():
"""앱 완전 초기화"""
print("\n🔄 앱 전체 초기화 시작...")
# 1. 현재 컬렉션 삭제
if st.session_state.vectordb is not None:
try:
print(f" 🗑️ 현재 컬렉션 삭제 (세션: {st.session_state.session_id})")
st.session_state.vectordb.delete_collection()
print(" ✅ 컬렉션 삭제 완료")
except Exception as e:
print(f" ⚠️ 컬렉션 삭제 오류: {e}")
# 2. 새 세션 ID 생성
old_session_id = st.session_state.session_id
new_session_id = str(uuid.uuid4())[:8]
print(f" 🆔 세션 ID 변경: {old_session_id} → {new_session_id}")
# 3. 세션 초기화
keys_to_delete = list(st.session_state.keys())
for key in keys_to_delete:
del st.session_state[key]
# 새 세션 ID 설정
st.session_state.session_id = new_session_id
st.session_state.pdf_processed = False
st.session_state.pdf_path = None
st.session_state.pdf_bytes = None
print(" ✅ 세션 초기화 완료")
print(f"🎉 초기화 완료! 새 세션: {new_session_id}\n")
st.success("✅ 초기화 완료!")
st.info("💡 **새 PDF를 업로드할 준비가 되었습니다!**")
st.rerun()
def process_pdf(uploaded_file):
"""PDF 처리 파이프라인"""
try:
# 파일 저장
save_dir = Path("./data/uploads")
save_dir.mkdir(parents=True, exist_ok=True)
pdf_path = save_dir / uploaded_file.name
with open(pdf_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.session_state.pdf_path = pdf_path
st.session_state.pdf_bytes = uploaded_file.getvalue()
with st.spinner("🔄 문서를 분석하고 있습니다..."):
# 1. PDF 로드
print(f"\n📄 PDF 로드 중: {uploaded_file.name}")
pdf_data = load_pdf(str(pdf_path))
st.session_state.total_pages = pdf_data["total_pages"]
print(f" ✅ 총 {pdf_data['total_pages']} 페이지")
# 2. 청킹
print(f"\n✂️ 청킹 중...")
chunks = chunk_text(pdf_data["pages"], CHUNK_SIZE, CHUNK_OVERLAP)
st.session_state.total_chunks = len(chunks)
print(f" ✅ 총 {len(chunks)}개 청크 생성")
# 3. 임베딩
print(f"\n🔢 임베딩 생성 중...")
embedded_chunks = embed_chunks(chunks)
print(f" ✅ 임베딩 완료")
# 4. VectorDB 생성
print(f"\n💾 VectorDB 초기화 (세션: {st.session_state.session_id})...")
if st.session_state.vectordb is not None:
print(" 🗑️ 기존 컬렉션 삭제")
try:
st.session_state.vectordb.delete_collection()
except Exception as e:
print(f" ⚠️ 삭제 오류: {e}")
print(" 🆕 새 VectorDB 생성")
st.session_state.vectordb = VectorDB(
session_id=st.session_state.session_id
)
initial_count = st.session_state.vectordb.count()
print(f" 📊 초기 상태: {initial_count}개 청크")
# 5. 청크 저장
print(f"\n 💾 청크 저장: {len(embedded_chunks)}개")
st.session_state.vectordb.add_chunks(embedded_chunks)
# 6. Retriever 생성
st.session_state.retriever = Retriever(st.session_state.vectordb)
# 7. 상태 업데이트
st.session_state.pdf_processed = True
# 8. 최종 확인
final_count = st.session_state.vectordb.count()
print(f"\n✅ 최종: {final_count}개 청크 저장 완료")
# 9. 초기화
st.session_state.messages = []
st.session_state.annotations = []
st.session_state.current_page = 1
print(f"\n🎉 PDF 처리 완료! (세션: {st.session_state.session_id})\n")
st.success("✅ 문서 분석 완료!")
st.rerun()
except Exception as e:
st.error(f"❌ 오류 발생: {str(e)}")
print(f"\n❌ 오류:")
import traceback
print(traceback.format_exc())
# --------------------------------------------------------------------------
# 메인 UI
# --------------------------------------------------------------------------
# 웰컴 화면 출력
render_welcome_screen()
# --------------------------------------------------------------------------
# Sidebar
# --------------------------------------------------------------------------
with st.sidebar:
st.title(f"{APP_ICON} {APP_NAME}")
uploaded_file = st.file_uploader(
"PDF 파일 업로드",
type=["pdf"],
key=f"pdf_uploader_{st.session_state.session_id}"
)
if uploaded_file and not st.session_state.pdf_processed:
process_pdf(uploaded_file)
st.divider()
if st.button("🔄 초기화", use_container_width=True):
reset_app()
# --------------------------------------------------------------------------
# PDF + Chat UI
# --------------------------------------------------------------------------
if st.session_state.pdf_processed:
col1, col2 = st.columns([5, 5], gap="medium")
# 왼쪽: PDF 뷰어
with col1:
# 툴바
toolbar1, toolbar2, toolbar3, toolbar4 = st.columns([1, 1, 2, 2])
with toolbar1:
if st.button("◀", help="이전 페이지"):
if st.session_state.current_page > 1:
st.session_state.current_page -= 1
st.rerun()
with toolbar2:
if st.button("▶", help="다음 페이지"):
if st.session_state.current_page < st.session_state.total_pages:
st.session_state.current_page += 1
st.rerun()
with toolbar3:
st.write(f"Page {st.session_state.current_page} / {st.session_state.total_pages}")
with toolbar4:
new_zoom = st.slider("Zoom", 500, 1200, st.session_state.zoom_level, label_visibility="collapsed")
if new_zoom != st.session_state.zoom_level:
st.session_state.zoom_level = new_zoom
st.rerun()
# PDF 뷰어
pdf_viewer(
input=st.session_state.pdf_bytes,
width=st.session_state.zoom_level,
annotations=st.session_state.annotations,
pages_to_render=[st.session_state.current_page],
render_text=True
)
# 오른쪽: 채팅
with col2:
st.markdown("### 💬 PROBIN CHAT")
# 채팅 컨테이너 (스크롤 가능 - 높이 줄임)
chat_container = st.container(height=500)
with chat_container:
# 채팅 기록이 없을 때 가이드 표시
if not st.session_state.messages:
st.markdown("""
👋 반가워요! 이렇게 활용해보세요
- AI가 문서 내용을 분석하여 답변과 근거를 찾아줍니다.
- 답변의 노란색 하이라이트를 확인하세요.
""", unsafe_allow_html=True)
# 채팅 기록 표시
else:
for idx, msg in enumerate(st.session_state.messages):
with st.chat_message(msg["role"]):
st.markdown(msg["content"])
if msg.get("sources"):
render_sources_with_relevance(
sources=msg["sources"],
message_idx=idx,
move_to_page_callback=move_to_page
)
# 채팅 입력
if query := st.chat_input("질문을 입력하세요..."):
# 사용자 메시지 추가
st.session_state.messages.append({
"role": "user",
"content": query
})
# 검색 및 답변 생성
with st.spinner("🔍 PROBIN이 검색중입니다..."):
print(f"\n🔍 질문: {query}")
retrieved_chunks = st.session_state.retriever.retrieve(query, TOP_K)
result = st.session_state.generator.generate_answer(query, retrieved_chunks)
# AI 답변 추가
st.session_state.messages.append({
"role": "assistant",
"content": result["answer"],
"sources": result["sources"]
})
# 첫 번째 출처로 이동
if result["sources"]:
top_source = result["sources"][0]
highlights = get_text_coordinates(
str(st.session_state.pdf_path),
top_source["page_num"],
top_source["text"]
)
st.session_state.annotations = highlights
st.session_state.current_page = top_source["page_num"]
print(f"✅ 답변 완료\n")
st.rerun()