Spaces:

cksleigen
/

kwmin_probin

Runtime error

App Files Files Community

kwmin_probin / app.py

cksleigen

Upload 8 files

36696b3 verified 5 months ago

raw

history blame contribute delete

13 kB

	# app.py
	"""PROBIN - Intelligent Document Analysis System"""
	import streamlit as st
	import os
	import sys
	import uuid
	from pathlib import Path
	from streamlit_pdf_viewer import pdf_viewer

	# 프로젝트 루트 경로 추가
	sys.path.insert(0, str(Path(__file__).parent))

	from core.pdf_loader import load_pdf
	from core.chunker import chunk_text
	from core.embedder import embed_chunks
	from core.vectordb import VectorDB
	from core.retriever import Retriever
	from core.generator import Generator
	from ui.styles import get_custom_css
	from ui.components import render_sources_with_relevance
	from utils.pdf_utils import get_text_coordinates
	from config.settings import (
	CHUNK_SIZE, CHUNK_OVERLAP, TOP_K,
	APP_NAME, APP_SUBTITLE, APP_ICON, SHOW_STATS, PDF_HEIGHT
	)

	# 1. 페이지 설정
	st.set_page_config(
	page_title=f"{APP_NAME} - {APP_SUBTITLE}",
	page_icon=APP_ICON,
	layout="wide",
	initial_sidebar_state="collapsed"
	)

	# 2. 세션 스테이트 초기화
	if "session_id" not in st.session_state:
	st.session_state.session_id = str(uuid.uuid4())[:8]
	print(f"🆔 새 세션 ID 생성: {st.session_state.session_id}")

	if "vectordb" not in st.session_state:
	st.session_state.vectordb = None
	if "retriever" not in st.session_state:
	st.session_state.retriever = None
	if "generator" not in st.session_state:
	st.session_state.generator = Generator()
	if "pdf_processed" not in st.session_state:
	st.session_state.pdf_processed = False
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "current_page" not in st.session_state:
	st.session_state.current_page = 1
	if "pdf_path" not in st.session_state:
	st.session_state.pdf_path = None
	if "pdf_bytes" not in st.session_state:
	st.session_state.pdf_bytes = None
	if "annotations" not in st.session_state:
	st.session_state.annotations = []
	if "zoom_level" not in st.session_state:
	st.session_state.zoom_level = 500

	# 3. CSS 적용
	st.markdown(get_custom_css(), unsafe_allow_html=True)

	# --------------------------------------------------------------------------
	# 함수 정의
	# --------------------------------------------------------------------------

	def render_welcome_screen():
	"""웰컴 화면 (PDF 업로드 전에만 표시)"""
	if not st.session_state.pdf_processed:
	st.markdown(
	f"""
	<div id="welcome" class="hero-container">
	<h1 class="hero-title">{APP_ICON} {APP_NAME}</h1>
	<p class="hero-subtitle">Experience Intelligent Document Analysis with AI</p>
	</div>
	""",
	unsafe_allow_html=True
	)


	def move_to_page(page_num, text_content):
	"""페이지 이동 및 하이라이트 (즉시 반영)"""
	st.session_state.current_page = page_num

	if st.session_state.pdf_path:
	highlights = get_text_coordinates(
	str(st.session_state.pdf_path),
	page_num,
	text_content
	)
	st.session_state.annotations = highlights

	# 즉시 페이지 이동 반영
	st.rerun()


	def reset_app():
	"""앱 완전 초기화"""
	print("\n🔄 앱 전체 초기화 시작...")

	# 1. 현재 컬렉션 삭제
	if st.session_state.vectordb is not None:
	try:
	print(f" 🗑️ 현재 컬렉션 삭제 (세션: {st.session_state.session_id})")
	st.session_state.vectordb.delete_collection()
	print(" ✅ 컬렉션 삭제 완료")
	except Exception as e:
	print(f" ⚠️ 컬렉션 삭제 오류: {e}")

	# 2. 새 세션 ID 생성
	old_session_id = st.session_state.session_id
	new_session_id = str(uuid.uuid4())[:8]
	print(f" 🆔 세션 ID 변경: {old_session_id} → {new_session_id}")

	# 3. 세션 초기화
	keys_to_delete = list(st.session_state.keys())
	for key in keys_to_delete:
	del st.session_state[key]

	# 새 세션 ID 설정
	st.session_state.session_id = new_session_id
	st.session_state.pdf_processed = False
	st.session_state.pdf_path = None
	st.session_state.pdf_bytes = None

	print(" ✅ 세션 초기화 완료")
	print(f"🎉 초기화 완료! 새 세션: {new_session_id}\n")

	st.success("✅ 초기화 완료!")
	st.info("💡 새 PDF를 업로드할 준비가 되었습니다!")
	st.rerun()


	def process_pdf(uploaded_file):
	"""PDF 처리 파이프라인"""
	try:
	# 파일 저장
	save_dir = Path("./data/uploads")
	save_dir.mkdir(parents=True, exist_ok=True)
	pdf_path = save_dir / uploaded_file.name

	with open(pdf_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	st.session_state.pdf_path = pdf_path
	st.session_state.pdf_bytes = uploaded_file.getvalue()

	with st.spinner("🔄 문서를 분석하고 있습니다..."):
	# 1. PDF 로드
	print(f"\n📄 PDF 로드 중: {uploaded_file.name}")
	pdf_data = load_pdf(str(pdf_path))
	st.session_state.total_pages = pdf_data["total_pages"]
	print(f" ✅ 총 {pdf_data['total_pages']} 페이지")

	# 2. 청킹
	print(f"\n✂️ 청킹 중...")
	chunks = chunk_text(pdf_data["pages"], CHUNK_SIZE, CHUNK_OVERLAP)
	st.session_state.total_chunks = len(chunks)
	print(f" ✅ 총 {len(chunks)}개 청크 생성")

	# 3. 임베딩
	print(f"\n🔢 임베딩 생성 중...")
	embedded_chunks = embed_chunks(chunks)
	print(f" ✅ 임베딩 완료")

	# 4. VectorDB 생성
	print(f"\n💾 VectorDB 초기화 (세션: {st.session_state.session_id})...")

	if st.session_state.vectordb is not None:
	print(" 🗑️ 기존 컬렉션 삭제")
	try:
	st.session_state.vectordb.delete_collection()
	except Exception as e:
	print(f" ⚠️ 삭제 오류: {e}")

	print(" 🆕 새 VectorDB 생성")
	st.session_state.vectordb = VectorDB(
	session_id=st.session_state.session_id
	)

	initial_count = st.session_state.vectordb.count()
	print(f" 📊 초기 상태: {initial_count}개 청크")

	# 5. 청크 저장
	print(f"\n 💾 청크 저장: {len(embedded_chunks)}개")
	st.session_state.vectordb.add_chunks(embedded_chunks)

	# 6. Retriever 생성
	st.session_state.retriever = Retriever(st.session_state.vectordb)

	# 7. 상태 업데이트
	st.session_state.pdf_processed = True

	# 8. 최종 확인
	final_count = st.session_state.vectordb.count()
	print(f"\n✅ 최종: {final_count}개 청크 저장 완료")

	# 9. 초기화
	st.session_state.messages = []
	st.session_state.annotations = []
	st.session_state.current_page = 1

	print(f"\n🎉 PDF 처리 완료! (세션: {st.session_state.session_id})\n")
	st.success("✅ 문서 분석 완료!")
	st.rerun()

	except Exception as e:
	st.error(f"❌ 오류 발생: {str(e)}")
	print(f"\n❌ 오류:")
	import traceback
	print(traceback.format_exc())


	# --------------------------------------------------------------------------
	# 메인 UI
	# --------------------------------------------------------------------------

	# 웰컴 화면 출력
	render_welcome_screen()

	# --------------------------------------------------------------------------
	# Sidebar
	# --------------------------------------------------------------------------
	with st.sidebar:
	st.title(f"{APP_ICON} {APP_NAME}")

	uploaded_file = st.file_uploader(
	"PDF 파일 업로드",
	type=["pdf"],
	key=f"pdf_uploader_{st.session_state.session_id}"
	)

	if uploaded_file and not st.session_state.pdf_processed:
	process_pdf(uploaded_file)

	st.divider()

	if st.button("🔄 초기화", use_container_width=True):
	reset_app()

	# --------------------------------------------------------------------------
	# PDF + Chat UI
	# --------------------------------------------------------------------------
	if st.session_state.pdf_processed:

	col1, col2 = st.columns([5, 5], gap="medium")

	# 왼쪽: PDF 뷰어
	with col1:
	# 툴바
	toolbar1, toolbar2, toolbar3, toolbar4 = st.columns([1, 1, 2, 2])

	with toolbar1:
	if st.button("◀", help="이전 페이지"):
	if st.session_state.current_page > 1:
	st.session_state.current_page -= 1
	st.rerun()

	with toolbar2:
	if st.button("▶", help="다음 페이지"):
	if st.session_state.current_page < st.session_state.total_pages:
	st.session_state.current_page += 1
	st.rerun()

	with toolbar3:
	st.write(f"Page {st.session_state.current_page} / {st.session_state.total_pages}")

	with toolbar4:
	new_zoom = st.slider("Zoom", 500, 1200, st.session_state.zoom_level, label_visibility="collapsed")
	if new_zoom != st.session_state.zoom_level:
	st.session_state.zoom_level = new_zoom
	st.rerun()

	# PDF 뷰어
	pdf_viewer(
	input=st.session_state.pdf_bytes,
	width=st.session_state.zoom_level,
	annotations=st.session_state.annotations,
	pages_to_render=[st.session_state.current_page],
	render_text=True
	)

	# 오른쪽: 채팅
	with col2:
	st.markdown("### 💬 PROBIN CHAT")

	# 채팅 컨테이너 (스크롤 가능 - 높이 줄임)
	chat_container = st.container(height=500)
	with chat_container:
	# 채팅 기록이 없을 때 가이드 표시
	if not st.session_state.messages:
	st.markdown("""
	<div class="chat-placeholder">
	<div class="placeholder-title">👋 반가워요! 이렇게 활용해보세요</div>
	<ol class="placeholder-steps">
	<li>AI가 문서 내용을 분석하여 <strong>답변과 근거</strong>를 찾아줍니다.</li>
	<li>답변의 <span class="highlight-box">노란색 하이라이트</span>를 확인하세요.</li>
	</ol>
	</div>
	""", unsafe_allow_html=True)

	# 채팅 기록 표시
	else:
	for idx, msg in enumerate(st.session_state.messages):
	with st.chat_message(msg["role"]):
	st.markdown(msg["content"])

	if msg.get("sources"):
	render_sources_with_relevance(
	sources=msg["sources"],
	message_idx=idx,
	move_to_page_callback=move_to_page
	)

	# 채팅 입력
	if query := st.chat_input("질문을 입력하세요..."):
	# 사용자 메시지 추가
	st.session_state.messages.append({
	"role": "user",
	"content": query
	})

	# 검색 및 답변 생성
	with st.spinner("🔍 PROBIN이 검색중입니다..."):
	print(f"\n🔍 질문: {query}")

	retrieved_chunks = st.session_state.retriever.retrieve(query, TOP_K)
	result = st.session_state.generator.generate_answer(query, retrieved_chunks)

	# AI 답변 추가
	st.session_state.messages.append({
	"role": "assistant",
	"content": result["answer"],
	"sources": result["sources"]
	})

	# 첫 번째 출처로 이동
	if result["sources"]:
	top_source = result["sources"][0]
	highlights = get_text_coordinates(
	str(st.session_state.pdf_path),
	top_source["page_num"],
	top_source["text"]
	)
	st.session_state.annotations = highlights
	st.session_state.current_page = top_source["page_num"]

	print(f"✅ 답변 완료\n")

	st.rerun()