Spaces:
Sleeping
Sleeping
| """ | |
| UI RAG | |
| """ | |
| import streamlit as st | |
| import os | |
| from pathlib import Path | |
| from pdf_parser import PDFParser | |
| from vector_store import VectorStore | |
| from rag_system import VisualMultimodalRAG | |
| from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB | |
| st.set_page_config( | |
| page_title="Мультимодальная RAG система (PDF parsing)", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| if 'api_key_set' not in st.session_state: | |
| st.session_state.api_key_set = False | |
| if 'api_key' not in st.session_state: | |
| st.session_state.api_key = None | |
| if 'visual_rag_system' not in st.session_state: | |
| st.session_state.visual_rag_system = None | |
| if 'vector_store' not in st.session_state: | |
| st.session_state.vector_store = None | |
| if 'parser' not in st.session_state: | |
| st.session_state.parser = None | |
| if 'current_document' not in st.session_state: | |
| st.session_state.current_document = None | |
| if 'current_text' not in st.session_state: | |
| st.session_state.current_text = None | |
| if 'current_images' not in st.session_state: | |
| st.session_state.current_images = None | |
| if 'current_tables' not in st.session_state: | |
| st.session_state.current_tables = None | |
| if 'processing_results' not in st.session_state: | |
| st.session_state.processing_results = None | |
| if 'answering_rag' not in st.session_state: | |
| st.session_state.answering_rag = None | |
| st.title("Мультимодальная RAG система (PDF parsing)") | |
| st.markdown(""" | |
| Обрабатывает PDF документы и предоставляет информацию по ним | |
| """) | |
| with st.sidebar: | |
| st.header(" Конфигурация") | |
| st.subheader(" OpenAI API Ключ") | |
| api_key = st.text_input( | |
| "Введите OpenAI API ключ:", | |
| type="password", | |
| key="api_key_input" | |
| ) | |
| if api_key: | |
| st.session_state.api_key = api_key | |
| st.session_state.api_key_set = True | |
| if st.session_state.visual_rag_system is None: | |
| try: | |
| st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW | |
| st.session_state.vector_store = VectorStore() | |
| st.session_state.parser = PDFParser(debug=True) | |
| st.success("API ключ введен") | |
| except Exception as e: | |
| st.error(f"Ошибка старта системы: {e}") | |
| else: | |
| st.session_state.api_key_set = False | |
| st.warning("Введите OpenAI API ключ") | |
| st.divider() | |
| st.subheader("Векторное хранилище") | |
| if st.session_state.vector_store: | |
| try: | |
| info = st.session_state.vector_store.get_collection_info() | |
| st.metric("Документов в хранилище", info['count']) | |
| st.caption(f"Расположение: {info['persist_path']}") | |
| except Exception as e: | |
| st.error(f"Ошибка получения информации: {e}") | |
| else: | |
| st.info("Введите OpenAI API ключ") | |
| st.divider() | |
| st.subheader("Управление хранилищем") | |
| if st.button("Очистить хранилище"): | |
| if st.session_state.vector_store: | |
| try: | |
| st.session_state.vector_store.clear_all() | |
| st.success("Хранилище очищено") | |
| except Exception as e: | |
| st.error(f"Ошибка очистки хранилища: {e}") | |
| st.header("Загрузить PDF") | |
| uploaded_file = st.file_uploader( | |
| "Выбрать...", | |
| type=['pdf'], | |
| help="Загрузите PDF файл" | |
| ) | |
| if uploaded_file is not None: | |
| upload_path = Path(UPLOAD_FOLDER) | |
| upload_path.mkdir(exist_ok=True) | |
| file_path = upload_path / uploaded_file.name | |
| with open(file_path, 'wb') as f: | |
| f.write(uploaded_file.getbuffer()) | |
| st.success(f"Файл загружен: {uploaded_file.name}") | |
| if st.button("Распарсить PDF"): | |
| if not st.session_state.api_key_set: | |
| st.error("Введите OpenAI API ключ") | |
| else: | |
| try: | |
| with st.spinner(" Парсинг PDF..."): | |
| print(f"Парсинг PDF файла: {uploaded_file.name}") | |
| parser = st.session_state.parser | |
| text, images, tables = parser.parse_pdf(str(file_path)) | |
| st.session_state.current_document = uploaded_file.name | |
| st.session_state.current_text = text | |
| st.session_state.current_images = images | |
| st.session_state.current_tables = tables | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Текста", f"{len(text):,} chars") | |
| with col2: | |
| st.metric("Изображений", len(images)) | |
| with col3: | |
| st.metric("Таблиц", len(tables)) | |
| st.success("Парсинг PDF завершен!") | |
| except Exception as e: | |
| st.error(f"Парсинг PDF завершелся с ошибкой: {e}") | |
| print(f"Ошибка: {e}") | |
| st.divider() | |
| st.header("Анализ документа") | |
| st.info(""" | |
| Отправляет содержимое документа на анализ | |
| """) | |
| if st.button("Проанализировать документ"): | |
| if not st.session_state.api_key_set: | |
| st.error("Введите OpenAI API ключ") | |
| elif st.session_state.current_text is None: | |
| st.error("Распарсите документ") | |
| else: | |
| try: | |
| with st.spinner("Анализ с gpt-4o-mini..."): | |
| visual_rag = st.session_state.visual_rag_system | |
| vector_store = st.session_state.vector_store | |
| results = visual_rag.process_and_store_document( | |
| text=st.session_state.current_text, | |
| images=st.session_state.current_images, | |
| tables=st.session_state.current_tables, | |
| vector_store=vector_store, | |
| doc_id=st.session_state.current_document or "current_doc" | |
| ) | |
| st.session_state.processing_results = results | |
| st.success("Анализ готов!") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Проанализировано изображений", len(results['image_visual_analyses'])) | |
| with col2: | |
| st.metric("Проанализировано чанков текста", len(results['text_summaries'])) | |
| with col3: | |
| st.metric("Проанализировано таблиц", len(results['table_summaries'])) | |
| st.metric("Помещено в хранилище", results['total_stored']) | |
| print(f"Анализ завершен") | |
| except Exception as e: | |
| st.error(f"Ошибка в ходе: {e}") | |
| print(f"Ошибка: {e}") | |
| st.divider() | |
| st.header("Работа с документом") | |
| if 'answering_rag' not in st.session_state: | |
| st.session_state.answering_rag = None | |
| if st.session_state.api_key_set and st.session_state.answering_rag is None: | |
| from rag_system import AnsweringRAG | |
| st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True) | |
| question = st.text_area( | |
| "Введите запрос:", | |
| height=100, | |
| placeholder="О чем данный документ?" | |
| ) | |
| if st.button("Генерация ответа"): | |
| if not st.session_state.api_key_set: | |
| st.error("Введите OpenAI API ключ") | |
| elif st.session_state.current_text is None: | |
| st.error("Распарсите документ") | |
| elif not question: | |
| st.error("Введите запрос") | |
| else: | |
| try: | |
| with st.spinner("Поиск документов..."): | |
| store = st.session_state.vector_store | |
| doc_name = st.session_state.current_document or "current_doc" | |
| doc_data = { | |
| 'text': st.session_state.current_text, | |
| 'images': [], | |
| 'tables': [] | |
| } | |
| store.add_documents(doc_data, doc_name) | |
| search_results = store.search(question, n_results=5) | |
| print(f"Найдено: {len(search_results)}") | |
| answering_rag = st.session_state.answering_rag | |
| result = answering_rag.analyze_and_answer(question, search_results) | |
| st.success("Поиск завершен!") | |
| st.subheader("Ответ") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| confidence_color = { | |
| 'high': '🟢', | |
| 'medium': '🟡', | |
| 'low': '🔴' | |
| }.get(result['confidence'], '⚪') | |
| st.metric("Уверенность в ответе", f"{confidence_color} {result['confidence'].upper()}") | |
| with col2: | |
| st.metric("Использовано источников", result['sources_used']) | |
| with col3: | |
| if result['sources_used'] > 0: | |
| st.metric("Среднняя релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}") | |
| st.write(result['answer']) | |
| if st.checkbox("Показать исходные документы"): | |
| st.subheader("Использованы документы") | |
| for idx, source in enumerate(result['formatted_sources'], 1): | |
| relevance = source['relevance'] | |
| relevance_bar = "\/" * int(relevance * 10) + "|" * (10 - int(relevance * 10)) | |
| with st.expander( | |
| f"Источник {idx} - {source['type'].upper()} " | |
| f"[{relevance_bar}] {relevance:.0%}" | |
| ): | |
| st.write(source['content']) | |
| print(f" Ответ готов!") | |
| except Exception as e: | |
| st.error(f"Ошибка обработки запроса: {e}") | |
| print(f"Ошибка: {e}") | |
| st.divider() | |
| st.caption( | |
| "Мультимодальная RAG система для парсинга PDF документов" | |
| ) |