|
|
import streamlit as st |
|
|
import os |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
import torch |
|
|
from pdf_parser import PDFParser |
|
|
from embedder import ChromaDBManager |
|
|
from rag_pipeline import RAGPipeline |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.set_page_config( |
|
|
page_title="Multimodal PDF RAG System", |
|
|
page_icon="π", |
|
|
layout="wide", |
|
|
initial_sidebar_state="expanded" |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.markdown(""" |
|
|
<style> |
|
|
.main { |
|
|
padding: 2rem; |
|
|
} |
|
|
.stTabs [data-baseweb="tab-list"] button [data-testid="stMarkdownContainer"] p { |
|
|
font-size: 1.2rem; |
|
|
} |
|
|
.upload-area { |
|
|
border: 2px dashed #ccc; |
|
|
border-radius: 5px; |
|
|
padding: 20px; |
|
|
text-align: center; |
|
|
} |
|
|
.success-box { |
|
|
background-color: #d4edda; |
|
|
border: 1px solid #28a745; |
|
|
border-radius: 4px; |
|
|
padding: 10px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
.error-box { |
|
|
background-color: #f8d7da; |
|
|
border: 1px solid #f5c6cb; |
|
|
border-radius: 4px; |
|
|
padding: 10px; |
|
|
margin: 10px 0; |
|
|
} |
|
|
</style> |
|
|
""", unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
|
def initialize_system(): |
|
|
"""Initialize RAG system components once.""" |
|
|
try: |
|
|
parser = PDFParser(extraction_dir="./pdf_extractions") |
|
|
chroma = ChromaDBManager(db_dir="./chroma_db") |
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
rag = RAGPipeline(chroma, device=device) |
|
|
return parser, chroma, rag, device |
|
|
except Exception as e: |
|
|
st.error(f"Error initializing system: {e}") |
|
|
return None, None, None, None |
|
|
|
|
|
|
|
|
|
|
|
pdf_parser, chroma_manager, rag_pipeline, device = initialize_system() |
|
|
|
|
|
if pdf_parser is None: |
|
|
st.error("Failed to initialize RAG system. Please check your installation.") |
|
|
st.stop() |
|
|
|
|
|
|
|
|
if 'uploaded_files' not in st.session_state: |
|
|
st.session_state.uploaded_files = [] |
|
|
|
|
|
if 'processing_status' not in st.session_state: |
|
|
st.session_state.processing_status = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.title("π Multimodal PDF RAG System") |
|
|
st.markdown("**Local AI-powered document analysis with Qwen2.5-VL and ChromaDB**") |
|
|
st.markdown("*Upload PDFs directly and ask questions about them*") |
|
|
|
|
|
|
|
|
tab_upload, tab_query, tab_manage, tab_about = st.tabs(["π€ Upload PDFs", "π Ask Questions", "π οΈ Manage", "βΉοΈ About"]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab_upload: |
|
|
st.header("π€ Upload PDF Documents") |
|
|
|
|
|
col1, col2 = st.columns([3, 1]) |
|
|
|
|
|
with col1: |
|
|
st.markdown("**Upload your PDF files below. They will be automatically processed and stored.**") |
|
|
|
|
|
|
|
|
uploaded_files = st.file_uploader( |
|
|
"Choose PDF files", |
|
|
type=["pdf"], |
|
|
accept_multiple_files=True, |
|
|
help="You can upload multiple PDF files at once" |
|
|
) |
|
|
|
|
|
with col2: |
|
|
st.info(f"π Documents in DB: {chroma_manager.get_collection_info()['document_count']}") |
|
|
|
|
|
|
|
|
if uploaded_files: |
|
|
st.divider() |
|
|
st.subheader("Processing Uploaded Files") |
|
|
|
|
|
|
|
|
temp_dir = tempfile.mkdtemp() |
|
|
|
|
|
progress_bar = st.progress(0) |
|
|
status_text = st.empty() |
|
|
results_container = st.container() |
|
|
|
|
|
total_files = len(uploaded_files) |
|
|
processed_files = [] |
|
|
failed_files = [] |
|
|
|
|
|
for idx, uploaded_file in enumerate(uploaded_files): |
|
|
try: |
|
|
|
|
|
status_text.text(f"Processing {idx + 1}/{total_files}: {uploaded_file.name}") |
|
|
|
|
|
|
|
|
temp_file_path = os.path.join(temp_dir, uploaded_file.name) |
|
|
with open(temp_file_path, "wb") as f: |
|
|
f.write(uploaded_file.getbuffer()) |
|
|
|
|
|
|
|
|
with st.spinner(f"Extracting content from {uploaded_file.name}..."): |
|
|
try: |
|
|
result = pdf_parser.process_pdf(temp_file_path) |
|
|
|
|
|
|
|
|
chroma_manager.add_documents([result]) |
|
|
|
|
|
processed_files.append({ |
|
|
'name': uploaded_file.name, |
|
|
'size': uploaded_file.size, |
|
|
'text_length': len(result.get('text', '')), |
|
|
'tables': len(result.get('tables', [])), |
|
|
'images': len(result.get('images', [])) |
|
|
}) |
|
|
|
|
|
st.success(f"β
{uploaded_file.name} processed successfully") |
|
|
|
|
|
except Exception as e: |
|
|
failed_files.append({ |
|
|
'name': uploaded_file.name, |
|
|
'error': str(e) |
|
|
}) |
|
|
st.error(f"β Error processing {uploaded_file.name}: {e}") |
|
|
|
|
|
|
|
|
progress_bar.progress((idx + 1) / total_files) |
|
|
|
|
|
except Exception as e: |
|
|
failed_files.append({ |
|
|
'name': uploaded_file.name, |
|
|
'error': str(e) |
|
|
}) |
|
|
st.error(f"β Error with {uploaded_file.name}: {e}") |
|
|
|
|
|
|
|
|
st.divider() |
|
|
st.subheader("Upload Summary") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
st.metric("Successfully Processed", len(processed_files)) |
|
|
|
|
|
with col2: |
|
|
st.metric("Failed", len(failed_files)) |
|
|
|
|
|
with col3: |
|
|
collection_info = chroma_manager.get_collection_info() |
|
|
st.metric("Total in Database", collection_info['document_count']) |
|
|
|
|
|
|
|
|
if processed_files: |
|
|
st.markdown("#### β
Processed Files:") |
|
|
for file_info in processed_files: |
|
|
col1, col2, col3, col4 = st.columns(4) |
|
|
with col1: |
|
|
st.text(file_info['name']) |
|
|
with col2: |
|
|
st.text(f"{file_info['size'] / 1024:.1f} KB") |
|
|
with col3: |
|
|
st.text(f"{file_info['text_length']:,} chars") |
|
|
with col4: |
|
|
st.text(f"{file_info['tables']} tables, {file_info['images']} imgs") |
|
|
|
|
|
|
|
|
if failed_files: |
|
|
st.markdown("#### β Failed Files:") |
|
|
for file_info in failed_files: |
|
|
st.error(f"**{file_info['name']}**: {file_info['error']}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab_query: |
|
|
st.header("π Ask Questions About Your Documents") |
|
|
|
|
|
collection_info = chroma_manager.get_collection_info() |
|
|
|
|
|
if collection_info['document_count'] == 0: |
|
|
st.warning("β οΈ No documents uploaded yet. Please upload PDFs in the 'Upload PDFs' tab first.") |
|
|
else: |
|
|
st.success(f"β
{collection_info['document_count']} documents in database") |
|
|
|
|
|
col1, col2, col3 = st.columns([2, 1, 1]) |
|
|
|
|
|
with col1: |
|
|
query = st.text_input( |
|
|
"Enter your question:", |
|
|
placeholder="ΠΠ°ΠΏΡΠΈΠΌΠ΅Ρ: ΠΠ°ΠΊΠΈΠ΅ ΠΊΠ»ΡΡΠ΅Π²ΡΠ΅ ΠΌΠΎΠΌΠ΅Π½ΡΡ ΠΎΠΏΠΈΡΠ°Π½Ρ Π² Π΄ΠΎΠΊΡΠΌΠ΅Π½ΡΠ΅?", |
|
|
help="Ask any question about your uploaded documents" |
|
|
) |
|
|
|
|
|
with col2: |
|
|
n_docs = st.number_input("Retrieved docs:", value=3, min_value=1, max_value=10) |
|
|
|
|
|
with col3: |
|
|
max_tokens = st.number_input("Max tokens:", value=256, min_value=128, max_value=512, step=128) |
|
|
|
|
|
if st.button("π Get Answer", use_container_width=True, type="primary"): |
|
|
if not query: |
|
|
st.warning("β οΈ Please enter a question.") |
|
|
else: |
|
|
try: |
|
|
with st.spinner("π€ Generating answer... (this may take 10-30 seconds)"): |
|
|
st.info("Processing query - please wait...") |
|
|
|
|
|
|
|
|
try: |
|
|
result = rag_pipeline.answer_question( |
|
|
query=query, |
|
|
n_retrieved=n_docs, |
|
|
max_new_tokens=max_tokens |
|
|
) |
|
|
|
|
|
|
|
|
if "error" in result and result["error"]: |
|
|
st.error(f"β οΈ {result['error']}") |
|
|
|
|
|
|
|
|
st.success("β
Answer Generated") |
|
|
st.markdown("### Answer") |
|
|
st.write(result['answer']) |
|
|
|
|
|
|
|
|
with st.expander("π Retrieved Documents", expanded=False): |
|
|
st.markdown(f"#### {result['doc_count']} Relevant Document Chunks:") |
|
|
for idx, doc in enumerate(result['retrieved_docs'], 1): |
|
|
with st.container(): |
|
|
col_rel, col_score = st.columns([3, 1]) |
|
|
with col_rel: |
|
|
st.markdown(f"**Document {idx}**") |
|
|
with col_score: |
|
|
st.caption(f"Score: {doc['relevance_score']:.1%}") |
|
|
|
|
|
|
|
|
preview = doc['document'][:400] |
|
|
if len(doc['document']) > 400: |
|
|
preview += "..." |
|
|
st.write(preview) |
|
|
|
|
|
if doc['metadata']: |
|
|
st.caption(f"Source: {doc['metadata'].get('filename', 'Unknown')}") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"β Error during generation: {e}") |
|
|
st.info("Possible causes:") |
|
|
st.write("- Out of memory (try reducing 'Max tokens' or 'Retrieved docs')") |
|
|
st.write("- Model inference timeout") |
|
|
st.write("- Invalid input format") |
|
|
|
|
|
except Exception as e: |
|
|
st.error(f"β Unexpected error: {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab_manage: |
|
|
st.header("π οΈ Database Management") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
if st.button("βΉοΈ Database Info", use_container_width=True): |
|
|
try: |
|
|
info = chroma_manager.get_collection_info() |
|
|
st.json(info) |
|
|
except Exception as e: |
|
|
st.error(f"Error: {e}") |
|
|
|
|
|
with col2: |
|
|
if st.button("π List Documents", use_container_width=True): |
|
|
try: |
|
|
all_docs = chroma_manager.collection.get(include=['documents']) |
|
|
if all_docs['ids']: |
|
|
st.write(f"Total documents: {len(all_docs['ids'])}") |
|
|
|
|
|
col1_list, col2_list = st.columns(2) |
|
|
|
|
|
with col1_list: |
|
|
st.write("**First 10:**") |
|
|
for idx, doc_id in enumerate(all_docs['ids'][:10], 1): |
|
|
st.write(f"{idx}. {doc_id[:50]}...") |
|
|
|
|
|
with col2_list: |
|
|
if len(all_docs['ids']) > 10: |
|
|
st.write(f"**... and {len(all_docs['ids']) - 10} more**") |
|
|
else: |
|
|
st.info("No documents in database") |
|
|
except Exception as e: |
|
|
st.error(f"Error: {e}") |
|
|
|
|
|
with col3: |
|
|
if st.button("ποΈ Clear Database", use_container_width=True): |
|
|
try: |
|
|
collection_info = chroma_manager.get_collection_info() |
|
|
if collection_info['document_count'] > 0: |
|
|
chroma_manager.clear_collection() |
|
|
st.success("β
Database cleared!") |
|
|
st.rerun() |
|
|
else: |
|
|
st.info("Database is already empty") |
|
|
except Exception as e: |
|
|
st.error(f"Error: {e}") |
|
|
|
|
|
st.divider() |
|
|
|
|
|
st.markdown("### Storage Information") |
|
|
col1, col2 = st.columns(2) |
|
|
|
|
|
with col1: |
|
|
extraction_size = sum( |
|
|
os.path.getsize(os.path.join(dirpath, filename)) |
|
|
for dirpath, dirnames, filenames in os.walk("./pdf_extractions") |
|
|
for filename in filenames |
|
|
) / (1024 * 1024) if os.path.exists("./pdf_extractions") else 0 |
|
|
st.metric("PDF Extractions", f"{extraction_size:.1f} MB") |
|
|
|
|
|
with col2: |
|
|
chroma_size = sum( |
|
|
os.path.getsize(os.path.join(dirpath, filename)) |
|
|
for dirpath, dirnames, filenames in os.walk("./chroma_db") |
|
|
for filename in filenames |
|
|
) / (1024 * 1024) if os.path.exists("./chroma_db") else 0 |
|
|
st.metric("ChromaDB Storage", f"{chroma_size:.1f} MB") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with tab_about: |
|
|
st.header("βΉοΈ About This System") |
|
|
|
|
|
st.markdown(""" |
|
|
### Multimodal RAG System with PDF Upload |
|
|
|
|
|
This is a **local, privacy-first AI document analysis system** that allows you to: |
|
|
|
|
|
#### β¨ Features |
|
|
- **π€ Easy PDF Upload**: Drag & drop or select multiple PDF files |
|
|
- **π Smart Search**: Semantic search across documents with CLIP embeddings |
|
|
- **π€ AI-Powered Answers**: Ask questions and get answers from Qwen2.5-VL-3B |
|
|
- **π Russian & English**: Full support for both languages |
|
|
- **πΎ Local Storage**: All data stays on your machine |
|
|
- **β‘ Fast Processing**: Automatic caching to avoid re-processing |
|
|
|
|
|
#### ποΈ How It Works |
|
|
1. Upload PDF documents |
|
|
2. System extracts text, tables, and images |
|
|
3. Content is embedded with CLIP and stored in ChromaDB |
|
|
4. Ask questions about your documents |
|
|
5. AI retrieves relevant sections and generates answers |
|
|
|
|
|
#### π Privacy & Security |
|
|
- β
All processing happens locally |
|
|
- β
No internet required (after model download) |
|
|
- β
No cloud APIs used |
|
|
- β
Full data control |
|
|
- β
Open-source code |
|
|
|
|
|
#### π» Technology Stack |
|
|
- **LLM**: Qwen2.5-VL-3B (multimodal) |
|
|
- **Embeddings**: CLIP (clip-vit-base-patch32) |
|
|
- **Vector DB**: ChromaDB |
|
|
- **UI**: Streamlit |
|
|
- **PDF Processing**: pdfplumber + PyMuPDF |
|
|
|
|
|
#### π System Info |
|
|
""") |
|
|
|
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
device_name = "GPU (CUDA)" if torch.cuda.is_available() else "CPU" |
|
|
st.metric("Device", device_name) |
|
|
|
|
|
with col2: |
|
|
collection_info = chroma_manager.get_collection_info() |
|
|
st.metric("Documents in DB", collection_info['document_count']) |
|
|
|
|
|
with col3: |
|
|
st.metric("Version", "1.2 (Upload)") |
|
|
|
|
|
st.divider() |
|
|
|
|
|
st.markdown(""" |
|
|
#### π How to Use |
|
|
|
|
|
1. **Upload PDFs**: Go to the "Upload PDFs" tab and select your files |
|
|
2. **Wait for Processing**: System automatically extracts content |
|
|
3. **Ask Questions**: Switch to "Ask Questions" tab and type your query |
|
|
4. **Review Results**: See generated answers and relevant document chunks |
|
|
5. **Manage**: Use "Manage" tab to view or clear database |
|
|
|
|
|
#### βοΈ Tips for Best Results |
|
|
- Start with smaller PDFs to test |
|
|
- Ask specific questions for better answers |
|
|
- Reduce "Retrieved docs" if responses are slow |
|
|
- Use Russian for Russian documents (better accuracy) |
|
|
|
|
|
#### π§ Performance Tuning |
|
|
- **Slow responses**: Reduce "Max tokens" from 512 to 256 |
|
|
- **Out of memory**: Use fewer "Retrieved docs" (1-3) |
|
|
- **Better quality**: Increase "Max tokens" to 512 |
|
|
|
|
|
#### β Troubleshooting |
|
|
- **App closes**: Reduce "Max tokens" and "Retrieved docs" |
|
|
- **Slow processing**: First upload takes time (model loading) |
|
|
- **Memory issues**: Use CPU mode (edit in sidebar) |
|
|
""") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
st.divider() |
|
|
st.markdown(""" |
|
|
<div style='text-align: center; color: #666; font-size: 0.9rem;'> |
|
|
Multimodal RAG System with PDF Upload | Qwen2.5-VL + ChromaDB + Streamlit | v1.2 |
|
|
</div> |
|
|
""", unsafe_allow_html=True) |