Spaces:
Sleeping
Sleeping
Simplify
Browse files- src/app.py +240 -83
- src/config.py +20 -9
- src/pdf_parser.py +41 -11
- src/rag_system.py +155 -53
- src/vector_store.py +49 -18
src/app.py
CHANGED
|
@@ -1,25 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import os
|
| 3 |
from pathlib import Path
|
|
|
|
|
|
|
| 4 |
from pdf_parser import PDFParser
|
| 5 |
from vector_store import VectorStore
|
| 6 |
-
from rag_system import VisualMultimodalRAG
|
| 7 |
from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
st.set_page_config(
|
| 10 |
-
page_title="
|
| 11 |
-
page_icon="",
|
| 12 |
layout="wide",
|
| 13 |
initial_sidebar_state="expanded"
|
| 14 |
)
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if 'api_key_set' not in st.session_state:
|
| 17 |
st.session_state.api_key_set = False
|
| 18 |
|
| 19 |
if 'api_key' not in st.session_state:
|
| 20 |
st.session_state.api_key = None
|
| 21 |
|
| 22 |
-
if 'visual_rag_system' not in st.session_state:
|
| 23 |
st.session_state.visual_rag_system = None
|
| 24 |
|
| 25 |
if 'vector_store' not in st.session_state:
|
|
@@ -40,26 +56,39 @@ if 'current_images' not in st.session_state:
|
|
| 40 |
if 'current_tables' not in st.session_state:
|
| 41 |
st.session_state.current_tables = None
|
| 42 |
|
| 43 |
-
if 'processing_results' not in st.session_state:
|
| 44 |
st.session_state.processing_results = None
|
| 45 |
|
| 46 |
if 'answering_rag' not in st.session_state:
|
| 47 |
st.session_state.answering_rag = None
|
| 48 |
|
| 49 |
-
st.title("Мультимодальная система RAG LLM")
|
| 50 |
|
| 51 |
-
|
|
|
|
|
|
|
| 52 |
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
""")
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
with st.sidebar:
|
| 57 |
-
st.header("
|
| 58 |
|
| 59 |
-
|
|
|
|
| 60 |
|
| 61 |
api_key = st.text_input(
|
| 62 |
-
"
|
| 63 |
type="password",
|
| 64 |
key="api_key_input"
|
| 65 |
)
|
|
@@ -68,116 +97,154 @@ with st.sidebar:
|
|
| 68 |
st.session_state.api_key = api_key
|
| 69 |
st.session_state.api_key_set = True
|
| 70 |
|
|
|
|
| 71 |
if st.session_state.visual_rag_system is None:
|
| 72 |
try:
|
| 73 |
-
st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True)
|
| 74 |
st.session_state.vector_store = VectorStore()
|
| 75 |
st.session_state.parser = PDFParser(debug=True)
|
| 76 |
-
st.success("
|
| 77 |
except Exception as e:
|
| 78 |
-
st.error(f"
|
| 79 |
else:
|
| 80 |
st.session_state.api_key_set = False
|
| 81 |
-
st.warning("
|
| 82 |
|
| 83 |
st.divider()
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
if st.session_state.vector_store:
|
| 88 |
try:
|
| 89 |
info = st.session_state.vector_store.get_collection_info()
|
| 90 |
-
st.metric("
|
| 91 |
-
st.
|
|
|
|
| 92 |
except Exception as e:
|
| 93 |
-
st.error(f"
|
| 94 |
else:
|
| 95 |
-
st.info("
|
| 96 |
|
| 97 |
st.divider()
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
if st.button("
|
| 102 |
if st.session_state.vector_store:
|
| 103 |
try:
|
| 104 |
st.session_state.vector_store.clear_all()
|
| 105 |
-
st.success("
|
| 106 |
except Exception as e:
|
| 107 |
-
st.error(f"
|
|
|
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
uploaded_file = st.file_uploader(
|
| 112 |
-
"
|
| 113 |
type=['pdf'],
|
| 114 |
-
help="PDF
|
| 115 |
)
|
| 116 |
|
| 117 |
if uploaded_file is not None:
|
|
|
|
| 118 |
upload_path = Path(UPLOAD_FOLDER)
|
| 119 |
upload_path.mkdir(exist_ok=True)
|
|
|
|
| 120 |
file_path = upload_path / uploaded_file.name
|
| 121 |
with open(file_path, 'wb') as f:
|
| 122 |
f.write(uploaded_file.getbuffer())
|
| 123 |
-
st.success(f"Файл сохранён: {uploaded_file.name}")
|
| 124 |
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
| 126 |
if not st.session_state.api_key_set:
|
| 127 |
-
st.error("
|
| 128 |
else:
|
| 129 |
try:
|
| 130 |
-
with st.spinner("
|
| 131 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
parser = st.session_state.parser
|
| 133 |
text, images, tables = parser.parse_pdf(str(file_path))
|
| 134 |
|
|
|
|
| 135 |
st.session_state.current_document = uploaded_file.name
|
| 136 |
st.session_state.current_text = text
|
| 137 |
st.session_state.current_images = images
|
| 138 |
st.session_state.current_tables = tables
|
| 139 |
|
|
|
|
| 140 |
col1, col2, col3 = st.columns(3)
|
| 141 |
with col1:
|
| 142 |
-
st.metric("
|
| 143 |
with col2:
|
| 144 |
-
st.metric("
|
| 145 |
with col3:
|
| 146 |
-
st.metric("
|
| 147 |
|
|
|
|
| 148 |
if images:
|
| 149 |
-
st.subheader("
|
| 150 |
for idx, img in enumerate(images):
|
| 151 |
ocr_text = img.get('ocr_text', '')
|
| 152 |
ocr_len = len(ocr_text)
|
|
|
|
| 153 |
if ocr_len > 0:
|
| 154 |
-
st.success(f"
|
| 155 |
else:
|
| 156 |
-
st.warning(f"
|
|
|
|
|
|
|
| 157 |
|
| 158 |
-
st.success("Парсинг PDF завершён!")
|
| 159 |
except Exception as e:
|
| 160 |
-
st.error(f"
|
|
|
|
| 161 |
|
| 162 |
-
st.divider()
|
| 163 |
|
| 164 |
-
|
|
|
|
|
|
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
-
if st.button("
|
| 168 |
if not st.session_state.api_key_set:
|
| 169 |
-
st.error("
|
| 170 |
elif st.session_state.current_text is None:
|
| 171 |
-
st.error("
|
| 172 |
else:
|
| 173 |
try:
|
| 174 |
-
with st.spinner("
|
| 175 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
visual_rag = st.session_state.visual_rag_system
|
| 177 |
vector_store = st.session_state.vector_store
|
|
|
|
| 178 |
results = visual_rag.process_and_store_document(
|
| 179 |
text=st.session_state.current_text,
|
| 180 |
-
images=st.session_state.current_images,
|
| 181 |
tables=st.session_state.current_tables,
|
| 182 |
vector_store=vector_store,
|
| 183 |
doc_id=st.session_state.current_document or "current_doc"
|
|
@@ -185,97 +252,187 @@ if st.button("Анализировать"):
|
|
| 185 |
|
| 186 |
st.session_state.processing_results = results
|
| 187 |
|
| 188 |
-
|
|
|
|
| 189 |
|
| 190 |
col1, col2, col3 = st.columns(3)
|
| 191 |
with col1:
|
| 192 |
-
st.metric("
|
| 193 |
with col2:
|
| 194 |
-
st.metric("
|
| 195 |
with col3:
|
| 196 |
-
st.metric("
|
| 197 |
|
| 198 |
-
st.metric("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
except Exception as e:
|
| 201 |
-
st.error(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
st.divider()
|
|
|
|
| 204 |
|
| 205 |
-
|
|
|
|
|
|
|
| 206 |
|
|
|
|
| 207 |
if st.session_state.api_key_set and st.session_state.answering_rag is None:
|
|
|
|
| 208 |
st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
|
| 209 |
|
| 210 |
question = st.text_area(
|
| 211 |
-
"
|
| 212 |
height=100,
|
| 213 |
-
placeholder="
|
| 214 |
)
|
| 215 |
|
| 216 |
-
if st.button("
|
| 217 |
if not st.session_state.api_key_set:
|
| 218 |
-
st.error("
|
| 219 |
elif st.session_state.current_text is None:
|
| 220 |
-
st.error("
|
| 221 |
elif not question:
|
| 222 |
-
st.error("
|
| 223 |
else:
|
| 224 |
try:
|
| 225 |
-
with st.spinner("
|
| 226 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
store = st.session_state.vector_store
|
| 228 |
|
|
|
|
| 229 |
doc_name = st.session_state.current_document or "current_doc"
|
| 230 |
doc_data = {
|
| 231 |
'text': st.session_state.current_text,
|
| 232 |
'images': [],
|
| 233 |
'tables': []
|
| 234 |
}
|
| 235 |
-
|
| 236 |
store.add_documents(doc_data, doc_name)
|
| 237 |
|
|
|
|
| 238 |
search_results = store.search(question, n_results=5)
|
| 239 |
|
|
|
|
|
|
|
|
|
|
| 240 |
answering_rag = st.session_state.answering_rag
|
| 241 |
result = answering_rag.analyze_and_answer(question, search_results)
|
| 242 |
|
| 243 |
-
|
|
|
|
| 244 |
|
| 245 |
-
st.subheader("
|
| 246 |
|
|
|
|
| 247 |
col1, col2, col3 = st.columns(3)
|
| 248 |
with col1:
|
| 249 |
-
|
| 250 |
-
'high': '
|
| 251 |
-
'medium': '
|
| 252 |
-
'low': '
|
| 253 |
-
}
|
| 254 |
-
|
| 255 |
-
st.metric("Уверенность", confidence_text)
|
| 256 |
with col2:
|
| 257 |
-
st.metric("
|
| 258 |
with col3:
|
| 259 |
if result['sources_used'] > 0:
|
| 260 |
-
st.metric("
|
| 261 |
|
|
|
|
| 262 |
st.write(result['answer'])
|
| 263 |
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
| 267 |
relevance = source['relevance']
|
| 268 |
relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
|
|
|
|
| 269 |
with st.expander(
|
| 270 |
-
f"
|
| 271 |
f"[{relevance_bar}] {relevance:.0%}"
|
| 272 |
):
|
| 273 |
st.write(source['content'])
|
|
|
|
|
|
|
|
|
|
| 274 |
except Exception as e:
|
| 275 |
-
st.error(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
st.divider()
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
st.caption(
|
| 280 |
-
"
|
|
|
|
|
|
|
|
|
|
| 281 |
)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multimodal RAG LLM System - Streamlit App
|
| 3 |
+
Complete working version with VISUAL image analysis using gpt-4o
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import streamlit as st
|
| 7 |
import os
|
| 8 |
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
# Import optimized versions
|
| 11 |
from pdf_parser import PDFParser
|
| 12 |
from vector_store import VectorStore
|
| 13 |
+
from rag_system import VisualMultimodalRAG # NEW - Vision model
|
| 14 |
from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
|
| 15 |
|
| 16 |
+
|
| 17 |
+
# ============================================================================
|
| 18 |
+
# PAGE CONFIGURATION
|
| 19 |
+
# ============================================================================
|
| 20 |
+
|
| 21 |
st.set_page_config(
|
| 22 |
+
page_title="📄 Multimodal RAG LLM System",
|
| 23 |
+
page_icon="🤖",
|
| 24 |
layout="wide",
|
| 25 |
initial_sidebar_state="expanded"
|
| 26 |
)
|
| 27 |
|
| 28 |
+
# ============================================================================
|
| 29 |
+
# SESSION STATE INITIALIZATION
|
| 30 |
+
# ============================================================================
|
| 31 |
+
|
| 32 |
if 'api_key_set' not in st.session_state:
|
| 33 |
st.session_state.api_key_set = False
|
| 34 |
|
| 35 |
if 'api_key' not in st.session_state:
|
| 36 |
st.session_state.api_key = None
|
| 37 |
|
| 38 |
+
if 'visual_rag_system' not in st.session_state: # NEW - Vision model
|
| 39 |
st.session_state.visual_rag_system = None
|
| 40 |
|
| 41 |
if 'vector_store' not in st.session_state:
|
|
|
|
| 56 |
if 'current_tables' not in st.session_state:
|
| 57 |
st.session_state.current_tables = None
|
| 58 |
|
| 59 |
+
if 'processing_results' not in st.session_state: # NEW
|
| 60 |
st.session_state.processing_results = None
|
| 61 |
|
| 62 |
if 'answering_rag' not in st.session_state:
|
| 63 |
st.session_state.answering_rag = None
|
| 64 |
|
|
|
|
| 65 |
|
| 66 |
+
# ============================================================================
|
| 67 |
+
# MAIN HEADER
|
| 68 |
+
# ============================================================================
|
| 69 |
|
| 70 |
+
st.title("📄 Multimodal RAG LLM System")
|
| 71 |
+
st.markdown("""
|
| 72 |
+
Process PDF documents with visual image analysis:
|
| 73 |
+
- **PDF Parser** with OCR for Russian & English
|
| 74 |
+
- **Visual Analysis** (gpt-4o) for image understanding
|
| 75 |
+
- **Vector Store** (ChromaDB) for semantic search
|
| 76 |
+
- **Individual Component** summarization and storage
|
| 77 |
""")
|
| 78 |
|
| 79 |
+
|
| 80 |
+
# ============================================================================
|
| 81 |
+
# SIDEBAR - CONFIGURATION
|
| 82 |
+
# ============================================================================
|
| 83 |
+
|
| 84 |
with st.sidebar:
|
| 85 |
+
st.header("⚙️ Configuration")
|
| 86 |
|
| 87 |
+
# API Key Section
|
| 88 |
+
st.subheader("🔑 OpenAI API Key")
|
| 89 |
|
| 90 |
api_key = st.text_input(
|
| 91 |
+
"Enter your OpenAI API key:",
|
| 92 |
type="password",
|
| 93 |
key="api_key_input"
|
| 94 |
)
|
|
|
|
| 97 |
st.session_state.api_key = api_key
|
| 98 |
st.session_state.api_key_set = True
|
| 99 |
|
| 100 |
+
# Initialize RAG systems if not already done
|
| 101 |
if st.session_state.visual_rag_system is None:
|
| 102 |
try:
|
| 103 |
+
st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
|
| 104 |
st.session_state.vector_store = VectorStore()
|
| 105 |
st.session_state.parser = PDFParser(debug=True)
|
| 106 |
+
st.success("✅ API Key set & systems initialized")
|
| 107 |
except Exception as e:
|
| 108 |
+
st.error(f"Error initializing systems: {e}")
|
| 109 |
else:
|
| 110 |
st.session_state.api_key_set = False
|
| 111 |
+
st.warning("⚠️ Please enter your API key to continue")
|
| 112 |
|
| 113 |
st.divider()
|
| 114 |
|
| 115 |
+
# Vector Store Status
|
| 116 |
+
st.subheader("📊 Vector Store Status")
|
| 117 |
if st.session_state.vector_store:
|
| 118 |
try:
|
| 119 |
info = st.session_state.vector_store.get_collection_info()
|
| 120 |
+
st.metric("Items in Store", info['count'])
|
| 121 |
+
st.metric("Status", info['status'])
|
| 122 |
+
st.caption(f"Path: {info['persist_path']}")
|
| 123 |
except Exception as e:
|
| 124 |
+
st.error(f"Error getting store info: {e}")
|
| 125 |
else:
|
| 126 |
+
st.info("Set API key to initialize vector store")
|
| 127 |
|
| 128 |
st.divider()
|
| 129 |
|
| 130 |
+
# Document Management
|
| 131 |
+
st.subheader("📁 Document Management")
|
| 132 |
+
if st.button("🔄 Clear Vector Store"):
|
| 133 |
if st.session_state.vector_store:
|
| 134 |
try:
|
| 135 |
st.session_state.vector_store.clear_all()
|
| 136 |
+
st.success("✅ Vector store cleared")
|
| 137 |
except Exception as e:
|
| 138 |
+
st.error(f"Error clearing store: {e}")
|
| 139 |
+
|
| 140 |
|
| 141 |
+
# ============================================================================
|
| 142 |
+
# MAIN CONTENT
|
| 143 |
+
# ============================================================================
|
| 144 |
+
|
| 145 |
+
# Upload Section
|
| 146 |
+
st.header("📤 Upload PDF Document")
|
| 147 |
|
| 148 |
uploaded_file = st.file_uploader(
|
| 149 |
+
"Choose a PDF file",
|
| 150 |
type=['pdf'],
|
| 151 |
+
help="PDF with text, images, and tables"
|
| 152 |
)
|
| 153 |
|
| 154 |
if uploaded_file is not None:
|
| 155 |
+
# Save uploaded file
|
| 156 |
upload_path = Path(UPLOAD_FOLDER)
|
| 157 |
upload_path.mkdir(exist_ok=True)
|
| 158 |
+
|
| 159 |
file_path = upload_path / uploaded_file.name
|
| 160 |
with open(file_path, 'wb') as f:
|
| 161 |
f.write(uploaded_file.getbuffer())
|
|
|
|
| 162 |
|
| 163 |
+
st.success(f"✅ File saved: {uploaded_file.name}")
|
| 164 |
+
|
| 165 |
+
# Parse PDF
|
| 166 |
+
if st.button("🔍 Parse PDF"):
|
| 167 |
if not st.session_state.api_key_set:
|
| 168 |
+
st.error("❌ Please set OpenAI API key first")
|
| 169 |
else:
|
| 170 |
try:
|
| 171 |
+
with st.spinner("📄 Parsing PDF..."):
|
| 172 |
+
print(f"\n{'='*70}")
|
| 173 |
+
print(f"PARSING: {uploaded_file.name}")
|
| 174 |
+
print(f"{'='*70}")
|
| 175 |
+
|
| 176 |
+
# Parse PDF - returns text, images, tables
|
| 177 |
parser = st.session_state.parser
|
| 178 |
text, images, tables = parser.parse_pdf(str(file_path))
|
| 179 |
|
| 180 |
+
# Store in session state
|
| 181 |
st.session_state.current_document = uploaded_file.name
|
| 182 |
st.session_state.current_text = text
|
| 183 |
st.session_state.current_images = images
|
| 184 |
st.session_state.current_tables = tables
|
| 185 |
|
| 186 |
+
# Display results
|
| 187 |
col1, col2, col3 = st.columns(3)
|
| 188 |
with col1:
|
| 189 |
+
st.metric("📝 Text", f"{len(text):,} chars")
|
| 190 |
with col2:
|
| 191 |
+
st.metric("🖼️ Images", len(images))
|
| 192 |
with col3:
|
| 193 |
+
st.metric("📋 Tables", len(tables))
|
| 194 |
|
| 195 |
+
# Show image OCR details
|
| 196 |
if images:
|
| 197 |
+
st.subheader("🖼️ Extracted Images")
|
| 198 |
for idx, img in enumerate(images):
|
| 199 |
ocr_text = img.get('ocr_text', '')
|
| 200 |
ocr_len = len(ocr_text)
|
| 201 |
+
|
| 202 |
if ocr_len > 0:
|
| 203 |
+
st.success(f"✅ Image {idx}: {ocr_len} characters (OCR)")
|
| 204 |
else:
|
| 205 |
+
st.warning(f"⚠️ Image {idx}: No OCR text (will use visual analysis)")
|
| 206 |
+
|
| 207 |
+
st.success("✅ PDF parsing complete!")
|
| 208 |
|
|
|
|
| 209 |
except Exception as e:
|
| 210 |
+
st.error(f"❌ Error parsing PDF: {e}")
|
| 211 |
+
print(f"Error: {e}")
|
| 212 |
|
|
|
|
| 213 |
|
| 214 |
+
# ============================================================================
|
| 215 |
+
# VISUAL IMAGE ANALYSIS & COMPONENT STORAGE
|
| 216 |
+
# ============================================================================
|
| 217 |
|
| 218 |
+
st.divider()
|
| 219 |
+
st.header("🖼️ Visual Analysis & Storage")
|
| 220 |
+
|
| 221 |
+
st.info("""
|
| 222 |
+
**How it works:**
|
| 223 |
+
1. Images are sent to gpt-4o for visual analysis (not just text OCR)
|
| 224 |
+
2. Text is split into chunks and each chunk is summarized
|
| 225 |
+
3. Tables are analyzed individually
|
| 226 |
+
4. ALL summaries are stored in the vector store for semantic search
|
| 227 |
+
""")
|
| 228 |
|
| 229 |
+
if st.button("🖼️ Analyze Images Visually & Store Components"):
|
| 230 |
if not st.session_state.api_key_set:
|
| 231 |
+
st.error("❌ Please set OpenAI API key first")
|
| 232 |
elif st.session_state.current_text is None:
|
| 233 |
+
st.error("❌ Please parse a PDF document first")
|
| 234 |
else:
|
| 235 |
try:
|
| 236 |
+
with st.spinner("🖼️ Analyzing images visually with gpt-4o..."):
|
| 237 |
+
print(f"\n{'='*70}")
|
| 238 |
+
print(f"VISUAL IMAGE ANALYSIS")
|
| 239 |
+
print(f"{'='*70}")
|
| 240 |
+
|
| 241 |
+
# Process with visual analysis
|
| 242 |
visual_rag = st.session_state.visual_rag_system
|
| 243 |
vector_store = st.session_state.vector_store
|
| 244 |
+
|
| 245 |
results = visual_rag.process_and_store_document(
|
| 246 |
text=st.session_state.current_text,
|
| 247 |
+
images=st.session_state.current_images, # Actual images sent to gpt-4o
|
| 248 |
tables=st.session_state.current_tables,
|
| 249 |
vector_store=vector_store,
|
| 250 |
doc_id=st.session_state.current_document or "current_doc"
|
|
|
|
| 252 |
|
| 253 |
st.session_state.processing_results = results
|
| 254 |
|
| 255 |
+
# Display results
|
| 256 |
+
st.success("✅ Visual analysis complete & stored!")
|
| 257 |
|
| 258 |
col1, col2, col3 = st.columns(3)
|
| 259 |
with col1:
|
| 260 |
+
st.metric("🖼️ Images Analyzed", len(results['image_visual_analyses']))
|
| 261 |
with col2:
|
| 262 |
+
st.metric("📝 Text Chunks", len(results['text_summaries']))
|
| 263 |
with col3:
|
| 264 |
+
st.metric("📋 Tables Analyzed", len(results['table_summaries']))
|
| 265 |
|
| 266 |
+
st.metric("📊 Total Stored in Vector", results['total_stored'])
|
| 267 |
+
|
| 268 |
+
# Show image visual analyses
|
| 269 |
+
if results['image_visual_analyses']:
|
| 270 |
+
st.subheader("🖼️ Visual Image Analyses (gpt-4o)")
|
| 271 |
+
for img_analysis in results['image_visual_analyses']:
|
| 272 |
+
with st.expander(f"Image {img_analysis['image_index']} - Visual Analysis"):
|
| 273 |
+
st.write("**Visual Analysis by gpt-4o:**")
|
| 274 |
+
st.write(img_analysis['visual_analysis'])
|
| 275 |
+
|
| 276 |
+
st.write("**Image Path:**")
|
| 277 |
+
st.code(img_analysis['image_path'])
|
| 278 |
+
|
| 279 |
+
if img_analysis['ocr_text']:
|
| 280 |
+
st.write("**OCR Text (backup):**")
|
| 281 |
+
st.text(img_analysis['ocr_text'][:500])
|
| 282 |
+
|
| 283 |
+
# Show text chunk summaries
|
| 284 |
+
if results['text_summaries']:
|
| 285 |
+
st.subheader("📝 Text Chunk Summaries")
|
| 286 |
+
for chunk_summary in results['text_summaries']:
|
| 287 |
+
with st.expander(
|
| 288 |
+
f"Chunk {chunk_summary['chunk_index']} "
|
| 289 |
+
f"({chunk_summary['chunk_length']} chars)"
|
| 290 |
+
):
|
| 291 |
+
st.write("**Summary:**")
|
| 292 |
+
st.write(chunk_summary['summary'])
|
| 293 |
+
st.write("**Original Text (first 500 chars):**")
|
| 294 |
+
st.text(chunk_summary['original_text'])
|
| 295 |
+
|
| 296 |
+
# Show table analyses
|
| 297 |
+
if results['table_summaries']:
|
| 298 |
+
st.subheader("📋 Table Analyses")
|
| 299 |
+
for table_summary in results['table_summaries']:
|
| 300 |
+
with st.expander(
|
| 301 |
+
f"Table {table_summary['table_index']} "
|
| 302 |
+
f"({table_summary['table_length']} chars)"
|
| 303 |
+
):
|
| 304 |
+
st.write("**Analysis:**")
|
| 305 |
+
st.write(table_summary['summary'])
|
| 306 |
+
st.write("**Original Content (first 500 chars):**")
|
| 307 |
+
st.text(table_summary['original_content'])
|
| 308 |
+
|
| 309 |
+
print(f"\n✅ Visual analysis processing complete!")
|
| 310 |
|
| 311 |
except Exception as e:
|
| 312 |
+
st.error(f"❌ Error during visual analysis: {e}")
|
| 313 |
+
print(f"Error: {e}")
|
| 314 |
+
|
| 315 |
+
|
| 316 |
+
# ============================================================================
|
| 317 |
+
# QUESTION & ANSWERING
|
| 318 |
+
# ============================================================================
|
| 319 |
|
| 320 |
st.divider()
|
| 321 |
+
st.header("❓ Ask Questions About Document")
|
| 322 |
|
| 323 |
+
# Initialize answering system if not done
|
| 324 |
+
if 'answering_rag' not in st.session_state:
|
| 325 |
+
st.session_state.answering_rag = None
|
| 326 |
|
| 327 |
+
# Create answering system when API key is set
|
| 328 |
if st.session_state.api_key_set and st.session_state.answering_rag is None:
|
| 329 |
+
from rag_system import AnsweringRAG
|
| 330 |
st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
|
| 331 |
|
| 332 |
question = st.text_area(
|
| 333 |
+
"Enter your question:",
|
| 334 |
height=100,
|
| 335 |
+
placeholder="What does the document say about...?"
|
| 336 |
)
|
| 337 |
|
| 338 |
+
if st.button("🔍 Search & Generate Answer"):
|
| 339 |
if not st.session_state.api_key_set:
|
| 340 |
+
st.error("❌ Please set OpenAI API key first")
|
| 341 |
elif st.session_state.current_text is None:
|
| 342 |
+
st.error("❌ Please parse a PDF document first")
|
| 343 |
elif not question:
|
| 344 |
+
st.error("❌ Please enter a question")
|
| 345 |
else:
|
| 346 |
try:
|
| 347 |
+
with st.spinner("🔄 Searching document and analyzing..."):
|
| 348 |
+
print(f"\n{'='*70}")
|
| 349 |
+
print(f"QUESTION: {question}")
|
| 350 |
+
print(f"{'='*70}")
|
| 351 |
+
|
| 352 |
+
# Search vector store
|
| 353 |
store = st.session_state.vector_store
|
| 354 |
|
| 355 |
+
# Add documents to store if needed
|
| 356 |
doc_name = st.session_state.current_document or "current_doc"
|
| 357 |
doc_data = {
|
| 358 |
'text': st.session_state.current_text,
|
| 359 |
'images': [],
|
| 360 |
'tables': []
|
| 361 |
}
|
|
|
|
| 362 |
store.add_documents(doc_data, doc_name)
|
| 363 |
|
| 364 |
+
# Search for relevant results
|
| 365 |
search_results = store.search(question, n_results=5)
|
| 366 |
|
| 367 |
+
print(f"\n📊 Search Results Found: {len(search_results)}")
|
| 368 |
+
|
| 369 |
+
# Analyze results and generate answer
|
| 370 |
answering_rag = st.session_state.answering_rag
|
| 371 |
result = answering_rag.analyze_and_answer(question, search_results)
|
| 372 |
|
| 373 |
+
# Display answer prominently
|
| 374 |
+
st.success("✅ Analysis complete!")
|
| 375 |
|
| 376 |
+
st.subheader("📝 Answer")
|
| 377 |
|
| 378 |
+
# Show confidence level
|
| 379 |
col1, col2, col3 = st.columns(3)
|
| 380 |
with col1:
|
| 381 |
+
confidence_color = {
|
| 382 |
+
'high': '🟢',
|
| 383 |
+
'medium': '🟡',
|
| 384 |
+
'low': '🔴'
|
| 385 |
+
}.get(result['confidence'], '⚪')
|
| 386 |
+
st.metric("Confidence", f"{confidence_color} {result['confidence'].upper()}")
|
|
|
|
| 387 |
with col2:
|
| 388 |
+
st.metric("Sources Used", result['sources_used'])
|
| 389 |
with col3:
|
| 390 |
if result['sources_used'] > 0:
|
| 391 |
+
st.metric("Avg Relevance", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
|
| 392 |
|
| 393 |
+
# Display the generated answer
|
| 394 |
st.write(result['answer'])
|
| 395 |
|
| 396 |
+
# Show sources
|
| 397 |
+
if st.checkbox("📚 Show Source Documents"):
|
| 398 |
+
st.subheader("Sources Used in Answer")
|
| 399 |
+
for idx, source in enumerate(result['formatted_sources'], 1):
|
| 400 |
relevance = source['relevance']
|
| 401 |
relevance_bar = "█" * int(relevance * 10) + "░" * (10 - int(relevance * 10))
|
| 402 |
+
|
| 403 |
with st.expander(
|
| 404 |
+
f"Source {idx} - {source['type'].upper()} "
|
| 405 |
f"[{relevance_bar}] {relevance:.0%}"
|
| 406 |
):
|
| 407 |
st.write(source['content'])
|
| 408 |
+
|
| 409 |
+
print(f"\n✅ Answer generation complete!")
|
| 410 |
+
|
| 411 |
except Exception as e:
|
| 412 |
+
st.error(f"❌ Error processing question: {e}")
|
| 413 |
+
print(f"Error: {e}")
|
| 414 |
+
|
| 415 |
+
|
| 416 |
+
# ============================================================================
|
| 417 |
+
# FOOTER
|
| 418 |
+
# ============================================================================
|
| 419 |
|
| 420 |
st.divider()
|
| 421 |
|
| 422 |
+
col1, col2, col3 = st.columns(3)
|
| 423 |
+
|
| 424 |
+
with col1:
|
| 425 |
+
st.info("📖 **Text Processing**: PyPDF2 extraction with UTF-8 support")
|
| 426 |
+
|
| 427 |
+
with col2:
|
| 428 |
+
st.info("🖼️ **Visual Analysis**: GPT-4o vision for image understanding")
|
| 429 |
+
|
| 430 |
+
with col3:
|
| 431 |
+
st.info("📊 **Vector Storage**: ChromaDB with auto-persist")
|
| 432 |
+
|
| 433 |
st.caption(
|
| 434 |
+
"Multimodal RAG System | "
|
| 435 |
+
"Visual Image Analysis | "
|
| 436 |
+
"Russian Language Support | "
|
| 437 |
+
"Individual Component Summarization"
|
| 438 |
)
|
src/config.py
CHANGED
|
@@ -1,31 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
from pathlib import Path
|
| 3 |
|
|
|
|
| 4 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 5 |
-
OPENAI_MODEL = "gpt-4o-mini"
|
| 6 |
-
USE_CACHE = True
|
| 7 |
|
|
|
|
| 8 |
CHROMA_DB_PATH = "./chroma_db"
|
| 9 |
DOCSTORE_PATH = "./docstore"
|
| 10 |
PROCESSED_FILES_LOG = "./processed_files.txt"
|
| 11 |
|
|
|
|
| 12 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
| 13 |
EMBEDDING_DIM = 768
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 19 |
|
|
|
|
| 20 |
LANGUAGE = "russian"
|
| 21 |
|
|
|
|
| 22 |
Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
|
| 23 |
Path(DOCSTORE_PATH).mkdir(exist_ok=True)
|
| 24 |
|
|
|
|
| 25 |
UPLOAD_FOLDER = "./uploaded_pdfs"
|
| 26 |
Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
|
| 27 |
MAX_PDF_SIZE_MB = 50
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Configuration file for Multimodal RAG LLM System
|
| 3 |
+
"""
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
+
# API Configuration
|
| 8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 9 |
+
OPENAI_MODEL = "gpt-4o-mini" # Cheaper model variant
|
| 10 |
+
USE_CACHE = True # Enable response caching
|
| 11 |
|
| 12 |
+
# Vector Store Configuration
|
| 13 |
CHROMA_DB_PATH = "./chroma_db"
|
| 14 |
DOCSTORE_PATH = "./docstore"
|
| 15 |
PROCESSED_FILES_LOG = "./processed_files.txt"
|
| 16 |
|
| 17 |
+
# Embedding Model Configuration
|
| 18 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
| 19 |
EMBEDDING_DIM = 768
|
| 20 |
|
| 21 |
+
# System Configuration
|
| 22 |
+
MAX_CHUNK_SIZE = 500 # Smaller chunks = fewer tokens
|
| 23 |
+
CHUNK_OVERLAP = 50 # Less overlap = fewer chunks
|
| 24 |
+
TEMPERATURE = 0.3 # Lower = faster, cheaper
|
| 25 |
+
MAX_TOKENS = 500 # Limit response size (vs 1500)
|
| 26 |
|
| 27 |
+
# Language Support
|
| 28 |
LANGUAGE = "russian"
|
| 29 |
|
| 30 |
+
# Create necessary directories
|
| 31 |
Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
|
| 32 |
Path(DOCSTORE_PATH).mkdir(exist_ok=True)
|
| 33 |
|
| 34 |
+
# PDF Upload Configuration
|
| 35 |
UPLOAD_FOLDER = "./uploaded_pdfs"
|
| 36 |
Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
|
| 37 |
MAX_PDF_SIZE_MB = 50
|
| 38 |
|
| 39 |
+
# TOKEN OPTIMIZATION SETTINGS
|
| 40 |
+
BATCH_SEARCH_RESULTS = 3 # Return only top 3 (not 5)
|
| 41 |
+
CACHE_RESPONSES = True # Cache Q&A responses
|
| 42 |
+
SUMMARIZE_FIRST = True # Summarize PDFs once, not per query
|
src/pdf_parser.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import hashlib
|
|
@@ -17,21 +20,29 @@ class PDFParser:
|
|
| 17 |
self.processed_files = self._load_processed_files()
|
| 18 |
self.debug = debug
|
| 19 |
|
|
|
|
| 20 |
self._configure_tesseract()
|
| 21 |
|
| 22 |
if self.debug:
|
| 23 |
-
print("PDFParser initialized")
|
| 24 |
|
| 25 |
def _configure_tesseract(self):
|
|
|
|
| 26 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
pytesseract.get_tesseract_version()
|
| 28 |
-
print("Tesseract configured successfully")
|
| 29 |
except Exception as e:
|
| 30 |
-
print(f"Tesseract configuration warning: {e}")
|
| 31 |
|
| 32 |
def _debug_print(self, label: str, data: any):
|
|
|
|
| 33 |
if self.debug:
|
| 34 |
-
print(f"[PDF Parser] {label}")
|
| 35 |
if isinstance(data, dict):
|
| 36 |
for key, val in data.items():
|
| 37 |
print(f" {key}: {val}")
|
|
@@ -43,6 +54,7 @@ class PDFParser:
|
|
| 43 |
print(f" {data}")
|
| 44 |
|
| 45 |
def _load_processed_files(self) -> Dict[str, str]:
|
|
|
|
| 46 |
if os.path.exists(PROCESSED_FILES_LOG):
|
| 47 |
try:
|
| 48 |
with open(PROCESSED_FILES_LOG, 'r') as f:
|
|
@@ -52,10 +64,12 @@ class PDFParser:
|
|
| 52 |
return {}
|
| 53 |
|
| 54 |
def _save_processed_files(self):
|
|
|
|
| 55 |
with open(PROCESSED_FILES_LOG, 'w') as f:
|
| 56 |
json.dump(self.processed_files, f, indent=2)
|
| 57 |
|
| 58 |
def _get_file_hash(self, file_path: str) -> str:
|
|
|
|
| 59 |
hash_md5 = hashlib.md5()
|
| 60 |
with open(file_path, "rb") as f:
|
| 61 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
@@ -63,6 +77,7 @@ class PDFParser:
|
|
| 63 |
return hash_md5.hexdigest()
|
| 64 |
|
| 65 |
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
|
|
|
| 66 |
text = ""
|
| 67 |
try:
|
| 68 |
with open(pdf_path, 'rb') as file:
|
|
@@ -81,31 +96,36 @@ class PDFParser:
|
|
| 81 |
return text
|
| 82 |
|
| 83 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
|
|
|
| 84 |
images_data = []
|
| 85 |
try:
|
| 86 |
self._debug_print("Image Extraction Started", f"File: {pdf_path}")
|
| 87 |
|
| 88 |
images = convert_from_path(pdf_path, dpi=150)
|
| 89 |
-
self._debug_print("PDF to Images", f"Total images: {len(images)}")
|
| 90 |
|
| 91 |
for idx, image in enumerate(images):
|
| 92 |
self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
|
| 93 |
|
|
|
|
| 94 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 95 |
image.save(image_path)
|
| 96 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 97 |
|
| 98 |
-
|
|
|
|
| 99 |
|
| 100 |
try:
|
|
|
|
| 101 |
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
| 102 |
|
|
|
|
| 103 |
ocr_text = ocr_text.strip()
|
| 104 |
|
| 105 |
if not ocr_text or len(ocr_text) < 5:
|
| 106 |
-
self._debug_print(f"Image {idx} OCR Result", f"EMPTY or very short ({len(ocr_text)} chars)")
|
| 107 |
else:
|
| 108 |
-
self._debug_print(f"Image {idx} OCR Result", f"Success - {len(ocr_text)} chars: {ocr_text[:150]}")
|
| 109 |
|
| 110 |
except Exception as ocr_error:
|
| 111 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
|
@@ -124,6 +144,7 @@ class PDFParser:
|
|
| 124 |
return images_data
|
| 125 |
|
| 126 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
|
|
|
| 127 |
tables_data = []
|
| 128 |
try:
|
| 129 |
text = self._extract_text_from_pdf(pdf_path)
|
|
@@ -156,22 +177,26 @@ class PDFParser:
|
|
| 156 |
return tables_data
|
| 157 |
|
| 158 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
|
|
|
| 159 |
file_hash = self._get_file_hash(pdf_path)
|
| 160 |
doc_id = Path(pdf_path).stem
|
| 161 |
|
| 162 |
-
self._debug_print("PDF Parsing Started", f"File: {doc_id}")
|
| 163 |
|
|
|
|
| 164 |
if doc_id in self.processed_files:
|
| 165 |
if self.processed_files[doc_id] == file_hash:
|
| 166 |
-
self._debug_print("Status", f"File {doc_id} already processed")
|
| 167 |
return self._load_extracted_data(doc_id)
|
| 168 |
|
| 169 |
-
print(f"Processing PDF: {doc_id}")
|
| 170 |
|
|
|
|
| 171 |
text = self._extract_text_from_pdf(pdf_path)
|
| 172 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 173 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 174 |
|
|
|
|
| 175 |
self._debug_print("Extraction Summary", {
|
| 176 |
'text_length': len(text),
|
| 177 |
'images_count': len(images),
|
|
@@ -179,14 +204,17 @@ class PDFParser:
|
|
| 179 |
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 180 |
})
|
| 181 |
|
|
|
|
| 182 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 183 |
|
|
|
|
| 184 |
self.processed_files[doc_id] = file_hash
|
| 185 |
self._save_processed_files()
|
| 186 |
|
| 187 |
return text, images, tables
|
| 188 |
|
| 189 |
def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
|
|
|
|
| 190 |
data = {
|
| 191 |
'text': text,
|
| 192 |
'images': images,
|
|
@@ -199,6 +227,7 @@ class PDFParser:
|
|
| 199 |
self._debug_print("Data Saved", str(data_path))
|
| 200 |
|
| 201 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
|
|
|
| 202 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 203 |
try:
|
| 204 |
with open(data_path, 'r', encoding='utf-8') as f:
|
|
@@ -208,6 +237,7 @@ class PDFParser:
|
|
| 208 |
return "", [], []
|
| 209 |
|
| 210 |
def get_all_documents(self) -> Dict:
|
|
|
|
| 211 |
all_docs = {}
|
| 212 |
for json_file in self.docstore_path.glob("*_data.json"):
|
| 213 |
doc_id = json_file.stem.replace("_data", "")
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Parser Module with FIXED Russian OCR support
|
| 3 |
+
"""
|
| 4 |
import os
|
| 5 |
import json
|
| 6 |
import hashlib
|
|
|
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
self.debug = debug
|
| 22 |
|
| 23 |
+
# Configure Tesseract for Russian + English
|
| 24 |
self._configure_tesseract()
|
| 25 |
|
| 26 |
if self.debug:
|
| 27 |
+
print("✅ PDFParser initialized with Russian OCR support")
|
| 28 |
|
| 29 |
def _configure_tesseract(self):
|
| 30 |
+
"""Configure Tesseract with proper paths and language support"""
|
| 31 |
try:
|
| 32 |
+
# Windows specific path
|
| 33 |
+
if os.name == 'nt':
|
| 34 |
+
pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
| 35 |
+
|
| 36 |
+
# Test Tesseract
|
| 37 |
pytesseract.get_tesseract_version()
|
| 38 |
+
print("✅ Tesseract configured successfully")
|
| 39 |
except Exception as e:
|
| 40 |
+
print(f"⚠️ Tesseract configuration warning: {e}")
|
| 41 |
|
| 42 |
def _debug_print(self, label: str, data: any):
|
| 43 |
+
"""Print debug information"""
|
| 44 |
if self.debug:
|
| 45 |
+
print(f"\n🔍 [PDF Parser] {label}")
|
| 46 |
if isinstance(data, dict):
|
| 47 |
for key, val in data.items():
|
| 48 |
print(f" {key}: {val}")
|
|
|
|
| 54 |
print(f" {data}")
|
| 55 |
|
| 56 |
def _load_processed_files(self) -> Dict[str, str]:
|
| 57 |
+
"""Load list of already processed files with their hashes"""
|
| 58 |
if os.path.exists(PROCESSED_FILES_LOG):
|
| 59 |
try:
|
| 60 |
with open(PROCESSED_FILES_LOG, 'r') as f:
|
|
|
|
| 64 |
return {}
|
| 65 |
|
| 66 |
def _save_processed_files(self):
|
| 67 |
+
"""Save processed files list to disk"""
|
| 68 |
with open(PROCESSED_FILES_LOG, 'w') as f:
|
| 69 |
json.dump(self.processed_files, f, indent=2)
|
| 70 |
|
| 71 |
def _get_file_hash(self, file_path: str) -> str:
|
| 72 |
+
"""Generate hash of file to detect changes"""
|
| 73 |
hash_md5 = hashlib.md5()
|
| 74 |
with open(file_path, "rb") as f:
|
| 75 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
|
|
| 77 |
return hash_md5.hexdigest()
|
| 78 |
|
| 79 |
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
| 80 |
+
"""Extract text from PDF using PyPDF2"""
|
| 81 |
text = ""
|
| 82 |
try:
|
| 83 |
with open(pdf_path, 'rb') as file:
|
|
|
|
| 96 |
return text
|
| 97 |
|
| 98 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 99 |
+
"""Extract images from PDF pages with Russian OCR support"""
|
| 100 |
images_data = []
|
| 101 |
try:
|
| 102 |
self._debug_print("Image Extraction Started", f"File: {pdf_path}")
|
| 103 |
|
| 104 |
images = convert_from_path(pdf_path, dpi=150)
|
| 105 |
+
self._debug_print("PDF to Images Conversion", f"Total images: {len(images)}")
|
| 106 |
|
| 107 |
for idx, image in enumerate(images):
|
| 108 |
self._debug_print(f"Processing Image {idx}", f"Size: {image.size}")
|
| 109 |
|
| 110 |
+
# Save image
|
| 111 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 112 |
image.save(image_path)
|
| 113 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 114 |
|
| 115 |
+
# Extract text using OCR with Russian support
|
| 116 |
+
self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
|
| 117 |
|
| 118 |
try:
|
| 119 |
+
# CRITICAL: Use 'rus+eng' for Russian + English support
|
| 120 |
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
| 121 |
|
| 122 |
+
# Clean up text
|
| 123 |
ocr_text = ocr_text.strip()
|
| 124 |
|
| 125 |
if not ocr_text or len(ocr_text) < 5:
|
| 126 |
+
self._debug_print(f"Image {idx} OCR Result", f"⚠️ EMPTY or very short ({len(ocr_text)} chars)")
|
| 127 |
else:
|
| 128 |
+
self._debug_print(f"Image {idx} OCR Result", f"✅ Success - {len(ocr_text)} chars: {ocr_text[:150]}")
|
| 129 |
|
| 130 |
except Exception as ocr_error:
|
| 131 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
|
|
|
| 144 |
return images_data
|
| 145 |
|
| 146 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 147 |
+
"""Extract table content from PDF"""
|
| 148 |
tables_data = []
|
| 149 |
try:
|
| 150 |
text = self._extract_text_from_pdf(pdf_path)
|
|
|
|
| 177 |
return tables_data
|
| 178 |
|
| 179 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 180 |
+
"""Parse PDF and extract text, images, and tables with debug output"""
|
| 181 |
file_hash = self._get_file_hash(pdf_path)
|
| 182 |
doc_id = Path(pdf_path).stem
|
| 183 |
|
| 184 |
+
self._debug_print("PDF Parsing Started", f"File: {doc_id}, Hash: {file_hash}")
|
| 185 |
|
| 186 |
+
# Check if file was already processed
|
| 187 |
if doc_id in self.processed_files:
|
| 188 |
if self.processed_files[doc_id] == file_hash:
|
| 189 |
+
self._debug_print("Status", f"File {doc_id} already processed, loading from cache")
|
| 190 |
return self._load_extracted_data(doc_id)
|
| 191 |
|
| 192 |
+
print(f"\n📄 Processing PDF: {doc_id}")
|
| 193 |
|
| 194 |
+
# Extract content
|
| 195 |
text = self._extract_text_from_pdf(pdf_path)
|
| 196 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 197 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 198 |
|
| 199 |
+
# Summary
|
| 200 |
self._debug_print("Extraction Summary", {
|
| 201 |
'text_length': len(text),
|
| 202 |
'images_count': len(images),
|
|
|
|
| 204 |
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 205 |
})
|
| 206 |
|
| 207 |
+
# Save extracted data
|
| 208 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 209 |
|
| 210 |
+
# Update processed files log
|
| 211 |
self.processed_files[doc_id] = file_hash
|
| 212 |
self._save_processed_files()
|
| 213 |
|
| 214 |
return text, images, tables
|
| 215 |
|
| 216 |
def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
|
| 217 |
+
"""Save extracted data to docstore"""
|
| 218 |
data = {
|
| 219 |
'text': text,
|
| 220 |
'images': images,
|
|
|
|
| 227 |
self._debug_print("Data Saved", str(data_path))
|
| 228 |
|
| 229 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 230 |
+
"""Load previously extracted data from docstore"""
|
| 231 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 232 |
try:
|
| 233 |
with open(data_path, 'r', encoding='utf-8') as f:
|
|
|
|
| 237 |
return "", [], []
|
| 238 |
|
| 239 |
def get_all_documents(self) -> Dict:
|
| 240 |
+
"""Load all processed documents from docstore"""
|
| 241 |
all_docs = {}
|
| 242 |
for json_file in self.docstore_path.glob("*_data.json"):
|
| 243 |
doc_id = json_file.stem.replace("_data", "")
|
src/rag_system.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from typing import List, Dict
|
| 2 |
from langchain_openai import ChatOpenAI
|
| 3 |
from langchain_core.messages import HumanMessage, SystemMessage
|
|
@@ -11,14 +16,21 @@ from config import (
|
|
| 11 |
|
| 12 |
|
| 13 |
class VisualMultimodalRAG:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 16 |
api_key = api_key or OPENAI_API_KEY
|
| 17 |
self.debug = debug
|
| 18 |
|
| 19 |
-
|
| 20 |
self.llm = ChatOpenAI(
|
| 21 |
-
model_name="gpt-4o-mini",
|
| 22 |
api_key=api_key,
|
| 23 |
temperature=TEMPERATURE,
|
| 24 |
max_tokens=MAX_TOKENS,
|
|
@@ -28,11 +40,12 @@ class VisualMultimodalRAG:
|
|
| 28 |
self.visual_summaries_log = []
|
| 29 |
|
| 30 |
if self.debug:
|
| 31 |
-
print("VisualMultimodalRAG initialized")
|
| 32 |
|
| 33 |
def _debug_print(self, label: str, data: any):
|
|
|
|
| 34 |
if self.debug:
|
| 35 |
-
print(f"DEBUG [{label}]:")
|
| 36 |
if isinstance(data, (list, dict)):
|
| 37 |
print(f" Type: {type(data).__name__}")
|
| 38 |
print(f" Content: {str(data)[:300]}...")
|
|
@@ -40,6 +53,7 @@ class VisualMultimodalRAG:
|
|
| 40 |
print(f" {data}")
|
| 41 |
|
| 42 |
def _image_to_base64(self, image_path: str) -> str:
|
|
|
|
| 43 |
try:
|
| 44 |
with open(image_path, 'rb') as image_file:
|
| 45 |
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
@@ -49,14 +63,28 @@ class VisualMultimodalRAG:
|
|
| 49 |
return None
|
| 50 |
|
| 51 |
def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
if not os.path.exists(image_path):
|
| 53 |
return f"[Image {image_idx}: File not found - {image_path}]"
|
| 54 |
|
| 55 |
try:
|
|
|
|
| 56 |
image_base64 = self._image_to_base64(image_path)
|
| 57 |
if not image_base64:
|
| 58 |
return f"[Image {image_idx}: Could not convert to base64]"
|
| 59 |
|
|
|
|
| 60 |
file_ext = Path(image_path).suffix.lower()
|
| 61 |
media_type_map = {
|
| 62 |
'.jpg': 'image/jpeg',
|
|
@@ -67,8 +95,9 @@ class VisualMultimodalRAG:
|
|
| 67 |
}
|
| 68 |
media_type = media_type_map.get(file_ext, 'image/png')
|
| 69 |
|
| 70 |
-
print(f"Analyzing image {image_idx}...")
|
| 71 |
|
|
|
|
| 72 |
message = HumanMessage(
|
| 73 |
content=[
|
| 74 |
{
|
|
@@ -79,44 +108,52 @@ class VisualMultimodalRAG:
|
|
| 79 |
},
|
| 80 |
{
|
| 81 |
"type": "text",
|
| 82 |
-
"text": f"""
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
| 90 |
Analysis:"""
|
| 91 |
}
|
| 92 |
],
|
| 93 |
)
|
| 94 |
|
|
|
|
| 95 |
response = self.llm.invoke([message])
|
| 96 |
analysis = response.content.strip()
|
| 97 |
|
| 98 |
if self.debug:
|
| 99 |
self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
|
| 100 |
|
| 101 |
-
print(f"Image {image_idx} analyzed successfully")
|
| 102 |
return analysis
|
| 103 |
|
| 104 |
except Exception as e:
|
| 105 |
error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
|
| 106 |
-
print(f"Error analyzing image {image_idx}: {e}")
|
| 107 |
return error_msg
|
| 108 |
|
| 109 |
def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
| 111 |
visual_analyses = []
|
| 112 |
|
| 113 |
for idx, image in enumerate(images):
|
| 114 |
image_path = image.get('path', '')
|
| 115 |
|
| 116 |
if not image_path:
|
| 117 |
-
print(f"Image {idx}: No path provided")
|
| 118 |
continue
|
| 119 |
|
|
|
|
| 120 |
visual_analysis = self.analyze_image_visually(image_path, idx)
|
| 121 |
|
| 122 |
visual_analyses.append({
|
|
@@ -124,12 +161,15 @@ Analysis:"""
|
|
| 124 |
'image_index': idx,
|
| 125 |
'image_path': image_path,
|
| 126 |
'visual_analysis': visual_analysis,
|
| 127 |
-
'ocr_text': image.get('ocr_text', '')
|
| 128 |
})
|
| 129 |
|
| 130 |
return visual_analyses
|
| 131 |
|
| 132 |
def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
| 133 |
chunks = []
|
| 134 |
text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
|
| 135 |
|
|
@@ -141,12 +181,12 @@ Analysis:"""
|
|
| 141 |
|
| 142 |
try:
|
| 143 |
prompt = f"""Summarize this text chunk in {self.language}.
|
| 144 |
-
|
| 145 |
|
| 146 |
Text Chunk:
|
| 147 |
{chunk}
|
| 148 |
|
| 149 |
-
Summary:"""
|
| 150 |
|
| 151 |
message = HumanMessage(content=prompt)
|
| 152 |
response = self.llm.invoke([message])
|
|
@@ -169,6 +209,9 @@ Summary:"""
|
|
| 169 |
return chunks
|
| 170 |
|
| 171 |
def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
| 172 |
summaries = []
|
| 173 |
|
| 174 |
for idx, table in enumerate(tables):
|
|
@@ -179,12 +222,12 @@ Summary:"""
|
|
| 179 |
|
| 180 |
try:
|
| 181 |
prompt = f"""Analyze and summarize this table/structured data in {self.language}.
|
| 182 |
-
Extract key insights, row/column meanings, and important figures.
|
| 183 |
|
| 184 |
Table Content:
|
| 185 |
{table_content}
|
| 186 |
|
| 187 |
-
Summary:"""
|
| 188 |
|
| 189 |
message = HumanMessage(content=prompt)
|
| 190 |
response = self.llm.invoke([message])
|
|
@@ -214,9 +257,13 @@ Summary:"""
|
|
| 214 |
vector_store,
|
| 215 |
doc_id: str
|
| 216 |
) -> Dict:
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
|
| 219 |
-
|
| 220 |
|
| 221 |
results = {
|
| 222 |
'doc_id': doc_id,
|
|
@@ -226,13 +273,14 @@ Summary:"""
|
|
| 226 |
'total_stored': 0
|
| 227 |
}
|
| 228 |
|
| 229 |
-
|
| 230 |
-
print(f"VISUAL IMAGE ANALYSIS ({len(images)} total)")
|
| 231 |
-
|
| 232 |
|
| 233 |
image_analyses = self.analyze_images_visually(images)
|
| 234 |
results['image_visual_analyses'] = image_analyses
|
| 235 |
|
|
|
|
| 236 |
image_docs = {
|
| 237 |
'text': ' | '.join([
|
| 238 |
f"Image {a['image_index']}: {a['visual_analysis']}"
|
|
@@ -243,7 +291,7 @@ Summary:"""
|
|
| 243 |
}
|
| 244 |
|
| 245 |
for analysis in image_analyses:
|
| 246 |
-
print(f"
|
| 247 |
print(f" Path: {analysis['image_path']}")
|
| 248 |
print(f" Analysis: {analysis['visual_analysis'][:100]}...")
|
| 249 |
|
|
@@ -254,11 +302,13 @@ Summary:"""
|
|
| 254 |
f"{doc_id}_images_visual"
|
| 255 |
)
|
| 256 |
results['total_stored'] += len(image_analyses)
|
| 257 |
-
print(f" Stored {len(image_analyses)} image visual analyses")
|
| 258 |
except Exception as e:
|
| 259 |
-
print(f" Error storing image analyses: {e}")
|
| 260 |
|
| 261 |
-
|
|
|
|
|
|
|
| 262 |
|
| 263 |
text_summaries = self.summarize_text_chunks(text)
|
| 264 |
results['text_summaries'] = text_summaries
|
|
@@ -271,7 +321,7 @@ Summary:"""
|
|
| 271 |
}
|
| 272 |
|
| 273 |
for summary in text_summaries:
|
| 274 |
-
print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
|
| 275 |
|
| 276 |
if text_summaries:
|
| 277 |
try:
|
|
@@ -280,11 +330,13 @@ Summary:"""
|
|
| 280 |
f"{doc_id}_text_chunks"
|
| 281 |
)
|
| 282 |
results['total_stored'] += len(text_summaries)
|
| 283 |
-
print(f" Stored {len(text_summaries)} text chunk summaries")
|
| 284 |
except Exception as e:
|
| 285 |
-
print(f" Error storing text summaries: {e}")
|
| 286 |
|
| 287 |
-
|
|
|
|
|
|
|
| 288 |
|
| 289 |
table_summaries = self.summarize_tables(tables)
|
| 290 |
results['table_summaries'] = table_summaries
|
|
@@ -297,7 +349,7 @@ Summary:"""
|
|
| 297 |
}
|
| 298 |
|
| 299 |
for summary in table_summaries:
|
| 300 |
-
print(f"
|
| 301 |
|
| 302 |
if table_summaries:
|
| 303 |
try:
|
|
@@ -306,15 +358,19 @@ Summary:"""
|
|
| 306 |
f"{doc_id}_tables"
|
| 307 |
)
|
| 308 |
results['total_stored'] += len(table_summaries)
|
| 309 |
-
print(f" Stored {len(table_summaries)} table summaries")
|
| 310 |
except Exception as e:
|
| 311 |
-
print(f" Error storing table summaries: {e}")
|
| 312 |
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
| 314 |
print(f" Images analyzed visually & stored: {len(image_analyses)}")
|
| 315 |
print(f" Text chunks summarized & stored: {len(text_summaries)}")
|
| 316 |
print(f" Tables summarized & stored: {len(table_summaries)}")
|
| 317 |
print(f" Total items stored in vector: {results['total_stored']}")
|
|
|
|
| 318 |
|
| 319 |
self.visual_summaries_log.append(results)
|
| 320 |
return results
|
|
@@ -335,13 +391,19 @@ Summary:"""
|
|
| 335 |
|
| 336 |
|
| 337 |
class AnsweringRAG:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 340 |
api_key = api_key or OPENAI_API_KEY
|
| 341 |
self.debug = debug
|
| 342 |
|
| 343 |
self.llm = ChatOpenAI(
|
| 344 |
-
model_name="gpt-4o-mini",
|
| 345 |
api_key=api_key,
|
| 346 |
temperature=TEMPERATURE,
|
| 347 |
max_tokens=MAX_TOKENS,
|
|
@@ -351,11 +413,12 @@ class AnsweringRAG:
|
|
| 351 |
self.answer_log = []
|
| 352 |
|
| 353 |
if self.debug:
|
| 354 |
-
print("AnsweringRAG initialized ")
|
| 355 |
|
| 356 |
def _debug_print(self, label: str, data: any):
|
|
|
|
| 357 |
if self.debug:
|
| 358 |
-
print(f" DEBUG [{label}]:")
|
| 359 |
if isinstance(data, (list, dict)):
|
| 360 |
print(f" Type: {type(data).__name__}")
|
| 361 |
print(f" Content: {str(data)[:300]}...")
|
|
@@ -367,17 +430,35 @@ class AnsweringRAG:
|
|
| 367 |
question: str,
|
| 368 |
search_results: List[Dict]
|
| 369 |
) -> Dict:
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
|
|
|
| 372 |
print(f"ANALYZING QUESTION & GENERATING ANSWER")
|
|
|
|
| 373 |
|
| 374 |
-
print(f"Question: {question}")
|
| 375 |
-
print(f"Search Results Found: {len(search_results)}")
|
| 376 |
|
|
|
|
| 377 |
if not search_results:
|
| 378 |
-
print(f"No search results found!")
|
| 379 |
-
answer = f"""
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
|
| 382 |
result = {
|
| 383 |
'question': question,
|
|
@@ -389,6 +470,7 @@ class AnsweringRAG:
|
|
| 389 |
self.answer_log.append(result)
|
| 390 |
return result
|
| 391 |
|
|
|
|
| 392 |
context_parts = []
|
| 393 |
for idx, result in enumerate(search_results, 1):
|
| 394 |
content = result.get('content', '')
|
|
@@ -405,6 +487,7 @@ class AnsweringRAG:
|
|
| 405 |
|
| 406 |
self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
|
| 407 |
|
|
|
|
| 408 |
analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
|
| 409 |
|
| 410 |
USER QUESTION:
|
|
@@ -420,20 +503,23 @@ INSTRUCTIONS:
|
|
| 420 |
4. If the content doesn't fully answer the question, explain what information is available
|
| 421 |
5. Be specific and cite the content when relevant
|
| 422 |
6. Structure your answer clearly with key points
|
|
|
|
| 423 |
ANSWER:"""
|
| 424 |
|
| 425 |
-
print(f"Analyzing search results...")
|
| 426 |
print(f" Context size: {len(full_context)} characters")
|
| 427 |
print(f" Sources: {len(search_results)}")
|
| 428 |
|
| 429 |
try:
|
|
|
|
| 430 |
message = HumanMessage(content=analysis_prompt)
|
| 431 |
response = self.llm.invoke([message])
|
| 432 |
answer = response.content.strip()
|
| 433 |
|
|
|
|
| 434 |
confidence = self._estimate_confidence(len(search_results), answer)
|
| 435 |
|
| 436 |
-
print(f" Answer generated successfully")
|
| 437 |
print(f" Confidence: {confidence}")
|
| 438 |
print(f" Answer length: {len(answer)} characters")
|
| 439 |
|
|
@@ -449,7 +535,7 @@ ANSWER:"""
|
|
| 449 |
return result
|
| 450 |
|
| 451 |
except Exception as e:
|
| 452 |
-
print(f" Error generating answer: {e}")
|
| 453 |
answer = f"I encountered an error while analyzing the search results. Please try again."
|
| 454 |
|
| 455 |
result = {
|
|
@@ -465,14 +551,18 @@ ANSWER:"""
|
|
| 465 |
return result
|
| 466 |
|
| 467 |
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
|
|
|
| 468 |
answer_length = len(answer)
|
| 469 |
|
|
|
|
| 470 |
if sources_count >= 3 and answer_length > 500:
|
| 471 |
return "high"
|
| 472 |
|
|
|
|
| 473 |
elif sources_count >= 2 and answer_length > 200:
|
| 474 |
return "medium"
|
| 475 |
|
|
|
|
| 476 |
else:
|
| 477 |
return "low"
|
| 478 |
|
|
@@ -481,9 +571,14 @@ ANSWER:"""
|
|
| 481 |
question: str,
|
| 482 |
search_results: List[Dict]
|
| 483 |
) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
result = self.analyze_and_answer(question, search_results)
|
| 486 |
|
|
|
|
| 487 |
formatted_sources = []
|
| 488 |
for idx, source in enumerate(result['search_results'], 1):
|
| 489 |
formatted_sources.append({
|
|
@@ -497,18 +592,25 @@ ANSWER:"""
|
|
| 497 |
return result
|
| 498 |
|
| 499 |
def get_answer_log(self) -> List[Dict]:
|
|
|
|
| 500 |
return self.answer_log
|
| 501 |
|
| 502 |
def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
|
|
|
|
| 503 |
|
|
|
|
| 504 |
print(f"ANSWER TO: {result['question']}")
|
|
|
|
| 505 |
|
| 506 |
-
print(f"ANSWER (Confidence: {result['confidence'].upper()}):")
|
|
|
|
| 507 |
print(result['answer'])
|
|
|
|
| 508 |
|
| 509 |
if result.get('formatted_sources'):
|
| 510 |
-
print(f"SOURCES USED ({len(result['formatted_sources'])} total):")
|
| 511 |
for source in result['formatted_sources']:
|
| 512 |
print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
|
| 513 |
print(f"{source['content'][:max_source_length]}...")
|
| 514 |
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced RAG System - Visual Image Analysis
|
| 3 |
+
Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
|
| 4 |
+
Then stores results in vector store
|
| 5 |
+
"""
|
| 6 |
from typing import List, Dict
|
| 7 |
from langchain_openai import ChatOpenAI
|
| 8 |
from langchain_core.messages import HumanMessage, SystemMessage
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
class VisualMultimodalRAG:
|
| 19 |
+
"""
|
| 20 |
+
RAG system that:
|
| 21 |
+
1. Sends images as base64 to GPT-4o for visual analysis
|
| 22 |
+
2. Gets detailed visual descriptions and insights
|
| 23 |
+
3. Stores visual analysis in vector store
|
| 24 |
+
4. Enables image-based semantic search
|
| 25 |
+
"""
|
| 26 |
|
| 27 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 28 |
api_key = api_key or OPENAI_API_KEY
|
| 29 |
self.debug = debug
|
| 30 |
|
| 31 |
+
# Use gpt-4o for vision capabilities
|
| 32 |
self.llm = ChatOpenAI(
|
| 33 |
+
model_name="gpt-4o-mini", # CRITICAL: gpt-4o has vision
|
| 34 |
api_key=api_key,
|
| 35 |
temperature=TEMPERATURE,
|
| 36 |
max_tokens=MAX_TOKENS,
|
|
|
|
| 40 |
self.visual_summaries_log = []
|
| 41 |
|
| 42 |
if self.debug:
|
| 43 |
+
print("✅ VisualMultimodalRAG initialized with gpt-4o (vision model)")
|
| 44 |
|
| 45 |
def _debug_print(self, label: str, data: any):
|
| 46 |
+
"""Print debug information"""
|
| 47 |
if self.debug:
|
| 48 |
+
print(f"\n🔍 DEBUG [{label}]:")
|
| 49 |
if isinstance(data, (list, dict)):
|
| 50 |
print(f" Type: {type(data).__name__}")
|
| 51 |
print(f" Content: {str(data)[:300]}...")
|
|
|
|
| 53 |
print(f" {data}")
|
| 54 |
|
| 55 |
def _image_to_base64(self, image_path: str) -> str:
|
| 56 |
+
"""Convert image file to base64 string"""
|
| 57 |
try:
|
| 58 |
with open(image_path, 'rb') as image_file:
|
| 59 |
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
|
|
| 63 |
return None
|
| 64 |
|
| 65 |
def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
|
| 66 |
+
"""
|
| 67 |
+
Send actual image (base64) to gpt-4o for visual analysis
|
| 68 |
+
Returns detailed visual analysis/description
|
| 69 |
+
|
| 70 |
+
gpt-4o can see:
|
| 71 |
+
- Charts, graphs, diagrams
|
| 72 |
+
- Tables and structured data
|
| 73 |
+
- Photos and drawings
|
| 74 |
+
- Handwritten text
|
| 75 |
+
- Screenshots
|
| 76 |
+
- Any visual content
|
| 77 |
+
"""
|
| 78 |
if not os.path.exists(image_path):
|
| 79 |
return f"[Image {image_idx}: File not found - {image_path}]"
|
| 80 |
|
| 81 |
try:
|
| 82 |
+
# Convert image to base64
|
| 83 |
image_base64 = self._image_to_base64(image_path)
|
| 84 |
if not image_base64:
|
| 85 |
return f"[Image {image_idx}: Could not convert to base64]"
|
| 86 |
|
| 87 |
+
# Determine image type
|
| 88 |
file_ext = Path(image_path).suffix.lower()
|
| 89 |
media_type_map = {
|
| 90 |
'.jpg': 'image/jpeg',
|
|
|
|
| 95 |
}
|
| 96 |
media_type = media_type_map.get(file_ext, 'image/png')
|
| 97 |
|
| 98 |
+
print(f"🔍 Analyzing image {image_idx} visually (as {media_type})...")
|
| 99 |
|
| 100 |
+
# Create message with image
|
| 101 |
message = HumanMessage(
|
| 102 |
content=[
|
| 103 |
{
|
|
|
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"type": "text",
|
| 111 |
+
"text": f"""Analyze this image in detail in {self.language}.
|
| 112 |
+
|
| 113 |
+
Provide a comprehensive visual analysis including:
|
| 114 |
+
1. **What you see** - Main objects, elements, structure
|
| 115 |
+
2. **Data/Content** - Any numbers, text, charts, graphs
|
| 116 |
+
3. **Purpose** - What this image is showing or representing
|
| 117 |
+
4. **Key insights** - Important patterns, trends, or information
|
| 118 |
+
5. **Connections** - How this relates to document content
|
| 119 |
+
|
| 120 |
+
Be specific and detailed. Focus on visual information that cannot be extracted from text alone.
|
| 121 |
+
|
| 122 |
Analysis:"""
|
| 123 |
}
|
| 124 |
],
|
| 125 |
)
|
| 126 |
|
| 127 |
+
# Call gpt-4o with vision
|
| 128 |
response = self.llm.invoke([message])
|
| 129 |
analysis = response.content.strip()
|
| 130 |
|
| 131 |
if self.debug:
|
| 132 |
self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
|
| 133 |
|
| 134 |
+
print(f"✅ Image {image_idx} analyzed successfully")
|
| 135 |
return analysis
|
| 136 |
|
| 137 |
except Exception as e:
|
| 138 |
error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
|
| 139 |
+
print(f"❌ Error analyzing image {image_idx}: {e}")
|
| 140 |
return error_msg
|
| 141 |
|
| 142 |
def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
|
| 143 |
+
"""
|
| 144 |
+
Analyze each image visually using gpt-4o vision
|
| 145 |
+
Returns list of {image_index, visual_analysis, type}
|
| 146 |
+
"""
|
| 147 |
visual_analyses = []
|
| 148 |
|
| 149 |
for idx, image in enumerate(images):
|
| 150 |
image_path = image.get('path', '')
|
| 151 |
|
| 152 |
if not image_path:
|
| 153 |
+
print(f"⚠️ Image {idx}: No path provided")
|
| 154 |
continue
|
| 155 |
|
| 156 |
+
# Analyze image visually (not just OCR)
|
| 157 |
visual_analysis = self.analyze_image_visually(image_path, idx)
|
| 158 |
|
| 159 |
visual_analyses.append({
|
|
|
|
| 161 |
'image_index': idx,
|
| 162 |
'image_path': image_path,
|
| 163 |
'visual_analysis': visual_analysis,
|
| 164 |
+
'ocr_text': image.get('ocr_text', '') # Keep OCR as backup
|
| 165 |
})
|
| 166 |
|
| 167 |
return visual_analyses
|
| 168 |
|
| 169 |
def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
|
| 170 |
+
"""
|
| 171 |
+
Chunk text and summarize each chunk individually
|
| 172 |
+
"""
|
| 173 |
chunks = []
|
| 174 |
text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
|
| 175 |
|
|
|
|
| 181 |
|
| 182 |
try:
|
| 183 |
prompt = f"""Summarize this text chunk in {self.language}.
|
| 184 |
+
Keep it concise. Extract key points, facts, and main ideas.
|
| 185 |
|
| 186 |
Text Chunk:
|
| 187 |
{chunk}
|
| 188 |
|
| 189 |
+
Summary (2-3 sentences maximum):"""
|
| 190 |
|
| 191 |
message = HumanMessage(content=prompt)
|
| 192 |
response = self.llm.invoke([message])
|
|
|
|
| 209 |
return chunks
|
| 210 |
|
| 211 |
def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
|
| 212 |
+
"""
|
| 213 |
+
Summarize each table individually
|
| 214 |
+
"""
|
| 215 |
summaries = []
|
| 216 |
|
| 217 |
for idx, table in enumerate(tables):
|
|
|
|
| 222 |
|
| 223 |
try:
|
| 224 |
prompt = f"""Analyze and summarize this table/structured data in {self.language}.
|
| 225 |
+
Extract key insights, row/column meanings, and important figures.
|
| 226 |
|
| 227 |
Table Content:
|
| 228 |
{table_content}
|
| 229 |
|
| 230 |
+
Summary (2-3 sentences maximum):"""
|
| 231 |
|
| 232 |
message = HumanMessage(content=prompt)
|
| 233 |
response = self.llm.invoke([message])
|
|
|
|
| 257 |
vector_store,
|
| 258 |
doc_id: str
|
| 259 |
) -> Dict:
|
| 260 |
+
"""
|
| 261 |
+
Main function: Analyze all components visually and store in vector store
|
| 262 |
+
Images are analyzed using gpt-4o vision (not just OCR)
|
| 263 |
+
"""
|
| 264 |
+
print(f"\n{'='*70}")
|
| 265 |
print(f"PROCESSING WITH VISUAL IMAGE ANALYSIS: {doc_id}")
|
| 266 |
+
print(f"{'='*70}")
|
| 267 |
|
| 268 |
results = {
|
| 269 |
'doc_id': doc_id,
|
|
|
|
| 273 |
'total_stored': 0
|
| 274 |
}
|
| 275 |
|
| 276 |
+
# 1. Analyze images VISUALLY using gpt-4o
|
| 277 |
+
print(f"\n🖼️ VISUAL IMAGE ANALYSIS (gpt-4o vision) ({len(images)} total)")
|
| 278 |
+
print(f"{'─'*70}")
|
| 279 |
|
| 280 |
image_analyses = self.analyze_images_visually(images)
|
| 281 |
results['image_visual_analyses'] = image_analyses
|
| 282 |
|
| 283 |
+
# Store each image analysis in vector store
|
| 284 |
image_docs = {
|
| 285 |
'text': ' | '.join([
|
| 286 |
f"Image {a['image_index']}: {a['visual_analysis']}"
|
|
|
|
| 291 |
}
|
| 292 |
|
| 293 |
for analysis in image_analyses:
|
| 294 |
+
print(f" ✅ Image {analysis['image_index']} (visual analysis)")
|
| 295 |
print(f" Path: {analysis['image_path']}")
|
| 296 |
print(f" Analysis: {analysis['visual_analysis'][:100]}...")
|
| 297 |
|
|
|
|
| 302 |
f"{doc_id}_images_visual"
|
| 303 |
)
|
| 304 |
results['total_stored'] += len(image_analyses)
|
| 305 |
+
print(f"✅ Stored {len(image_analyses)} image visual analyses")
|
| 306 |
except Exception as e:
|
| 307 |
+
print(f"❌ Error storing image analyses: {e}")
|
| 308 |
|
| 309 |
+
# 2. Summarize and store text chunks
|
| 310 |
+
print(f"\n📝 TEXT CHUNK SUMMARIZATION")
|
| 311 |
+
print(f"{'─'*70}")
|
| 312 |
|
| 313 |
text_summaries = self.summarize_text_chunks(text)
|
| 314 |
results['text_summaries'] = text_summaries
|
|
|
|
| 321 |
}
|
| 322 |
|
| 323 |
for summary in text_summaries:
|
| 324 |
+
print(f" ✅ Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
|
| 325 |
|
| 326 |
if text_summaries:
|
| 327 |
try:
|
|
|
|
| 330 |
f"{doc_id}_text_chunks"
|
| 331 |
)
|
| 332 |
results['total_stored'] += len(text_summaries)
|
| 333 |
+
print(f"✅ Stored {len(text_summaries)} text chunk summaries")
|
| 334 |
except Exception as e:
|
| 335 |
+
print(f"❌ Error storing text summaries: {e}")
|
| 336 |
|
| 337 |
+
# 3. Summarize and store tables
|
| 338 |
+
print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
|
| 339 |
+
print(f"{'─'*70}")
|
| 340 |
|
| 341 |
table_summaries = self.summarize_tables(tables)
|
| 342 |
results['table_summaries'] = table_summaries
|
|
|
|
| 349 |
}
|
| 350 |
|
| 351 |
for summary in table_summaries:
|
| 352 |
+
print(f" ✅ Table {summary['table_index']}: {summary['summary'][:50]}...")
|
| 353 |
|
| 354 |
if table_summaries:
|
| 355 |
try:
|
|
|
|
| 358 |
f"{doc_id}_tables"
|
| 359 |
)
|
| 360 |
results['total_stored'] += len(table_summaries)
|
| 361 |
+
print(f"✅ Stored {len(table_summaries)} table summaries")
|
| 362 |
except Exception as e:
|
| 363 |
+
print(f"❌ Error storing table summaries: {e}")
|
| 364 |
|
| 365 |
+
# 4. Summary statistics
|
| 366 |
+
print(f"\n{'='*70}")
|
| 367 |
+
print(f"📊 STORAGE SUMMARY")
|
| 368 |
+
print(f"{'='*70}")
|
| 369 |
print(f" Images analyzed visually & stored: {len(image_analyses)}")
|
| 370 |
print(f" Text chunks summarized & stored: {len(text_summaries)}")
|
| 371 |
print(f" Tables summarized & stored: {len(table_summaries)}")
|
| 372 |
print(f" Total items stored in vector: {results['total_stored']}")
|
| 373 |
+
print(f"{'='*70}")
|
| 374 |
|
| 375 |
self.visual_summaries_log.append(results)
|
| 376 |
return results
|
|
|
|
| 391 |
|
| 392 |
|
| 393 |
class AnsweringRAG:
|
| 394 |
+
"""
|
| 395 |
+
RAG system that:
|
| 396 |
+
1. Searches vector store for relevant content
|
| 397 |
+
2. ANALYZES search results
|
| 398 |
+
3. Generates intelligent answers based on context
|
| 399 |
+
"""
|
| 400 |
|
| 401 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 402 |
api_key = api_key or OPENAI_API_KEY
|
| 403 |
self.debug = debug
|
| 404 |
|
| 405 |
self.llm = ChatOpenAI(
|
| 406 |
+
model_name="gpt-4o-mini", # Use gpt-4o for better understanding
|
| 407 |
api_key=api_key,
|
| 408 |
temperature=TEMPERATURE,
|
| 409 |
max_tokens=MAX_TOKENS,
|
|
|
|
| 413 |
self.answer_log = []
|
| 414 |
|
| 415 |
if self.debug:
|
| 416 |
+
print("✅ AnsweringRAG initialized with answer generation")
|
| 417 |
|
| 418 |
def _debug_print(self, label: str, data: any):
|
| 419 |
+
"""Print debug information"""
|
| 420 |
if self.debug:
|
| 421 |
+
print(f"\n🔍 DEBUG [{label}]:")
|
| 422 |
if isinstance(data, (list, dict)):
|
| 423 |
print(f" Type: {type(data).__name__}")
|
| 424 |
print(f" Content: {str(data)[:300]}...")
|
|
|
|
| 430 |
question: str,
|
| 431 |
search_results: List[Dict]
|
| 432 |
) -> Dict:
|
| 433 |
+
"""
|
| 434 |
+
Analyze search results and generate intelligent answer
|
| 435 |
+
|
| 436 |
+
Returns:
|
| 437 |
+
{
|
| 438 |
+
'question': user question,
|
| 439 |
+
'answer': detailed answer,
|
| 440 |
+
'sources_used': number of sources,
|
| 441 |
+
'confidence': low/medium/high,
|
| 442 |
+
'search_results': original search results
|
| 443 |
+
}
|
| 444 |
+
"""
|
| 445 |
|
| 446 |
+
print(f"\n{'='*70}")
|
| 447 |
print(f"ANALYZING QUESTION & GENERATING ANSWER")
|
| 448 |
+
print(f"{'='*70}")
|
| 449 |
|
| 450 |
+
print(f"\n❓ Question: {question}")
|
| 451 |
+
print(f"📊 Search Results Found: {len(search_results)}")
|
| 452 |
|
| 453 |
+
# Check if we have search results
|
| 454 |
if not search_results:
|
| 455 |
+
print(f"⚠️ No search results found!")
|
| 456 |
+
answer = f"""I could not find relevant information in the document to answer your question: "{question}"
|
| 457 |
+
|
| 458 |
+
Try:
|
| 459 |
+
- Using different keywords
|
| 460 |
+
- Breaking the question into smaller parts
|
| 461 |
+
- Asking about other topics in the document"""
|
| 462 |
|
| 463 |
result = {
|
| 464 |
'question': question,
|
|
|
|
| 470 |
self.answer_log.append(result)
|
| 471 |
return result
|
| 472 |
|
| 473 |
+
# Build context from search results
|
| 474 |
context_parts = []
|
| 475 |
for idx, result in enumerate(search_results, 1):
|
| 476 |
content = result.get('content', '')
|
|
|
|
| 487 |
|
| 488 |
self._debug_print("Context Prepared", f"{len(context_parts)} sources, {len(full_context)} chars")
|
| 489 |
|
| 490 |
+
# Build prompt to analyze results and answer question
|
| 491 |
analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
|
| 492 |
|
| 493 |
USER QUESTION:
|
|
|
|
| 503 |
4. If the content doesn't fully answer the question, explain what information is available
|
| 504 |
5. Be specific and cite the content when relevant
|
| 505 |
6. Structure your answer clearly with key points
|
| 506 |
+
|
| 507 |
ANSWER:"""
|
| 508 |
|
| 509 |
+
print(f"\n🔍 Analyzing search results...")
|
| 510 |
print(f" Context size: {len(full_context)} characters")
|
| 511 |
print(f" Sources: {len(search_results)}")
|
| 512 |
|
| 513 |
try:
|
| 514 |
+
# Call LLM to analyze and answer
|
| 515 |
message = HumanMessage(content=analysis_prompt)
|
| 516 |
response = self.llm.invoke([message])
|
| 517 |
answer = response.content.strip()
|
| 518 |
|
| 519 |
+
# Determine confidence level
|
| 520 |
confidence = self._estimate_confidence(len(search_results), answer)
|
| 521 |
|
| 522 |
+
print(f"✅ Answer generated successfully")
|
| 523 |
print(f" Confidence: {confidence}")
|
| 524 |
print(f" Answer length: {len(answer)} characters")
|
| 525 |
|
|
|
|
| 535 |
return result
|
| 536 |
|
| 537 |
except Exception as e:
|
| 538 |
+
print(f"❌ Error generating answer: {e}")
|
| 539 |
answer = f"I encountered an error while analyzing the search results. Please try again."
|
| 540 |
|
| 541 |
result = {
|
|
|
|
| 551 |
return result
|
| 552 |
|
| 553 |
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
| 554 |
+
"""Estimate confidence level of answer"""
|
| 555 |
answer_length = len(answer)
|
| 556 |
|
| 557 |
+
# High confidence: multiple sources, substantial answer
|
| 558 |
if sources_count >= 3 and answer_length > 500:
|
| 559 |
return "high"
|
| 560 |
|
| 561 |
+
# Medium confidence: some sources, decent answer
|
| 562 |
elif sources_count >= 2 and answer_length > 200:
|
| 563 |
return "medium"
|
| 564 |
|
| 565 |
+
# Low confidence: few sources or short answer
|
| 566 |
else:
|
| 567 |
return "low"
|
| 568 |
|
|
|
|
| 571 |
question: str,
|
| 572 |
search_results: List[Dict]
|
| 573 |
) -> Dict:
|
| 574 |
+
"""
|
| 575 |
+
Get answer AND properly formatted sources
|
| 576 |
+
Returns both answer and formatted source citations
|
| 577 |
+
"""
|
| 578 |
|
| 579 |
result = self.analyze_and_answer(question, search_results)
|
| 580 |
|
| 581 |
+
# Format sources for display
|
| 582 |
formatted_sources = []
|
| 583 |
for idx, source in enumerate(result['search_results'], 1):
|
| 584 |
formatted_sources.append({
|
|
|
|
| 592 |
return result
|
| 593 |
|
| 594 |
def get_answer_log(self) -> List[Dict]:
|
| 595 |
+
"""Get all answer generation logs"""
|
| 596 |
return self.answer_log
|
| 597 |
|
| 598 |
def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
|
| 599 |
+
"""Pretty print answer with sources"""
|
| 600 |
|
| 601 |
+
print(f"\n{'='*70}")
|
| 602 |
print(f"ANSWER TO: {result['question']}")
|
| 603 |
+
print(f"{'='*70}")
|
| 604 |
|
| 605 |
+
print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
|
| 606 |
+
print(f"{'-'*70}")
|
| 607 |
print(result['answer'])
|
| 608 |
+
print(f"{'-'*70}")
|
| 609 |
|
| 610 |
if result.get('formatted_sources'):
|
| 611 |
+
print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
|
| 612 |
for source in result['formatted_sources']:
|
| 613 |
print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
|
| 614 |
print(f"{source['content'][:max_source_length]}...")
|
| 615 |
|
| 616 |
+
print(f"\n{'='*70}")
|
src/vector_store.py
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
from typing import List, Dict
|
|
@@ -8,12 +12,14 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
|
|
| 8 |
|
| 9 |
|
| 10 |
class CLIPEmbedder:
|
|
|
|
| 11 |
def __init__(self, model_name: str = EMBEDDING_MODEL):
|
| 12 |
-
print(f" Loading embedding model: {model_name}")
|
| 13 |
self.model = SentenceTransformer(model_name)
|
| 14 |
-
print(f" Model loaded successfully")
|
| 15 |
|
| 16 |
def embed(self, text: str) -> List[float]:
|
|
|
|
| 17 |
try:
|
| 18 |
embedding = self.model.encode(text, convert_to_numpy=False)
|
| 19 |
return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
|
|
@@ -22,6 +28,7 @@ class CLIPEmbedder:
|
|
| 22 |
return [0.0] * EMBEDDING_DIM
|
| 23 |
|
| 24 |
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
|
|
|
| 25 |
try:
|
| 26 |
embeddings = self.model.encode(texts, convert_to_numpy=False)
|
| 27 |
return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
|
|
@@ -31,30 +38,34 @@ class CLIPEmbedder:
|
|
| 31 |
|
| 32 |
|
| 33 |
class VectorStore:
|
|
|
|
| 34 |
def __init__(self):
|
| 35 |
self.persist_directory = CHROMA_DB_PATH
|
| 36 |
self.embedder = CLIPEmbedder()
|
| 37 |
|
| 38 |
-
print(f" Initializing ChromaDB at: {self.persist_directory}")
|
| 39 |
|
|
|
|
| 40 |
try:
|
| 41 |
self.client = chromadb.PersistentClient(
|
| 42 |
path=self.persist_directory
|
| 43 |
)
|
| 44 |
-
print(f" ChromaDB initialized")
|
| 45 |
except Exception as e:
|
| 46 |
-
print(f" Error initializing ChromaDB: {e}")
|
|
|
|
| 47 |
self.client = chromadb.PersistentClient(
|
| 48 |
path=self.persist_directory
|
| 49 |
)
|
| 50 |
|
|
|
|
| 51 |
try:
|
| 52 |
self.collection = self.client.get_or_create_collection(
|
| 53 |
name="multimodal_rag",
|
| 54 |
metadata={"hnsw:space": "cosine"}
|
| 55 |
)
|
| 56 |
count = self.collection.count()
|
| 57 |
-
print(f" Collection loaded: {count} items in store")
|
| 58 |
except Exception as e:
|
| 59 |
print(f"Error with collection: {e}")
|
| 60 |
self.collection = self.client.get_or_create_collection(
|
|
@@ -62,12 +73,14 @@ class VectorStore:
|
|
| 62 |
)
|
| 63 |
|
| 64 |
def add_documents(self, documents: List[Dict], doc_id: str):
|
|
|
|
| 65 |
texts = []
|
| 66 |
metadatas = []
|
| 67 |
ids = []
|
| 68 |
|
| 69 |
-
print(f" Adding documents for: {doc_id}")
|
| 70 |
|
|
|
|
| 71 |
if 'text' in documents and documents['text']:
|
| 72 |
chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
|
| 73 |
for idx, chunk in enumerate(chunks):
|
|
@@ -78,8 +91,9 @@ class VectorStore:
|
|
| 78 |
'chunk_idx': str(idx)
|
| 79 |
})
|
| 80 |
ids.append(f"{doc_id}_text_{idx}")
|
| 81 |
-
print(f"
|
| 82 |
|
|
|
|
| 83 |
if 'images' in documents:
|
| 84 |
image_count = 0
|
| 85 |
for idx, image_data in enumerate(documents['images']):
|
|
@@ -94,8 +108,9 @@ class VectorStore:
|
|
| 94 |
ids.append(f"{doc_id}_image_{idx}")
|
| 95 |
image_count += 1
|
| 96 |
if image_count > 0:
|
| 97 |
-
print(f"
|
| 98 |
|
|
|
|
| 99 |
if 'tables' in documents:
|
| 100 |
table_count = 0
|
| 101 |
for idx, table_data in enumerate(documents['tables']):
|
|
@@ -109,12 +124,14 @@ class VectorStore:
|
|
| 109 |
ids.append(f"{doc_id}_table_{idx}")
|
| 110 |
table_count += 1
|
| 111 |
if table_count > 0:
|
| 112 |
-
print(f"
|
| 113 |
|
| 114 |
if texts:
|
| 115 |
-
|
|
|
|
| 116 |
embeddings = self.embedder.embed_batch(texts)
|
| 117 |
|
|
|
|
| 118 |
try:
|
| 119 |
self.collection.add(
|
| 120 |
ids=ids,
|
|
@@ -122,10 +139,11 @@ class VectorStore:
|
|
| 122 |
embeddings=embeddings,
|
| 123 |
metadatas=metadatas
|
| 124 |
)
|
| 125 |
-
print(f" Successfully added {len(texts)} items to vector store")
|
| 126 |
-
|
|
|
|
| 127 |
except Exception as e:
|
| 128 |
-
print(f" Error adding to collection: {e}")
|
| 129 |
|
| 130 |
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 131 |
"""Search vector store for similar documents"""
|
|
@@ -137,6 +155,7 @@ class VectorStore:
|
|
| 137 |
n_results=n_results
|
| 138 |
)
|
| 139 |
|
|
|
|
| 140 |
formatted_results = []
|
| 141 |
if results['documents']:
|
| 142 |
for i, doc in enumerate(results['documents'][0]):
|
|
@@ -156,6 +175,7 @@ class VectorStore:
|
|
| 156 |
return []
|
| 157 |
|
| 158 |
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
|
|
|
| 159 |
chunks = []
|
| 160 |
start = 0
|
| 161 |
while start < len(text):
|
|
@@ -165,6 +185,7 @@ class VectorStore:
|
|
| 165 |
return chunks
|
| 166 |
|
| 167 |
def get_collection_info(self) -> Dict:
|
|
|
|
| 168 |
try:
|
| 169 |
count = self.collection.count()
|
| 170 |
return {
|
|
@@ -178,25 +199,35 @@ class VectorStore:
|
|
| 178 |
return {'status': 'error', 'message': str(e)}
|
| 179 |
|
| 180 |
def delete_by_doc_id(self, doc_id: str):
|
|
|
|
| 181 |
try:
|
|
|
|
| 182 |
results = self.collection.get(where={'doc_id': doc_id})
|
| 183 |
if results['ids']:
|
| 184 |
self.collection.delete(ids=results['ids'])
|
| 185 |
-
print(f" Deleted {len(results['ids'])} documents for {doc_id}")
|
|
|
|
|
|
|
| 186 |
except Exception as e:
|
| 187 |
print(f"Error deleting documents: {e}")
|
| 188 |
|
| 189 |
def persist(self):
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
def clear_all(self):
|
|
|
|
| 194 |
try:
|
|
|
|
| 195 |
self.client.delete_collection(name="multimodal_rag")
|
| 196 |
self.collection = self.client.get_or_create_collection(
|
| 197 |
name="multimodal_rag",
|
| 198 |
metadata={"hnsw:space": "cosine"}
|
| 199 |
)
|
| 200 |
-
print(" Collection cleared and reset")
|
| 201 |
except Exception as e:
|
| 202 |
print(f"Error clearing collection: {e}")
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Vector Store and Embeddings Module using ChromaDB with sentence-transformers
|
| 3 |
+
UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
|
| 4 |
+
"""
|
| 5 |
import os
|
| 6 |
import json
|
| 7 |
from typing import List, Dict
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class CLIPEmbedder:
|
| 15 |
+
"""Custom embedder using sentence-transformers for multimodal content"""
|
| 16 |
def __init__(self, model_name: str = EMBEDDING_MODEL):
|
| 17 |
+
print(f"🔄 Loading embedding model: {model_name}")
|
| 18 |
self.model = SentenceTransformer(model_name)
|
| 19 |
+
print(f"✅ Model loaded successfully")
|
| 20 |
|
| 21 |
def embed(self, text: str) -> List[float]:
|
| 22 |
+
"""Generate embedding for text"""
|
| 23 |
try:
|
| 24 |
embedding = self.model.encode(text, convert_to_numpy=False)
|
| 25 |
return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
|
|
|
|
| 28 |
return [0.0] * EMBEDDING_DIM
|
| 29 |
|
| 30 |
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
| 31 |
+
"""Generate embeddings for batch of texts"""
|
| 32 |
try:
|
| 33 |
embeddings = self.model.encode(texts, convert_to_numpy=False)
|
| 34 |
return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
class VectorStore:
|
| 41 |
+
"""Vector store manager using ChromaDB (v0.4.22+ with auto-persist)"""
|
| 42 |
def __init__(self):
|
| 43 |
self.persist_directory = CHROMA_DB_PATH
|
| 44 |
self.embedder = CLIPEmbedder()
|
| 45 |
|
| 46 |
+
print(f"\n🔄 Initializing ChromaDB at: {self.persist_directory}")
|
| 47 |
|
| 48 |
+
# NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
|
| 49 |
try:
|
| 50 |
self.client = chromadb.PersistentClient(
|
| 51 |
path=self.persist_directory
|
| 52 |
)
|
| 53 |
+
print(f"✅ ChromaDB PersistentClient initialized")
|
| 54 |
except Exception as e:
|
| 55 |
+
print(f"❌ Error initializing ChromaDB: {e}")
|
| 56 |
+
print(f"Trying fallback initialization...")
|
| 57 |
self.client = chromadb.PersistentClient(
|
| 58 |
path=self.persist_directory
|
| 59 |
)
|
| 60 |
|
| 61 |
+
# Get or create collection
|
| 62 |
try:
|
| 63 |
self.collection = self.client.get_or_create_collection(
|
| 64 |
name="multimodal_rag",
|
| 65 |
metadata={"hnsw:space": "cosine"}
|
| 66 |
)
|
| 67 |
count = self.collection.count()
|
| 68 |
+
print(f"✅ Collection loaded: {count} items in store")
|
| 69 |
except Exception as e:
|
| 70 |
print(f"Error with collection: {e}")
|
| 71 |
self.collection = self.client.get_or_create_collection(
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
def add_documents(self, documents: List[Dict], doc_id: str):
|
| 76 |
+
"""Add documents to vector store"""
|
| 77 |
texts = []
|
| 78 |
metadatas = []
|
| 79 |
ids = []
|
| 80 |
|
| 81 |
+
print(f"\n📚 Adding documents for: {doc_id}")
|
| 82 |
|
| 83 |
+
# Add text chunks
|
| 84 |
if 'text' in documents and documents['text']:
|
| 85 |
chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
|
| 86 |
for idx, chunk in enumerate(chunks):
|
|
|
|
| 91 |
'chunk_idx': str(idx)
|
| 92 |
})
|
| 93 |
ids.append(f"{doc_id}_text_{idx}")
|
| 94 |
+
print(f" ✅ Text: {len(chunks)} chunks")
|
| 95 |
|
| 96 |
+
# Add image descriptions and OCR text
|
| 97 |
if 'images' in documents:
|
| 98 |
image_count = 0
|
| 99 |
for idx, image_data in enumerate(documents['images']):
|
|
|
|
| 108 |
ids.append(f"{doc_id}_image_{idx}")
|
| 109 |
image_count += 1
|
| 110 |
if image_count > 0:
|
| 111 |
+
print(f" ✅ Images: {image_count} with OCR text")
|
| 112 |
|
| 113 |
+
# Add table content
|
| 114 |
if 'tables' in documents:
|
| 115 |
table_count = 0
|
| 116 |
for idx, table_data in enumerate(documents['tables']):
|
|
|
|
| 124 |
ids.append(f"{doc_id}_table_{idx}")
|
| 125 |
table_count += 1
|
| 126 |
if table_count > 0:
|
| 127 |
+
print(f" ✅ Tables: {table_count}")
|
| 128 |
|
| 129 |
if texts:
|
| 130 |
+
# Generate embeddings
|
| 131 |
+
print(f" 🔄 Generating {len(texts)} embeddings...")
|
| 132 |
embeddings = self.embedder.embed_batch(texts)
|
| 133 |
|
| 134 |
+
# Add to collection
|
| 135 |
try:
|
| 136 |
self.collection.add(
|
| 137 |
ids=ids,
|
|
|
|
| 139 |
embeddings=embeddings,
|
| 140 |
metadatas=metadatas
|
| 141 |
)
|
| 142 |
+
print(f"✅ Successfully added {len(texts)} items to vector store")
|
| 143 |
+
# Auto-persist happens here
|
| 144 |
+
print(f"✅ Data persisted automatically to: {self.persist_directory}")
|
| 145 |
except Exception as e:
|
| 146 |
+
print(f"❌ Error adding to collection: {e}")
|
| 147 |
|
| 148 |
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 149 |
"""Search vector store for similar documents"""
|
|
|
|
| 155 |
n_results=n_results
|
| 156 |
)
|
| 157 |
|
| 158 |
+
# Format results
|
| 159 |
formatted_results = []
|
| 160 |
if results['documents']:
|
| 161 |
for i, doc in enumerate(results['documents'][0]):
|
|
|
|
| 175 |
return []
|
| 176 |
|
| 177 |
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
| 178 |
+
"""Split text into chunks with overlap"""
|
| 179 |
chunks = []
|
| 180 |
start = 0
|
| 181 |
while start < len(text):
|
|
|
|
| 185 |
return chunks
|
| 186 |
|
| 187 |
def get_collection_info(self) -> Dict:
|
| 188 |
+
"""Get information about the collection"""
|
| 189 |
try:
|
| 190 |
count = self.collection.count()
|
| 191 |
return {
|
|
|
|
| 199 |
return {'status': 'error', 'message': str(e)}
|
| 200 |
|
| 201 |
def delete_by_doc_id(self, doc_id: str):
|
| 202 |
+
"""Delete all documents related to a specific doc_id"""
|
| 203 |
try:
|
| 204 |
+
# Get all IDs with this doc_id
|
| 205 |
results = self.collection.get(where={'doc_id': doc_id})
|
| 206 |
if results['ids']:
|
| 207 |
self.collection.delete(ids=results['ids'])
|
| 208 |
+
print(f"✅ Deleted {len(results['ids'])} documents for {doc_id}")
|
| 209 |
+
# Auto-persist on delete
|
| 210 |
+
print(f"✅ Changes persisted automatically")
|
| 211 |
except Exception as e:
|
| 212 |
print(f"Error deleting documents: {e}")
|
| 213 |
|
| 214 |
def persist(self):
|
| 215 |
+
"""
|
| 216 |
+
No-op for compatibility with older code.
|
| 217 |
+
ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
|
| 218 |
+
This method kept for backward compatibility.
|
| 219 |
+
"""
|
| 220 |
+
print("✅ Vector store is using auto-persist (no manual persist needed)")
|
| 221 |
|
| 222 |
def clear_all(self):
|
| 223 |
+
"""Clear all documents from collection"""
|
| 224 |
try:
|
| 225 |
+
# Delete collection and recreate
|
| 226 |
self.client.delete_collection(name="multimodal_rag")
|
| 227 |
self.collection = self.client.get_or_create_collection(
|
| 228 |
name="multimodal_rag",
|
| 229 |
metadata={"hnsw:space": "cosine"}
|
| 230 |
)
|
| 231 |
+
print("✅ Collection cleared and reset")
|
| 232 |
except Exception as e:
|
| 233 |
print(f"Error clearing collection: {e}")
|