Spaces:
Sleeping
Sleeping
Simplify
Browse files- src/app.py +77 -209
- src/config.py +10 -18
- src/pdf_parser.py +26 -48
- src/rag_system.py +89 -187
- src/vector_store.py +26 -50
src/app.py
CHANGED
|
@@ -1,33 +1,25 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
Complete working version with VISUAL image analysis using gpt-4o
|
| 4 |
"""
|
| 5 |
|
| 6 |
import streamlit as st
|
| 7 |
import os
|
| 8 |
from pathlib import Path
|
| 9 |
|
| 10 |
-
# Import optimized versions
|
| 11 |
from pdf_parser import PDFParser
|
| 12 |
from vector_store import VectorStore
|
| 13 |
-
from rag_system import VisualMultimodalRAG
|
| 14 |
from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
|
| 15 |
|
| 16 |
|
| 17 |
-
# ============================================================================
|
| 18 |
-
# PAGE CONFIGURATION
|
| 19 |
-
# ============================================================================
|
| 20 |
|
| 21 |
st.set_page_config(
|
| 22 |
-
page_title="
|
| 23 |
-
page_icon="🤖",
|
| 24 |
layout="wide",
|
| 25 |
initial_sidebar_state="expanded"
|
| 26 |
)
|
| 27 |
|
| 28 |
-
|
| 29 |
-
# SESSION STATE INITIALIZATION
|
| 30 |
-
# ============================================================================
|
| 31 |
|
| 32 |
if 'api_key_set' not in st.session_state:
|
| 33 |
st.session_state.api_key_set = False
|
|
@@ -35,7 +27,7 @@ if 'api_key_set' not in st.session_state:
|
|
| 35 |
if 'api_key' not in st.session_state:
|
| 36 |
st.session_state.api_key = None
|
| 37 |
|
| 38 |
-
if 'visual_rag_system' not in st.session_state:
|
| 39 |
st.session_state.visual_rag_system = None
|
| 40 |
|
| 41 |
if 'vector_store' not in st.session_state:
|
|
@@ -56,39 +48,28 @@ if 'current_images' not in st.session_state:
|
|
| 56 |
if 'current_tables' not in st.session_state:
|
| 57 |
st.session_state.current_tables = None
|
| 58 |
|
| 59 |
-
if 'processing_results' not in st.session_state:
|
| 60 |
st.session_state.processing_results = None
|
| 61 |
|
| 62 |
if 'answering_rag' not in st.session_state:
|
| 63 |
st.session_state.answering_rag = None
|
| 64 |
|
| 65 |
|
| 66 |
-
# ============================================================================
|
| 67 |
-
# MAIN HEADER
|
| 68 |
-
# ============================================================================
|
| 69 |
|
| 70 |
-
st.title("
|
| 71 |
st.markdown("""
|
| 72 |
-
|
| 73 |
-
- **PDF Parser** with OCR for Russian & English
|
| 74 |
-
- **Visual Analysis** (gpt-4o) for image understanding
|
| 75 |
-
- **Vector Store** (ChromaDB) for semantic search
|
| 76 |
-
- **Individual Component** summarization and storage
|
| 77 |
""")
|
| 78 |
|
| 79 |
|
| 80 |
-
# ============================================================================
|
| 81 |
-
# SIDEBAR - CONFIGURATION
|
| 82 |
-
# ============================================================================
|
| 83 |
|
| 84 |
with st.sidebar:
|
| 85 |
-
st.header("
|
| 86 |
|
| 87 |
-
|
| 88 |
-
st.subheader("🔑 OpenAI API Key")
|
| 89 |
|
| 90 |
api_key = st.text_input(
|
| 91 |
-
"
|
| 92 |
type="password",
|
| 93 |
key="api_key_input"
|
| 94 |
)
|
|
@@ -97,62 +78,53 @@ with st.sidebar:
|
|
| 97 |
st.session_state.api_key = api_key
|
| 98 |
st.session_state.api_key_set = True
|
| 99 |
|
| 100 |
-
# Initialize RAG systems if not already done
|
| 101 |
if st.session_state.visual_rag_system is None:
|
| 102 |
try:
|
| 103 |
st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
|
| 104 |
st.session_state.vector_store = VectorStore()
|
| 105 |
st.session_state.parser = PDFParser(debug=True)
|
| 106 |
-
st.success("
|
| 107 |
except Exception as e:
|
| 108 |
-
st.error(f"
|
| 109 |
else:
|
| 110 |
st.session_state.api_key_set = False
|
| 111 |
-
st.warning("
|
| 112 |
|
| 113 |
st.divider()
|
| 114 |
|
| 115 |
-
|
| 116 |
-
st.subheader("📊 Vector Store Status")
|
| 117 |
if st.session_state.vector_store:
|
| 118 |
try:
|
| 119 |
info = st.session_state.vector_store.get_collection_info()
|
| 120 |
-
st.metric("
|
| 121 |
-
st.
|
| 122 |
-
st.caption(f"Path: {info['persist_path']}")
|
| 123 |
except Exception as e:
|
| 124 |
-
st.error(f"
|
| 125 |
else:
|
| 126 |
-
st.info("
|
| 127 |
|
| 128 |
st.divider()
|
| 129 |
|
| 130 |
-
|
| 131 |
-
st.
|
| 132 |
-
if st.button("🔄 Clear Vector Store"):
|
| 133 |
if st.session_state.vector_store:
|
| 134 |
try:
|
| 135 |
st.session_state.vector_store.clear_all()
|
| 136 |
-
st.success("
|
| 137 |
except Exception as e:
|
| 138 |
-
st.error(f"
|
| 139 |
|
| 140 |
|
| 141 |
-
# ============================================================================
|
| 142 |
-
# MAIN CONTENT
|
| 143 |
-
# ============================================================================
|
| 144 |
|
| 145 |
-
|
| 146 |
-
st.header("📤 Upload PDF Document")
|
| 147 |
|
| 148 |
uploaded_file = st.file_uploader(
|
| 149 |
-
"
|
| 150 |
type=['pdf'],
|
| 151 |
-
help="PDF
|
| 152 |
)
|
| 153 |
|
| 154 |
if uploaded_file is not None:
|
| 155 |
-
# Save uploaded file
|
| 156 |
upload_path = Path(UPLOAD_FOLDER)
|
| 157 |
upload_path.mkdir(exist_ok=True)
|
| 158 |
|
|
@@ -160,91 +132,64 @@ if uploaded_file is not None:
|
|
| 160 |
with open(file_path, 'wb') as f:
|
| 161 |
f.write(uploaded_file.getbuffer())
|
| 162 |
|
| 163 |
-
st.success(f"
|
| 164 |
|
| 165 |
-
|
| 166 |
-
if st.button("🔍 Parse PDF"):
|
| 167 |
if not st.session_state.api_key_set:
|
| 168 |
-
st.error("
|
| 169 |
else:
|
| 170 |
try:
|
| 171 |
-
with st.spinner("
|
| 172 |
-
|
| 173 |
-
print(f"
|
| 174 |
-
|
| 175 |
|
| 176 |
-
# Parse PDF - returns text, images, tables
|
| 177 |
parser = st.session_state.parser
|
| 178 |
text, images, tables = parser.parse_pdf(str(file_path))
|
| 179 |
|
| 180 |
-
# Store in session state
|
| 181 |
st.session_state.current_document = uploaded_file.name
|
| 182 |
st.session_state.current_text = text
|
| 183 |
st.session_state.current_images = images
|
| 184 |
st.session_state.current_tables = tables
|
| 185 |
|
| 186 |
-
# Display results
|
| 187 |
col1, col2, col3 = st.columns(3)
|
| 188 |
with col1:
|
| 189 |
-
st.metric("
|
| 190 |
with col2:
|
| 191 |
-
st.metric("
|
| 192 |
with col3:
|
| 193 |
-
st.metric("
|
| 194 |
|
| 195 |
-
|
| 196 |
-
if images:
|
| 197 |
-
st.subheader("🖼️ Extracted Images")
|
| 198 |
-
for idx, img in enumerate(images):
|
| 199 |
-
ocr_text = img.get('ocr_text', '')
|
| 200 |
-
ocr_len = len(ocr_text)
|
| 201 |
-
|
| 202 |
-
if ocr_len > 0:
|
| 203 |
-
st.success(f"✅ Image {idx}: {ocr_len} characters (OCR)")
|
| 204 |
-
else:
|
| 205 |
-
st.warning(f"⚠️ Image {idx}: No OCR text (will use visual analysis)")
|
| 206 |
-
|
| 207 |
-
st.success("✅ PDF parsing complete!")
|
| 208 |
|
| 209 |
except Exception as e:
|
| 210 |
-
st.error(f"
|
| 211 |
-
print(f"
|
| 212 |
|
| 213 |
|
| 214 |
-
# ============================================================================
|
| 215 |
-
# VISUAL IMAGE ANALYSIS & COMPONENT STORAGE
|
| 216 |
-
# ============================================================================
|
| 217 |
|
| 218 |
st.divider()
|
| 219 |
-
st.header("
|
| 220 |
|
| 221 |
st.info("""
|
| 222 |
-
|
| 223 |
-
1. Images are sent to gpt-4o for visual analysis (not just text OCR)
|
| 224 |
-
2. Text is split into chunks and each chunk is summarized
|
| 225 |
-
3. Tables are analyzed individually
|
| 226 |
-
4. ALL summaries are stored in the vector store for semantic search
|
| 227 |
""")
|
| 228 |
|
| 229 |
-
if st.button("
|
| 230 |
if not st.session_state.api_key_set:
|
| 231 |
-
st.error("
|
| 232 |
elif st.session_state.current_text is None:
|
| 233 |
-
st.error("
|
| 234 |
else:
|
| 235 |
try:
|
| 236 |
-
with st.spinner("
|
| 237 |
-
print(f"\n{'='*70}")
|
| 238 |
-
print(f"VISUAL IMAGE ANALYSIS")
|
| 239 |
-
print(f"{'='*70}")
|
| 240 |
|
| 241 |
-
# Process with visual analysis
|
| 242 |
visual_rag = st.session_state.visual_rag_system
|
| 243 |
vector_store = st.session_state.vector_store
|
| 244 |
|
| 245 |
results = visual_rag.process_and_store_document(
|
| 246 |
text=st.session_state.current_text,
|
| 247 |
-
images=st.session_state.current_images,
|
| 248 |
tables=st.session_state.current_tables,
|
| 249 |
vector_store=vector_store,
|
| 250 |
doc_id=st.session_state.current_document or "current_doc"
|
|
@@ -252,107 +197,55 @@ if st.button("🖼️ Analyze Images Visually & Store Components"):
|
|
| 252 |
|
| 253 |
st.session_state.processing_results = results
|
| 254 |
|
| 255 |
-
|
| 256 |
-
st.success("✅ Visual analysis complete & stored!")
|
| 257 |
|
| 258 |
col1, col2, col3 = st.columns(3)
|
| 259 |
with col1:
|
| 260 |
-
st.metric("
|
| 261 |
with col2:
|
| 262 |
-
st.metric("
|
| 263 |
with col3:
|
| 264 |
-
st.metric("
|
| 265 |
-
|
| 266 |
-
st.metric("📊 Total Stored in Vector", results['total_stored'])
|
| 267 |
-
|
| 268 |
-
# Show image visual analyses
|
| 269 |
-
if results['image_visual_analyses']:
|
| 270 |
-
st.subheader("🖼️ Visual Image Analyses (gpt-4o)")
|
| 271 |
-
for img_analysis in results['image_visual_analyses']:
|
| 272 |
-
with st.expander(f"Image {img_analysis['image_index']} - Visual Analysis"):
|
| 273 |
-
st.write("**Visual Analysis by gpt-4o:**")
|
| 274 |
-
st.write(img_analysis['visual_analysis'])
|
| 275 |
-
|
| 276 |
-
st.write("**Image Path:**")
|
| 277 |
-
st.code(img_analysis['image_path'])
|
| 278 |
-
|
| 279 |
-
if img_analysis['ocr_text']:
|
| 280 |
-
st.write("**OCR Text (backup):**")
|
| 281 |
-
st.text(img_analysis['ocr_text'][:500])
|
| 282 |
-
|
| 283 |
-
# Show text chunk summaries
|
| 284 |
-
if results['text_summaries']:
|
| 285 |
-
st.subheader("📝 Text Chunk Summaries")
|
| 286 |
-
for chunk_summary in results['text_summaries']:
|
| 287 |
-
with st.expander(
|
| 288 |
-
f"Chunk {chunk_summary['chunk_index']} "
|
| 289 |
-
f"({chunk_summary['chunk_length']} chars)"
|
| 290 |
-
):
|
| 291 |
-
st.write("**Summary:**")
|
| 292 |
-
st.write(chunk_summary['summary'])
|
| 293 |
-
st.write("**Original Text (first 500 chars):**")
|
| 294 |
-
st.text(chunk_summary['original_text'])
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
st.subheader("📋 Table Analyses")
|
| 299 |
-
for table_summary in results['table_summaries']:
|
| 300 |
-
with st.expander(
|
| 301 |
-
f"Table {table_summary['table_index']} "
|
| 302 |
-
f"({table_summary['table_length']} chars)"
|
| 303 |
-
):
|
| 304 |
-
st.write("**Analysis:**")
|
| 305 |
-
st.write(table_summary['summary'])
|
| 306 |
-
st.write("**Original Content (first 500 chars):**")
|
| 307 |
-
st.text(table_summary['original_content'])
|
| 308 |
|
| 309 |
-
print(f"
|
| 310 |
|
| 311 |
except Exception as e:
|
| 312 |
-
st.error(f"
|
| 313 |
-
print(f"
|
| 314 |
|
| 315 |
|
| 316 |
-
# ============================================================================
|
| 317 |
-
# QUESTION & ANSWERING
|
| 318 |
-
# ============================================================================
|
| 319 |
|
| 320 |
st.divider()
|
| 321 |
-
st.header("
|
| 322 |
|
| 323 |
-
# Initialize answering system if not done
|
| 324 |
if 'answering_rag' not in st.session_state:
|
| 325 |
st.session_state.answering_rag = None
|
| 326 |
|
| 327 |
-
# Create answering system when API key is set
|
| 328 |
if st.session_state.api_key_set and st.session_state.answering_rag is None:
|
| 329 |
from rag_system import AnsweringRAG
|
| 330 |
st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
|
| 331 |
|
| 332 |
question = st.text_area(
|
| 333 |
-
"
|
| 334 |
height=100,
|
| 335 |
-
placeholder="
|
| 336 |
)
|
| 337 |
|
| 338 |
-
if st.button("
|
| 339 |
if not st.session_state.api_key_set:
|
| 340 |
-
st.error("
|
| 341 |
elif st.session_state.current_text is None:
|
| 342 |
-
st.error("
|
| 343 |
elif not question:
|
| 344 |
-
st.error("
|
| 345 |
else:
|
| 346 |
try:
|
| 347 |
-
with st.spinner("
|
| 348 |
-
print(f"\n{'='*70}")
|
| 349 |
-
print(f"QUESTION: {question}")
|
| 350 |
-
print(f"{'='*70}")
|
| 351 |
-
|
| 352 |
-
# Search vector store
|
| 353 |
store = st.session_state.vector_store
|
| 354 |
|
| 355 |
-
# Add documents to store if needed
|
| 356 |
doc_name = st.session_state.current_document or "current_doc"
|
| 357 |
doc_data = {
|
| 358 |
'text': st.session_state.current_text,
|
|
@@ -361,21 +254,17 @@ if st.button("🔍 Search & Generate Answer"):
|
|
| 361 |
}
|
| 362 |
store.add_documents(doc_data, doc_name)
|
| 363 |
|
| 364 |
-
# Search for relevant results
|
| 365 |
search_results = store.search(question, n_results=5)
|
| 366 |
|
| 367 |
-
print(f"
|
| 368 |
|
| 369 |
-
# Analyze results and generate answer
|
| 370 |
answering_rag = st.session_state.answering_rag
|
| 371 |
result = answering_rag.analyze_and_answer(question, search_results)
|
| 372 |
|
| 373 |
-
|
| 374 |
-
st.success("✅ Analysis complete!")
|
| 375 |
|
| 376 |
-
st.subheader("
|
| 377 |
|
| 378 |
-
# Show confidence level
|
| 379 |
col1, col2, col3 = st.columns(3)
|
| 380 |
with col1:
|
| 381 |
confidence_color = {
|
|
@@ -383,56 +272,35 @@ if st.button("🔍 Search & Generate Answer"):
|
|
| 383 |
'medium': '🟡',
|
| 384 |
'low': '🔴'
|
| 385 |
}.get(result['confidence'], '⚪')
|
| 386 |
-
st.metric("
|
| 387 |
with col2:
|
| 388 |
-
st.metric("
|
| 389 |
with col3:
|
| 390 |
if result['sources_used'] > 0:
|
| 391 |
-
st.metric("
|
| 392 |
|
| 393 |
-
# Display the generated answer
|
| 394 |
st.write(result['answer'])
|
| 395 |
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
st.subheader("Sources Used in Answer")
|
| 399 |
for idx, source in enumerate(result['formatted_sources'], 1):
|
| 400 |
relevance = source['relevance']
|
| 401 |
-
relevance_bar = "
|
| 402 |
|
| 403 |
with st.expander(
|
| 404 |
-
f"
|
| 405 |
f"[{relevance_bar}] {relevance:.0%}"
|
| 406 |
):
|
| 407 |
st.write(source['content'])
|
| 408 |
|
| 409 |
-
print(f"
|
| 410 |
|
| 411 |
except Exception as e:
|
| 412 |
-
st.error(f"
|
| 413 |
-
print(f"
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
# ============================================================================
|
| 417 |
-
# FOOTER
|
| 418 |
-
# ============================================================================
|
| 419 |
|
| 420 |
st.divider()
|
| 421 |
|
| 422 |
-
col1, col2, col3 = st.columns(3)
|
| 423 |
-
|
| 424 |
-
with col1:
|
| 425 |
-
st.info("📖 **Text Processing**: PyPDF2 extraction with UTF-8 support")
|
| 426 |
-
|
| 427 |
-
with col2:
|
| 428 |
-
st.info("🖼️ **Visual Analysis**: GPT-4o vision for image understanding")
|
| 429 |
-
|
| 430 |
-
with col3:
|
| 431 |
-
st.info("📊 **Vector Storage**: ChromaDB with auto-persist")
|
| 432 |
-
|
| 433 |
st.caption(
|
| 434 |
-
"
|
| 435 |
-
"Visual Image Analysis | "
|
| 436 |
-
"Russian Language Support | "
|
| 437 |
-
"Individual Component Summarization"
|
| 438 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
+
UI RAG
|
|
|
|
| 3 |
"""
|
| 4 |
|
| 5 |
import streamlit as st
|
| 6 |
import os
|
| 7 |
from pathlib import Path
|
| 8 |
|
|
|
|
| 9 |
from pdf_parser import PDFParser
|
| 10 |
from vector_store import VectorStore
|
| 11 |
+
from rag_system import VisualMultimodalRAG
|
| 12 |
from config import UPLOAD_FOLDER, MAX_PDF_SIZE_MB
|
| 13 |
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
st.set_page_config(
|
| 17 |
+
page_title="Мультимодальная RAG система (PDF parsing)",
|
|
|
|
| 18 |
layout="wide",
|
| 19 |
initial_sidebar_state="expanded"
|
| 20 |
)
|
| 21 |
|
| 22 |
+
|
|
|
|
|
|
|
| 23 |
|
| 24 |
if 'api_key_set' not in st.session_state:
|
| 25 |
st.session_state.api_key_set = False
|
|
|
|
| 27 |
if 'api_key' not in st.session_state:
|
| 28 |
st.session_state.api_key = None
|
| 29 |
|
| 30 |
+
if 'visual_rag_system' not in st.session_state:
|
| 31 |
st.session_state.visual_rag_system = None
|
| 32 |
|
| 33 |
if 'vector_store' not in st.session_state:
|
|
|
|
| 48 |
if 'current_tables' not in st.session_state:
|
| 49 |
st.session_state.current_tables = None
|
| 50 |
|
| 51 |
+
if 'processing_results' not in st.session_state:
|
| 52 |
st.session_state.processing_results = None
|
| 53 |
|
| 54 |
if 'answering_rag' not in st.session_state:
|
| 55 |
st.session_state.answering_rag = None
|
| 56 |
|
| 57 |
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
+
st.title("Мультимодальная RAG система (PDF parsing)")
|
| 60 |
st.markdown("""
|
| 61 |
+
Обрабатывает PDF документы и предоставляет информацию по ним
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
""")
|
| 63 |
|
| 64 |
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
with st.sidebar:
|
| 67 |
+
st.header(" Конфигурация")
|
| 68 |
|
| 69 |
+
st.subheader(" OpenAI API Ключ")
|
|
|
|
| 70 |
|
| 71 |
api_key = st.text_input(
|
| 72 |
+
"Введите OpenAI API ключ:",
|
| 73 |
type="password",
|
| 74 |
key="api_key_input"
|
| 75 |
)
|
|
|
|
| 78 |
st.session_state.api_key = api_key
|
| 79 |
st.session_state.api_key_set = True
|
| 80 |
|
|
|
|
| 81 |
if st.session_state.visual_rag_system is None:
|
| 82 |
try:
|
| 83 |
st.session_state.visual_rag_system = VisualMultimodalRAG(api_key=api_key, debug=True) # NEW
|
| 84 |
st.session_state.vector_store = VectorStore()
|
| 85 |
st.session_state.parser = PDFParser(debug=True)
|
| 86 |
+
st.success("API ключ введен")
|
| 87 |
except Exception as e:
|
| 88 |
+
st.error(f"Ошибка старта системы: {e}")
|
| 89 |
else:
|
| 90 |
st.session_state.api_key_set = False
|
| 91 |
+
st.warning("Введите OpenAI API ключ")
|
| 92 |
|
| 93 |
st.divider()
|
| 94 |
|
| 95 |
+
st.subheader("Векторное хранилище")
|
|
|
|
| 96 |
if st.session_state.vector_store:
|
| 97 |
try:
|
| 98 |
info = st.session_state.vector_store.get_collection_info()
|
| 99 |
+
st.metric("Документов в хранилище", info['count'])
|
| 100 |
+
st.caption(f"Расположение: {info['persist_path']}")
|
|
|
|
| 101 |
except Exception as e:
|
| 102 |
+
st.error(f"Ошибка получения информации: {e}")
|
| 103 |
else:
|
| 104 |
+
st.info("Введите OpenAI API ключ")
|
| 105 |
|
| 106 |
st.divider()
|
| 107 |
|
| 108 |
+
st.subheader("Управление хранилищем")
|
| 109 |
+
if st.button("Очистить хранилище"):
|
|
|
|
| 110 |
if st.session_state.vector_store:
|
| 111 |
try:
|
| 112 |
st.session_state.vector_store.clear_all()
|
| 113 |
+
st.success("Хранилище очищено")
|
| 114 |
except Exception as e:
|
| 115 |
+
st.error(f"Ошибка очистки хранилища: {e}")
|
| 116 |
|
| 117 |
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
st.header("Загрузить PDF")
|
|
|
|
| 120 |
|
| 121 |
uploaded_file = st.file_uploader(
|
| 122 |
+
"Выбрать...",
|
| 123 |
type=['pdf'],
|
| 124 |
+
help="Загрузите PDF файл"
|
| 125 |
)
|
| 126 |
|
| 127 |
if uploaded_file is not None:
|
|
|
|
| 128 |
upload_path = Path(UPLOAD_FOLDER)
|
| 129 |
upload_path.mkdir(exist_ok=True)
|
| 130 |
|
|
|
|
| 132 |
with open(file_path, 'wb') as f:
|
| 133 |
f.write(uploaded_file.getbuffer())
|
| 134 |
|
| 135 |
+
st.success(f"Файл загружен: {uploaded_file.name}")
|
| 136 |
|
| 137 |
+
if st.button("Распарсить PDF"):
|
|
|
|
| 138 |
if not st.session_state.api_key_set:
|
| 139 |
+
st.error("Введите OpenAI API ключ")
|
| 140 |
else:
|
| 141 |
try:
|
| 142 |
+
with st.spinner(" Парсинг PDF..."):
|
| 143 |
+
|
| 144 |
+
print(f"Парсинг PDF файла: {uploaded_file.name}")
|
| 145 |
+
|
| 146 |
|
|
|
|
| 147 |
parser = st.session_state.parser
|
| 148 |
text, images, tables = parser.parse_pdf(str(file_path))
|
| 149 |
|
|
|
|
| 150 |
st.session_state.current_document = uploaded_file.name
|
| 151 |
st.session_state.current_text = text
|
| 152 |
st.session_state.current_images = images
|
| 153 |
st.session_state.current_tables = tables
|
| 154 |
|
|
|
|
| 155 |
col1, col2, col3 = st.columns(3)
|
| 156 |
with col1:
|
| 157 |
+
st.metric("Текста", f"{len(text):,} chars")
|
| 158 |
with col2:
|
| 159 |
+
st.metric("Изображений", len(images))
|
| 160 |
with col3:
|
| 161 |
+
st.metric("Таблиц", len(tables))
|
| 162 |
|
| 163 |
+
st.success("Парсинг PDF завершен!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
except Exception as e:
|
| 166 |
+
st.error(f"Парсинг PDF завершелся с ошибкой: {e}")
|
| 167 |
+
print(f"Ошибка: {e}")
|
| 168 |
|
| 169 |
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
st.divider()
|
| 172 |
+
st.header("Анализ документа")
|
| 173 |
|
| 174 |
st.info("""
|
| 175 |
+
Отправляет содержимое документа на анализ
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
""")
|
| 177 |
|
| 178 |
+
if st.button("Проанализировать документ"):
|
| 179 |
if not st.session_state.api_key_set:
|
| 180 |
+
st.error("Введите OpenAI API ключ")
|
| 181 |
elif st.session_state.current_text is None:
|
| 182 |
+
st.error("Распарсите документ")
|
| 183 |
else:
|
| 184 |
try:
|
| 185 |
+
with st.spinner("Анализ с gpt-4o-mini..."):
|
|
|
|
|
|
|
|
|
|
| 186 |
|
|
|
|
| 187 |
visual_rag = st.session_state.visual_rag_system
|
| 188 |
vector_store = st.session_state.vector_store
|
| 189 |
|
| 190 |
results = visual_rag.process_and_store_document(
|
| 191 |
text=st.session_state.current_text,
|
| 192 |
+
images=st.session_state.current_images,
|
| 193 |
tables=st.session_state.current_tables,
|
| 194 |
vector_store=vector_store,
|
| 195 |
doc_id=st.session_state.current_document or "current_doc"
|
|
|
|
| 197 |
|
| 198 |
st.session_state.processing_results = results
|
| 199 |
|
| 200 |
+
st.success("Анализ готов!")
|
|
|
|
| 201 |
|
| 202 |
col1, col2, col3 = st.columns(3)
|
| 203 |
with col1:
|
| 204 |
+
st.metric("Проанализировано изображений", len(results['image_visual_analyses']))
|
| 205 |
with col2:
|
| 206 |
+
st.metric("Проанализировано чанков текста", len(results['text_summaries']))
|
| 207 |
with col3:
|
| 208 |
+
st.metric("Проанализировано таблиц", len(results['table_summaries']))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
+
st.metric("Помещено в хранилище", results['total_stored'])
|
| 211 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
+
print(f"Анализ завершен")
|
| 214 |
|
| 215 |
except Exception as e:
|
| 216 |
+
st.error(f"Ошибка в ходе: {e}")
|
| 217 |
+
print(f"Ошибка: {e}")
|
| 218 |
|
| 219 |
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
st.divider()
|
| 222 |
+
st.header("Работа с документом")
|
| 223 |
|
|
|
|
| 224 |
if 'answering_rag' not in st.session_state:
|
| 225 |
st.session_state.answering_rag = None
|
| 226 |
|
|
|
|
| 227 |
if st.session_state.api_key_set and st.session_state.answering_rag is None:
|
| 228 |
from rag_system import AnsweringRAG
|
| 229 |
st.session_state.answering_rag = AnsweringRAG(api_key=st.session_state.api_key, debug=True)
|
| 230 |
|
| 231 |
question = st.text_area(
|
| 232 |
+
"Введите запрос:",
|
| 233 |
height=100,
|
| 234 |
+
placeholder="О чем данный документ?"
|
| 235 |
)
|
| 236 |
|
| 237 |
+
if st.button("Генерация ответа"):
|
| 238 |
if not st.session_state.api_key_set:
|
| 239 |
+
st.error("Введите OpenAI API ключ")
|
| 240 |
elif st.session_state.current_text is None:
|
| 241 |
+
st.error("Распарсите документ")
|
| 242 |
elif not question:
|
| 243 |
+
st.error("Введите запрос")
|
| 244 |
else:
|
| 245 |
try:
|
| 246 |
+
with st.spinner("Поиск документов..."):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
store = st.session_state.vector_store
|
| 248 |
|
|
|
|
| 249 |
doc_name = st.session_state.current_document or "current_doc"
|
| 250 |
doc_data = {
|
| 251 |
'text': st.session_state.current_text,
|
|
|
|
| 254 |
}
|
| 255 |
store.add_documents(doc_data, doc_name)
|
| 256 |
|
|
|
|
| 257 |
search_results = store.search(question, n_results=5)
|
| 258 |
|
| 259 |
+
print(f"Найдено: {len(search_results)}")
|
| 260 |
|
|
|
|
| 261 |
answering_rag = st.session_state.answering_rag
|
| 262 |
result = answering_rag.analyze_and_answer(question, search_results)
|
| 263 |
|
| 264 |
+
st.success("Поиск завершен!")
|
|
|
|
| 265 |
|
| 266 |
+
st.subheader("Ответ")
|
| 267 |
|
|
|
|
| 268 |
col1, col2, col3 = st.columns(3)
|
| 269 |
with col1:
|
| 270 |
confidence_color = {
|
|
|
|
| 272 |
'medium': '🟡',
|
| 273 |
'low': '🔴'
|
| 274 |
}.get(result['confidence'], '⚪')
|
| 275 |
+
st.metric("Уверенность в ответе", f"{confidence_color} {result['confidence'].upper()}")
|
| 276 |
with col2:
|
| 277 |
+
st.metric("Использовано источников", result['sources_used'])
|
| 278 |
with col3:
|
| 279 |
if result['sources_used'] > 0:
|
| 280 |
+
st.metric("Среднняя релевантность", f"{sum(1-r.get('distance',0) for r in search_results)/len(search_results):.0%}")
|
| 281 |
|
|
|
|
| 282 |
st.write(result['answer'])
|
| 283 |
|
| 284 |
+
if st.checkbox("Показать исходные документы"):
|
| 285 |
+
st.subheader("Использованы докуме��ты")
|
|
|
|
| 286 |
for idx, source in enumerate(result['formatted_sources'], 1):
|
| 287 |
relevance = source['relevance']
|
| 288 |
+
relevance_bar = "\/" * int(relevance * 10) + "|" * (10 - int(relevance * 10))
|
| 289 |
|
| 290 |
with st.expander(
|
| 291 |
+
f"Источник {idx} - {source['type'].upper()} "
|
| 292 |
f"[{relevance_bar}] {relevance:.0%}"
|
| 293 |
):
|
| 294 |
st.write(source['content'])
|
| 295 |
|
| 296 |
+
print(f" Ответ готов!")
|
| 297 |
|
| 298 |
except Exception as e:
|
| 299 |
+
st.error(f"Ошибка обработки запроса: {e}")
|
| 300 |
+
print(f"Ошибка: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
|
| 302 |
st.divider()
|
| 303 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
st.caption(
|
| 305 |
+
"Мультимодальная RAG система для парсинга PDF документов"
|
|
|
|
|
|
|
|
|
|
| 306 |
)
|
src/config.py
CHANGED
|
@@ -1,42 +1,34 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
| 7 |
-
# API Configuration
|
| 8 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 9 |
-
OPENAI_MODEL = "gpt-4o-mini"
|
| 10 |
-
USE_CACHE = True
|
| 11 |
|
| 12 |
-
# Vector Store Configuration
|
| 13 |
CHROMA_DB_PATH = "./chroma_db"
|
| 14 |
DOCSTORE_PATH = "./docstore"
|
| 15 |
PROCESSED_FILES_LOG = "./processed_files.txt"
|
| 16 |
|
| 17 |
-
# Embedding Model Configuration
|
| 18 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
| 19 |
EMBEDDING_DIM = 768
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
MAX_TOKENS = 500 # Limit response size (vs 1500)
|
| 26 |
|
| 27 |
-
# Language Support
|
| 28 |
LANGUAGE = "russian"
|
| 29 |
|
| 30 |
-
# Create necessary directories
|
| 31 |
Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
|
| 32 |
Path(DOCSTORE_PATH).mkdir(exist_ok=True)
|
| 33 |
|
| 34 |
-
# PDF Upload Configuration
|
| 35 |
UPLOAD_FOLDER = "./uploaded_pdfs"
|
| 36 |
Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
|
| 37 |
MAX_PDF_SIZE_MB = 50
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
SUMMARIZE_FIRST = True # Summarize PDFs once, not per query
|
|
|
|
| 1 |
"""
|
| 2 |
+
Конфигурационный файл
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
from pathlib import Path
|
| 6 |
|
|
|
|
| 7 |
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
| 8 |
+
OPENAI_MODEL = "gpt-4o-mini"
|
| 9 |
+
USE_CACHE = True
|
| 10 |
|
|
|
|
| 11 |
CHROMA_DB_PATH = "./chroma_db"
|
| 12 |
DOCSTORE_PATH = "./docstore"
|
| 13 |
PROCESSED_FILES_LOG = "./processed_files.txt"
|
| 14 |
|
|
|
|
| 15 |
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
|
| 16 |
EMBEDDING_DIM = 768
|
| 17 |
|
| 18 |
+
MAX_CHUNK_SIZE = 500
|
| 19 |
+
CHUNK_OVERLAP = 50
|
| 20 |
+
TEMPERATURE = 0.3
|
| 21 |
+
MAX_TOKENS = 500
|
|
|
|
| 22 |
|
|
|
|
| 23 |
LANGUAGE = "russian"
|
| 24 |
|
|
|
|
| 25 |
Path(CHROMA_DB_PATH).mkdir(exist_ok=True)
|
| 26 |
Path(DOCSTORE_PATH).mkdir(exist_ok=True)
|
| 27 |
|
|
|
|
| 28 |
UPLOAD_FOLDER = "./uploaded_pdfs"
|
| 29 |
Path(UPLOAD_FOLDER).mkdir(exist_ok=True)
|
| 30 |
MAX_PDF_SIZE_MB = 50
|
| 31 |
|
| 32 |
+
BATCH_SEARCH_RESULTS = 3
|
| 33 |
+
CACHE_RESPONSES = True
|
| 34 |
+
SUMMARIZE_FIRST = True
|
|
|
src/pdf_parser.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
PDF
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
import json
|
|
@@ -20,27 +20,14 @@ class PDFParser:
|
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
self.debug = debug
|
| 22 |
|
| 23 |
-
|
| 24 |
self._configure_tesseract()
|
| 25 |
|
| 26 |
if self.debug:
|
| 27 |
-
print("
|
| 28 |
-
|
| 29 |
-
def _configure_tesseract(self):
|
| 30 |
-
"""Configure Tesseract with proper paths and language support"""
|
| 31 |
-
try:
|
| 32 |
-
# Windows specific path
|
| 33 |
-
if os.name == 'nt':
|
| 34 |
-
pytesseract.pytesseract.pytesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
| 35 |
-
|
| 36 |
-
# Test Tesseract
|
| 37 |
-
pytesseract.get_tesseract_version()
|
| 38 |
-
print("✅ Tesseract configured successfully")
|
| 39 |
-
except Exception as e:
|
| 40 |
-
print(f"⚠️ Tesseract configuration warning: {e}")
|
| 41 |
|
| 42 |
def _debug_print(self, label: str, data: any):
|
| 43 |
-
"""
|
| 44 |
if self.debug:
|
| 45 |
print(f"\n🔍 [PDF Parser] {label}")
|
| 46 |
if isinstance(data, dict):
|
|
@@ -54,7 +41,7 @@ class PDFParser:
|
|
| 54 |
print(f" {data}")
|
| 55 |
|
| 56 |
def _load_processed_files(self) -> Dict[str, str]:
|
| 57 |
-
"""
|
| 58 |
if os.path.exists(PROCESSED_FILES_LOG):
|
| 59 |
try:
|
| 60 |
with open(PROCESSED_FILES_LOG, 'r') as f:
|
|
@@ -64,12 +51,12 @@ class PDFParser:
|
|
| 64 |
return {}
|
| 65 |
|
| 66 |
def _save_processed_files(self):
|
| 67 |
-
"""
|
| 68 |
with open(PROCESSED_FILES_LOG, 'w') as f:
|
| 69 |
json.dump(self.processed_files, f, indent=2)
|
| 70 |
|
| 71 |
def _get_file_hash(self, file_path: str) -> str:
|
| 72 |
-
"""
|
| 73 |
hash_md5 = hashlib.md5()
|
| 74 |
with open(file_path, "rb") as f:
|
| 75 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
@@ -77,7 +64,7 @@ class PDFParser:
|
|
| 77 |
return hash_md5.hexdigest()
|
| 78 |
|
| 79 |
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
| 80 |
-
"""
|
| 81 |
text = ""
|
| 82 |
try:
|
| 83 |
with open(pdf_path, 'rb') as file:
|
|
@@ -96,40 +83,36 @@ class PDFParser:
|
|
| 96 |
return text
|
| 97 |
|
| 98 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 99 |
-
"""
|
| 100 |
images_data = []
|
| 101 |
try:
|
| 102 |
-
self._debug_print("Image
|
| 103 |
|
| 104 |
images = convert_from_path(pdf_path, dpi=150)
|
| 105 |
-
self._debug_print(
|
| 106 |
|
| 107 |
for idx, image in enumerate(images):
|
| 108 |
-
self._debug_print(f"
|
| 109 |
|
| 110 |
-
# Save image
|
| 111 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 112 |
image.save(image_path)
|
| 113 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 114 |
|
| 115 |
-
|
| 116 |
-
self._debug_print(f"Image {idx} OCR", "Running Tesseract OCR with Russian+English...")
|
| 117 |
|
| 118 |
try:
|
| 119 |
-
# CRITICAL: Use 'rus+eng' for Russian + English support
|
| 120 |
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
| 121 |
|
| 122 |
-
# Clean up text
|
| 123 |
ocr_text = ocr_text.strip()
|
| 124 |
|
| 125 |
if not ocr_text or len(ocr_text) < 5:
|
| 126 |
-
self._debug_print(f"Image {idx} OCR Result", f"
|
| 127 |
else:
|
| 128 |
-
self._debug_print(f"Image {idx} OCR Result", f"
|
| 129 |
|
| 130 |
except Exception as ocr_error:
|
| 131 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 132 |
-
ocr_text = f"[Image {idx}: OCR failed
|
| 133 |
|
| 134 |
images_data.append({
|
| 135 |
'page': idx,
|
|
@@ -144,13 +127,13 @@ class PDFParser:
|
|
| 144 |
return images_data
|
| 145 |
|
| 146 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 147 |
-
"""
|
| 148 |
tables_data = []
|
| 149 |
try:
|
| 150 |
text = self._extract_text_from_pdf(pdf_path)
|
| 151 |
lines = text.split('\n')
|
| 152 |
|
| 153 |
-
self._debug_print("Table
|
| 154 |
|
| 155 |
current_table = []
|
| 156 |
for line in lines:
|
|
@@ -177,44 +160,39 @@ class PDFParser:
|
|
| 177 |
return tables_data
|
| 178 |
|
| 179 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 180 |
-
"""
|
| 181 |
file_hash = self._get_file_hash(pdf_path)
|
| 182 |
doc_id = Path(pdf_path).stem
|
| 183 |
|
| 184 |
-
self._debug_print("PDF Parsing Started", f"File: {doc_id}
|
| 185 |
|
| 186 |
-
# Check if file was already processed
|
| 187 |
if doc_id in self.processed_files:
|
| 188 |
if self.processed_files[doc_id] == file_hash:
|
| 189 |
-
self._debug_print("Status", f"File {doc_id} already processed
|
| 190 |
return self._load_extracted_data(doc_id)
|
| 191 |
|
| 192 |
-
print(f"\
|
| 193 |
|
| 194 |
-
# Extract content
|
| 195 |
text = self._extract_text_from_pdf(pdf_path)
|
| 196 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 197 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 198 |
|
| 199 |
-
|
| 200 |
-
self._debug_print("Extraction Summary", {
|
| 201 |
'text_length': len(text),
|
| 202 |
'images_count': len(images),
|
| 203 |
'tables_count': len(tables),
|
| 204 |
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 205 |
})
|
| 206 |
|
| 207 |
-
# Save extracted data
|
| 208 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 209 |
|
| 210 |
-
# Update processed files log
|
| 211 |
self.processed_files[doc_id] = file_hash
|
| 212 |
self._save_processed_files()
|
| 213 |
|
| 214 |
return text, images, tables
|
| 215 |
|
| 216 |
def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
|
| 217 |
-
"""
|
| 218 |
data = {
|
| 219 |
'text': text,
|
| 220 |
'images': images,
|
|
@@ -227,7 +205,7 @@ class PDFParser:
|
|
| 227 |
self._debug_print("Data Saved", str(data_path))
|
| 228 |
|
| 229 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 230 |
-
"""
|
| 231 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 232 |
try:
|
| 233 |
with open(data_path, 'r', encoding='utf-8') as f:
|
|
@@ -237,7 +215,7 @@ class PDFParser:
|
|
| 237 |
return "", [], []
|
| 238 |
|
| 239 |
def get_all_documents(self) -> Dict:
|
| 240 |
-
"""
|
| 241 |
all_docs = {}
|
| 242 |
for json_file in self.docstore_path.glob("*_data.json"):
|
| 243 |
doc_id = json_file.stem.replace("_data", "")
|
|
|
|
| 1 |
"""
|
| 2 |
+
PDF Парсер
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
import json
|
|
|
|
| 20 |
self.processed_files = self._load_processed_files()
|
| 21 |
self.debug = debug
|
| 22 |
|
| 23 |
+
|
| 24 |
self._configure_tesseract()
|
| 25 |
|
| 26 |
if self.debug:
|
| 27 |
+
print("PDFParser initialized")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def _debug_print(self, label: str, data: any):
|
| 30 |
+
"""Debug"""
|
| 31 |
if self.debug:
|
| 32 |
print(f"\n🔍 [PDF Parser] {label}")
|
| 33 |
if isinstance(data, dict):
|
|
|
|
| 41 |
print(f" {data}")
|
| 42 |
|
| 43 |
def _load_processed_files(self) -> Dict[str, str]:
|
| 44 |
+
"""Подгрузка обработанных файлов"""
|
| 45 |
if os.path.exists(PROCESSED_FILES_LOG):
|
| 46 |
try:
|
| 47 |
with open(PROCESSED_FILES_LOG, 'r') as f:
|
|
|
|
| 51 |
return {}
|
| 52 |
|
| 53 |
def _save_processed_files(self):
|
| 54 |
+
"""Сохранение обработанных файлов"""
|
| 55 |
with open(PROCESSED_FILES_LOG, 'w') as f:
|
| 56 |
json.dump(self.processed_files, f, indent=2)
|
| 57 |
|
| 58 |
def _get_file_hash(self, file_path: str) -> str:
|
| 59 |
+
"""Проверка изменения файлов"""
|
| 60 |
hash_md5 = hashlib.md5()
|
| 61 |
with open(file_path, "rb") as f:
|
| 62 |
for chunk in iter(lambda: f.read(4096), b""):
|
|
|
|
| 64 |
return hash_md5.hexdigest()
|
| 65 |
|
| 66 |
def _extract_text_from_pdf(self, pdf_path: str) -> str:
|
| 67 |
+
"""Извлечение текста из PDF"""
|
| 68 |
text = ""
|
| 69 |
try:
|
| 70 |
with open(pdf_path, 'rb') as file:
|
|
|
|
| 83 |
return text
|
| 84 |
|
| 85 |
def _extract_images_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 86 |
+
"""Извлечение изображений из PDF"""
|
| 87 |
images_data = []
|
| 88 |
try:
|
| 89 |
+
self._debug_print("Image extraction", f"File: {pdf_path}")
|
| 90 |
|
| 91 |
images = convert_from_path(pdf_path, dpi=150)
|
| 92 |
+
self._debug_print(f"Total images: {len(images)}")
|
| 93 |
|
| 94 |
for idx, image in enumerate(images):
|
| 95 |
+
self._debug_print(f"Image {idx}", f"Size: {image.size}")
|
| 96 |
|
|
|
|
| 97 |
image_path = self.docstore_path / f"{doc_id}_image_{idx}.png"
|
| 98 |
image.save(image_path)
|
| 99 |
self._debug_print(f"Image {idx} Saved", str(image_path))
|
| 100 |
|
| 101 |
+
self._debug_print(f"Image {idx} OCR", "Running OCR...")
|
|
|
|
| 102 |
|
| 103 |
try:
|
|
|
|
| 104 |
ocr_text = pytesseract.image_to_string(image, lang='rus')
|
| 105 |
|
|
|
|
| 106 |
ocr_text = ocr_text.strip()
|
| 107 |
|
| 108 |
if not ocr_text or len(ocr_text) < 5:
|
| 109 |
+
self._debug_print(f"Image {idx} OCR Result", f"WARN ({len(ocr_text)} chars)")
|
| 110 |
else:
|
| 111 |
+
self._debug_print(f"Image {idx} OCR Result", f"SUCCESS {len(ocr_text)} chars: {ocr_text[:150]}")
|
| 112 |
|
| 113 |
except Exception as ocr_error:
|
| 114 |
self._debug_print(f"Image {idx} OCR ERROR", str(ocr_error))
|
| 115 |
+
ocr_text = f"[Image {idx}: OCR failed {str(ocr_error)}]"
|
| 116 |
|
| 117 |
images_data.append({
|
| 118 |
'page': idx,
|
|
|
|
| 127 |
return images_data
|
| 128 |
|
| 129 |
def _extract_tables_from_pdf(self, pdf_path: str, doc_id: str) -> List[Dict]:
|
| 130 |
+
"""Извлечение таблиц из PDF"""
|
| 131 |
tables_data = []
|
| 132 |
try:
|
| 133 |
text = self._extract_text_from_pdf(pdf_path)
|
| 134 |
lines = text.split('\n')
|
| 135 |
|
| 136 |
+
self._debug_print("Table extraction", f"Scanning {len(lines)} lines")
|
| 137 |
|
| 138 |
current_table = []
|
| 139 |
for line in lines:
|
|
|
|
| 160 |
return tables_data
|
| 161 |
|
| 162 |
def parse_pdf(self, pdf_path: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 163 |
+
"""Парсинг PDF"""
|
| 164 |
file_hash = self._get_file_hash(pdf_path)
|
| 165 |
doc_id = Path(pdf_path).stem
|
| 166 |
|
| 167 |
+
self._debug_print("PDF Parsing Started", f"File: {doc_id}")
|
| 168 |
|
|
|
|
| 169 |
if doc_id in self.processed_files:
|
| 170 |
if self.processed_files[doc_id] == file_hash:
|
| 171 |
+
self._debug_print("Status", f"File {doc_id} already processed")
|
| 172 |
return self._load_extracted_data(doc_id)
|
| 173 |
|
| 174 |
+
print(f"\nProcessing PDF: {doc_id}")
|
| 175 |
|
|
|
|
| 176 |
text = self._extract_text_from_pdf(pdf_path)
|
| 177 |
images = self._extract_images_from_pdf(pdf_path, doc_id)
|
| 178 |
tables = self._extract_tables_from_pdf(pdf_path, doc_id)
|
| 179 |
|
| 180 |
+
self._debug_print("Summary", {
|
|
|
|
| 181 |
'text_length': len(text),
|
| 182 |
'images_count': len(images),
|
| 183 |
'tables_count': len(tables),
|
| 184 |
'images_with_ocr': sum(1 for img in images if img.get('ocr_text', '').strip())
|
| 185 |
})
|
| 186 |
|
|
|
|
| 187 |
self._save_extracted_data(doc_id, text, images, tables)
|
| 188 |
|
|
|
|
| 189 |
self.processed_files[doc_id] = file_hash
|
| 190 |
self._save_processed_files()
|
| 191 |
|
| 192 |
return text, images, tables
|
| 193 |
|
| 194 |
def _save_extracted_data(self, doc_id: str, text: str, images: List[Dict], tables: List[Dict]):
|
| 195 |
+
"""Сохранение извелеченных данных в Docstore"""
|
| 196 |
data = {
|
| 197 |
'text': text,
|
| 198 |
'images': images,
|
|
|
|
| 205 |
self._debug_print("Data Saved", str(data_path))
|
| 206 |
|
| 207 |
def _load_extracted_data(self, doc_id: str) -> Tuple[str, List[Dict], List[Dict]]:
|
| 208 |
+
"""Подгрузка ранее извлеченных данных из Docstore"""
|
| 209 |
data_path = self.docstore_path / f"{doc_id}_data.json"
|
| 210 |
try:
|
| 211 |
with open(data_path, 'r', encoding='utf-8') as f:
|
|
|
|
| 215 |
return "", [], []
|
| 216 |
|
| 217 |
def get_all_documents(self) -> Dict:
|
| 218 |
+
"""Получение всех документов из Docstore"""
|
| 219 |
all_docs = {}
|
| 220 |
for json_file in self.docstore_path.glob("*_data.json"):
|
| 221 |
doc_id = json_file.stem.replace("_data", "")
|
src/rag_system.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
Sends base64 images directly to GPT-4o for visual analysis (not just OCR)
|
| 4 |
-
Then stores results in vector store
|
| 5 |
"""
|
| 6 |
from typing import List, Dict
|
| 7 |
from langchain_openai import ChatOpenAI
|
|
@@ -17,20 +15,18 @@ from config import (
|
|
| 17 |
|
| 18 |
class VisualMultimodalRAG:
|
| 19 |
"""
|
| 20 |
-
RAG
|
| 21 |
-
1.
|
| 22 |
-
2.
|
| 23 |
-
3.
|
| 24 |
-
4. Enables image-based semantic search
|
| 25 |
"""
|
| 26 |
|
| 27 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 28 |
api_key = api_key or OPENAI_API_KEY
|
| 29 |
self.debug = debug
|
| 30 |
|
| 31 |
-
# Use gpt-4o for vision capabilities
|
| 32 |
self.llm = ChatOpenAI(
|
| 33 |
-
model_name=
|
| 34 |
api_key=api_key,
|
| 35 |
temperature=TEMPERATURE,
|
| 36 |
max_tokens=MAX_TOKENS,
|
|
@@ -40,12 +36,12 @@ class VisualMultimodalRAG:
|
|
| 40 |
self.visual_summaries_log = []
|
| 41 |
|
| 42 |
if self.debug:
|
| 43 |
-
print("
|
| 44 |
|
| 45 |
def _debug_print(self, label: str, data: any):
|
| 46 |
-
"""
|
| 47 |
if self.debug:
|
| 48 |
-
print(f"\
|
| 49 |
if isinstance(data, (list, dict)):
|
| 50 |
print(f" Type: {type(data).__name__}")
|
| 51 |
print(f" Content: {str(data)[:300]}...")
|
|
@@ -53,7 +49,7 @@ class VisualMultimodalRAG:
|
|
| 53 |
print(f" {data}")
|
| 54 |
|
| 55 |
def _image_to_base64(self, image_path: str) -> str:
|
| 56 |
-
"""
|
| 57 |
try:
|
| 58 |
with open(image_path, 'rb') as image_file:
|
| 59 |
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
@@ -64,27 +60,16 @@ class VisualMultimodalRAG:
|
|
| 64 |
|
| 65 |
def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
|
| 66 |
"""
|
| 67 |
-
|
| 68 |
-
Returns detailed visual analysis/description
|
| 69 |
-
|
| 70 |
-
gpt-4o can see:
|
| 71 |
-
- Charts, graphs, diagrams
|
| 72 |
-
- Tables and structured data
|
| 73 |
-
- Photos and drawings
|
| 74 |
-
- Handwritten text
|
| 75 |
-
- Screenshots
|
| 76 |
-
- Any visual content
|
| 77 |
"""
|
| 78 |
if not os.path.exists(image_path):
|
| 79 |
return f"[Image {image_idx}: File not found - {image_path}]"
|
| 80 |
|
| 81 |
try:
|
| 82 |
-
# Convert image to base64
|
| 83 |
image_base64 = self._image_to_base64(image_path)
|
| 84 |
if not image_base64:
|
| 85 |
-
return f"[Image {image_idx}:
|
| 86 |
|
| 87 |
-
# Determine image type
|
| 88 |
file_ext = Path(image_path).suffix.lower()
|
| 89 |
media_type_map = {
|
| 90 |
'.jpg': 'image/jpeg',
|
|
@@ -95,9 +80,8 @@ class VisualMultimodalRAG:
|
|
| 95 |
}
|
| 96 |
media_type = media_type_map.get(file_ext, 'image/png')
|
| 97 |
|
| 98 |
-
print(f"
|
| 99 |
|
| 100 |
-
# Create message with image
|
| 101 |
message = HumanMessage(
|
| 102 |
content=[
|
| 103 |
{
|
|
@@ -108,41 +92,38 @@ class VisualMultimodalRAG:
|
|
| 108 |
},
|
| 109 |
{
|
| 110 |
"type": "text",
|
| 111 |
-
"text": f"""
|
| 112 |
|
| 113 |
-
|
| 114 |
-
1.
|
| 115 |
-
2.
|
| 116 |
-
3.
|
| 117 |
-
4.
|
| 118 |
-
5. **Connections** - How this relates to document content
|
| 119 |
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
}
|
| 124 |
],
|
| 125 |
)
|
| 126 |
|
| 127 |
-
# Call gpt-4o with vision
|
| 128 |
response = self.llm.invoke([message])
|
| 129 |
analysis = response.content.strip()
|
| 130 |
|
| 131 |
if self.debug:
|
| 132 |
self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
|
| 133 |
|
| 134 |
-
print(f"
|
| 135 |
return analysis
|
| 136 |
|
| 137 |
except Exception as e:
|
| 138 |
error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
|
| 139 |
-
print(f"
|
| 140 |
return error_msg
|
| 141 |
|
| 142 |
def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
|
| 143 |
"""
|
| 144 |
-
|
| 145 |
-
Returns list of {image_index, visual_analysis, type}
|
| 146 |
"""
|
| 147 |
visual_analyses = []
|
| 148 |
|
|
@@ -150,10 +131,9 @@ Analysis:"""
|
|
| 150 |
image_path = image.get('path', '')
|
| 151 |
|
| 152 |
if not image_path:
|
| 153 |
-
print(f"
|
| 154 |
continue
|
| 155 |
|
| 156 |
-
# Analyze image visually (not just OCR)
|
| 157 |
visual_analysis = self.analyze_image_visually(image_path, idx)
|
| 158 |
|
| 159 |
visual_analyses.append({
|
|
@@ -161,14 +141,14 @@ Analysis:"""
|
|
| 161 |
'image_index': idx,
|
| 162 |
'image_path': image_path,
|
| 163 |
'visual_analysis': visual_analysis,
|
| 164 |
-
'ocr_text': image.get('ocr_text', '')
|
| 165 |
})
|
| 166 |
|
| 167 |
return visual_analyses
|
| 168 |
|
| 169 |
def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
|
| 170 |
"""
|
| 171 |
-
|
| 172 |
"""
|
| 173 |
chunks = []
|
| 174 |
text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
|
|
@@ -180,13 +160,13 @@ Analysis:"""
|
|
| 180 |
continue
|
| 181 |
|
| 182 |
try:
|
| 183 |
-
prompt = f"""
|
| 184 |
-
|
| 185 |
|
| 186 |
-
|
| 187 |
{chunk}
|
| 188 |
|
| 189 |
-
|
| 190 |
|
| 191 |
message = HumanMessage(content=prompt)
|
| 192 |
response = self.llm.invoke([message])
|
|
@@ -210,7 +190,7 @@ Summary (2-3 sentences maximum):"""
|
|
| 210 |
|
| 211 |
def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
|
| 212 |
"""
|
| 213 |
-
|
| 214 |
"""
|
| 215 |
summaries = []
|
| 216 |
|
|
@@ -221,13 +201,13 @@ Summary (2-3 sentences maximum):"""
|
|
| 221 |
continue
|
| 222 |
|
| 223 |
try:
|
| 224 |
-
prompt = f"""
|
| 225 |
-
|
| 226 |
|
| 227 |
-
|
| 228 |
{table_content}
|
| 229 |
|
| 230 |
-
|
| 231 |
|
| 232 |
message = HumanMessage(content=prompt)
|
| 233 |
response = self.llm.invoke([message])
|
|
@@ -258,12 +238,10 @@ Summary (2-3 sentences maximum):"""
|
|
| 258 |
doc_id: str
|
| 259 |
) -> Dict:
|
| 260 |
"""
|
| 261 |
-
|
| 262 |
-
Images are analyzed using gpt-4o vision (not just OCR)
|
| 263 |
"""
|
| 264 |
-
|
| 265 |
-
print(f"PROCESSING
|
| 266 |
-
print(f"{'='*70}")
|
| 267 |
|
| 268 |
results = {
|
| 269 |
'doc_id': doc_id,
|
|
@@ -273,14 +251,12 @@ Summary (2-3 sentences maximum):"""
|
|
| 273 |
'total_stored': 0
|
| 274 |
}
|
| 275 |
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
print(f"{'─'*70}")
|
| 279 |
|
| 280 |
image_analyses = self.analyze_images_visually(images)
|
| 281 |
results['image_visual_analyses'] = image_analyses
|
| 282 |
|
| 283 |
-
# Store each image analysis in vector store
|
| 284 |
image_docs = {
|
| 285 |
'text': ' | '.join([
|
| 286 |
f"Image {a['image_index']}: {a['visual_analysis']}"
|
|
@@ -291,7 +267,7 @@ Summary (2-3 sentences maximum):"""
|
|
| 291 |
}
|
| 292 |
|
| 293 |
for analysis in image_analyses:
|
| 294 |
-
print(f"
|
| 295 |
print(f" Path: {analysis['image_path']}")
|
| 296 |
print(f" Analysis: {analysis['visual_analysis'][:100]}...")
|
| 297 |
|
|
@@ -302,13 +278,11 @@ Summary (2-3 sentences maximum):"""
|
|
| 302 |
f"{doc_id}_images_visual"
|
| 303 |
)
|
| 304 |
results['total_stored'] += len(image_analyses)
|
| 305 |
-
print(f"
|
| 306 |
except Exception as e:
|
| 307 |
-
print(f"
|
| 308 |
|
| 309 |
-
|
| 310 |
-
print(f"\n📝 TEXT CHUNK SUMMARIZATION")
|
| 311 |
-
print(f"{'─'*70}")
|
| 312 |
|
| 313 |
text_summaries = self.summarize_text_chunks(text)
|
| 314 |
results['text_summaries'] = text_summaries
|
|
@@ -321,7 +295,7 @@ Summary (2-3 sentences maximum):"""
|
|
| 321 |
}
|
| 322 |
|
| 323 |
for summary in text_summaries:
|
| 324 |
-
print(f"
|
| 325 |
|
| 326 |
if text_summaries:
|
| 327 |
try:
|
|
@@ -330,13 +304,11 @@ Summary (2-3 sentences maximum):"""
|
|
| 330 |
f"{doc_id}_text_chunks"
|
| 331 |
)
|
| 332 |
results['total_stored'] += len(text_summaries)
|
| 333 |
-
print(f"
|
| 334 |
except Exception as e:
|
| 335 |
-
print(f"
|
| 336 |
|
| 337 |
-
|
| 338 |
-
print(f"\n📋 TABLE SUMMARIZATION ({len(tables)} total)")
|
| 339 |
-
print(f"{'─'*70}")
|
| 340 |
|
| 341 |
table_summaries = self.summarize_tables(tables)
|
| 342 |
results['table_summaries'] = table_summaries
|
|
@@ -349,7 +321,7 @@ Summary (2-3 sentences maximum):"""
|
|
| 349 |
}
|
| 350 |
|
| 351 |
for summary in table_summaries:
|
| 352 |
-
print(f"
|
| 353 |
|
| 354 |
if table_summaries:
|
| 355 |
try:
|
|
@@ -358,25 +330,20 @@ Summary (2-3 sentences maximum):"""
|
|
| 358 |
f"{doc_id}_tables"
|
| 359 |
)
|
| 360 |
results['total_stored'] += len(table_summaries)
|
| 361 |
-
print(f"
|
| 362 |
except Exception as e:
|
| 363 |
-
print(f"
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
print(f"
|
| 367 |
-
print(f"
|
| 368 |
-
print(f"{
|
| 369 |
-
print(f" Images analyzed visually & stored: {len(image_analyses)}")
|
| 370 |
-
print(f" Text chunks summarized & stored: {len(text_summaries)}")
|
| 371 |
-
print(f" Tables summarized & stored: {len(table_summaries)}")
|
| 372 |
print(f" Total items stored in vector: {results['total_stored']}")
|
| 373 |
-
print(f"{'='*70}")
|
| 374 |
|
| 375 |
self.visual_summaries_log.append(results)
|
| 376 |
return results
|
| 377 |
|
| 378 |
def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
|
| 379 |
-
"""Split text into overlapping chunks"""
|
| 380 |
chunks = []
|
| 381 |
start = 0
|
| 382 |
while start < len(text):
|
|
@@ -386,16 +353,15 @@ Summary (2-3 sentences maximum):"""
|
|
| 386 |
return chunks
|
| 387 |
|
| 388 |
def get_visual_summaries_log(self) -> List[Dict]:
|
| 389 |
-
"""Get all visual analysis logs"""
|
| 390 |
return self.visual_summaries_log
|
| 391 |
|
| 392 |
|
| 393 |
class AnsweringRAG:
|
| 394 |
"""
|
| 395 |
-
RAG
|
| 396 |
-
1.
|
| 397 |
-
2.
|
| 398 |
-
3.
|
| 399 |
"""
|
| 400 |
|
| 401 |
def __init__(self, api_key: str = None, debug: bool = True):
|
|
@@ -403,7 +369,7 @@ class AnsweringRAG:
|
|
| 403 |
self.debug = debug
|
| 404 |
|
| 405 |
self.llm = ChatOpenAI(
|
| 406 |
-
model_name=
|
| 407 |
api_key=api_key,
|
| 408 |
temperature=TEMPERATURE,
|
| 409 |
max_tokens=MAX_TOKENS,
|
|
@@ -413,10 +379,10 @@ class AnsweringRAG:
|
|
| 413 |
self.answer_log = []
|
| 414 |
|
| 415 |
if self.debug:
|
| 416 |
-
print("
|
| 417 |
|
| 418 |
def _debug_print(self, label: str, data: any):
|
| 419 |
-
"""
|
| 420 |
if self.debug:
|
| 421 |
print(f"\n🔍 DEBUG [{label}]:")
|
| 422 |
if isinstance(data, (list, dict)):
|
|
@@ -431,9 +397,9 @@ class AnsweringRAG:
|
|
| 431 |
search_results: List[Dict]
|
| 432 |
) -> Dict:
|
| 433 |
"""
|
| 434 |
-
|
| 435 |
|
| 436 |
-
|
| 437 |
{
|
| 438 |
'question': user question,
|
| 439 |
'answer': detailed answer,
|
|
@@ -443,22 +409,15 @@ class AnsweringRAG:
|
|
| 443 |
}
|
| 444 |
"""
|
| 445 |
|
| 446 |
-
print(f"\n{'='*70}")
|
| 447 |
print(f"ANALYZING QUESTION & GENERATING ANSWER")
|
| 448 |
-
print(f"{'='*70}")
|
| 449 |
|
| 450 |
-
print(f"\n
|
| 451 |
-
print(f"
|
| 452 |
|
| 453 |
-
# Check if we have search results
|
| 454 |
if not search_results:
|
| 455 |
-
print(f"
|
| 456 |
-
answer = f"""
|
| 457 |
-
|
| 458 |
-
Try:
|
| 459 |
-
- Using different keywords
|
| 460 |
-
- Breaking the question into smaller parts
|
| 461 |
-
- Asking about other topics in the document"""
|
| 462 |
|
| 463 |
result = {
|
| 464 |
'question': question,
|
|
@@ -470,7 +429,6 @@ Try:
|
|
| 470 |
self.answer_log.append(result)
|
| 471 |
return result
|
| 472 |
|
| 473 |
-
# Build context from search results
|
| 474 |
context_parts = []
|
| 475 |
for idx, result in enumerate(search_results, 1):
|
| 476 |
content = result.get('content', '')
|
|
@@ -485,43 +443,39 @@ Try:
|
|
| 485 |
|
| 486 |
full_context = "\n".join(context_parts)
|
| 487 |
|
| 488 |
-
self._debug_print("Context Prepared", f"{len(context_parts)} sources
|
| 489 |
|
| 490 |
-
|
| 491 |
-
analysis_prompt = f"""You are a helpful assistant analyzing document content to answer user questions.
|
| 492 |
|
| 493 |
-
|
| 494 |
"{question}"
|
| 495 |
|
| 496 |
-
|
| 497 |
{full_context}
|
| 498 |
|
| 499 |
-
|
| 500 |
-
1.
|
| 501 |
-
2.
|
| 502 |
-
3.
|
| 503 |
-
4.
|
| 504 |
-
5.
|
| 505 |
-
6. Structure your answer clearly with key points
|
| 506 |
|
| 507 |
-
|
| 508 |
|
| 509 |
-
print(f"\n
|
| 510 |
-
print(f" Context size: {len(full_context)}
|
| 511 |
print(f" Sources: {len(search_results)}")
|
| 512 |
|
| 513 |
try:
|
| 514 |
-
# Call LLM to analyze and answer
|
| 515 |
message = HumanMessage(content=analysis_prompt)
|
| 516 |
response = self.llm.invoke([message])
|
| 517 |
answer = response.content.strip()
|
| 518 |
|
| 519 |
-
# Determine confidence level
|
| 520 |
confidence = self._estimate_confidence(len(search_results), answer)
|
| 521 |
|
| 522 |
-
print(f"
|
| 523 |
print(f" Confidence: {confidence}")
|
| 524 |
-
print(f" Answer length: {len(answer)}
|
| 525 |
|
| 526 |
result = {
|
| 527 |
'question': question,
|
|
@@ -535,8 +489,8 @@ ANSWER:"""
|
|
| 535 |
return result
|
| 536 |
|
| 537 |
except Exception as e:
|
| 538 |
-
print(f"
|
| 539 |
-
answer = f"
|
| 540 |
|
| 541 |
result = {
|
| 542 |
'question': question,
|
|
@@ -551,66 +505,14 @@ ANSWER:"""
|
|
| 551 |
return result
|
| 552 |
|
| 553 |
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
| 554 |
-
"""
|
| 555 |
answer_length = len(answer)
|
| 556 |
|
| 557 |
-
# High confidence: multiple sources, substantial answer
|
| 558 |
if sources_count >= 3 and answer_length > 500:
|
| 559 |
return "high"
|
| 560 |
|
| 561 |
-
# Medium confidence: some sources, decent answer
|
| 562 |
elif sources_count >= 2 and answer_length > 200:
|
| 563 |
return "medium"
|
| 564 |
|
| 565 |
-
# Low confidence: few sources or short answer
|
| 566 |
else:
|
| 567 |
return "low"
|
| 568 |
-
|
| 569 |
-
def get_answer_with_sources(
|
| 570 |
-
self,
|
| 571 |
-
question: str,
|
| 572 |
-
search_results: List[Dict]
|
| 573 |
-
) -> Dict:
|
| 574 |
-
"""
|
| 575 |
-
Get answer AND properly formatted sources
|
| 576 |
-
Returns both answer and formatted source citations
|
| 577 |
-
"""
|
| 578 |
-
|
| 579 |
-
result = self.analyze_and_answer(question, search_results)
|
| 580 |
-
|
| 581 |
-
# Format sources for display
|
| 582 |
-
formatted_sources = []
|
| 583 |
-
for idx, source in enumerate(result['search_results'], 1):
|
| 584 |
-
formatted_sources.append({
|
| 585 |
-
'index': idx,
|
| 586 |
-
'type': source.get('type', 'unknown'),
|
| 587 |
-
'content': source.get('content', ''),
|
| 588 |
-
'relevance': 1 - source.get('distance', 0) if source.get('distance') else 0
|
| 589 |
-
})
|
| 590 |
-
|
| 591 |
-
result['formatted_sources'] = formatted_sources
|
| 592 |
-
return result
|
| 593 |
-
|
| 594 |
-
def get_answer_log(self) -> List[Dict]:
|
| 595 |
-
"""Get all answer generation logs"""
|
| 596 |
-
return self.answer_log
|
| 597 |
-
|
| 598 |
-
def print_answer_with_sources(self, result: Dict, max_source_length: int = 300):
|
| 599 |
-
"""Pretty print answer with sources"""
|
| 600 |
-
|
| 601 |
-
print(f"\n{'='*70}")
|
| 602 |
-
print(f"ANSWER TO: {result['question']}")
|
| 603 |
-
print(f"{'='*70}")
|
| 604 |
-
|
| 605 |
-
print(f"\n📝 ANSWER (Confidence: {result['confidence'].upper()}):")
|
| 606 |
-
print(f"{'-'*70}")
|
| 607 |
-
print(result['answer'])
|
| 608 |
-
print(f"{'-'*70}")
|
| 609 |
-
|
| 610 |
-
if result.get('formatted_sources'):
|
| 611 |
-
print(f"\n📚 SOURCES USED ({len(result['formatted_sources'])} total):")
|
| 612 |
-
for source in result['formatted_sources']:
|
| 613 |
-
print(f"\n[Source {source['index']} - {source['type'].upper()} ({source['relevance']:.0%} relevant)]")
|
| 614 |
-
print(f"{source['content'][:max_source_length]}...")
|
| 615 |
-
|
| 616 |
-
print(f"\n{'='*70}")
|
|
|
|
| 1 |
"""
|
| 2 |
+
RAG основной pipeline
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
from typing import List, Dict
|
| 5 |
from langchain_openai import ChatOpenAI
|
|
|
|
| 15 |
|
| 16 |
class VisualMultimodalRAG:
|
| 17 |
"""
|
| 18 |
+
RAG - подготовительный этап:
|
| 19 |
+
1. Кодирует изображение в base64 и отправляет в gpt-4o-mini
|
| 20 |
+
2. Получает описание изображения
|
| 21 |
+
3. Сохраняет описание в векторное хранилище
|
|
|
|
| 22 |
"""
|
| 23 |
|
| 24 |
def __init__(self, api_key: str = None, debug: bool = True):
|
| 25 |
api_key = api_key or OPENAI_API_KEY
|
| 26 |
self.debug = debug
|
| 27 |
|
|
|
|
| 28 |
self.llm = ChatOpenAI(
|
| 29 |
+
model_name=OPENAI_MODEL,
|
| 30 |
api_key=api_key,
|
| 31 |
temperature=TEMPERATURE,
|
| 32 |
max_tokens=MAX_TOKENS,
|
|
|
|
| 36 |
self.visual_summaries_log = []
|
| 37 |
|
| 38 |
if self.debug:
|
| 39 |
+
print(f"VisualMultimodalRAG with {OPENAI_MODEL}")
|
| 40 |
|
| 41 |
def _debug_print(self, label: str, data: any):
|
| 42 |
+
"""Debug"""
|
| 43 |
if self.debug:
|
| 44 |
+
print(f"\nDEBUG [{label}]:")
|
| 45 |
if isinstance(data, (list, dict)):
|
| 46 |
print(f" Type: {type(data).__name__}")
|
| 47 |
print(f" Content: {str(data)[:300]}...")
|
|
|
|
| 49 |
print(f" {data}")
|
| 50 |
|
| 51 |
def _image_to_base64(self, image_path: str) -> str:
|
| 52 |
+
"""Конвертирует изображение в base64"""
|
| 53 |
try:
|
| 54 |
with open(image_path, 'rb') as image_file:
|
| 55 |
image_data = base64.b64encode(image_file.read()).decode('utf-8')
|
|
|
|
| 60 |
|
| 61 |
def analyze_image_visually(self, image_path: str, image_idx: int) -> str:
|
| 62 |
"""
|
| 63 |
+
Отправляет в модель изображение для суммаризации
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
"""
|
| 65 |
if not os.path.exists(image_path):
|
| 66 |
return f"[Image {image_idx}: File not found - {image_path}]"
|
| 67 |
|
| 68 |
try:
|
|
|
|
| 69 |
image_base64 = self._image_to_base64(image_path)
|
| 70 |
if not image_base64:
|
| 71 |
+
return f"[Image {image_idx}: Error converting to base64]"
|
| 72 |
|
|
|
|
| 73 |
file_ext = Path(image_path).suffix.lower()
|
| 74 |
media_type_map = {
|
| 75 |
'.jpg': 'image/jpeg',
|
|
|
|
| 80 |
}
|
| 81 |
media_type = media_type_map.get(file_ext, 'image/png')
|
| 82 |
|
| 83 |
+
print(f" Analyzing image {image_idx}...")
|
| 84 |
|
|
|
|
| 85 |
message = HumanMessage(
|
| 86 |
content=[
|
| 87 |
{
|
|
|
|
| 92 |
},
|
| 93 |
{
|
| 94 |
"type": "text",
|
| 95 |
+
"text": f"""Ты - ассистент по сбору и обобщению информации. Проанализируй изображение.
|
| 96 |
|
| 97 |
+
По результатам анализа предоставь информацию:
|
| 98 |
+
1. Что изображено на картинке - основные объекты и элементы
|
| 99 |
+
2. Тип данных и содержимое - числа, графики, зависимости.
|
| 100 |
+
3. Назначение изображения - для чего оно представлено и что отображает
|
| 101 |
+
4. Связь с текстом
|
|
|
|
| 102 |
|
| 103 |
+
Будь краток и содержателен. Фокусируйся на визуальной информации.
|
| 104 |
|
| 105 |
+
Результат:"""
|
| 106 |
}
|
| 107 |
],
|
| 108 |
)
|
| 109 |
|
|
|
|
| 110 |
response = self.llm.invoke([message])
|
| 111 |
analysis = response.content.strip()
|
| 112 |
|
| 113 |
if self.debug:
|
| 114 |
self._debug_print(f"Image {image_idx} Visual Analysis", analysis)
|
| 115 |
|
| 116 |
+
print(f" Image {image_idx} analyzed successfully")
|
| 117 |
return analysis
|
| 118 |
|
| 119 |
except Exception as e:
|
| 120 |
error_msg = f"[Image {image_idx}: Vision analysis failed - {str(e)}]"
|
| 121 |
+
print(f" Error analyzing image {image_idx}: {e}")
|
| 122 |
return error_msg
|
| 123 |
|
| 124 |
def analyze_images_visually(self, images: List[Dict]) -> List[Dict]:
|
| 125 |
"""
|
| 126 |
+
Считывает изображения и отправляет на анализ
|
|
|
|
| 127 |
"""
|
| 128 |
visual_analyses = []
|
| 129 |
|
|
|
|
| 131 |
image_path = image.get('path', '')
|
| 132 |
|
| 133 |
if not image_path:
|
| 134 |
+
print(f" Image {idx}: No path")
|
| 135 |
continue
|
| 136 |
|
|
|
|
| 137 |
visual_analysis = self.analyze_image_visually(image_path, idx)
|
| 138 |
|
| 139 |
visual_analyses.append({
|
|
|
|
| 141 |
'image_index': idx,
|
| 142 |
'image_path': image_path,
|
| 143 |
'visual_analysis': visual_analysis,
|
| 144 |
+
'ocr_text': image.get('ocr_text', '')
|
| 145 |
})
|
| 146 |
|
| 147 |
return visual_analyses
|
| 148 |
|
| 149 |
def summarize_text_chunks(self, text: str, chunk_size: int = 1500) -> List[Dict]:
|
| 150 |
"""
|
| 151 |
+
Отправляет куски текста на суммаризацию
|
| 152 |
"""
|
| 153 |
chunks = []
|
| 154 |
text_chunks = self._chunk_text(text, chunk_size=chunk_size, overlap=300)
|
|
|
|
| 160 |
continue
|
| 161 |
|
| 162 |
try:
|
| 163 |
+
prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующий кусок текста.
|
| 164 |
+
Выдели основные моменты, факты и идеи. Будь краток.
|
| 165 |
|
| 166 |
+
Текст :
|
| 167 |
{chunk}
|
| 168 |
|
| 169 |
+
Результат:"""
|
| 170 |
|
| 171 |
message = HumanMessage(content=prompt)
|
| 172 |
response = self.llm.invoke([message])
|
|
|
|
| 190 |
|
| 191 |
def summarize_tables(self, tables: List[Dict]) -> List[Dict]:
|
| 192 |
"""
|
| 193 |
+
Отправляет таблицы на суммаризацию
|
| 194 |
"""
|
| 195 |
summaries = []
|
| 196 |
|
|
|
|
| 201 |
continue
|
| 202 |
|
| 203 |
try:
|
| 204 |
+
prompt = f"""Ты - ассистент по обобщению и суммаризации информации. Проанализируй и суммаризируй следующию таблицу.
|
| 205 |
+
Выдели основные моменты, числа, и значения строк/колонок. Будь краток.
|
| 206 |
|
| 207 |
+
Таблица:
|
| 208 |
{table_content}
|
| 209 |
|
| 210 |
+
Результат:"""
|
| 211 |
|
| 212 |
message = HumanMessage(content=prompt)
|
| 213 |
response = self.llm.invoke([message])
|
|
|
|
| 238 |
doc_id: str
|
| 239 |
) -> Dict:
|
| 240 |
"""
|
| 241 |
+
Основной pipeline анализирует и сохраняет документы в хранилище
|
|
|
|
| 242 |
"""
|
| 243 |
+
|
| 244 |
+
print(f"PROCESSING ANALYSIS: {doc_id}")
|
|
|
|
| 245 |
|
| 246 |
results = {
|
| 247 |
'doc_id': doc_id,
|
|
|
|
| 251 |
'total_stored': 0
|
| 252 |
}
|
| 253 |
|
| 254 |
+
print(f"\n VISUAL IMAGE ANALYSIS ({len(images)} )")
|
| 255 |
+
|
|
|
|
| 256 |
|
| 257 |
image_analyses = self.analyze_images_visually(images)
|
| 258 |
results['image_visual_analyses'] = image_analyses
|
| 259 |
|
|
|
|
| 260 |
image_docs = {
|
| 261 |
'text': ' | '.join([
|
| 262 |
f"Image {a['image_index']}: {a['visual_analysis']}"
|
|
|
|
| 267 |
}
|
| 268 |
|
| 269 |
for analysis in image_analyses:
|
| 270 |
+
print(f" Image {analysis['image_index']}")
|
| 271 |
print(f" Path: {analysis['image_path']}")
|
| 272 |
print(f" Analysis: {analysis['visual_analysis'][:100]}...")
|
| 273 |
|
|
|
|
| 278 |
f"{doc_id}_images_visual"
|
| 279 |
)
|
| 280 |
results['total_stored'] += len(image_analyses)
|
| 281 |
+
print(f" Stored {len(image_analyses)} imagу analyses")
|
| 282 |
except Exception as e:
|
| 283 |
+
print(f"Error storing image analyses: {e}")
|
| 284 |
|
| 285 |
+
print(f"\n TEXT CHUNK SUMMARIZATION")
|
|
|
|
|
|
|
| 286 |
|
| 287 |
text_summaries = self.summarize_text_chunks(text)
|
| 288 |
results['text_summaries'] = text_summaries
|
|
|
|
| 295 |
}
|
| 296 |
|
| 297 |
for summary in text_summaries:
|
| 298 |
+
print(f" Chunk {summary['chunk_index']}: {summary['summary'][:50]}...")
|
| 299 |
|
| 300 |
if text_summaries:
|
| 301 |
try:
|
|
|
|
| 304 |
f"{doc_id}_text_chunks"
|
| 305 |
)
|
| 306 |
results['total_stored'] += len(text_summaries)
|
| 307 |
+
print(f" Stored {len(text_summaries)} text chunk summaries")
|
| 308 |
except Exception as e:
|
| 309 |
+
print(f" Error text summaries: {e}")
|
| 310 |
|
| 311 |
+
print(f"\n TABLE SUMMARIZATION ({len(tables)}")
|
|
|
|
|
|
|
| 312 |
|
| 313 |
table_summaries = self.summarize_tables(tables)
|
| 314 |
results['table_summaries'] = table_summaries
|
|
|
|
| 321 |
}
|
| 322 |
|
| 323 |
for summary in table_summaries:
|
| 324 |
+
print(f" Table {summary['table_index']}: {summary['summary'][:50]}...")
|
| 325 |
|
| 326 |
if table_summaries:
|
| 327 |
try:
|
|
|
|
| 330 |
f"{doc_id}_tables"
|
| 331 |
)
|
| 332 |
results['total_stored'] += len(table_summaries)
|
| 333 |
+
print(f" Stored {len(table_summaries)} table summaries")
|
| 334 |
except Exception as e:
|
| 335 |
+
print(f" Error storing table summaries: {e}")
|
| 336 |
+
|
| 337 |
+
print(f" STORAGE SUMMARY")
|
| 338 |
+
print(f" Images analyzed: {len(image_analyses)}")
|
| 339 |
+
print(f" Text chunks summarized: {len(text_summaries)}")
|
| 340 |
+
print(f" Tables summarized: {len(table_summaries)}")
|
|
|
|
|
|
|
|
|
|
| 341 |
print(f" Total items stored in vector: {results['total_stored']}")
|
|
|
|
| 342 |
|
| 343 |
self.visual_summaries_log.append(results)
|
| 344 |
return results
|
| 345 |
|
| 346 |
def _chunk_text(self, text: str, chunk_size: int = 1500, overlap: int = 300) -> List[str]:
|
|
|
|
| 347 |
chunks = []
|
| 348 |
start = 0
|
| 349 |
while start < len(text):
|
|
|
|
| 353 |
return chunks
|
| 354 |
|
| 355 |
def get_visual_summaries_log(self) -> List[Dict]:
|
|
|
|
| 356 |
return self.visual_summaries_log
|
| 357 |
|
| 358 |
|
| 359 |
class AnsweringRAG:
|
| 360 |
"""
|
| 361 |
+
RAG - работа с ответом на запрос:
|
| 362 |
+
1. Поиск в векторном хранилище
|
| 363 |
+
2. Анализ результатов
|
| 364 |
+
3. Предоставление ответа
|
| 365 |
"""
|
| 366 |
|
| 367 |
def __init__(self, api_key: str = None, debug: bool = True):
|
|
|
|
| 369 |
self.debug = debug
|
| 370 |
|
| 371 |
self.llm = ChatOpenAI(
|
| 372 |
+
model_name=OPENAI_MODEL,
|
| 373 |
api_key=api_key,
|
| 374 |
temperature=TEMPERATURE,
|
| 375 |
max_tokens=MAX_TOKENS,
|
|
|
|
| 379 |
self.answer_log = []
|
| 380 |
|
| 381 |
if self.debug:
|
| 382 |
+
print(" AnsweringRAG initialized")
|
| 383 |
|
| 384 |
def _debug_print(self, label: str, data: any):
|
| 385 |
+
"""Debug"""
|
| 386 |
if self.debug:
|
| 387 |
print(f"\n🔍 DEBUG [{label}]:")
|
| 388 |
if isinstance(data, (list, dict)):
|
|
|
|
| 397 |
search_results: List[Dict]
|
| 398 |
) -> Dict:
|
| 399 |
"""
|
| 400 |
+
Проанализируй найденные документов и на основе их предоставь ответ на вопрос пользователя
|
| 401 |
|
| 402 |
+
Ответ:
|
| 403 |
{
|
| 404 |
'question': user question,
|
| 405 |
'answer': detailed answer,
|
|
|
|
| 409 |
}
|
| 410 |
"""
|
| 411 |
|
|
|
|
| 412 |
print(f"ANALYZING QUESTION & GENERATING ANSWER")
|
|
|
|
| 413 |
|
| 414 |
+
print(f"\n Question: {question}")
|
| 415 |
+
print(f" Search Results: {len(search_results)}")
|
| 416 |
|
|
|
|
| 417 |
if not search_results:
|
| 418 |
+
print(f" No search results found!")
|
| 419 |
+
answer = f"""Релевантная информация в документах отсутствует: "{question}"
|
| 420 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
|
| 422 |
result = {
|
| 423 |
'question': question,
|
|
|
|
| 429 |
self.answer_log.append(result)
|
| 430 |
return result
|
| 431 |
|
|
|
|
| 432 |
context_parts = []
|
| 433 |
for idx, result in enumerate(search_results, 1):
|
| 434 |
content = result.get('content', '')
|
|
|
|
| 443 |
|
| 444 |
full_context = "\n".join(context_parts)
|
| 445 |
|
| 446 |
+
self._debug_print("Context Prepared", f"{len(context_parts)} sources")
|
| 447 |
|
| 448 |
+
analysis_prompt = f"""Ты - ассистент по анализу документов и ответов на вопросы по ним.
|
|
|
|
| 449 |
|
| 450 |
+
ВОПРОС:
|
| 451 |
"{question}"
|
| 452 |
|
| 453 |
+
РЕЛЕВАНТНАЯ ИНФОРМАЦИЯ:
|
| 454 |
{full_context}
|
| 455 |
|
| 456 |
+
ИНСТРУКЦИИ:
|
| 457 |
+
1. Проанализируй предоставленный контент
|
| 458 |
+
2. Выдели информацию имеющую отношение к вопросу
|
| 459 |
+
3. Предоставь понятный и исчерпывающий ответ
|
| 460 |
+
4. Если контент полностью не отвечает на вопрос предосавь информацию которая доступна в контенте
|
| 461 |
+
5. Построй свой ответ опираясь на ключевые моменты
|
|
|
|
| 462 |
|
| 463 |
+
Ответ:"""
|
| 464 |
|
| 465 |
+
print(f"\n Analyzing search results...")
|
| 466 |
+
print(f" Context size: {len(full_context)} chars")
|
| 467 |
print(f" Sources: {len(search_results)}")
|
| 468 |
|
| 469 |
try:
|
|
|
|
| 470 |
message = HumanMessage(content=analysis_prompt)
|
| 471 |
response = self.llm.invoke([message])
|
| 472 |
answer = response.content.strip()
|
| 473 |
|
|
|
|
| 474 |
confidence = self._estimate_confidence(len(search_results), answer)
|
| 475 |
|
| 476 |
+
print(f" Answer generated successfully")
|
| 477 |
print(f" Confidence: {confidence}")
|
| 478 |
+
print(f" Answer length: {len(answer)} chars")
|
| 479 |
|
| 480 |
result = {
|
| 481 |
'question': question,
|
|
|
|
| 489 |
return result
|
| 490 |
|
| 491 |
except Exception as e:
|
| 492 |
+
print(f" Error generating answer: {e}")
|
| 493 |
+
answer = f"Error while analyzing the search results."
|
| 494 |
|
| 495 |
result = {
|
| 496 |
'question': question,
|
|
|
|
| 505 |
return result
|
| 506 |
|
| 507 |
def _estimate_confidence(self, sources_count: int, answer: str) -> str:
|
| 508 |
+
"""Уверенность в ответе на основании найденных источников информации"""
|
| 509 |
answer_length = len(answer)
|
| 510 |
|
|
|
|
| 511 |
if sources_count >= 3 and answer_length > 500:
|
| 512 |
return "high"
|
| 513 |
|
|
|
|
| 514 |
elif sources_count >= 2 and answer_length > 200:
|
| 515 |
return "medium"
|
| 516 |
|
|
|
|
| 517 |
else:
|
| 518 |
return "low"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/vector_store.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
-
UPDATED for ChromaDB v0.4.22+ (auto-persist, no manual persist needed)
|
| 4 |
"""
|
| 5 |
import os
|
| 6 |
import json
|
|
@@ -12,14 +11,14 @@ from config import CHROMA_DB_PATH, EMBEDDING_MODEL, EMBEDDING_DIM
|
|
| 12 |
|
| 13 |
|
| 14 |
class CLIPEmbedder:
|
| 15 |
-
"""
|
| 16 |
def __init__(self, model_name: str = EMBEDDING_MODEL):
|
| 17 |
-
print(f"
|
| 18 |
self.model = SentenceTransformer(model_name)
|
| 19 |
-
print(f"
|
| 20 |
|
| 21 |
def embed(self, text: str) -> List[float]:
|
| 22 |
-
"""
|
| 23 |
try:
|
| 24 |
embedding = self.model.encode(text, convert_to_numpy=False)
|
| 25 |
return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
|
|
@@ -28,7 +27,7 @@ class CLIPEmbedder:
|
|
| 28 |
return [0.0] * EMBEDDING_DIM
|
| 29 |
|
| 30 |
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
| 31 |
-
"""
|
| 32 |
try:
|
| 33 |
embeddings = self.model.encode(texts, convert_to_numpy=False)
|
| 34 |
return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
|
|
@@ -38,34 +37,31 @@ class CLIPEmbedder:
|
|
| 38 |
|
| 39 |
|
| 40 |
class VectorStore:
|
| 41 |
-
"""
|
| 42 |
def __init__(self):
|
| 43 |
self.persist_directory = CHROMA_DB_PATH
|
| 44 |
self.embedder = CLIPEmbedder()
|
| 45 |
|
| 46 |
-
print(f"\
|
| 47 |
|
| 48 |
-
# NEW ChromaDB v0.4.22+ - PersistentClient auto-persists
|
| 49 |
try:
|
| 50 |
self.client = chromadb.PersistentClient(
|
| 51 |
path=self.persist_directory
|
| 52 |
)
|
| 53 |
-
print(f"
|
| 54 |
except Exception as e:
|
| 55 |
-
print(f"
|
| 56 |
-
print(f"Trying fallback initialization...")
|
| 57 |
self.client = chromadb.PersistentClient(
|
| 58 |
path=self.persist_directory
|
| 59 |
)
|
| 60 |
|
| 61 |
-
# Get or create collection
|
| 62 |
try:
|
| 63 |
-
self.collection = self.client.
|
| 64 |
name="multimodal_rag",
|
| 65 |
metadata={"hnsw:space": "cosine"}
|
| 66 |
)
|
| 67 |
count = self.collection.count()
|
| 68 |
-
print(f"
|
| 69 |
except Exception as e:
|
| 70 |
print(f"Error with collection: {e}")
|
| 71 |
self.collection = self.client.get_or_create_collection(
|
|
@@ -73,14 +69,13 @@ class VectorStore:
|
|
| 73 |
)
|
| 74 |
|
| 75 |
def add_documents(self, documents: List[Dict], doc_id: str):
|
| 76 |
-
"""
|
| 77 |
texts = []
|
| 78 |
metadatas = []
|
| 79 |
ids = []
|
| 80 |
|
| 81 |
-
print(f"\
|
| 82 |
|
| 83 |
-
# Add text chunks
|
| 84 |
if 'text' in documents and documents['text']:
|
| 85 |
chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
|
| 86 |
for idx, chunk in enumerate(chunks):
|
|
@@ -91,9 +86,8 @@ class VectorStore:
|
|
| 91 |
'chunk_idx': str(idx)
|
| 92 |
})
|
| 93 |
ids.append(f"{doc_id}_text_{idx}")
|
| 94 |
-
print(f"
|
| 95 |
|
| 96 |
-
# Add image descriptions and OCR text
|
| 97 |
if 'images' in documents:
|
| 98 |
image_count = 0
|
| 99 |
for idx, image_data in enumerate(documents['images']):
|
|
@@ -108,9 +102,8 @@ class VectorStore:
|
|
| 108 |
ids.append(f"{doc_id}_image_{idx}")
|
| 109 |
image_count += 1
|
| 110 |
if image_count > 0:
|
| 111 |
-
print(f"
|
| 112 |
|
| 113 |
-
# Add table content
|
| 114 |
if 'tables' in documents:
|
| 115 |
table_count = 0
|
| 116 |
for idx, table_data in enumerate(documents['tables']):
|
|
@@ -124,14 +117,12 @@ class VectorStore:
|
|
| 124 |
ids.append(f"{doc_id}_table_{idx}")
|
| 125 |
table_count += 1
|
| 126 |
if table_count > 0:
|
| 127 |
-
print(f"
|
| 128 |
|
| 129 |
if texts:
|
| 130 |
-
# Generate embeddings
|
| 131 |
print(f" 🔄 Generating {len(texts)} embeddings...")
|
| 132 |
embeddings = self.embedder.embed_batch(texts)
|
| 133 |
|
| 134 |
-
# Add to collection
|
| 135 |
try:
|
| 136 |
self.collection.add(
|
| 137 |
ids=ids,
|
|
@@ -139,14 +130,12 @@ class VectorStore:
|
|
| 139 |
embeddings=embeddings,
|
| 140 |
metadatas=metadatas
|
| 141 |
)
|
| 142 |
-
print(f"
|
| 143 |
-
# Auto-persist happens here
|
| 144 |
-
print(f"✅ Data persisted automatically to: {self.persist_directory}")
|
| 145 |
except Exception as e:
|
| 146 |
-
print(f"
|
| 147 |
|
| 148 |
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 149 |
-
"""
|
| 150 |
try:
|
| 151 |
query_embedding = self.embedder.embed(query)
|
| 152 |
|
|
@@ -155,7 +144,6 @@ class VectorStore:
|
|
| 155 |
n_results=n_results
|
| 156 |
)
|
| 157 |
|
| 158 |
-
# Format results
|
| 159 |
formatted_results = []
|
| 160 |
if results['documents']:
|
| 161 |
for i, doc in enumerate(results['documents'][0]):
|
|
@@ -175,7 +163,7 @@ class VectorStore:
|
|
| 175 |
return []
|
| 176 |
|
| 177 |
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
| 178 |
-
"""
|
| 179 |
chunks = []
|
| 180 |
start = 0
|
| 181 |
while start < len(text):
|
|
@@ -185,7 +173,7 @@ class VectorStore:
|
|
| 185 |
return chunks
|
| 186 |
|
| 187 |
def get_collection_info(self) -> Dict:
|
| 188 |
-
"""
|
| 189 |
try:
|
| 190 |
count = self.collection.count()
|
| 191 |
return {
|
|
@@ -199,35 +187,23 @@ class VectorStore:
|
|
| 199 |
return {'status': 'error', 'message': str(e)}
|
| 200 |
|
| 201 |
def delete_by_doc_id(self, doc_id: str):
|
| 202 |
-
"""
|
| 203 |
try:
|
| 204 |
-
# Get all IDs with this doc_id
|
| 205 |
results = self.collection.get(where={'doc_id': doc_id})
|
| 206 |
if results['ids']:
|
| 207 |
self.collection.delete(ids=results['ids'])
|
| 208 |
-
print(f"
|
| 209 |
-
# Auto-persist on delete
|
| 210 |
-
print(f"✅ Changes persisted automatically")
|
| 211 |
except Exception as e:
|
| 212 |
print(f"Error deleting documents: {e}")
|
| 213 |
|
| 214 |
-
def persist(self):
|
| 215 |
-
"""
|
| 216 |
-
No-op for compatibility with older code.
|
| 217 |
-
ChromaDB v0.4.22+ uses PersistentClient which auto-persists.
|
| 218 |
-
This method kept for backward compatibility.
|
| 219 |
-
"""
|
| 220 |
-
print("✅ Vector store is using auto-persist (no manual persist needed)")
|
| 221 |
-
|
| 222 |
def clear_all(self):
|
| 223 |
-
"""
|
| 224 |
try:
|
| 225 |
-
# Delete collection and recreate
|
| 226 |
self.client.delete_collection(name="multimodal_rag")
|
| 227 |
self.collection = self.client.get_or_create_collection(
|
| 228 |
name="multimodal_rag",
|
| 229 |
metadata={"hnsw:space": "cosine"}
|
| 230 |
)
|
| 231 |
-
print("
|
| 232 |
except Exception as e:
|
| 233 |
print(f"Error clearing collection: {e}")
|
|
|
|
| 1 |
"""
|
| 2 |
+
Векторное хранилище и Эмбеддер"
|
|
|
|
| 3 |
"""
|
| 4 |
import os
|
| 5 |
import json
|
|
|
|
| 11 |
|
| 12 |
|
| 13 |
class CLIPEmbedder:
|
| 14 |
+
"""Эмбеддер"""
|
| 15 |
def __init__(self, model_name: str = EMBEDDING_MODEL):
|
| 16 |
+
print(f"Embedding model: {model_name}")
|
| 17 |
self.model = SentenceTransformer(model_name)
|
| 18 |
+
print(f"Model loaded successfully")
|
| 19 |
|
| 20 |
def embed(self, text: str) -> List[float]:
|
| 21 |
+
"""Эмбеддинг для текста"""
|
| 22 |
try:
|
| 23 |
embedding = self.model.encode(text, convert_to_numpy=False)
|
| 24 |
return embedding.tolist() if hasattr(embedding, 'tolist') else embedding
|
|
|
|
| 27 |
return [0.0] * EMBEDDING_DIM
|
| 28 |
|
| 29 |
def embed_batch(self, texts: List[str]) -> List[List[float]]:
|
| 30 |
+
"""Эмбеддинг для текста"""
|
| 31 |
try:
|
| 32 |
embeddings = self.model.encode(texts, convert_to_numpy=False)
|
| 33 |
return [e.tolist() if hasattr(e, 'tolist') else e for e in embeddings]
|
|
|
|
| 37 |
|
| 38 |
|
| 39 |
class VectorStore:
|
| 40 |
+
"""Векторное хранилище"""
|
| 41 |
def __init__(self):
|
| 42 |
self.persist_directory = CHROMA_DB_PATH
|
| 43 |
self.embedder = CLIPEmbedder()
|
| 44 |
|
| 45 |
+
print(f"\nInitializing ChromaDB: {self.persist_directory}")
|
| 46 |
|
|
|
|
| 47 |
try:
|
| 48 |
self.client = chromadb.PersistentClient(
|
| 49 |
path=self.persist_directory
|
| 50 |
)
|
| 51 |
+
print(f"ChromaDB initialized")
|
| 52 |
except Exception as e:
|
| 53 |
+
print(f"Error initializing ChromaDB: {e}")
|
|
|
|
| 54 |
self.client = chromadb.PersistentClient(
|
| 55 |
path=self.persist_directory
|
| 56 |
)
|
| 57 |
|
|
|
|
| 58 |
try:
|
| 59 |
+
self.collection = self.client.get_or_create_colletion(
|
| 60 |
name="multimodal_rag",
|
| 61 |
metadata={"hnsw:space": "cosine"}
|
| 62 |
)
|
| 63 |
count = self.collection.count()
|
| 64 |
+
print(f"Collection loaded: {count} items in store")
|
| 65 |
except Exception as e:
|
| 66 |
print(f"Error with collection: {e}")
|
| 67 |
self.collection = self.client.get_or_create_collection(
|
|
|
|
| 69 |
)
|
| 70 |
|
| 71 |
def add_documents(self, documents: List[Dict], doc_id: str):
|
| 72 |
+
"""Добавление документов в векторное хранилище"""
|
| 73 |
texts = []
|
| 74 |
metadatas = []
|
| 75 |
ids = []
|
| 76 |
|
| 77 |
+
print(f"\nAdding document: {doc_id}")
|
| 78 |
|
|
|
|
| 79 |
if 'text' in documents and documents['text']:
|
| 80 |
chunks = self._chunk_text(documents['text'], chunk_size=1000, overlap=200)
|
| 81 |
for idx, chunk in enumerate(chunks):
|
|
|
|
| 86 |
'chunk_idx': str(idx)
|
| 87 |
})
|
| 88 |
ids.append(f"{doc_id}_text_{idx}")
|
| 89 |
+
print(f" Text: {len(chunks)} chunks")
|
| 90 |
|
|
|
|
| 91 |
if 'images' in documents:
|
| 92 |
image_count = 0
|
| 93 |
for idx, image_data in enumerate(documents['images']):
|
|
|
|
| 102 |
ids.append(f"{doc_id}_image_{idx}")
|
| 103 |
image_count += 1
|
| 104 |
if image_count > 0:
|
| 105 |
+
print(f" Images: {image_count} with OCR text")
|
| 106 |
|
|
|
|
| 107 |
if 'tables' in documents:
|
| 108 |
table_count = 0
|
| 109 |
for idx, table_data in enumerate(documents['tables']):
|
|
|
|
| 117 |
ids.append(f"{doc_id}_table_{idx}")
|
| 118 |
table_count += 1
|
| 119 |
if table_count > 0:
|
| 120 |
+
print(f" Tables: {table_count}")
|
| 121 |
|
| 122 |
if texts:
|
|
|
|
| 123 |
print(f" 🔄 Generating {len(texts)} embeddings...")
|
| 124 |
embeddings = self.embedder.embed_batch(texts)
|
| 125 |
|
|
|
|
| 126 |
try:
|
| 127 |
self.collection.add(
|
| 128 |
ids=ids,
|
|
|
|
| 130 |
embeddings=embeddings,
|
| 131 |
metadatas=metadatas
|
| 132 |
)
|
| 133 |
+
print(f"Successfully added {len(texts)} items to vector store")
|
|
|
|
|
|
|
| 134 |
except Exception as e:
|
| 135 |
+
print(f"Error adding to collection: {e}")
|
| 136 |
|
| 137 |
def search(self, query: str, n_results: int = 5) -> List[Dict]:
|
| 138 |
+
"""Поиск в векторном хранилище"""
|
| 139 |
try:
|
| 140 |
query_embedding = self.embedder.embed(query)
|
| 141 |
|
|
|
|
| 144 |
n_results=n_results
|
| 145 |
)
|
| 146 |
|
|
|
|
| 147 |
formatted_results = []
|
| 148 |
if results['documents']:
|
| 149 |
for i, doc in enumerate(results['documents'][0]):
|
|
|
|
| 163 |
return []
|
| 164 |
|
| 165 |
def _chunk_text(self, text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
|
| 166 |
+
"""Сплит текста"""
|
| 167 |
chunks = []
|
| 168 |
start = 0
|
| 169 |
while start < len(text):
|
|
|
|
| 173 |
return chunks
|
| 174 |
|
| 175 |
def get_collection_info(self) -> Dict:
|
| 176 |
+
"""Получение информации о коллекции в вектороном хранилище"""
|
| 177 |
try:
|
| 178 |
count = self.collection.count()
|
| 179 |
return {
|
|
|
|
| 187 |
return {'status': 'error', 'message': str(e)}
|
| 188 |
|
| 189 |
def delete_by_doc_id(self, doc_id: str):
|
| 190 |
+
"""Удаление документа из векторного хранилища"""
|
| 191 |
try:
|
|
|
|
| 192 |
results = self.collection.get(where={'doc_id': doc_id})
|
| 193 |
if results['ids']:
|
| 194 |
self.collection.delete(ids=results['ids'])
|
| 195 |
+
print(f"Deleted {len(results['ids'])} documents for {doc_id}")
|
|
|
|
|
|
|
| 196 |
except Exception as e:
|
| 197 |
print(f"Error deleting documents: {e}")
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
def clear_all(self):
|
| 200 |
+
"""Очистка хранилища"""
|
| 201 |
try:
|
|
|
|
| 202 |
self.client.delete_collection(name="multimodal_rag")
|
| 203 |
self.collection = self.client.get_or_create_collection(
|
| 204 |
name="multimodal_rag",
|
| 205 |
metadata={"hnsw:space": "cosine"}
|
| 206 |
)
|
| 207 |
+
print("Collection cleared")
|
| 208 |
except Exception as e:
|
| 209 |
print(f"Error clearing collection: {e}")
|