Juan Salas commited on
Commit ·
25ec886
1
Parent(s): 15ee652
FAISS file persistance
Browse files- .streamlit/config.toml +1 -2
- app.py +235 -406
- pyproject.toml +12 -3
- requirements.txt +7 -5
- src/__init__.py +14 -16
- src/ai/__init__.py +19 -75
- src/ai/agent_core.py +536 -33
- src/ai/agent_nodes.py +0 -173
- src/ai/llm_utilities.py +0 -432
- src/ai/prompts.py +71 -126
- src/config.py +294 -384
- src/document_processing.py +317 -745
- src/services.py +384 -542
- src/ui_components.py +20 -49
- src/utils.py +0 -640
- uv.lock +0 -0
.streamlit/config.toml
CHANGED
|
@@ -7,7 +7,6 @@ textColor = "#262730"
|
|
| 7 |
[server]
|
| 8 |
headless = true
|
| 9 |
port = 8501
|
| 10 |
-
enableCORS = false
|
| 11 |
|
| 12 |
[client]
|
| 13 |
-
showErrorDetails =
|
|
|
|
| 7 |
[server]
|
| 8 |
headless = true
|
| 9 |
port = 8501
|
|
|
|
| 10 |
|
| 11 |
[client]
|
| 12 |
+
showErrorDetails = true
|
app.py
CHANGED
|
@@ -7,56 +7,65 @@ using the new modular architecture for better maintainability.
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
|
|
|
| 10 |
# Fix tokenizers parallelism warning early
|
| 11 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
import streamlit as st
|
| 14 |
-
|
| 15 |
-
from sentence_transformers import SentenceTransformer
|
| 16 |
from pathlib import Path
|
| 17 |
-
from typing import Dict
|
| 18 |
|
| 19 |
# Import our refactored modules
|
| 20 |
from src import (
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
| 25 |
)
|
|
|
|
|
|
|
| 26 |
from src.ui_components import (
|
| 27 |
-
render_file_selector,
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
-
from src.services import ReportGenerator
|
| 32 |
-
from src.utils import ProgressTracker, show_success, show_error, show_info
|
| 33 |
|
| 34 |
-
|
| 35 |
-
try:
|
| 36 |
-
from src.ai import (
|
| 37 |
-
DDChecklistAgent,
|
| 38 |
-
LANGGRAPH_AVAILABLE,
|
| 39 |
-
batch_summarize_documents,
|
| 40 |
-
create_document_embeddings_with_summaries,
|
| 41 |
-
match_checklist_with_summaries,
|
| 42 |
-
generate_checklist_descriptions
|
| 43 |
-
)
|
| 44 |
-
LLM_AVAILABLE = LANGGRAPH_AVAILABLE
|
| 45 |
-
except ImportError:
|
| 46 |
-
LLM_AVAILABLE = False
|
| 47 |
-
DDChecklistAgent = None
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
class DDChecklistApp:
|
| 52 |
"""
|
| 53 |
Main application class that orchestrates all components
|
| 54 |
"""
|
| 55 |
-
|
| 56 |
def __init__(self):
|
| 57 |
"""Initialize the application"""
|
| 58 |
# Initialize configuration
|
| 59 |
-
self.config = init_config()
|
| 60 |
|
| 61 |
# Initialize session state
|
| 62 |
self._init_session_state()
|
|
@@ -69,63 +78,44 @@ class DDChecklistApp:
|
|
| 69 |
)
|
| 70 |
|
| 71 |
# Initialize services (will be loaded when needed)
|
| 72 |
-
self.
|
| 73 |
-
self.
|
| 74 |
self.agent = None
|
| 75 |
|
| 76 |
def _init_session_state(self):
|
| 77 |
-
"""Initialize
|
| 78 |
-
|
| 79 |
'documents': {},
|
| 80 |
'chunks': [],
|
| 81 |
'embeddings': None,
|
| 82 |
-
'checklist': {},
|
| 83 |
'checklist_results': {},
|
| 84 |
-
'questions': [],
|
| 85 |
'question_answers': {},
|
| 86 |
-
'strategy_text': "",
|
| 87 |
-
'strategy_analysis': "",
|
| 88 |
'company_summary': "",
|
|
|
|
| 89 |
'agent': None,
|
| 90 |
-
'
|
| 91 |
-
'just_processed': False,
|
| 92 |
-
'is_processing': False,
|
| 93 |
-
'trigger_processing': False,
|
| 94 |
-
'processing_path': None
|
| 95 |
}
|
| 96 |
|
| 97 |
-
for key, default_value in
|
| 98 |
if key not in st.session_state:
|
| 99 |
st.session_state[key] = default_value
|
| 100 |
-
|
| 101 |
-
@st.cache_resource
|
| 102 |
-
def load_model(_self) -> SentenceTransformer:
|
| 103 |
-
"""Load the sentence transformer model"""
|
| 104 |
-
with ErrorHandler("Failed to load AI model"):
|
| 105 |
-
return SentenceTransformer(_self.config.model.sentence_transformer_model)
|
| 106 |
-
|
| 107 |
def initialize_services(self):
|
| 108 |
"""Initialize core services"""
|
| 109 |
-
if self.
|
| 110 |
-
self.
|
| 111 |
-
|
| 112 |
-
if self.service is None:
|
| 113 |
-
self.service = DDChecklistService(self.model, self.agent)
|
| 114 |
|
| 115 |
# Restore document processor state from session state if available
|
| 116 |
if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
|
| 117 |
hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
|
| 118 |
|
| 119 |
-
self.
|
| 120 |
-
self.
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
# Ensure the document processor has the model
|
| 124 |
-
self.service.document_processor.model = self.model
|
| 125 |
|
| 126 |
def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
|
| 127 |
"""
|
| 128 |
-
Setup AI agent
|
| 129 |
|
| 130 |
Args:
|
| 131 |
api_key: Anthropic API key
|
|
@@ -133,11 +123,7 @@ class DDChecklistApp:
|
|
| 133 |
|
| 134 |
Returns:
|
| 135 |
True if agent was successfully initialized
|
| 136 |
-
"""
|
| 137 |
-
if not LLM_AVAILABLE or not DDChecklistAgent:
|
| 138 |
-
show_error("AI packages not installed")
|
| 139 |
-
return False
|
| 140 |
-
|
| 141 |
try:
|
| 142 |
with st.spinner("Initializing AI agent..."):
|
| 143 |
agent = DDChecklistAgent(api_key, model_choice)
|
|
@@ -147,9 +133,6 @@ class DDChecklistApp:
|
|
| 147 |
self.agent = agent
|
| 148 |
show_success("✅ AI Agent ready")
|
| 149 |
|
| 150 |
-
# Update service with agent
|
| 151 |
-
if self.service:
|
| 152 |
-
self.service.report_generator = ReportGenerator(agent)
|
| 153 |
|
| 154 |
return True
|
| 155 |
else:
|
|
@@ -198,15 +181,13 @@ class DDChecklistApp:
|
|
| 198 |
self.agent = None
|
| 199 |
|
| 200 |
return selected_data_room_path, use_ai_features, process_button
|
| 201 |
-
|
| 202 |
|
| 203 |
def render_summary_tab(self):
|
| 204 |
-
"""Render
|
| 205 |
# Strategy selector
|
| 206 |
strategy_path, strategy_text = render_file_selector(
|
| 207 |
self.config.paths.strategy_dir, "Strategy", "tab"
|
| 208 |
)
|
| 209 |
-
st.session_state.strategy_text = strategy_text
|
| 210 |
|
| 211 |
# Check if we have documents to display summaries
|
| 212 |
if st.session_state.documents:
|
|
@@ -214,113 +195,115 @@ class DDChecklistApp:
|
|
| 214 |
overview_tab, analysis_tab = st.tabs(["🏢 Company Overview", "🎯 Strategic Analysis"])
|
| 215 |
|
| 216 |
with overview_tab:
|
| 217 |
-
self.
|
| 218 |
|
| 219 |
with analysis_tab:
|
| 220 |
-
self.
|
| 221 |
else:
|
| 222 |
show_info("👈 Configure and process data room to see analysis")
|
| 223 |
|
| 224 |
-
def
|
| 225 |
-
"""
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
)
|
| 236 |
|
| 237 |
-
# Display the
|
| 238 |
-
if st.session_state.
|
| 239 |
-
st.markdown(st.session_state
|
| 240 |
|
| 241 |
# Add export and regenerate buttons
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
col1, col2 = st.columns([1, 5])
|
| 243 |
with col1:
|
|
|
|
|
|
|
|
|
|
| 244 |
st.download_button(
|
| 245 |
"📥 Export Summary",
|
| 246 |
-
data=f"# Company Overview\n\n{st.session_state
|
| 247 |
-
file_name=
|
| 248 |
mime="text/markdown",
|
| 249 |
-
key="
|
| 250 |
)
|
| 251 |
with col2:
|
| 252 |
-
if st.button("🔄 Regenerate
|
| 253 |
-
st.session_state
|
| 254 |
st.rerun()
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
"""Render strategic analysis section"""
|
| 258 |
-
if not st.session_state.checklist_results:
|
| 259 |
-
st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
|
| 260 |
-
return
|
| 261 |
-
|
| 262 |
-
# Auto-generate analysis if not already present and AI is available
|
| 263 |
-
if (not st.session_state.strategy_analysis and
|
| 264 |
-
hasattr(st.session_state, 'agent') and st.session_state.agent):
|
| 265 |
-
|
| 266 |
-
with st.spinner("🤖 Generating strategic analysis..."):
|
| 267 |
-
report_gen = ReportGenerator(st.session_state.agent)
|
| 268 |
-
st.session_state.strategy_analysis = report_gen.generate_strategic_analysis(
|
| 269 |
-
st.session_state.strategy_text,
|
| 270 |
-
st.session_state.checklist_results,
|
| 271 |
-
st.session_state.documents
|
| 272 |
-
)
|
| 273 |
-
|
| 274 |
-
if st.session_state.strategy_analysis:
|
| 275 |
-
st.markdown(st.session_state.strategy_analysis)
|
| 276 |
-
|
| 277 |
-
# Add export and regenerate buttons
|
| 278 |
-
col1, col2, col3 = st.columns([1, 1, 3])
|
| 279 |
with col1:
|
| 280 |
-
# Combined report export
|
| 281 |
combined_report = f"# Due Diligence Report\n\n"
|
| 282 |
-
combined_report += f"## Company Overview\n\n{st.session_state.
|
| 283 |
-
combined_report += f"## Strategic Analysis\n\n{st.session_state
|
| 284 |
|
|
|
|
|
|
|
|
|
|
| 285 |
st.download_button(
|
| 286 |
"📥 Export Report",
|
| 287 |
data=combined_report,
|
| 288 |
-
file_name=
|
| 289 |
mime="text/markdown",
|
| 290 |
-
key="
|
| 291 |
)
|
| 292 |
with col2:
|
| 293 |
-
if st.button("🔄 Regenerate
|
| 294 |
-
st.session_state
|
| 295 |
st.rerun()
|
| 296 |
|
| 297 |
-
def
|
| 298 |
-
"""
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
|
| 325 |
def render_qa_tab(self):
|
| 326 |
"""Render the Q&A with citations tab"""
|
|
@@ -346,13 +329,14 @@ class DDChecklistApp:
|
|
| 346 |
|
| 347 |
def _handle_qa_query(self, question: str):
|
| 348 |
"""Handle Q&A query and display results"""
|
| 349 |
-
if not self.
|
| 350 |
self.initialize_services()
|
| 351 |
|
| 352 |
# Use lower threshold for Q&A to get more relevant results
|
| 353 |
qa_threshold = 0.25
|
| 354 |
|
| 355 |
-
results =
|
|
|
|
| 356 |
question,
|
| 357 |
top_k=self.config.ui.top_k_search_results,
|
| 358 |
threshold=qa_threshold
|
|
@@ -369,7 +353,9 @@ class DDChecklistApp:
|
|
| 369 |
context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
|
| 370 |
# Use LLM directly for more reliable answers
|
| 371 |
from langchain_core.messages import HumanMessage
|
| 372 |
-
prompt = f"Question: {question}\n\
|
|
|
|
|
|
|
| 373 |
response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
|
| 374 |
# Clean up any leading whitespace and escape math characters
|
| 375 |
answer_text = escape_markdown_math(response.content.strip())
|
|
@@ -389,10 +375,7 @@ class DDChecklistApp:
|
|
| 389 |
# Create clickable link for the document
|
| 390 |
doc_path = result.get('path', result.get('full_path', ''))
|
| 391 |
doc_name = result['source']
|
| 392 |
-
|
| 393 |
-
doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
|
| 394 |
-
else:
|
| 395 |
-
doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
|
| 396 |
|
| 397 |
if doc_path:
|
| 398 |
link_html = create_document_link(doc_path, doc_name, doc_title)
|
|
@@ -419,17 +402,7 @@ class DDChecklistApp:
|
|
| 419 |
file_bytes = f.read()
|
| 420 |
|
| 421 |
# Determine MIME type based on file extension
|
| 422 |
-
|
| 423 |
-
if file_extension == '.pdf':
|
| 424 |
-
mime_type = 'application/pdf'
|
| 425 |
-
elif file_extension in ['.doc', '.docx']:
|
| 426 |
-
mime_type = 'application/msword'
|
| 427 |
-
elif file_extension == '.txt':
|
| 428 |
-
mime_type = 'text/plain'
|
| 429 |
-
elif file_extension == '.md':
|
| 430 |
-
mime_type = 'text/markdown'
|
| 431 |
-
else:
|
| 432 |
-
mime_type = 'application/octet-stream'
|
| 433 |
|
| 434 |
button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
|
| 435 |
|
|
@@ -444,238 +417,107 @@ class DDChecklistApp:
|
|
| 444 |
except Exception as e:
|
| 445 |
st.error(f"Download failed: {str(e)}")
|
| 446 |
|
| 447 |
-
@handle_exceptions(show_error=True)
|
| 448 |
def process_data_room(self, data_room_path: str):
|
| 449 |
-
"""
|
| 450 |
-
Process the selected data room
|
| 451 |
-
|
| 452 |
-
Args:
|
| 453 |
-
data_room_path: Path to the data room to process
|
| 454 |
-
"""
|
| 455 |
if not Path(data_room_path).exists():
|
| 456 |
show_error(f"Data room path not found: {data_room_path}")
|
| 457 |
-
st.session_state.is_processing = False
|
| 458 |
return
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
self.initialize_services()
|
|
|
|
|
|
|
| 463 |
|
| 464 |
-
#
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
3: 8.0, # Generate AI summaries (very slow - depends on doc count)
|
| 475 |
-
4: 0.5, # AI summaries complete (instant)
|
| 476 |
-
5: 1.0, # Loading checklist and questions (fast)
|
| 477 |
-
6: 0.5, # Checklist and questions loaded (instant)
|
| 478 |
-
7: 3.0, # Generate checklist descriptions (moderate)
|
| 479 |
-
8: 0.5, # Descriptions generated (instant)
|
| 480 |
-
9: 2.0, # Match checklist to documents (moderate)
|
| 481 |
-
10: 0.5, # Checklist matching complete (instant)
|
| 482 |
-
11: 2.0, # Answer questions (moderate)
|
| 483 |
-
12: 0.5 # Complete (instant)
|
| 484 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
# Step 1: Load documents with parallel processing
|
| 489 |
-
tracker.update(1, f"Scanning data room: {Path(data_room_path).name}")
|
| 490 |
-
|
| 491 |
-
# Create a progress bar for detailed document loading progress
|
| 492 |
-
doc_progress_placeholder = st.empty()
|
| 493 |
-
with doc_progress_placeholder.container():
|
| 494 |
-
doc_progress_bar = st.progress(0, text="Initializing document scan...")
|
| 495 |
|
| 496 |
-
#
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
|
|
|
|
|
|
| 501 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
|
| 503 |
-
#
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
st.session_state.documents = self.service.document_processor.documents
|
| 507 |
-
st.session_state.chunks = self.service.document_processor.chunks
|
| 508 |
-
st.session_state.embeddings = self.service.document_processor.embeddings
|
| 509 |
-
|
| 510 |
-
# Display performance metrics
|
| 511 |
-
if 'performance' in load_results:
|
| 512 |
-
perf = load_results['performance']
|
| 513 |
-
tracker.update(2, f"Found {load_results['documents_count']} documents in {perf['total_time']:.1f}s "
|
| 514 |
-
f"({perf['documents_per_second']:.1f} docs/sec)")
|
| 515 |
-
logger.info(f"Document loading performance: {perf}")
|
| 516 |
-
else:
|
| 517 |
-
tracker.update(2, f"Found {load_results['documents_count']} documents")
|
| 518 |
-
|
| 519 |
-
# Step 2: Generate AI summaries if agent available
|
| 520 |
-
if hasattr(st.session_state, 'agent') and st.session_state.agent:
|
| 521 |
-
doc_count = len(st.session_state.documents)
|
| 522 |
-
tracker.update(3, f"Generating AI summaries for {doc_count} documents...")
|
| 523 |
-
|
| 524 |
-
# Adjust weight for step 3 based on actual document count
|
| 525 |
-
# More documents = longer processing time
|
| 526 |
-
if doc_count > 50:
|
| 527 |
-
step_weights[3] = min(15.0, doc_count * 0.15) # Scale with doc count, cap at 15
|
| 528 |
-
elif doc_count > 20:
|
| 529 |
-
step_weights[3] = doc_count * 0.2 # 4-10 weight for 20-50 docs
|
| 530 |
-
|
| 531 |
-
# Recalculate total weight
|
| 532 |
-
tracker.total_weight = sum(step_weights.values())
|
| 533 |
-
|
| 534 |
-
# Convert documents for summarization
|
| 535 |
-
docs_for_summary = []
|
| 536 |
-
for path, doc_info in st.session_state.documents.items():
|
| 537 |
-
docs_for_summary.append({
|
| 538 |
-
'name': doc_info['name'],
|
| 539 |
-
'path': doc_info['rel_path'],
|
| 540 |
-
'content': doc_info.get('content', '')[:1500],
|
| 541 |
-
'metadata': doc_info.get('metadata', {})
|
| 542 |
-
})
|
| 543 |
-
|
| 544 |
-
# Create a separate progress tracker for batch summarization
|
| 545 |
-
st.session_state.summary_progress = st.progress(0, text="📝 Starting document summarization...")
|
| 546 |
-
|
| 547 |
-
# Batch summarize
|
| 548 |
-
summarized_docs = batch_summarize_documents(
|
| 549 |
-
docs_for_summary,
|
| 550 |
-
st.session_state.agent.llm,
|
| 551 |
-
batch_size=self.config.processing.batch_size
|
| 552 |
-
)
|
| 553 |
-
|
| 554 |
-
# Clean up summary progress tracker
|
| 555 |
-
if 'summary_progress' in st.session_state:
|
| 556 |
-
st.session_state.summary_progress.progress(1.0, text="✅ Document summarization complete")
|
| 557 |
-
del st.session_state.summary_progress
|
| 558 |
-
|
| 559 |
-
# Store summaries
|
| 560 |
-
for doc in summarized_docs:
|
| 561 |
-
for path, doc_info in st.session_state.documents.items():
|
| 562 |
-
if doc_info['rel_path'] == doc['path']:
|
| 563 |
-
doc_info['summary'] = doc.get('summary', '')
|
| 564 |
-
|
| 565 |
-
# Create embeddings using summaries
|
| 566 |
-
st.session_state.doc_embeddings_data = create_document_embeddings_with_summaries(
|
| 567 |
-
summarized_docs, self.model
|
| 568 |
-
)
|
| 569 |
-
|
| 570 |
-
tracker.update(4, f"AI summaries complete ({doc_count} documents processed)")
|
| 571 |
-
else:
|
| 572 |
-
tracker.update(4, "Skipping AI summaries (not enabled)")
|
| 573 |
-
|
| 574 |
-
# Step 3: Parse checklist and questions
|
| 575 |
-
tracker.update(5, "Loading checklist and questions...")
|
| 576 |
-
|
| 577 |
-
# Load default checklist
|
| 578 |
-
checklist_text = self._load_default_file(self.config.paths.checklist_path, "*.md")
|
| 579 |
-
if checklist_text:
|
| 580 |
-
st.session_state.checklist = self.service.checklist_parser.parse_checklist(checklist_text)
|
| 581 |
-
|
| 582 |
-
# Load default questions
|
| 583 |
-
questions_text = self._load_default_file(self.config.paths.questions_path, "*.md")
|
| 584 |
-
if questions_text:
|
| 585 |
-
st.session_state.questions = self.service.question_parser.parse_questions(questions_text)
|
| 586 |
-
|
| 587 |
-
tracker.update(6, "Checklist and questions loaded")
|
| 588 |
-
|
| 589 |
-
# Step 7: Generate checklist descriptions if AI is available
|
| 590 |
-
if (hasattr(st.session_state, 'agent') and st.session_state.agent and
|
| 591 |
-
st.session_state.checklist):
|
| 592 |
-
|
| 593 |
-
tracker.update(7, "Generating checklist item descriptions...")
|
| 594 |
-
|
| 595 |
-
# Create progress tracker for descriptions
|
| 596 |
-
st.session_state.description_progress = st.progress(0, text="📝 Generating descriptions...")
|
| 597 |
-
|
| 598 |
-
# Generate enhanced descriptions for better matching
|
| 599 |
-
st.session_state.checklist = generate_checklist_descriptions(
|
| 600 |
-
st.session_state.checklist,
|
| 601 |
-
st.session_state.agent.llm,
|
| 602 |
-
batch_size=self.config.processing.batch_size
|
| 603 |
-
)
|
| 604 |
-
|
| 605 |
-
# Clean up progress tracker
|
| 606 |
-
if 'description_progress' in st.session_state:
|
| 607 |
-
st.session_state.description_progress.progress(1.0, text="✅ Descriptions generated")
|
| 608 |
-
del st.session_state.description_progress
|
| 609 |
-
|
| 610 |
-
tracker.update(8, "Checklist descriptions generated")
|
| 611 |
-
else:
|
| 612 |
-
tracker.update(8, "Skipping description generation (AI not enabled)")
|
| 613 |
-
|
| 614 |
-
# Step 9: Match checklist to documents
|
| 615 |
-
if st.session_state.checklist and st.session_state.chunks:
|
| 616 |
-
tracker.update(9, "Matching checklist to documents...")
|
| 617 |
-
|
| 618 |
-
if hasattr(st.session_state, 'doc_embeddings_data') and st.session_state.doc_embeddings_data:
|
| 619 |
-
# Use AI-enhanced matching with generated descriptions
|
| 620 |
-
st.session_state.checklist_results = match_checklist_with_summaries(
|
| 621 |
-
st.session_state.checklist,
|
| 622 |
-
st.session_state.doc_embeddings_data,
|
| 623 |
-
self.model,
|
| 624 |
-
self.config.processing.similarity_threshold
|
| 625 |
-
)
|
| 626 |
-
else:
|
| 627 |
-
# Use traditional matching
|
| 628 |
-
st.session_state.checklist_results = self.service.checklist_matcher.match_checklist_to_documents(
|
| 629 |
-
st.session_state.checklist,
|
| 630 |
-
st.session_state.chunks,
|
| 631 |
-
st.session_state.embeddings,
|
| 632 |
-
self.config.processing.similarity_threshold
|
| 633 |
-
)
|
| 634 |
-
|
| 635 |
-
tracker.update(10, "Checklist matching complete")
|
| 636 |
-
|
| 637 |
-
# Step 11: Answer questions
|
| 638 |
-
if (st.session_state.questions and st.session_state.chunks and
|
| 639 |
-
st.session_state.embeddings is not None):
|
| 640 |
-
|
| 641 |
-
tracker.update(11, "Answering due diligence questions...")
|
| 642 |
-
|
| 643 |
-
st.session_state.question_answers = self.service.question_answerer.answer_questions_with_chunks(
|
| 644 |
-
st.session_state.questions,
|
| 645 |
-
st.session_state.chunks,
|
| 646 |
-
st.session_state.embeddings,
|
| 647 |
-
self.config.processing.similarity_threshold
|
| 648 |
-
)
|
| 649 |
-
|
| 650 |
-
answered_count = sum(1 for a in st.session_state.question_answers.values() if a['has_answer'])
|
| 651 |
-
tracker.update(12, f"Answered {answered_count}/{len(st.session_state.questions)} questions")
|
| 652 |
-
|
| 653 |
-
tracker.complete("Processing complete!")
|
| 654 |
|
| 655 |
-
#
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
st.session_state.is_processing = False
|
| 668 |
-
raise # Let decorator handle error display
|
| 669 |
-
|
| 670 |
-
def _load_default_file(self, directory: Path, pattern: str) -> str:
|
| 671 |
-
"""Load the first file matching pattern from directory"""
|
| 672 |
-
try:
|
| 673 |
-
files = list(directory.glob(pattern))
|
| 674 |
-
if files:
|
| 675 |
-
return files[0].read_text(encoding='utf-8')
|
| 676 |
-
except Exception as e:
|
| 677 |
-
logger.warning(f"Could not load default file from {directory}: {e}")
|
| 678 |
-
return ""
|
| 679 |
|
| 680 |
def run(self):
|
| 681 |
"""Run the main application"""
|
|
@@ -698,33 +540,20 @@ class DDChecklistApp:
|
|
| 698 |
self.render_summary_tab()
|
| 699 |
|
| 700 |
with tab2:
|
| 701 |
-
self.
|
| 702 |
|
| 703 |
with tab3:
|
| 704 |
-
self.
|
| 705 |
|
| 706 |
with tab4:
|
| 707 |
self.render_qa_tab()
|
| 708 |
|
| 709 |
-
#
|
| 710 |
-
if st.session_state.just_processed:
|
| 711 |
-
show_success("✅ Data room processing complete! View results in the tabs above.")
|
| 712 |
-
st.session_state.just_processed = False
|
| 713 |
|
| 714 |
-
#
|
| 715 |
if process_button and selected_data_room_path and not st.session_state.is_processing:
|
| 716 |
-
# Set trigger and path for next render
|
| 717 |
-
st.session_state.trigger_processing = True
|
| 718 |
-
st.session_state.processing_path = selected_data_room_path
|
| 719 |
st.session_state.is_processing = True
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
# Execute processing if triggered
|
| 723 |
-
if st.session_state.trigger_processing and st.session_state.processing_path:
|
| 724 |
-
st.session_state.trigger_processing = False # Reset trigger
|
| 725 |
-
processing_path = st.session_state.processing_path
|
| 726 |
-
st.session_state.processing_path = None
|
| 727 |
-
self.process_data_room(processing_path)
|
| 728 |
|
| 729 |
|
| 730 |
def main():
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
| 10 |
+
import warnings
|
| 11 |
# Fix tokenizers parallelism warning early
|
| 12 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 13 |
|
| 14 |
+
# Suppress all LangChain verbose warnings globally
|
| 15 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
|
| 16 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
|
| 17 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
|
| 18 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_huggingface")
|
| 19 |
+
warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
|
| 20 |
+
warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
|
| 21 |
+
|
| 22 |
+
# Set up LangChain logging levels early
|
| 23 |
+
import logging
|
| 24 |
+
logging.getLogger("langchain").setLevel(logging.ERROR)
|
| 25 |
+
logging.getLogger("langchain_core").setLevel(logging.ERROR)
|
| 26 |
+
logging.getLogger("langchain_community").setLevel(logging.ERROR)
|
| 27 |
+
logging.getLogger("langchain_huggingface").setLevel(logging.ERROR)
|
| 28 |
+
|
| 29 |
import streamlit as st
|
| 30 |
+
|
|
|
|
| 31 |
from pathlib import Path
|
| 32 |
+
from typing import Dict
|
| 33 |
|
| 34 |
# Import our refactored modules
|
| 35 |
from src import (
|
| 36 |
+
init_config, DocumentProcessor,
|
| 37 |
+
logger,
|
| 38 |
+
render_project_selector,
|
| 39 |
+
render_ai_settings, escape_markdown_math,
|
| 40 |
+
get_mime_type, format_document_title
|
| 41 |
)
|
| 42 |
+
from src.document_processing import safe_execute
|
| 43 |
+
# Using Streamlit directly for simplicity
|
| 44 |
from src.ui_components import (
|
| 45 |
+
render_file_selector, render_checklist_results, render_question_results,
|
| 46 |
+
render_quick_questions, create_document_link
|
| 47 |
+
)
|
| 48 |
+
from src.services import (
|
| 49 |
+
search_documents
|
| 50 |
)
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
from src.config import show_success, show_error, show_info
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
# Import LangGraph + Anthropic configuration
|
| 55 |
+
from src.ai import (
|
| 56 |
+
DDChecklistAgent
|
| 57 |
+
)
|
| 58 |
|
| 59 |
|
| 60 |
class DDChecklistApp:
|
| 61 |
"""
|
| 62 |
Main application class that orchestrates all components
|
| 63 |
"""
|
| 64 |
+
|
| 65 |
def __init__(self):
|
| 66 |
"""Initialize the application"""
|
| 67 |
# Initialize configuration
|
| 68 |
+
self.config = init_config()
|
| 69 |
|
| 70 |
# Initialize session state
|
| 71 |
self._init_session_state()
|
|
|
|
| 78 |
)
|
| 79 |
|
| 80 |
# Initialize services (will be loaded when needed)
|
| 81 |
+
self.model_name = self.config.model.sentence_transformer_model
|
| 82 |
+
self.document_processor = None
|
| 83 |
self.agent = None
|
| 84 |
|
| 85 |
def _init_session_state(self):
|
| 86 |
+
"""Initialize essential session state variables only"""
|
| 87 |
+
essential_defaults = {
|
| 88 |
'documents': {},
|
| 89 |
'chunks': [],
|
| 90 |
'embeddings': None,
|
|
|
|
| 91 |
'checklist_results': {},
|
|
|
|
| 92 |
'question_answers': {},
|
|
|
|
|
|
|
| 93 |
'company_summary': "",
|
| 94 |
+
'strategy_analysis': "",
|
| 95 |
'agent': None,
|
| 96 |
+
'is_processing': False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
}
|
| 98 |
|
| 99 |
+
for key, default_value in essential_defaults.items():
|
| 100 |
if key not in st.session_state:
|
| 101 |
st.session_state[key] = default_value
|
| 102 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
def initialize_services(self):
|
| 104 |
"""Initialize core services"""
|
| 105 |
+
if self.document_processor is None:
|
| 106 |
+
self.document_processor = DocumentProcessor(self.model_name)
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
# Restore document processor state from session state if available
|
| 109 |
if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
|
| 110 |
hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
|
| 111 |
|
| 112 |
+
self.document_processor.chunks = st.session_state.chunks
|
| 113 |
+
self.document_processor.embeddings = st.session_state.embeddings
|
| 114 |
+
# Note: Don't restore documents here - they'll be recreated from chunks if needed
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
|
| 117 |
"""
|
| 118 |
+
Setup AI agent
|
| 119 |
|
| 120 |
Args:
|
| 121 |
api_key: Anthropic API key
|
|
|
|
| 123 |
|
| 124 |
Returns:
|
| 125 |
True if agent was successfully initialized
|
| 126 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
try:
|
| 128 |
with st.spinner("Initializing AI agent..."):
|
| 129 |
agent = DDChecklistAgent(api_key, model_choice)
|
|
|
|
| 133 |
self.agent = agent
|
| 134 |
show_success("✅ AI Agent ready")
|
| 135 |
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
return True
|
| 138 |
else:
|
|
|
|
| 181 |
self.agent = None
|
| 182 |
|
| 183 |
return selected_data_room_path, use_ai_features, process_button
|
|
|
|
| 184 |
|
| 185 |
def render_summary_tab(self):
|
| 186 |
+
"""Render consolidated summary and analysis tab"""
|
| 187 |
# Strategy selector
|
| 188 |
strategy_path, strategy_text = render_file_selector(
|
| 189 |
self.config.paths.strategy_dir, "Strategy", "tab"
|
| 190 |
)
|
|
|
|
| 191 |
|
| 192 |
# Check if we have documents to display summaries
|
| 193 |
if st.session_state.documents:
|
|
|
|
| 195 |
overview_tab, analysis_tab = st.tabs(["🏢 Company Overview", "🎯 Strategic Analysis"])
|
| 196 |
|
| 197 |
with overview_tab:
|
| 198 |
+
self._render_report_section("overview", strategy_text=strategy_text)
|
| 199 |
|
| 200 |
with analysis_tab:
|
| 201 |
+
self._render_report_section("strategic", strategy_text=strategy_text)
|
| 202 |
else:
|
| 203 |
show_info("👈 Configure and process data room to see analysis")
|
| 204 |
|
| 205 |
+
def _render_report_section(self, report_type: str, strategy_text: str = ""):
|
| 206 |
+
"""Unified report rendering for both overview and strategic analysis"""
|
| 207 |
+
from src.services import generate_reports
|
| 208 |
+
|
| 209 |
+
summary_key = f"{report_type}_summary"
|
| 210 |
+
|
| 211 |
+
# Check prerequisites for strategic analysis
|
| 212 |
+
if report_type == "strategic" and not st.session_state.checklist_results:
|
| 213 |
+
st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
# Auto-generate report if not already present and AI is available
|
| 217 |
+
if (not st.session_state.get(summary_key, "") and st.session_state.agent):
|
| 218 |
+
with st.spinner(f"🤖 Generating {report_type} analysis..."):
|
| 219 |
+
data_room_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
|
| 220 |
+
if st.session_state.documents else "Unknown")
|
| 221 |
+
|
| 222 |
+
st.session_state[summary_key] = generate_reports(
|
| 223 |
+
st.session_state.documents,
|
| 224 |
+
data_room_name,
|
| 225 |
+
strategy_text,
|
| 226 |
+
st.session_state.checklist_results,
|
| 227 |
+
report_type,
|
| 228 |
+
st.session_state.agent.llm if st.session_state.agent else None
|
| 229 |
)
|
| 230 |
|
| 231 |
+
# Display the report if available
|
| 232 |
+
if st.session_state.get(summary_key, ""):
|
| 233 |
+
st.markdown(st.session_state[summary_key])
|
| 234 |
|
| 235 |
# Add export and regenerate buttons
|
| 236 |
+
self._render_report_actions(report_type, summary_key)
|
| 237 |
+
|
| 238 |
+
def _render_report_actions(self, report_type: str, summary_key: str):
|
| 239 |
+
"""Render export and regenerate actions for reports"""
|
| 240 |
+
if report_type == "overview":
|
| 241 |
col1, col2 = st.columns([1, 5])
|
| 242 |
with col1:
|
| 243 |
+
company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
|
| 244 |
+
if st.session_state.documents else 'export')
|
| 245 |
+
file_name = f"company_overview_{company_name}.md"
|
| 246 |
st.download_button(
|
| 247 |
"📥 Export Summary",
|
| 248 |
+
data=f"# Company Overview\n\n{st.session_state[summary_key]}",
|
| 249 |
+
file_name=file_name,
|
| 250 |
mime="text/markdown",
|
| 251 |
+
key=f"export_{summary_key}"
|
| 252 |
)
|
| 253 |
with col2:
|
| 254 |
+
if st.button(f"🔄 Regenerate {report_type.title()}"):
|
| 255 |
+
st.session_state[summary_key] = ""
|
| 256 |
st.rerun()
|
| 257 |
+
else:
|
| 258 |
+
col1, col2 = st.columns([1, 5])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
with col1:
|
| 260 |
+
# Combined report export for strategic analysis
|
| 261 |
combined_report = f"# Due Diligence Report\n\n"
|
| 262 |
+
combined_report += f"## Company Overview\n\n{st.session_state.get('overview_summary', '')}\n\n"
|
| 263 |
+
combined_report += f"## Strategic Analysis\n\n{st.session_state[summary_key]}"
|
| 264 |
|
| 265 |
+
company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
|
| 266 |
+
if st.session_state.documents else 'export')
|
| 267 |
+
file_name = f"dd_report_{company_name}.md"
|
| 268 |
st.download_button(
|
| 269 |
"📥 Export Report",
|
| 270 |
data=combined_report,
|
| 271 |
+
file_name=file_name,
|
| 272 |
mime="text/markdown",
|
| 273 |
+
key=f"export_combined_{summary_key}"
|
| 274 |
)
|
| 275 |
with col2:
|
| 276 |
+
if st.button(f"🔄 Regenerate {report_type.title()}"):
|
| 277 |
+
st.session_state[summary_key] = ""
|
| 278 |
st.rerun()
|
| 279 |
|
| 280 |
+
def render_analysis_tab(self, tab_type: str):
|
| 281 |
+
"""Unified rendering for checklist and questions tabs"""
|
| 282 |
+
if tab_type == "checklist":
|
| 283 |
+
# Checklist selector
|
| 284 |
+
file_path, file_text = render_file_selector(
|
| 285 |
+
self.config.paths.checklist_dir, "Checklist", "tab"
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
if not file_text:
|
| 289 |
+
show_error("No checklists found in data/checklist directory")
|
| 290 |
+
return
|
| 291 |
+
|
| 292 |
+
# Render results if available
|
| 293 |
+
render_checklist_results(st.session_state.checklist_results)
|
| 294 |
+
|
| 295 |
+
elif tab_type == "questions":
|
| 296 |
+
# Question list selector
|
| 297 |
+
file_path, file_text = render_file_selector(
|
| 298 |
+
self.config.paths.questions_dir, "Question List", "tab"
|
| 299 |
+
)
|
| 300 |
+
|
| 301 |
+
if not file_text:
|
| 302 |
+
show_info("No question lists found in data/questions/")
|
| 303 |
+
return
|
| 304 |
+
|
| 305 |
+
# Render results if available
|
| 306 |
+
render_question_results(st.session_state.question_answers)
|
| 307 |
|
| 308 |
def render_qa_tab(self):
|
| 309 |
"""Render the Q&A with citations tab"""
|
|
|
|
| 329 |
|
| 330 |
def _handle_qa_query(self, question: str):
|
| 331 |
"""Handle Q&A query and display results"""
|
| 332 |
+
if not self.document_processor:
|
| 333 |
self.initialize_services()
|
| 334 |
|
| 335 |
# Use lower threshold for Q&A to get more relevant results
|
| 336 |
qa_threshold = 0.25
|
| 337 |
|
| 338 |
+
results = search_documents(
|
| 339 |
+
self.document_processor,
|
| 340 |
question,
|
| 341 |
top_k=self.config.ui.top_k_search_results,
|
| 342 |
threshold=qa_threshold
|
|
|
|
| 353 |
context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
|
| 354 |
# Use LLM directly for more reliable answers
|
| 355 |
from langchain_core.messages import HumanMessage
|
| 356 |
+
prompt = (f"Question: {question}\n\n"
|
| 357 |
+
f"Relevant document excerpts:\n{context}\n\n"
|
| 358 |
+
f"Provide a comprehensive answer with citations to the sources.")
|
| 359 |
response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
|
| 360 |
# Clean up any leading whitespace and escape math characters
|
| 361 |
answer_text = escape_markdown_math(response.content.strip())
|
|
|
|
| 375 |
# Create clickable link for the document
|
| 376 |
doc_path = result.get('path', result.get('full_path', ''))
|
| 377 |
doc_name = result['source']
|
| 378 |
+
doc_title = format_document_title(doc_name)
|
|
|
|
|
|
|
|
|
|
| 379 |
|
| 380 |
if doc_path:
|
| 381 |
link_html = create_document_link(doc_path, doc_name, doc_title)
|
|
|
|
| 402 |
file_bytes = f.read()
|
| 403 |
|
| 404 |
# Determine MIME type based on file extension
|
| 405 |
+
mime_type = get_mime_type(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 406 |
|
| 407 |
button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
|
| 408 |
|
|
|
|
| 417 |
except Exception as e:
|
| 418 |
st.error(f"Download failed: {str(e)}")
|
| 419 |
|
|
|
|
| 420 |
def process_data_room(self, data_room_path: str):
|
| 421 |
+
"""Simplified data room processing"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
if not Path(data_room_path).exists():
|
| 423 |
show_error(f"Data room path not found: {data_room_path}")
|
| 424 |
+
st.session_state.is_processing = False
|
| 425 |
return
|
| 426 |
|
| 427 |
+
# Use safe_execute for the entire processing operation
|
| 428 |
+
def process_operation():
|
| 429 |
self.initialize_services()
|
| 430 |
+
# Simple processing - load documents
|
| 431 |
+
self.document_processor.load_data_room(data_room_path)
|
| 432 |
|
| 433 |
+
# Store results in session state with simplified structure
|
| 434 |
+
# Convert list of LangChain documents to dictionary format expected by UI
|
| 435 |
+
documents_dict = {}
|
| 436 |
+
for doc in self.document_processor.documents:
|
| 437 |
+
file_path = doc.metadata.get('source', doc.metadata.get('path', 'unknown'))
|
| 438 |
+
documents_dict[file_path] = {
|
| 439 |
+
'name': doc.metadata.get('name', Path(file_path).name if file_path != 'unknown' else 'unknown'),
|
| 440 |
+
'path': doc.metadata.get('path', ''),
|
| 441 |
+
'content': doc.page_content,
|
| 442 |
+
'metadata': doc.metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
}
|
| 444 |
+
|
| 445 |
+
st.session_state.documents = documents_dict
|
| 446 |
+
st.session_state.chunks = self.document_processor.chunks
|
| 447 |
+
st.session_state.embeddings = self.document_processor.embeddings
|
| 448 |
+
|
| 449 |
+
# Process checklist and questions if available
|
| 450 |
+
self._process_checklist_and_questions()
|
| 451 |
+
|
| 452 |
+
# Clear any existing analysis to trigger regeneration
|
| 453 |
+
st.session_state.company_summary = ""
|
| 454 |
+
st.session_state.strategy_analysis = ""
|
| 455 |
+
st.session_state.overview_summary = ""
|
| 456 |
+
st.session_state.strategic_summary = ""
|
| 457 |
+
|
| 458 |
+
show_success("✅ Data room processing complete! View results in the tabs above.")
|
| 459 |
+
st.rerun()
|
| 460 |
+
|
| 461 |
+
safe_execute(
|
| 462 |
+
process_operation,
|
| 463 |
+
None,
|
| 464 |
+
"Data room processing"
|
| 465 |
+
)
|
| 466 |
+
|
| 467 |
+
st.session_state.is_processing = False
|
| 468 |
+
|
| 469 |
+
def _process_checklist_and_questions(self):
|
| 470 |
+
"""Process checklist and questions after documents are loaded"""
|
| 471 |
+
from src.services import parse_checklist, parse_questions, create_vector_store, search_and_analyze, load_default_file
|
| 472 |
+
|
| 473 |
+
# Load default checklist if available
|
| 474 |
+
checklist_text = load_default_file(Path(self.config.paths.checklist_dir), "*.md")
|
| 475 |
+
if checklist_text and self.document_processor.chunks:
|
| 476 |
+
try:
|
| 477 |
+
# Parse checklist
|
| 478 |
+
checklist = parse_checklist(checklist_text)
|
| 479 |
+
st.session_state.checklist = checklist
|
| 480 |
|
| 481 |
+
# Create vector store from chunks for processing
|
| 482 |
+
vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 483 |
|
| 484 |
+
# Process checklist items
|
| 485 |
+
checklist_results = search_and_analyze(
|
| 486 |
+
checklist,
|
| 487 |
+
vector_store,
|
| 488 |
+
self.agent.llm if self.agent else None,
|
| 489 |
+
self.config.processing.similarity_threshold,
|
| 490 |
+
'items'
|
| 491 |
)
|
| 492 |
+
st.session_state.checklist_results = checklist_results
|
| 493 |
+
logger.info("✅ Checklist processing completed")
|
| 494 |
+
except Exception as e:
|
| 495 |
+
logger.error(f"Checklist processing failed: {e}")
|
| 496 |
+
|
| 497 |
+
# Load default questions if available
|
| 498 |
+
questions_text = load_default_file(Path(self.config.paths.questions_dir), "*.md")
|
| 499 |
+
if questions_text and self.document_processor.chunks:
|
| 500 |
+
try:
|
| 501 |
+
# Parse questions
|
| 502 |
+
questions = parse_questions(questions_text)
|
| 503 |
+
st.session_state.questions = questions
|
| 504 |
|
| 505 |
+
# Create vector store from chunks for processing (reuse if already created)
|
| 506 |
+
if 'vector_store' not in locals():
|
| 507 |
+
vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
+
# Process questions
|
| 510 |
+
question_answers = search_and_analyze(
|
| 511 |
+
questions,
|
| 512 |
+
vector_store,
|
| 513 |
+
self.agent.llm if self.agent else None,
|
| 514 |
+
self.config.processing.relevancy_threshold,
|
| 515 |
+
'questions'
|
| 516 |
+
)
|
| 517 |
+
st.session_state.question_answers = question_answers
|
| 518 |
+
logger.info("✅ Questions processing completed")
|
| 519 |
+
except Exception as e:
|
| 520 |
+
logger.error(f"Questions processing failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 521 |
|
| 522 |
def run(self):
|
| 523 |
"""Run the main application"""
|
|
|
|
| 540 |
self.render_summary_tab()
|
| 541 |
|
| 542 |
with tab2:
|
| 543 |
+
self.render_analysis_tab("checklist")
|
| 544 |
|
| 545 |
with tab3:
|
| 546 |
+
self.render_analysis_tab("questions")
|
| 547 |
|
| 548 |
with tab4:
|
| 549 |
self.render_qa_tab()
|
| 550 |
|
| 551 |
+
# Processing complete message is handled in process_data_room function
|
|
|
|
|
|
|
|
|
|
| 552 |
|
| 553 |
+
# Simplified processing trigger
|
| 554 |
if process_button and selected_data_room_path and not st.session_state.is_processing:
|
|
|
|
|
|
|
|
|
|
| 555 |
st.session_state.is_processing = True
|
| 556 |
+
self.process_data_room(selected_data_room_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 557 |
|
| 558 |
|
| 559 |
def main():
|
pyproject.toml
CHANGED
|
@@ -8,22 +8,31 @@ dependencies = [
|
|
| 8 |
"streamlit>=1.28.0",
|
| 9 |
"sentence-transformers>=2.2.0",
|
| 10 |
"numpy>=1.24.0",
|
| 11 |
-
"pandas>=2.0.0",
|
| 12 |
-
"watchdog>=3.0.0",
|
| 13 |
# Document processing
|
| 14 |
"pymupdf>=1.23.0",
|
| 15 |
"python-docx>=0.8.11",
|
| 16 |
# Environment and configuration
|
| 17 |
"python-dotenv>=1.0.0",
|
|
|
|
|
|
|
| 18 |
# Vector store
|
| 19 |
"faiss-cpu>=1.7.4",
|
| 20 |
-
# AI Enhancement
|
| 21 |
"langchain-anthropic>=0.1.0",
|
| 22 |
"langgraph>=0.0.20",
|
| 23 |
"langchain-core>=0.1.0",
|
| 24 |
"langchain-text-splitters>=0.3.10",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
]
|
| 26 |
|
| 27 |
[build-system]
|
| 28 |
requires = ["setuptools", "wheel"]
|
| 29 |
build-backend = "setuptools.build_meta"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"streamlit>=1.28.0",
|
| 9 |
"sentence-transformers>=2.2.0",
|
| 10 |
"numpy>=1.24.0",
|
|
|
|
|
|
|
| 11 |
# Document processing
|
| 12 |
"pymupdf>=1.23.0",
|
| 13 |
"python-docx>=0.8.11",
|
| 14 |
# Environment and configuration
|
| 15 |
"python-dotenv>=1.0.0",
|
| 16 |
+
"pydantic-settings>=2.10.1",
|
| 17 |
+
"markdown>=3.8.2",
|
| 18 |
# Vector store
|
| 19 |
"faiss-cpu>=1.7.4",
|
| 20 |
+
# AI Enhancement - LangChain packages
|
| 21 |
"langchain-anthropic>=0.1.0",
|
| 22 |
"langgraph>=0.0.20",
|
| 23 |
"langchain-core>=0.1.0",
|
| 24 |
"langchain-text-splitters>=0.3.10",
|
| 25 |
+
"langchain-community>=0.3.29",
|
| 26 |
+
"langchain-huggingface>=0.3.1",
|
| 27 |
+
"pypdf>=6.0.0",
|
| 28 |
+
"watchdog>=6.0.0",
|
| 29 |
]
|
| 30 |
|
| 31 |
[build-system]
|
| 32 |
requires = ["setuptools", "wheel"]
|
| 33 |
build-backend = "setuptools.build_meta"
|
| 34 |
+
|
| 35 |
+
[dependency-groups]
|
| 36 |
+
dev = [
|
| 37 |
+
"autoflake>=2.3.1",
|
| 38 |
+
]
|
requirements.txt
CHANGED
|
@@ -2,22 +2,24 @@
|
|
| 2 |
streamlit==1.49.1
|
| 3 |
sentence-transformers==5.1.0
|
| 4 |
numpy==2.3.2
|
| 5 |
-
pandas==2.3.2
|
| 6 |
-
watchdog==6.0.0
|
| 7 |
|
| 8 |
# Document processing - pinned for deployment
|
| 9 |
PyMuPDF==1.23.18
|
| 10 |
python-docx==1.2.0
|
| 11 |
-
joblib==1.5.2
|
| 12 |
|
| 13 |
# Environment and configuration - pinned for deployment
|
| 14 |
python-dotenv==1.1.1
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# Vector store - pinned for deployment
|
| 17 |
faiss-cpu==1.12.0
|
| 18 |
|
| 19 |
-
# AI Enhancement - pinned for deployment
|
| 20 |
langchain-anthropic==0.3.19
|
| 21 |
langgraph==0.6.6
|
| 22 |
langchain-core==0.3.75
|
| 23 |
-
langchain-text-splitters==0.3.10
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
streamlit==1.49.1
|
| 3 |
sentence-transformers==5.1.0
|
| 4 |
numpy==2.3.2
|
|
|
|
|
|
|
| 5 |
|
| 6 |
# Document processing - pinned for deployment
|
| 7 |
PyMuPDF==1.23.18
|
| 8 |
python-docx==1.2.0
|
|
|
|
| 9 |
|
| 10 |
# Environment and configuration - pinned for deployment
|
| 11 |
python-dotenv==1.1.1
|
| 12 |
+
pydantic-settings==2.8.1
|
| 13 |
+
markdown==3.9
|
| 14 |
|
| 15 |
# Vector store - pinned for deployment
|
| 16 |
faiss-cpu==1.12.0
|
| 17 |
|
| 18 |
+
# AI Enhancement - LangChain packages pinned for deployment
|
| 19 |
langchain-anthropic==0.3.19
|
| 20 |
langgraph==0.6.6
|
| 21 |
langchain-core==0.3.75
|
| 22 |
+
langchain-text-splitters==0.3.10
|
| 23 |
+
langchain-community==0.3.29
|
| 24 |
+
langchain-huggingface==0.3.1
|
| 25 |
+
|
src/__init__.py
CHANGED
|
@@ -5,10 +5,11 @@ DD-Checklist Source Package
|
|
| 5 |
This package contains the refactored components of the DD-Checklist application.
|
| 6 |
"""
|
| 7 |
|
| 8 |
-
from .config import
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
|
|
|
| 12 |
from .ui_components import render_project_selector, render_ai_settings
|
| 13 |
|
| 14 |
__version__ = "0.2.0"
|
|
@@ -17,24 +18,21 @@ __author__ = "DD-Checklist Team"
|
|
| 17 |
__all__ = [
|
| 18 |
# Configuration
|
| 19 |
"get_config",
|
| 20 |
-
"init_config",
|
| 21 |
-
"get_model_config",
|
| 22 |
-
"get_processing_config",
|
| 23 |
|
| 24 |
# Document Processing
|
| 25 |
"DocumentProcessor",
|
| 26 |
"escape_markdown_math",
|
|
|
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
"DDChecklistService",
|
| 30 |
-
"ChecklistParser",
|
| 31 |
-
"QuestionParser",
|
| 32 |
-
|
| 33 |
-
# Utilities
|
| 34 |
"logger",
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
# UI Components
|
| 40 |
"render_project_selector",
|
|
|
|
| 5 |
This package contains the refactored components of the DD-Checklist application.
|
| 6 |
"""
|
| 7 |
|
| 8 |
+
from .config import (
|
| 9 |
+
get_config, init_config, logger, show_success, show_error, show_info,
|
| 10 |
+
get_mime_type, format_document_title, count_documents_in_directory
|
| 11 |
+
)
|
| 12 |
+
from .document_processing import DocumentProcessor, escape_markdown_math, safe_execute
|
| 13 |
from .ui_components import render_project_selector, render_ai_settings
|
| 14 |
|
| 15 |
__version__ = "0.2.0"
|
|
|
|
| 18 |
__all__ = [
|
| 19 |
# Configuration
|
| 20 |
"get_config",
|
| 21 |
+
"init_config",
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# Document Processing
|
| 24 |
"DocumentProcessor",
|
| 25 |
"escape_markdown_math",
|
| 26 |
+
"safe_execute",
|
| 27 |
|
| 28 |
+
# Utilities (merged from utils.py)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
"logger",
|
| 30 |
+
"show_success",
|
| 31 |
+
"show_error",
|
| 32 |
+
"show_info",
|
| 33 |
+
"get_mime_type",
|
| 34 |
+
"format_document_title",
|
| 35 |
+
"count_documents_in_directory",
|
| 36 |
|
| 37 |
# UI Components
|
| 38 |
"render_project_selector",
|
src/ai/__init__.py
CHANGED
|
@@ -6,70 +6,23 @@ This module provides AI-powered functionality for the DD-Checklist application,
|
|
| 6 |
including LangGraph agents, document processing, and checklist matching.
|
| 7 |
"""
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
get_document_summarization_prompt
|
| 27 |
-
)
|
| 28 |
-
|
| 29 |
-
# Set availability flag based on successful imports
|
| 30 |
-
AI_MODULE_AVAILABLE = LANGGRAPH_AVAILABLE
|
| 31 |
-
|
| 32 |
-
except ImportError as e:
|
| 33 |
-
# Handle missing dependencies gracefully
|
| 34 |
-
print(f"AI module dependencies not available: {e}")
|
| 35 |
-
|
| 36 |
-
# Create placeholder classes/functions for graceful degradation
|
| 37 |
-
class DDChecklistAgent:
|
| 38 |
-
def __init__(self, *args, **kwargs):
|
| 39 |
-
self.app = None
|
| 40 |
-
self.llm = None
|
| 41 |
-
|
| 42 |
-
def is_available(self):
|
| 43 |
-
return False
|
| 44 |
-
|
| 45 |
-
def get_langgraph_agent(*args, **kwargs):
|
| 46 |
-
return None
|
| 47 |
-
|
| 48 |
-
def batch_summarize_documents(documents, *args, **kwargs):
|
| 49 |
-
return documents
|
| 50 |
-
|
| 51 |
-
def create_document_embeddings_with_summaries(documents, *args, **kwargs):
|
| 52 |
-
return {'embeddings': [], 'documents': []}
|
| 53 |
-
|
| 54 |
-
def match_checklist_with_summaries(*args, **kwargs):
|
| 55 |
-
return {}
|
| 56 |
-
|
| 57 |
-
def generate_checklist_descriptions(checklist, *args, **kwargs):
|
| 58 |
-
return checklist
|
| 59 |
-
|
| 60 |
-
def exponential_backoff_retry(func, *args, **kwargs):
|
| 61 |
-
return func()
|
| 62 |
-
|
| 63 |
-
# Set availability flags
|
| 64 |
-
LANGGRAPH_AVAILABLE = False
|
| 65 |
-
AI_MODULE_AVAILABLE = False
|
| 66 |
-
|
| 67 |
-
# Placeholder classes for type hints
|
| 68 |
-
class AgentState:
|
| 69 |
-
pass
|
| 70 |
-
|
| 71 |
-
class TaskType:
|
| 72 |
-
pass
|
| 73 |
|
| 74 |
# Export main public API
|
| 75 |
__all__ = [
|
|
@@ -77,14 +30,9 @@ __all__ = [
|
|
| 77 |
'DDChecklistAgent',
|
| 78 |
'get_langgraph_agent',
|
| 79 |
|
| 80 |
-
|
| 81 |
-
'batch_summarize_documents',
|
| 82 |
-
'create_document_embeddings_with_summaries',
|
| 83 |
-
'match_checklist_with_summaries',
|
| 84 |
-
'generate_checklist_descriptions',
|
| 85 |
-
'exponential_backoff_retry',
|
| 86 |
|
| 87 |
-
# Agent types and state
|
| 88 |
'AgentState',
|
| 89 |
'TaskType',
|
| 90 |
|
|
@@ -95,8 +43,4 @@ __all__ = [
|
|
| 95 |
'get_findings_summary_prompt',
|
| 96 |
'get_description_generation_prompt',
|
| 97 |
'get_document_summarization_prompt',
|
| 98 |
-
|
| 99 |
-
# Availability flags
|
| 100 |
-
'LANGGRAPH_AVAILABLE',
|
| 101 |
-
'AI_MODULE_AVAILABLE',
|
| 102 |
]
|
|
|
|
| 6 |
including LangGraph agents, document processing, and checklist matching.
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
# Import core components
|
| 10 |
+
from .prompts import (
|
| 11 |
+
get_checklist_parsing_prompt,
|
| 12 |
+
get_document_relevance_prompt,
|
| 13 |
+
get_question_answering_prompt,
|
| 14 |
+
get_findings_summary_prompt,
|
| 15 |
+
get_description_generation_prompt,
|
| 16 |
+
get_document_summarization_prompt
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
# Direct imports for AI functionality - assuming dependencies are present
|
| 20 |
+
from .agent_core import (
|
| 21 |
+
DDChecklistAgent,
|
| 22 |
+
get_langgraph_agent,
|
| 23 |
+
AgentState,
|
| 24 |
+
TaskType
|
| 25 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# Export main public API
|
| 28 |
__all__ = [
|
|
|
|
| 30 |
'DDChecklistAgent',
|
| 31 |
'get_langgraph_agent',
|
| 32 |
|
| 33 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
# Agent types and state (now in agent_core)
|
| 36 |
'AgentState',
|
| 37 |
'TaskType',
|
| 38 |
|
|
|
|
| 43 |
'get_findings_summary_prompt',
|
| 44 |
'get_description_generation_prompt',
|
| 45 |
'get_document_summarization_prompt',
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
]
|
src/ai/agent_core.py
CHANGED
|
@@ -2,40 +2,548 @@
|
|
| 2 |
"""
|
| 3 |
LangGraph Agent Core Module
|
| 4 |
|
| 5 |
-
This module contains the main LangGraph agent setup and the high-level
|
| 6 |
DDChecklistAgent class for interacting with the agent system.
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
|
| 30 |
from ..config import get_config
|
| 31 |
-
from .
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
)
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
"""
|
| 40 |
Create a LangGraph agent with Anthropic
|
| 41 |
|
|
@@ -47,9 +555,6 @@ def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = No
|
|
| 47 |
Tuple of (compiled_app, llm) or None if not available
|
| 48 |
"""
|
| 49 |
|
| 50 |
-
if not LANGGRAPH_AVAILABLE:
|
| 51 |
-
return None
|
| 52 |
-
|
| 53 |
# Get configuration
|
| 54 |
config = get_config()
|
| 55 |
|
|
@@ -165,7 +670,7 @@ class DDChecklistAgent:
|
|
| 165 |
|
| 166 |
def is_available(self) -> bool:
|
| 167 |
"""Check if the agent is available for use"""
|
| 168 |
-
return self.app is not None
|
| 169 |
|
| 170 |
def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
|
| 171 |
"""
|
|
@@ -189,8 +694,7 @@ class DDChecklistAgent:
|
|
| 189 |
|
| 190 |
return result.get("checklist")
|
| 191 |
except Exception as e:
|
| 192 |
-
|
| 193 |
-
st.error(f"Agent error: {str(e)}")
|
| 194 |
return None
|
| 195 |
|
| 196 |
def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
|
|
@@ -223,8 +727,7 @@ class DDChecklistAgent:
|
|
| 223 |
|
| 224 |
return result.get("findings", {})
|
| 225 |
except Exception as e:
|
| 226 |
-
|
| 227 |
-
st.error(f"Agent error: {str(e)}")
|
| 228 |
return {}
|
| 229 |
|
| 230 |
def answer_question(self, question: str, documents: List[Dict]) -> str:
|
|
|
|
| 2 |
"""
|
| 3 |
LangGraph Agent Core Module
|
| 4 |
|
| 5 |
+
This module contains the main LangGraph agent setup, AI utilities, and the high-level
|
| 6 |
DDChecklistAgent class for interacting with the agent system.
|
| 7 |
+
|
| 8 |
+
Merged from: agent_core.py, agent_nodes.py, llm_utilities.py
|
| 9 |
"""
|
| 10 |
|
| 11 |
import os
|
| 12 |
+
import json
|
| 13 |
+
import time
|
| 14 |
+
import random
|
| 15 |
+
import logging
|
| 16 |
+
from typing import Optional, Dict, List, Any, Tuple, Sequence
|
| 17 |
+
from typing_extensions import TypedDict
|
| 18 |
+
from enum import Enum
|
| 19 |
+
import streamlit as st
|
| 20 |
+
from langchain_anthropic import ChatAnthropic
|
| 21 |
+
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
|
| 22 |
+
from langchain_core.tools import tool
|
| 23 |
+
from langchain_core.output_parsers import PydanticOutputParser
|
| 24 |
+
from langchain_community.vectorstores import FAISS
|
| 25 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 26 |
+
from langchain_core.documents import Document
|
| 27 |
+
from langgraph.graph import StateGraph, END
|
| 28 |
+
|
| 29 |
+
from langgraph.checkpoint.memory import MemorySaver
|
| 30 |
+
from pydantic import BaseModel, Field
|
| 31 |
|
| 32 |
from ..config import get_config
|
| 33 |
+
from ..document_processing import safe_execute
|
| 34 |
+
from .prompts import (
|
| 35 |
+
get_checklist_parsing_prompt,
|
| 36 |
+
get_document_relevance_prompt,
|
| 37 |
+
get_question_answering_prompt,
|
| 38 |
+
get_findings_summary_prompt,
|
| 39 |
+
get_description_generation_prompt,
|
| 40 |
+
get_document_summarization_prompt
|
| 41 |
)
|
| 42 |
|
| 43 |
+
logger = logging.getLogger(__name__)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# =============================================================================
|
| 47 |
+
# TYPE DEFINITIONS - Merged from agent_nodes.py
|
| 48 |
+
# =============================================================================
|
| 49 |
+
|
| 50 |
+
# Simple Pydantic models for structured output parsing
|
| 51 |
+
class SimpleChecklist(BaseModel):
|
| 52 |
+
"""Simple model matching existing checklist structure"""
|
| 53 |
+
categories: Dict = Field(description="Checklist categories as they currently exist")
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
# Define the state for our agent
|
| 57 |
+
class AgentState(TypedDict):
|
| 58 |
+
"""State for the due diligence agent"""
|
| 59 |
+
messages: Sequence[BaseMessage]
|
| 60 |
+
checklist: Optional[Dict]
|
| 61 |
+
documents: Optional[List[Dict]]
|
| 62 |
+
current_task: Optional[str]
|
| 63 |
+
findings: Dict[str, List[str]]
|
| 64 |
+
next_action: Optional[str]
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class TaskType(Enum):
|
| 68 |
+
"""Types of tasks the agent can perform"""
|
| 69 |
+
PARSE_CHECKLIST = "parse_checklist"
|
| 70 |
+
ANALYZE_DOCUMENT = "analyze_document"
|
| 71 |
+
MATCH_CHECKLIST = "match_checklist"
|
| 72 |
+
ANSWER_QUESTION = "answer_question"
|
| 73 |
+
SUMMARIZE_FINDINGS = "summarize_findings"
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
# =============================================================================
|
| 77 |
+
# AGENT NODE FUNCTIONS - Merged from agent_nodes.py
|
| 78 |
+
# =============================================================================
|
| 79 |
+
|
| 80 |
+
def route_task(state: AgentState) -> AgentState:
|
| 81 |
+
"""Route to appropriate task based on current state"""
|
| 82 |
+
messages = state["messages"]
|
| 83 |
+
if not messages:
|
| 84 |
+
return state
|
| 85 |
+
|
| 86 |
+
last_message = messages[-1].content if messages else ""
|
| 87 |
+
|
| 88 |
+
# Determine next action based on message content
|
| 89 |
+
if "parse" in last_message.lower() and "checklist" in last_message.lower():
|
| 90 |
+
state["next_action"] = TaskType.PARSE_CHECKLIST.value
|
| 91 |
+
elif "analyze" in last_message.lower() or "match" in last_message.lower():
|
| 92 |
+
state["next_action"] = TaskType.MATCH_CHECKLIST.value
|
| 93 |
+
elif "?" in last_message:
|
| 94 |
+
state["next_action"] = TaskType.ANSWER_QUESTION.value
|
| 95 |
+
else:
|
| 96 |
+
state["next_action"] = TaskType.SUMMARIZE_FINDINGS.value
|
| 97 |
+
|
| 98 |
+
return state
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def parse_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 102 |
+
"""Parse checklist using structured output - much simpler!"""
|
| 103 |
+
messages = state["messages"]
|
| 104 |
+
checklist_text = messages[-1].content if messages else ""
|
| 105 |
+
|
| 106 |
+
# Set up simple parser
|
| 107 |
+
parser = PydanticOutputParser(pydantic_object=SimpleChecklist)
|
| 108 |
+
prompt = get_checklist_parsing_prompt(checklist_text)
|
| 109 |
+
|
| 110 |
+
# Create chain and parse - that's it!
|
| 111 |
+
chain = prompt | llm | parser
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
result = chain.invoke({
|
| 115 |
+
"checklist_text": checklist_text[:3000],
|
| 116 |
+
"format_instructions": parser.get_format_instructions()
|
| 117 |
+
})
|
| 118 |
+
|
| 119 |
+
state["checklist"] = result.categories # Already in the right format!
|
| 120 |
+
state["messages"].append(AIMessage(content=f"Parsed {len(result.categories)} categories"))
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
|
| 124 |
+
|
| 125 |
+
return state
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def match_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 129 |
+
"""Match documents to checklist items - keep it simple"""
|
| 130 |
+
checklist = state.get("checklist", {})
|
| 131 |
+
documents = state.get("documents", [])
|
| 132 |
+
|
| 133 |
+
if not checklist or not documents:
|
| 134 |
+
state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
|
| 135 |
+
return state
|
| 136 |
+
|
| 137 |
+
# For each checklist item, find relevant documents
|
| 138 |
+
findings = {}
|
| 139 |
+
for cat_letter, category in checklist.items():
|
| 140 |
+
cat_findings = []
|
| 141 |
+
for item in category.get("items", []):
|
| 142 |
+
# Use Claude to assess relevance
|
| 143 |
+
document_names = [d.get('name', 'Unknown') for d in documents[:10]]
|
| 144 |
+
prompt = get_document_relevance_prompt(item['text'], document_names)
|
| 145 |
+
|
| 146 |
+
response = llm.invoke([HumanMessage(content=str(prompt))])
|
| 147 |
+
cat_findings.append({
|
| 148 |
+
"item": item['text'],
|
| 149 |
+
"relevant_docs": response.content
|
| 150 |
+
})
|
| 151 |
+
|
| 152 |
+
findings[category['name']] = cat_findings
|
| 153 |
+
|
| 154 |
+
state["findings"] = findings
|
| 155 |
+
state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
|
| 156 |
+
|
| 157 |
+
return state
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
def answer_question_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 161 |
+
"""Answer questions using document context"""
|
| 162 |
+
messages = state["messages"]
|
| 163 |
+
question = messages[-1].content if messages else ""
|
| 164 |
+
documents = state.get("documents", [])
|
| 165 |
+
|
| 166 |
+
# Create context from documents
|
| 167 |
+
context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
|
| 168 |
+
for d in documents[:5]])
|
| 169 |
+
|
| 170 |
+
prompt = get_question_answering_prompt(question, context)
|
| 171 |
+
response = llm.invoke([HumanMessage(content=prompt)])
|
| 172 |
+
state["messages"].append(AIMessage(content=response.content))
|
| 173 |
+
|
| 174 |
+
return state
|
| 175 |
+
|
| 176 |
|
| 177 |
+
def summarize_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
|
| 178 |
+
"""Summarize findings"""
|
| 179 |
+
findings = state.get("findings", {})
|
| 180 |
+
|
| 181 |
+
if not findings:
|
| 182 |
+
state["messages"].append(AIMessage(content="No findings to summarize"))
|
| 183 |
+
return state
|
| 184 |
+
|
| 185 |
+
prompt = get_findings_summary_prompt(findings)
|
| 186 |
+
response = llm.invoke([HumanMessage(content=prompt)])
|
| 187 |
+
state["messages"].append(AIMessage(content=response.content))
|
| 188 |
+
|
| 189 |
+
return state
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def route_condition(state: AgentState) -> str:
|
| 193 |
+
"""Conditional routing function based on next_action"""
|
| 194 |
+
next_action = state.get("next_action")
|
| 195 |
+
if next_action == TaskType.PARSE_CHECKLIST.value:
|
| 196 |
+
return "parse_checklist"
|
| 197 |
+
elif next_action == TaskType.MATCH_CHECKLIST.value:
|
| 198 |
+
return "match_checklist"
|
| 199 |
+
elif next_action == TaskType.ANSWER_QUESTION.value:
|
| 200 |
+
return "answer_question"
|
| 201 |
+
else:
|
| 202 |
+
return "summarize"
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# =============================================================================
|
| 206 |
+
# LLM UTILITIES - Merged from llm_utilities.py
|
| 207 |
+
# =============================================================================
|
| 208 |
+
|
| 209 |
+
def simple_retry(func, max_retries: int = 3, base_delay: float = 1.0):
|
| 210 |
+
"""Simple exponential backoff retry with jitter"""
|
| 211 |
+
last_exception = None
|
| 212 |
+
for attempt in range(max_retries):
|
| 213 |
+
try:
|
| 214 |
+
return func()
|
| 215 |
+
except Exception as e:
|
| 216 |
+
last_exception = e
|
| 217 |
+
|
| 218 |
+
# Check if it's a rate limit error that should be retried
|
| 219 |
+
error_str = str(e).lower()
|
| 220 |
+
if any(keyword in error_str for keyword in [
|
| 221 |
+
'rate', 'limit', 'quota', 'throttl', '429', 'too many',
|
| 222 |
+
'overload', '529', 'server_overloaded', 'overloaded_error'
|
| 223 |
+
]):
|
| 224 |
+
if attempt < max_retries - 1: # Don't wait on last attempt
|
| 225 |
+
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
| 226 |
+
time.sleep(min(delay, 60)) # Cap at 60 seconds
|
| 227 |
+
continue
|
| 228 |
+
|
| 229 |
+
# For non-retryable errors, raise immediately
|
| 230 |
+
raise e
|
| 231 |
+
|
| 232 |
+
# If we get here, all retries failed
|
| 233 |
+
raise last_exception
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
def generate_checklist_descriptions(checklist: Dict, llm: "ChatAnthropic", batch_size: Optional[int] = None) -> Dict:
|
| 237 |
+
"""
|
| 238 |
+
Generate detailed descriptions for each checklist item explaining what documents should satisfy it.
|
| 239 |
+
Returns checklist with added 'description' field for each item.
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
checklist: Checklist dictionary to enhance
|
| 243 |
+
llm: ChatAnthropic instance for generating descriptions
|
| 244 |
+
batch_size: Number of items to process in each batch (uses config default if None)
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
Enhanced checklist with descriptions
|
| 248 |
+
"""
|
| 249 |
+
|
| 250 |
+
config = get_config()
|
| 251 |
+
if batch_size is None:
|
| 252 |
+
batch_size = config.processing.description_batch_size
|
| 253 |
+
|
| 254 |
+
# Process all checklist items
|
| 255 |
+
enhanced_checklist = {}
|
| 256 |
+
all_items_to_process = []
|
| 257 |
+
|
| 258 |
+
# Collect all items with their context
|
| 259 |
+
for cat_letter, category in checklist.items():
|
| 260 |
+
cat_name = category.get('name', '')
|
| 261 |
+
enhanced_checklist[cat_letter] = {
|
| 262 |
+
'name': cat_name,
|
| 263 |
+
'letter': cat_letter,
|
| 264 |
+
'items': []
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
for item in category.get('items', []):
|
| 268 |
+
item_data = {
|
| 269 |
+
'category_letter': cat_letter,
|
| 270 |
+
'category_name': cat_name,
|
| 271 |
+
'item_text': item.get('text', ''),
|
| 272 |
+
'original_item': item,
|
| 273 |
+
'prompt': get_description_generation_prompt(cat_name, item.get('text', '')).format()
|
| 274 |
+
}
|
| 275 |
+
all_items_to_process.append(item_data)
|
| 276 |
+
|
| 277 |
+
# Process items in batches
|
| 278 |
+
total_items = len(all_items_to_process)
|
| 279 |
+
total_batches = (total_items + batch_size - 1) // batch_size
|
| 280 |
+
|
| 281 |
+
for batch_num, i in enumerate(range(0, total_items, batch_size), 1):
|
| 282 |
+
batch = all_items_to_process[i:i + batch_size]
|
| 283 |
+
batch_end = min(i + batch_size, total_items)
|
| 284 |
+
|
| 285 |
+
# Update progress if available
|
| 286 |
+
if hasattr(st, 'progress') and 'description_progress' in st.session_state:
|
| 287 |
+
progress = i / total_items
|
| 288 |
+
st.session_state.description_progress.progress(
|
| 289 |
+
progress,
|
| 290 |
+
text=f"📝 Generating descriptions batch {batch_num}/{total_batches} (items {i+1}-{batch_end} of {total_items})"
|
| 291 |
+
)
|
| 292 |
+
|
| 293 |
+
# Create prompts for batch processing
|
| 294 |
+
prompts = [item_data['prompt'] for item_data in batch]
|
| 295 |
+
messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
|
| 296 |
+
|
| 297 |
+
# Process batch with simple retry logic
|
| 298 |
+
try:
|
| 299 |
+
responses = simple_retry(
|
| 300 |
+
lambda: llm.batch(
|
| 301 |
+
messages_batch,
|
| 302 |
+
config={"max_concurrency": min(batch_size, config.api.max_concurrent_requests)}
|
| 303 |
+
),
|
| 304 |
+
max_retries=3,
|
| 305 |
+
base_delay=0.5
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
# Extract descriptions from responses
|
| 309 |
+
batch_descriptions = [response.content.strip() if response else f"Documents related to {item_data['item_text']}"
|
| 310 |
+
for response, item_data in zip(responses, batch)]
|
| 311 |
+
except Exception as e:
|
| 312 |
+
logger.warning(f"Batch {batch_num} description generation failed: {e}. Using fallback descriptions.")
|
| 313 |
+
batch_descriptions = [f"Documents related to {item_data['item_text']}" for item_data in batch]
|
| 314 |
+
|
| 315 |
+
# Add descriptions to items
|
| 316 |
+
for item_data, description in zip(batch, batch_descriptions):
|
| 317 |
+
enhanced_item = item_data['original_item'].copy()
|
| 318 |
+
enhanced_item['description'] = description
|
| 319 |
+
enhanced_checklist[item_data['category_letter']]['items'].append(enhanced_item)
|
| 320 |
+
|
| 321 |
+
# No delay between batches - using rate limiting with exponential backoff instead
|
| 322 |
+
|
| 323 |
+
return enhanced_checklist
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
def batch_summarize_documents(documents: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
|
| 327 |
+
"""
|
| 328 |
+
Summarize documents using LangChain's built-in batch processing for true parallelization.
|
| 329 |
+
Optimized with larger batches, higher concurrency, and exponential backoff rate limiting.
|
| 330 |
+
Returns documents with added 'summary' field.
|
| 331 |
+
|
| 332 |
+
Args:
|
| 333 |
+
documents: List of document dictionaries to summarize
|
| 334 |
+
llm: ChatAnthropic instance for generating summaries
|
| 335 |
+
batch_size: Number of documents to process in each batch (uses config default if None)
|
| 336 |
+
|
| 337 |
+
Returns:
|
| 338 |
+
List of documents with added summary field
|
| 339 |
+
"""
|
| 340 |
+
|
| 341 |
+
config = get_config()
|
| 342 |
+
if batch_size is None:
|
| 343 |
+
batch_size = config.processing.batch_size
|
| 344 |
+
|
| 345 |
+
# Process documents in batches
|
| 346 |
+
summarized_docs = []
|
| 347 |
+
total_docs = len(documents)
|
| 348 |
+
total_batches = (total_docs + batch_size - 1) // batch_size
|
| 349 |
+
|
| 350 |
+
for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
|
| 351 |
+
batch = documents[i:i + batch_size]
|
| 352 |
+
batch_end = min(i + batch_size, total_docs)
|
| 353 |
+
|
| 354 |
+
# Update progress with batch info
|
| 355 |
+
if hasattr(st, 'progress') and 'summary_progress' in st.session_state:
|
| 356 |
+
progress = i / total_docs
|
| 357 |
+
st.session_state.summary_progress.progress(
|
| 358 |
+
progress,
|
| 359 |
+
text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
# Create prompts for all documents in the batch
|
| 363 |
+
templates = [get_document_summarization_prompt(doc) for doc in batch]
|
| 364 |
+
prompts = [template.format() for template in templates]
|
| 365 |
+
|
| 366 |
+
# Convert prompts to HumanMessage format for batch processing
|
| 367 |
+
messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
|
| 368 |
+
|
| 369 |
+
# Process batch with simple retry logic
|
| 370 |
+
try:
|
| 371 |
+
responses = simple_retry(
|
| 372 |
+
lambda: llm.batch(
|
| 373 |
+
messages_batch,
|
| 374 |
+
config={"max_concurrency": min(batch_size // 2 or 1, config.api.max_concurrent_requests)}
|
| 375 |
+
),
|
| 376 |
+
max_retries=3,
|
| 377 |
+
base_delay=0.5
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
# Extract summaries from responses
|
| 381 |
+
batch_summaries = [response.content.strip() if response else f"Document: {doc.get('name', 'Unknown')}"
|
| 382 |
+
for response, doc in zip(responses, batch)]
|
| 383 |
+
except Exception as e:
|
| 384 |
+
logger.warning(f"Batch {batch_num} processing failed: {e}. Using fallback summaries.")
|
| 385 |
+
batch_summaries = [f"Document: {doc.get('name', 'Unknown')}" for doc in batch]
|
| 386 |
+
|
| 387 |
+
# Add summaries to documents
|
| 388 |
+
for doc, summary in zip(batch, batch_summaries):
|
| 389 |
+
doc['summary'] = summary
|
| 390 |
+
summarized_docs.append(doc)
|
| 391 |
+
|
| 392 |
+
# No delay between batches - using rate limiting with exponential backoff instead
|
| 393 |
+
|
| 394 |
+
return summarized_docs
|
| 395 |
+
|
| 396 |
+
|
| 397 |
+
def create_document_embeddings_with_summaries(documents: List[Dict], model_name: str = None) -> Dict[str, Any]:
|
| 398 |
+
"""
|
| 399 |
+
Prepare document data for LangChain-based similarity matching.
|
| 400 |
+
No longer creates embeddings directly - LangChain handles embedding generation.
|
| 401 |
+
|
| 402 |
+
Args:
|
| 403 |
+
documents: List of documents with summaries
|
| 404 |
+
|
| 405 |
+
Returns:
|
| 406 |
+
Dictionary with document info formatted for LangChain matching
|
| 407 |
+
"""
|
| 408 |
+
doc_info = []
|
| 409 |
+
|
| 410 |
+
for doc in documents:
|
| 411 |
+
# Prepare document info for LangChain matching
|
| 412 |
+
doc_name = doc.get('name', 'Unknown')
|
| 413 |
+
doc_path = doc.get('path', '')
|
| 414 |
+
summary = doc.get('summary', '')
|
| 415 |
+
|
| 416 |
+
doc_info.append({
|
| 417 |
+
'name': doc_name,
|
| 418 |
+
'path': doc_path,
|
| 419 |
+
'full_path': doc.get('full_path', doc_path),
|
| 420 |
+
'summary': summary,
|
| 421 |
+
'original_doc': doc
|
| 422 |
+
})
|
| 423 |
+
|
| 424 |
+
return {
|
| 425 |
+
'documents': doc_info
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
|
| 429 |
+
def match_checklist_with_summaries(
|
| 430 |
+
checklist: Dict,
|
| 431 |
+
doc_embeddings_data: Dict,
|
| 432 |
+
model_name: str,
|
| 433 |
+
threshold: Optional[float] = None
|
| 434 |
+
) -> Dict:
|
| 435 |
+
"""
|
| 436 |
+
Match checklist items against document summaries using LangChain FAISS.
|
| 437 |
+
Enhanced to use LLM-generated descriptions for better semantic matching.
|
| 438 |
+
|
| 439 |
+
Args:
|
| 440 |
+
checklist: Checklist dictionary with items and descriptions
|
| 441 |
+
doc_embeddings_data: Dictionary containing document info and embeddings
|
| 442 |
+
model_name: Name of the HuggingFace model for embeddings
|
| 443 |
+
threshold: Similarity threshold for matching (uses config default if None)
|
| 444 |
+
|
| 445 |
+
Returns:
|
| 446 |
+
Dictionary with matching results
|
| 447 |
+
"""
|
| 448 |
+
config = get_config()
|
| 449 |
+
if threshold is None:
|
| 450 |
+
threshold = config.processing.similarity_threshold
|
| 451 |
+
|
| 452 |
+
doc_info = doc_embeddings_data['documents']
|
| 453 |
+
|
| 454 |
+
# Create LangChain embeddings instance
|
| 455 |
+
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
| 456 |
+
|
| 457 |
+
# Convert document summaries to LangChain Documents
|
| 458 |
+
documents = [
|
| 459 |
+
Document(
|
| 460 |
+
page_content=f"{doc['name']}\n{doc['path']}\n{doc['summary']}",
|
| 461 |
+
metadata={
|
| 462 |
+
'name': doc['name'],
|
| 463 |
+
'path': doc['path'],
|
| 464 |
+
'full_path': doc.get('full_path', doc['path']),
|
| 465 |
+
'summary': doc['summary'],
|
| 466 |
+
**doc.get('original_doc', {}).get('metadata', {})
|
| 467 |
+
}
|
| 468 |
+
)
|
| 469 |
+
for doc in doc_info
|
| 470 |
+
]
|
| 471 |
+
|
| 472 |
+
# Create LangChain FAISS vector store
|
| 473 |
+
vector_store = FAISS.from_documents(documents, embeddings)
|
| 474 |
+
retriever = vector_store.as_retriever(
|
| 475 |
+
search_type="similarity_score_threshold",
|
| 476 |
+
search_kwargs={"score_threshold": threshold, "k": 5}
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
results = {}
|
| 480 |
+
|
| 481 |
+
for cat_letter, category in checklist.items():
|
| 482 |
+
cat_name = category.get('name', '')
|
| 483 |
+
cat_results = {
|
| 484 |
+
'name': cat_name,
|
| 485 |
+
'letter': cat_letter,
|
| 486 |
+
'total_items': len(category.get('items', [])),
|
| 487 |
+
'matched_items': 0,
|
| 488 |
+
'items': []
|
| 489 |
+
}
|
| 490 |
+
|
| 491 |
+
for item in category.get('items', []):
|
| 492 |
+
item_text = item.get('text', '')
|
| 493 |
+
item_description = item.get('description', '')
|
| 494 |
+
|
| 495 |
+
# Create enhanced query using both item text and generated description
|
| 496 |
+
if item_description:
|
| 497 |
+
# Use the LLM-generated description for richer semantic matching
|
| 498 |
+
query = f"{cat_name}: {item_text}\n{item_description}"
|
| 499 |
+
else:
|
| 500 |
+
# Fall back to original method if no description available
|
| 501 |
+
query = f"{cat_name}: {item_text}"
|
| 502 |
+
|
| 503 |
+
# Use LangChain retriever for similarity search
|
| 504 |
+
docs = safe_execute(
|
| 505 |
+
lambda: retriever.invoke(query),
|
| 506 |
+
default=[],
|
| 507 |
+
context="Document matching with summaries"
|
| 508 |
+
)
|
| 509 |
+
|
| 510 |
+
# Convert LangChain documents to matches format
|
| 511 |
+
matches = []
|
| 512 |
+
for doc in docs[:5]: # Keep top 5 matches
|
| 513 |
+
match_data = {
|
| 514 |
+
'name': doc.metadata['name'],
|
| 515 |
+
'path': doc.metadata['path'],
|
| 516 |
+
'full_path': doc.metadata.get('full_path', doc.metadata['path']),
|
| 517 |
+
'summary': doc.metadata['summary'],
|
| 518 |
+
'score': 0.8, # LangChain retriever doesn't return raw scores
|
| 519 |
+
'metadata': {k: v for k, v in doc.metadata.items()
|
| 520 |
+
if k not in ['name', 'path', 'full_path', 'summary']}
|
| 521 |
+
}
|
| 522 |
+
matches.append(match_data)
|
| 523 |
+
|
| 524 |
+
item_result = {
|
| 525 |
+
'text': item_text,
|
| 526 |
+
'original': item.get('original', item_text),
|
| 527 |
+
'description': item_description, # Include the generated description
|
| 528 |
+
'matches': matches
|
| 529 |
+
}
|
| 530 |
+
|
| 531 |
+
# Count items with matches toward category total
|
| 532 |
+
if matches:
|
| 533 |
+
cat_results['matched_items'] += 1
|
| 534 |
+
|
| 535 |
+
cat_results['items'].append(item_result)
|
| 536 |
+
|
| 537 |
+
results[cat_letter] = cat_results
|
| 538 |
+
|
| 539 |
+
return results
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
# =============================================================================
|
| 543 |
+
# LANGGRAPH AGENT FUNCTIONS
|
| 544 |
+
# =============================================================================
|
| 545 |
+
|
| 546 |
+
def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, "ChatAnthropic"]]:
|
| 547 |
"""
|
| 548 |
Create a LangGraph agent with Anthropic
|
| 549 |
|
|
|
|
| 555 |
Tuple of (compiled_app, llm) or None if not available
|
| 556 |
"""
|
| 557 |
|
|
|
|
|
|
|
|
|
|
| 558 |
# Get configuration
|
| 559 |
config = get_config()
|
| 560 |
|
|
|
|
| 670 |
|
| 671 |
def is_available(self) -> bool:
|
| 672 |
"""Check if the agent is available for use"""
|
| 673 |
+
return self.app is not None and self.llm is not None
|
| 674 |
|
| 675 |
def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
|
| 676 |
"""
|
|
|
|
| 694 |
|
| 695 |
return result.get("checklist")
|
| 696 |
except Exception as e:
|
| 697 |
+
st.error(f"Agent error: {str(e)}")
|
|
|
|
| 698 |
return None
|
| 699 |
|
| 700 |
def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
|
|
|
|
| 727 |
|
| 728 |
return result.get("findings", {})
|
| 729 |
except Exception as e:
|
| 730 |
+
st.error(f"Agent error: {str(e)}")
|
|
|
|
| 731 |
return {}
|
| 732 |
|
| 733 |
def answer_question(self, question: str, documents: List[Dict]) -> str:
|
src/ai/agent_nodes.py
DELETED
|
@@ -1,173 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
LangGraph Agent Nodes Module
|
| 4 |
-
|
| 5 |
-
This module contains all the individual node functions used in the
|
| 6 |
-
LangGraph workflow for the DD-Checklist agent.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import json
|
| 10 |
-
from typing import Dict, List, Optional, Sequence, Any
|
| 11 |
-
from typing_extensions import TypedDict
|
| 12 |
-
from enum import Enum
|
| 13 |
-
|
| 14 |
-
try:
|
| 15 |
-
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
|
| 16 |
-
from langchain_anthropic import ChatAnthropic
|
| 17 |
-
LANGGRAPH_AVAILABLE = True
|
| 18 |
-
except ImportError:
|
| 19 |
-
LANGGRAPH_AVAILABLE = False
|
| 20 |
-
BaseMessage = object
|
| 21 |
-
HumanMessage = object
|
| 22 |
-
AIMessage = object
|
| 23 |
-
ChatAnthropic = object
|
| 24 |
-
|
| 25 |
-
from .prompts import (
|
| 26 |
-
get_checklist_parsing_prompt,
|
| 27 |
-
get_document_relevance_prompt,
|
| 28 |
-
get_question_answering_prompt,
|
| 29 |
-
get_findings_summary_prompt
|
| 30 |
-
)
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
# Define the state for our agent
|
| 34 |
-
class AgentState(TypedDict):
|
| 35 |
-
"""State for the due diligence agent"""
|
| 36 |
-
messages: Sequence[BaseMessage]
|
| 37 |
-
checklist: Optional[Dict]
|
| 38 |
-
documents: Optional[List[Dict]]
|
| 39 |
-
current_task: Optional[str]
|
| 40 |
-
findings: Dict[str, List[str]]
|
| 41 |
-
next_action: Optional[str]
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
class TaskType(Enum):
|
| 45 |
-
"""Types of tasks the agent can perform"""
|
| 46 |
-
PARSE_CHECKLIST = "parse_checklist"
|
| 47 |
-
ANALYZE_DOCUMENT = "analyze_document"
|
| 48 |
-
MATCH_CHECKLIST = "match_checklist"
|
| 49 |
-
ANSWER_QUESTION = "answer_question"
|
| 50 |
-
SUMMARIZE_FINDINGS = "summarize_findings"
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
def route_task(state: AgentState) -> AgentState:
|
| 54 |
-
"""Route to appropriate task based on current state"""
|
| 55 |
-
messages = state["messages"]
|
| 56 |
-
if not messages:
|
| 57 |
-
return state
|
| 58 |
-
|
| 59 |
-
last_message = messages[-1].content if messages else ""
|
| 60 |
-
|
| 61 |
-
# Determine next action based on message content
|
| 62 |
-
if "parse" in last_message.lower() and "checklist" in last_message.lower():
|
| 63 |
-
state["next_action"] = TaskType.PARSE_CHECKLIST.value
|
| 64 |
-
elif "analyze" in last_message.lower() or "match" in last_message.lower():
|
| 65 |
-
state["next_action"] = TaskType.MATCH_CHECKLIST.value
|
| 66 |
-
elif "?" in last_message:
|
| 67 |
-
state["next_action"] = TaskType.ANSWER_QUESTION.value
|
| 68 |
-
else:
|
| 69 |
-
state["next_action"] = TaskType.SUMMARIZE_FINDINGS.value
|
| 70 |
-
|
| 71 |
-
return state
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
def parse_checklist_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
|
| 75 |
-
"""Parse checklist using Claude"""
|
| 76 |
-
messages = state["messages"]
|
| 77 |
-
checklist_text = messages[-1].content if messages else ""
|
| 78 |
-
|
| 79 |
-
prompt = get_checklist_parsing_prompt(checklist_text)
|
| 80 |
-
response = llm.invoke([HumanMessage(content=prompt)])
|
| 81 |
-
|
| 82 |
-
try:
|
| 83 |
-
# Parse JSON from response
|
| 84 |
-
json_str = response.content
|
| 85 |
-
if "```json" in json_str:
|
| 86 |
-
json_str = json_str.split("```json")[1].split("```")[0]
|
| 87 |
-
elif "```" in json_str:
|
| 88 |
-
json_str = json_str.split("```")[1].split("```")[0]
|
| 89 |
-
|
| 90 |
-
parsed = json.loads(json_str.strip())
|
| 91 |
-
state["checklist"] = parsed
|
| 92 |
-
state["messages"].append(AIMessage(content=f"Parsed {len(parsed)} categories"))
|
| 93 |
-
except Exception as e:
|
| 94 |
-
state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
|
| 95 |
-
|
| 96 |
-
return state
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
def match_checklist_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
|
| 100 |
-
"""Match documents to checklist items"""
|
| 101 |
-
checklist = state.get("checklist", {})
|
| 102 |
-
documents = state.get("documents", [])
|
| 103 |
-
|
| 104 |
-
if not checklist or not documents:
|
| 105 |
-
state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
|
| 106 |
-
return state
|
| 107 |
-
|
| 108 |
-
# For each checklist item, find relevant documents
|
| 109 |
-
findings = {}
|
| 110 |
-
for cat_letter, category in checklist.items():
|
| 111 |
-
cat_findings = []
|
| 112 |
-
for item in category.get("items", []):
|
| 113 |
-
# Use Claude to assess relevance
|
| 114 |
-
document_names = [d.get('name', 'Unknown') for d in documents[:10]]
|
| 115 |
-
prompt = get_document_relevance_prompt(item['text'], document_names)
|
| 116 |
-
|
| 117 |
-
response = llm.invoke([HumanMessage(content=prompt)])
|
| 118 |
-
cat_findings.append({
|
| 119 |
-
"item": item['text'],
|
| 120 |
-
"relevant_docs": response.content
|
| 121 |
-
})
|
| 122 |
-
|
| 123 |
-
findings[category['name']] = cat_findings
|
| 124 |
-
|
| 125 |
-
state["findings"] = findings
|
| 126 |
-
state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
|
| 127 |
-
|
| 128 |
-
return state
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
def answer_question_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
|
| 132 |
-
"""Answer questions using document context"""
|
| 133 |
-
messages = state["messages"]
|
| 134 |
-
question = messages[-1].content if messages else ""
|
| 135 |
-
documents = state.get("documents", [])
|
| 136 |
-
|
| 137 |
-
# Create context from documents
|
| 138 |
-
context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
|
| 139 |
-
for d in documents[:5]])
|
| 140 |
-
|
| 141 |
-
prompt = get_question_answering_prompt(question, context)
|
| 142 |
-
response = llm.invoke([HumanMessage(content=prompt)])
|
| 143 |
-
state["messages"].append(AIMessage(content=response.content))
|
| 144 |
-
|
| 145 |
-
return state
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
def summarize_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
|
| 149 |
-
"""Summarize findings"""
|
| 150 |
-
findings = state.get("findings", {})
|
| 151 |
-
|
| 152 |
-
if not findings:
|
| 153 |
-
state["messages"].append(AIMessage(content="No findings to summarize"))
|
| 154 |
-
return state
|
| 155 |
-
|
| 156 |
-
prompt = get_findings_summary_prompt(findings)
|
| 157 |
-
response = llm.invoke([HumanMessage(content=prompt)])
|
| 158 |
-
state["messages"].append(AIMessage(content=response.content))
|
| 159 |
-
|
| 160 |
-
return state
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
def route_condition(state: AgentState) -> str:
|
| 164 |
-
"""Conditional routing function based on next_action"""
|
| 165 |
-
next_action = state.get("next_action")
|
| 166 |
-
if next_action == TaskType.PARSE_CHECKLIST.value:
|
| 167 |
-
return "parse_checklist"
|
| 168 |
-
elif next_action == TaskType.MATCH_CHECKLIST.value:
|
| 169 |
-
return "match_checklist"
|
| 170 |
-
elif next_action == TaskType.ANSWER_QUESTION.value:
|
| 171 |
-
return "answer_question"
|
| 172 |
-
else:
|
| 173 |
-
return "summarize"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ai/llm_utilities.py
DELETED
|
@@ -1,432 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
LLM Utilities Module
|
| 4 |
-
|
| 5 |
-
This module contains utility functions for batch processing, document
|
| 6 |
-
summarization, embeddings, and checklist matching operations.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import time
|
| 10 |
-
import random
|
| 11 |
-
from typing import Dict, List, Any, Optional
|
| 12 |
-
|
| 13 |
-
try:
|
| 14 |
-
import streamlit as st
|
| 15 |
-
from langchain_anthropic import ChatAnthropic
|
| 16 |
-
from langchain_core.messages import HumanMessage
|
| 17 |
-
import numpy as np
|
| 18 |
-
import faiss
|
| 19 |
-
DEPENDENCIES_AVAILABLE = True
|
| 20 |
-
except ImportError:
|
| 21 |
-
DEPENDENCIES_AVAILABLE = False
|
| 22 |
-
st = None
|
| 23 |
-
ChatAnthropic = object
|
| 24 |
-
HumanMessage = object
|
| 25 |
-
|
| 26 |
-
from ..config import get_config
|
| 27 |
-
from .prompts import get_description_generation_prompt, get_document_summarization_prompt
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
def exponential_backoff_retry(func, max_retries: Optional[int] = None, base_delay: Optional[float] = None):
|
| 31 |
-
"""
|
| 32 |
-
Execute function with exponential backoff retry logic for rate limiting.
|
| 33 |
-
|
| 34 |
-
Args:
|
| 35 |
-
func: Function to execute
|
| 36 |
-
max_retries: Maximum number of retries (uses config default if None)
|
| 37 |
-
base_delay: Base delay in seconds (uses config default if None)
|
| 38 |
-
|
| 39 |
-
Returns:
|
| 40 |
-
Result of the function call
|
| 41 |
-
"""
|
| 42 |
-
config = get_config()
|
| 43 |
-
if max_retries is None:
|
| 44 |
-
max_retries = config.api.max_retries
|
| 45 |
-
if base_delay is None:
|
| 46 |
-
base_delay = config.api.base_delay
|
| 47 |
-
|
| 48 |
-
for attempt in range(max_retries):
|
| 49 |
-
try:
|
| 50 |
-
return func()
|
| 51 |
-
except Exception as e:
|
| 52 |
-
error_str = str(e).lower()
|
| 53 |
-
# Check if it's a rate limiting error
|
| 54 |
-
if any(keyword in error_str for keyword in ['rate', 'limit', 'quota', 'throttl', '429', 'too many']):
|
| 55 |
-
if attempt < max_retries - 1:
|
| 56 |
-
# Calculate exponential backoff with jitter
|
| 57 |
-
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
|
| 58 |
-
print(f"Rate limit hit, retrying in {delay:.2f}s (attempt {attempt + 1}/{max_retries})")
|
| 59 |
-
time.sleep(delay)
|
| 60 |
-
continue
|
| 61 |
-
else:
|
| 62 |
-
print(f"Rate limit exceeded after {max_retries} attempts")
|
| 63 |
-
raise e
|
| 64 |
-
else:
|
| 65 |
-
# Non-rate limit error, don't retry
|
| 66 |
-
raise e
|
| 67 |
-
return None
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def generate_checklist_descriptions(checklist: Dict, llm: ChatAnthropic, batch_size: Optional[int] = None) -> Dict:
|
| 71 |
-
"""
|
| 72 |
-
Generate detailed descriptions for each checklist item explaining what documents should satisfy it.
|
| 73 |
-
Returns checklist with added 'description' field for each item.
|
| 74 |
-
|
| 75 |
-
Args:
|
| 76 |
-
checklist: Checklist dictionary to enhance
|
| 77 |
-
llm: ChatAnthropic instance for generating descriptions
|
| 78 |
-
batch_size: Number of items to process in each batch (uses config default if None)
|
| 79 |
-
|
| 80 |
-
Returns:
|
| 81 |
-
Enhanced checklist with descriptions
|
| 82 |
-
"""
|
| 83 |
-
if not DEPENDENCIES_AVAILABLE:
|
| 84 |
-
return checklist
|
| 85 |
-
|
| 86 |
-
config = get_config()
|
| 87 |
-
if batch_size is None:
|
| 88 |
-
batch_size = config.processing.description_batch_size
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
# Process all checklist items
|
| 93 |
-
enhanced_checklist = {}
|
| 94 |
-
all_items_to_process = []
|
| 95 |
-
|
| 96 |
-
# Collect all items with their context
|
| 97 |
-
for cat_letter, category in checklist.items():
|
| 98 |
-
cat_name = category.get('name', '')
|
| 99 |
-
enhanced_checklist[cat_letter] = {
|
| 100 |
-
'name': cat_name,
|
| 101 |
-
'letter': cat_letter,
|
| 102 |
-
'items': []
|
| 103 |
-
}
|
| 104 |
-
|
| 105 |
-
for item in category.get('items', []):
|
| 106 |
-
item_data = {
|
| 107 |
-
'category_letter': cat_letter,
|
| 108 |
-
'category_name': cat_name,
|
| 109 |
-
'item_text': item.get('text', ''),
|
| 110 |
-
'original_item': item,
|
| 111 |
-
'prompt': get_description_generation_prompt(cat_name, item.get('text', ''))
|
| 112 |
-
}
|
| 113 |
-
all_items_to_process.append(item_data)
|
| 114 |
-
|
| 115 |
-
# Process items in batches
|
| 116 |
-
total_items = len(all_items_to_process)
|
| 117 |
-
total_batches = (total_items + batch_size - 1) // batch_size
|
| 118 |
-
|
| 119 |
-
for batch_num, i in enumerate(range(0, total_items, batch_size), 1):
|
| 120 |
-
batch = all_items_to_process[i:i + batch_size]
|
| 121 |
-
batch_end = min(i + batch_size, total_items)
|
| 122 |
-
|
| 123 |
-
# Update progress if available
|
| 124 |
-
if st and hasattr(st, 'progress') and 'description_progress' in st.session_state:
|
| 125 |
-
progress = i / total_items
|
| 126 |
-
st.session_state.description_progress.progress(
|
| 127 |
-
progress,
|
| 128 |
-
text=f"📝 Generating descriptions batch {batch_num}/{total_batches} (items {i+1}-{batch_end} of {total_items})"
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
# Create prompts for batch processing
|
| 132 |
-
prompts = [item_data['prompt'] for item_data in batch]
|
| 133 |
-
messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
|
| 134 |
-
|
| 135 |
-
# Use exponential backoff for batch processing
|
| 136 |
-
def process_descriptions_batch():
|
| 137 |
-
# Use higher concurrency for descriptions since they're short
|
| 138 |
-
max_concurrent = min(batch_size * 2, config.api.max_concurrent_requests)
|
| 139 |
-
return llm.batch(
|
| 140 |
-
messages_batch,
|
| 141 |
-
config={"max_concurrency": max_concurrent}
|
| 142 |
-
)
|
| 143 |
-
|
| 144 |
-
try:
|
| 145 |
-
responses = exponential_backoff_retry(
|
| 146 |
-
process_descriptions_batch,
|
| 147 |
-
max_retries=config.api.max_retries,
|
| 148 |
-
base_delay=config.api.batch_base_delay
|
| 149 |
-
)
|
| 150 |
-
|
| 151 |
-
# Extract descriptions from responses
|
| 152 |
-
batch_descriptions = [response.content.strip() if response else f"Documents related to {item_data['item_text']}"
|
| 153 |
-
for response, item_data in zip(responses, batch)]
|
| 154 |
-
except Exception as e:
|
| 155 |
-
# Fallback to sequential processing with individual retries if batch fails
|
| 156 |
-
print(f"Batch {batch_num} description generation failed: {e}. Falling back to sequential with retries.")
|
| 157 |
-
batch_descriptions = []
|
| 158 |
-
for item_data in batch:
|
| 159 |
-
def single_description_process():
|
| 160 |
-
return llm.invoke([HumanMessage(content=item_data['prompt'])])
|
| 161 |
-
|
| 162 |
-
try:
|
| 163 |
-
response = exponential_backoff_retry(
|
| 164 |
-
single_description_process,
|
| 165 |
-
max_retries=config.api.batch_retry_attempts,
|
| 166 |
-
base_delay=config.api.single_retry_base_delay
|
| 167 |
-
)
|
| 168 |
-
batch_descriptions.append(response.content.strip())
|
| 169 |
-
except Exception as inner_e:
|
| 170 |
-
print(f"Failed to generate description for {item_data['item_text']}: {inner_e}")
|
| 171 |
-
batch_descriptions.append(f"Documents related to {item_data['item_text']}")
|
| 172 |
-
|
| 173 |
-
# Add descriptions to items
|
| 174 |
-
for item_data, description in zip(batch, batch_descriptions):
|
| 175 |
-
enhanced_item = item_data['original_item'].copy()
|
| 176 |
-
enhanced_item['description'] = description
|
| 177 |
-
enhanced_checklist[item_data['category_letter']]['items'].append(enhanced_item)
|
| 178 |
-
|
| 179 |
-
# No delay between batches - using rate limiting with exponential backoff instead
|
| 180 |
-
|
| 181 |
-
return enhanced_checklist
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
def batch_summarize_documents(documents: List[Dict], llm: ChatAnthropic, batch_size: Optional[int] = None) -> List[Dict]:
|
| 185 |
-
"""
|
| 186 |
-
Summarize documents using LangChain's built-in batch processing for true parallelization.
|
| 187 |
-
Optimized with larger batches, higher concurrency, and exponential backoff rate limiting.
|
| 188 |
-
Returns documents with added 'summary' field.
|
| 189 |
-
|
| 190 |
-
Args:
|
| 191 |
-
documents: List of document dictionaries to summarize
|
| 192 |
-
llm: ChatAnthropic instance for generating summaries
|
| 193 |
-
batch_size: Number of documents to process in each batch (uses config default if None)
|
| 194 |
-
|
| 195 |
-
Returns:
|
| 196 |
-
List of documents with added summary field
|
| 197 |
-
"""
|
| 198 |
-
if not DEPENDENCIES_AVAILABLE:
|
| 199 |
-
return documents
|
| 200 |
-
|
| 201 |
-
config = get_config()
|
| 202 |
-
if batch_size is None:
|
| 203 |
-
batch_size = config.processing.batch_size
|
| 204 |
-
|
| 205 |
-
# Process documents in batches
|
| 206 |
-
summarized_docs = []
|
| 207 |
-
total_docs = len(documents)
|
| 208 |
-
total_batches = (total_docs + batch_size - 1) // batch_size
|
| 209 |
-
|
| 210 |
-
for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
|
| 211 |
-
batch = documents[i:i + batch_size]
|
| 212 |
-
batch_end = min(i + batch_size, total_docs)
|
| 213 |
-
|
| 214 |
-
# Update progress with batch info
|
| 215 |
-
if st and hasattr(st, 'progress') and 'summary_progress' in st.session_state:
|
| 216 |
-
progress = i / total_docs
|
| 217 |
-
st.session_state.summary_progress.progress(
|
| 218 |
-
progress,
|
| 219 |
-
text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
|
| 220 |
-
)
|
| 221 |
-
|
| 222 |
-
# Create prompts for all documents in the batch
|
| 223 |
-
prompts = [get_document_summarization_prompt(doc) for doc in batch]
|
| 224 |
-
|
| 225 |
-
# Convert prompts to HumanMessage format for batch processing
|
| 226 |
-
messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
|
| 227 |
-
|
| 228 |
-
# Use exponential backoff for batch processing
|
| 229 |
-
def process_batch():
|
| 230 |
-
max_concurrent = min(batch_size, config.api.max_concurrent_requests)
|
| 231 |
-
return llm.batch(
|
| 232 |
-
messages_batch,
|
| 233 |
-
config={"max_concurrency": max_concurrent}
|
| 234 |
-
)
|
| 235 |
-
|
| 236 |
-
try:
|
| 237 |
-
responses = exponential_backoff_retry(
|
| 238 |
-
process_batch,
|
| 239 |
-
max_retries=config.api.max_retries,
|
| 240 |
-
base_delay=config.api.batch_base_delay
|
| 241 |
-
)
|
| 242 |
-
|
| 243 |
-
# Extract summaries from responses
|
| 244 |
-
batch_summaries = [response.content.strip() if response else f"Document: {doc.get('name', 'Unknown')}"
|
| 245 |
-
for response, doc in zip(responses, batch)]
|
| 246 |
-
except Exception as e:
|
| 247 |
-
# Fallback to sequential processing with individual retries if batch fails
|
| 248 |
-
print(f"Batch {batch_num} processing failed: {e}. Falling back to sequential with retries.")
|
| 249 |
-
batch_summaries = []
|
| 250 |
-
for doc_idx, doc in enumerate(batch):
|
| 251 |
-
prompt = get_document_summarization_prompt(doc)
|
| 252 |
-
|
| 253 |
-
def single_doc_process():
|
| 254 |
-
return llm.invoke([HumanMessage(content=prompt)])
|
| 255 |
-
|
| 256 |
-
try:
|
| 257 |
-
response = exponential_backoff_retry(
|
| 258 |
-
single_doc_process,
|
| 259 |
-
max_retries=config.api.batch_retry_attempts,
|
| 260 |
-
base_delay=config.api.single_retry_base_delay
|
| 261 |
-
)
|
| 262 |
-
batch_summaries.append(response.content.strip())
|
| 263 |
-
except Exception as inner_e:
|
| 264 |
-
print(f"Failed to summarize {doc.get('name', 'Unknown')}: {inner_e}")
|
| 265 |
-
batch_summaries.append(f"Document: {doc.get('name', 'Unknown')}")
|
| 266 |
-
|
| 267 |
-
# Update progress within fallback
|
| 268 |
-
if st and hasattr(st, 'progress') and 'summary_progress' in st.session_state:
|
| 269 |
-
sub_progress = (i + doc_idx + 1) / total_docs
|
| 270 |
-
st.session_state.summary_progress.progress(
|
| 271 |
-
sub_progress,
|
| 272 |
-
text=f"📝 Sequential fallback: {i + doc_idx + 1}/{total_docs}"
|
| 273 |
-
)
|
| 274 |
-
|
| 275 |
-
# Add summaries to documents
|
| 276 |
-
for doc, summary in zip(batch, batch_summaries):
|
| 277 |
-
doc['summary'] = summary
|
| 278 |
-
summarized_docs.append(doc)
|
| 279 |
-
|
| 280 |
-
# No delay between batches - using rate limiting with exponential backoff instead
|
| 281 |
-
|
| 282 |
-
return summarized_docs
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
def create_document_embeddings_with_summaries(documents: List[Dict], model) -> Dict[str, Any]:
|
| 286 |
-
"""
|
| 287 |
-
Create embeddings for documents using their LLM-generated summaries.
|
| 288 |
-
|
| 289 |
-
Args:
|
| 290 |
-
documents: List of documents with summaries
|
| 291 |
-
model: SentenceTransformer model for embeddings
|
| 292 |
-
|
| 293 |
-
Returns:
|
| 294 |
-
Dictionary with document info and embeddings
|
| 295 |
-
"""
|
| 296 |
-
doc_embeddings = []
|
| 297 |
-
doc_info = []
|
| 298 |
-
|
| 299 |
-
for doc in documents:
|
| 300 |
-
# Combine filename, path context, and LLM summary for rich embedding
|
| 301 |
-
doc_name = doc.get('name', 'Unknown')
|
| 302 |
-
doc_path = doc.get('path', '')
|
| 303 |
-
summary = doc.get('summary', '')
|
| 304 |
-
|
| 305 |
-
# Create rich text representation
|
| 306 |
-
embedding_text = f"{doc_name}\n{doc_path}\n{summary}"
|
| 307 |
-
|
| 308 |
-
# Generate embedding
|
| 309 |
-
embedding = model.encode(embedding_text)
|
| 310 |
-
|
| 311 |
-
doc_embeddings.append(embedding)
|
| 312 |
-
doc_info.append({
|
| 313 |
-
'name': doc_name,
|
| 314 |
-
'path': doc_path,
|
| 315 |
-
'full_path': doc.get('full_path', doc_path),
|
| 316 |
-
'summary': summary,
|
| 317 |
-
'embedding_text': embedding_text,
|
| 318 |
-
'original_doc': doc
|
| 319 |
-
})
|
| 320 |
-
|
| 321 |
-
return {
|
| 322 |
-
'embeddings': doc_embeddings,
|
| 323 |
-
'documents': doc_info
|
| 324 |
-
}
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
def match_checklist_with_summaries(
|
| 328 |
-
checklist: Dict,
|
| 329 |
-
doc_embeddings_data: Dict,
|
| 330 |
-
model,
|
| 331 |
-
threshold: Optional[float] = None
|
| 332 |
-
) -> Dict:
|
| 333 |
-
"""
|
| 334 |
-
Match checklist items against document summaries using FAISS for 10x faster similarity search.
|
| 335 |
-
Enhanced to use LLM-generated descriptions for better semantic matching.
|
| 336 |
-
|
| 337 |
-
Args:
|
| 338 |
-
checklist: Checklist dictionary with items and descriptions
|
| 339 |
-
doc_embeddings_data: Dictionary containing document embeddings and info
|
| 340 |
-
model: SentenceTransformer model for embeddings
|
| 341 |
-
threshold: Similarity threshold for matching (uses config default if None)
|
| 342 |
-
|
| 343 |
-
Returns:
|
| 344 |
-
Dictionary with matching results
|
| 345 |
-
"""
|
| 346 |
-
if not DEPENDENCIES_AVAILABLE:
|
| 347 |
-
return {}
|
| 348 |
-
|
| 349 |
-
config = get_config()
|
| 350 |
-
if threshold is None:
|
| 351 |
-
threshold = config.processing.similarity_threshold
|
| 352 |
-
|
| 353 |
-
doc_embeddings = np.array(doc_embeddings_data['embeddings'], dtype='float32')
|
| 354 |
-
doc_info = doc_embeddings_data['documents']
|
| 355 |
-
|
| 356 |
-
# Build FAISS index for fast similarity search
|
| 357 |
-
faiss.normalize_L2(doc_embeddings) # Normalize for cosine similarity
|
| 358 |
-
dimension = doc_embeddings.shape[1]
|
| 359 |
-
faiss_index = faiss.IndexFlatIP(dimension)
|
| 360 |
-
faiss_index.add(doc_embeddings)
|
| 361 |
-
|
| 362 |
-
results = {}
|
| 363 |
-
|
| 364 |
-
for cat_letter, category in checklist.items():
|
| 365 |
-
cat_name = category.get('name', '')
|
| 366 |
-
cat_results = {
|
| 367 |
-
'name': cat_name,
|
| 368 |
-
'letter': cat_letter,
|
| 369 |
-
'total_items': len(category.get('items', [])),
|
| 370 |
-
'matched_items': 0,
|
| 371 |
-
'items': []
|
| 372 |
-
}
|
| 373 |
-
|
| 374 |
-
for item in category.get('items', []):
|
| 375 |
-
item_text = item.get('text', '')
|
| 376 |
-
item_description = item.get('description', '')
|
| 377 |
-
|
| 378 |
-
# Create enhanced embedding text using both item text and generated description
|
| 379 |
-
if item_description:
|
| 380 |
-
# Use the LLM-generated description for richer semantic matching
|
| 381 |
-
checklist_embedding_text = f"{cat_name}: {item_text}\n{item_description}"
|
| 382 |
-
else:
|
| 383 |
-
# Fallback to original method if no description available
|
| 384 |
-
checklist_embedding_text = f"{cat_name}: {item_text}"
|
| 385 |
-
|
| 386 |
-
# Create and normalize item embedding
|
| 387 |
-
item_embedding = model.encode(checklist_embedding_text).astype('float32').reshape(1, -1)
|
| 388 |
-
faiss.normalize_L2(item_embedding)
|
| 389 |
-
|
| 390 |
-
# Use FAISS for fast similarity search
|
| 391 |
-
scores, indices = faiss_index.search(item_embedding, len(doc_info))
|
| 392 |
-
|
| 393 |
-
# Find matching documents above threshold
|
| 394 |
-
matches = []
|
| 395 |
-
min_display_threshold = config.processing.min_display_threshold
|
| 396 |
-
|
| 397 |
-
for score, idx in zip(scores[0], indices[0]):
|
| 398 |
-
if idx == -1: # No more results
|
| 399 |
-
break
|
| 400 |
-
if score < min_display_threshold: # Skip very low scoring documents
|
| 401 |
-
break # Scores are sorted, so we can stop here
|
| 402 |
-
|
| 403 |
-
match_data = {
|
| 404 |
-
'name': doc_info[idx]['name'],
|
| 405 |
-
'path': doc_info[idx]['path'],
|
| 406 |
-
'full_path': doc_info[idx].get('full_path', doc_info[idx]['path']),
|
| 407 |
-
'summary': doc_info[idx]['summary'],
|
| 408 |
-
'score': float(score),
|
| 409 |
-
'metadata': doc_info[idx].get('original_doc', {}).get('metadata', {})
|
| 410 |
-
}
|
| 411 |
-
|
| 412 |
-
matches.append(match_data)
|
| 413 |
-
|
| 414 |
-
# Keep top 5 matches for display
|
| 415 |
-
display_matches = matches[:5]
|
| 416 |
-
|
| 417 |
-
item_result = {
|
| 418 |
-
'text': item_text,
|
| 419 |
-
'original': item.get('original', item_text),
|
| 420 |
-
'description': item_description, # Include the generated description
|
| 421 |
-
'matches': display_matches
|
| 422 |
-
}
|
| 423 |
-
|
| 424 |
-
# Count items with ANY matches (both green and yellow) toward category total
|
| 425 |
-
if display_matches:
|
| 426 |
-
cat_results['matched_items'] += 1
|
| 427 |
-
|
| 428 |
-
cat_results['items'].append(item_result)
|
| 429 |
-
|
| 430 |
-
results[cat_letter] = cat_results
|
| 431 |
-
|
| 432 |
-
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ai/prompts.py
CHANGED
|
@@ -6,147 +6,92 @@ This module contains all prompt templates used for AI interactions
|
|
| 6 |
in the DD-Checklist application.
|
| 7 |
"""
|
| 8 |
|
|
|
|
| 9 |
from typing import Dict, List
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
-
def get_checklist_parsing_prompt(checklist_text: str) ->
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
Formatted prompt string
|
| 21 |
-
"""
|
| 22 |
-
return f"""Parse this due diligence checklist into a structured JSON format.
|
| 23 |
-
|
| 24 |
-
Extract categories (A., B., C.) and numbered items.
|
| 25 |
-
|
| 26 |
-
Return ONLY valid JSON:
|
| 27 |
-
{{
|
| 28 |
-
"A": {{
|
| 29 |
-
"name": "Category Name",
|
| 30 |
-
"items": [{{"text": "item", "number": 1}}]
|
| 31 |
-
}}
|
| 32 |
-
}}
|
| 33 |
-
|
| 34 |
-
Checklist:
|
| 35 |
-
{checklist_text[:3000]}
|
| 36 |
-
|
| 37 |
-
JSON:"""
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
def get_document_relevance_prompt(item_text: str, documents: List[str]) -> str:
|
| 41 |
-
"""
|
| 42 |
-
Generate prompt for assessing document relevance to checklist items
|
| 43 |
-
|
| 44 |
-
Args:
|
| 45 |
-
item_text: Checklist item text
|
| 46 |
-
documents: List of document names
|
| 47 |
-
|
| 48 |
-
Returns:
|
| 49 |
-
Formatted prompt string
|
| 50 |
-
"""
|
| 51 |
-
return f"""Which of these documents is relevant to: {item_text}
|
| 52 |
-
|
| 53 |
-
Documents: {documents}
|
| 54 |
-
|
| 55 |
-
List the relevant document names only."""
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
def get_question_answering_prompt(question: str, context: str) -> str:
|
| 59 |
-
"""
|
| 60 |
-
Generate prompt for answering questions based on document context
|
| 61 |
-
|
| 62 |
-
Args:
|
| 63 |
-
question: User question
|
| 64 |
-
context: Document context
|
| 65 |
-
|
| 66 |
-
Returns:
|
| 67 |
-
Formatted prompt string
|
| 68 |
-
"""
|
| 69 |
-
return f"""Answer this question based on the documents:
|
| 70 |
-
|
| 71 |
-
Question: {question}
|
| 72 |
-
|
| 73 |
-
Document Context:
|
| 74 |
-
{context}
|
| 75 |
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
|
|
|
|
| 78 |
|
| 79 |
-
|
| 80 |
-
"""
|
| 81 |
-
Generate prompt for summarizing due diligence findings
|
| 82 |
-
|
| 83 |
-
Args:
|
| 84 |
-
findings: Dictionary of findings to summarize
|
| 85 |
-
max_chars: Maximum characters to include from findings
|
| 86 |
-
|
| 87 |
-
Returns:
|
| 88 |
-
Formatted prompt string
|
| 89 |
-
"""
|
| 90 |
-
import json
|
| 91 |
-
findings_text = json.dumps(findings, indent=2)[:max_chars]
|
| 92 |
-
|
| 93 |
-
return f"""Provide an executive summary of the due diligence findings:
|
| 94 |
|
| 95 |
-
|
|
|
|
| 96 |
|
| 97 |
-
Focus on:
|
| 98 |
-
1. Completeness of documentation
|
| 99 |
-
2. Key gaps or concerns
|
| 100 |
-
3. Overall assessment"""
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
def get_description_generation_prompt(category_name: str, item_text: str) -> str:
|
| 104 |
-
"""
|
| 105 |
-
Generate prompt for creating checklist item descriptions
|
| 106 |
-
|
| 107 |
-
Args:
|
| 108 |
-
category_name: Name of the checklist category
|
| 109 |
-
item_text: Text of the checklist item
|
| 110 |
-
|
| 111 |
-
Returns:
|
| 112 |
-
Formatted prompt string
|
| 113 |
-
"""
|
| 114 |
-
return f"""For this due diligence checklist item, provide a concise description (1-2 sentences) explaining what types of documents or information would satisfy this requirement. Focus on the specific document types and key information that would be relevant.
|
| 115 |
-
|
| 116 |
-
Category: {category_name}
|
| 117 |
Checklist Item: {item_text}
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
|
| 122 |
-
|
| 123 |
-
"""
|
| 124 |
-
Generate prompt for document type identification and summarization
|
| 125 |
-
|
| 126 |
-
Args:
|
| 127 |
-
doc: Dictionary containing document information
|
| 128 |
-
|
| 129 |
-
Returns:
|
| 130 |
-
Formatted prompt string
|
| 131 |
-
"""
|
| 132 |
-
# Extract text preview (first 1000 chars)
|
| 133 |
-
text_preview = doc.get('content', '')[:1000] if doc.get('content') else ''
|
| 134 |
-
doc_name = doc.get('name', 'Unknown')
|
| 135 |
-
doc_path = doc.get('path', '')
|
| 136 |
-
|
| 137 |
-
return f"""Identify and describe what type of document this is in 1-2 sentences.
|
| 138 |
-
Focus specifically on the document type, category, and what kind of information it contains.
|
| 139 |
|
| 140 |
-
|
|
|
|
| 141 |
|
| 142 |
-
Document: {doc_name}
|
| 143 |
-
Path: {doc_path}
|
| 144 |
-
Content preview:
|
| 145 |
-
{text_preview}
|
| 146 |
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
in the DD-Checklist application.
|
| 7 |
"""
|
| 8 |
|
| 9 |
+
import json
|
| 10 |
from typing import Dict, List
|
| 11 |
+
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
|
| 12 |
+
from langchain_core.messages import SystemMessage, HumanMessage
|
| 13 |
|
| 14 |
|
| 15 |
+
def get_checklist_parsing_prompt(checklist_text: str) -> ChatPromptTemplate:
|
| 16 |
+
"""Generate prompt for parsing due diligence checklists with structured output"""
|
| 17 |
+
return ChatPromptTemplate.from_messages([
|
| 18 |
+
SystemMessage(content="""
|
| 19 |
+
Parse this due diligence checklist into structured format. Extract:
|
| 20 |
+
- Categories (A., B., C., etc.) with their names
|
| 21 |
+
- Numbered items within each category (1., 2., 3., etc.)
|
| 22 |
+
- Total count of items
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
Follow the exact format specified in the format instructions.
|
| 25 |
+
"""),
|
| 26 |
+
HumanMessage(content="""Parse this checklist:
|
| 27 |
|
| 28 |
+
{checklist_text}
|
| 29 |
|
| 30 |
+
{format_instructions}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
Please provide the structured output:""")
|
| 33 |
+
])
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
|
| 37 |
+
"""Generate prompt for assessing document relevance to checklist items with structured output"""
|
| 38 |
+
return PromptTemplate.from_template(
|
| 39 |
+
"""Analyze which documents are relevant to the following checklist item:
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
Checklist Item: {item_text}
|
| 42 |
|
| 43 |
+
Available Documents:
|
| 44 |
+
{documents}
|
| 45 |
|
| 46 |
+
{format_instructions}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
+
Please provide your analysis in the specified format:"""
|
| 49 |
+
)
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
def get_question_answering_prompt(question: str, context: str) -> ChatPromptTemplate:
|
| 53 |
+
"""Generate prompt for answering questions based on document context"""
|
| 54 |
+
return ChatPromptTemplate.from_messages([
|
| 55 |
+
SystemMessage(content="Answer questions based on document context. Provide comprehensive answers with citations."),
|
| 56 |
+
HumanMessage(content=f"Question: {question}\n\nDocument Context:\n{context}\n\nAnswer:")
|
| 57 |
+
])
|
| 58 |
|
| 59 |
|
| 60 |
+
def get_findings_summary_prompt(findings: Dict, max_chars: int = 2000) -> PromptTemplate:
|
| 61 |
+
"""Generate prompt for summarizing due diligence findings"""
|
| 62 |
+
findings_text = json.dumps(findings, indent=2)[:max_chars]
|
| 63 |
+
return PromptTemplate.from_template(
|
| 64 |
+
"Provide an executive summary of these due diligence findings:\n\n"
|
| 65 |
+
"{findings_text}\n\n"
|
| 66 |
+
"Focus on:\n"
|
| 67 |
+
"1. Completeness of documentation\n"
|
| 68 |
+
"2. Key gaps or concerns\n"
|
| 69 |
+
"3. Overall assessment"
|
| 70 |
+
).partial(findings_text=findings_text)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def get_description_generation_prompt(category_name: str, item_text: str) -> PromptTemplate:
|
| 74 |
+
"""Generate prompt for creating checklist item descriptions"""
|
| 75 |
+
return PromptTemplate.from_template(
|
| 76 |
+
"For this due diligence checklist item, provide a concise description (1-2 sentences) "
|
| 77 |
+
"explaining what types of documents or information would satisfy this requirement.\n\n"
|
| 78 |
+
"Category: {category_name}\n"
|
| 79 |
+
"Checklist Item: {item_text}\n\n"
|
| 80 |
+
"Description:"
|
| 81 |
+
).partial(category_name=category_name, item_text=item_text)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
|
| 85 |
+
"""Generate prompt for document type identification and summarization"""
|
| 86 |
+
doc_name = doc.get('name', 'Unknown')
|
| 87 |
+
doc_path = doc.get('path', '')
|
| 88 |
+
text_preview = doc.get('content', '')[:1000] if doc.get('content') else ''
|
| 89 |
+
|
| 90 |
+
return PromptTemplate.from_template(
|
| 91 |
+
"Identify and describe what type of document this is in 1-2 sentences.\n\n"
|
| 92 |
+
"Examples: financial statement, contract agreement, corporate governance document, etc.\n\n"
|
| 93 |
+
"Document: {doc_name}\n"
|
| 94 |
+
"Path: {doc_path}\n"
|
| 95 |
+
"Content preview:\n{text_preview}\n\n"
|
| 96 |
+
"Document type description:"
|
| 97 |
+
).partial(doc_name=doc_name, doc_path=doc_path, text_preview=text_preview)
|
src/config.py
CHANGED
|
@@ -1,463 +1,373 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Configuration
|
| 4 |
|
| 5 |
-
|
| 6 |
-
Handles environment variables, default settings, and configuration validation.
|
| 7 |
"""
|
| 8 |
|
| 9 |
import os
|
|
|
|
|
|
|
|
|
|
| 10 |
from pathlib import Path
|
| 11 |
-
from typing import
|
| 12 |
-
from
|
| 13 |
-
from
|
|
|
|
| 14 |
|
| 15 |
# Fix tokenizers parallelism warning
|
| 16 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
class ModelConfig:
|
| 21 |
-
"""
|
| 22 |
sentence_transformer_model: str = "all-MiniLM-L6-v2"
|
| 23 |
claude_model: str = "claude-sonnet-4-20250514"
|
| 24 |
temperature: float = 0.3
|
| 25 |
max_tokens: int = 2000
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
@dataclass
|
| 38 |
-
class ProcessingConfig:
|
| 39 |
-
"""Configuration for document processing"""
|
| 40 |
-
chunk_size: int = 400
|
| 41 |
-
chunk_overlap: int = 50
|
| 42 |
-
max_text_length: int = 10000
|
| 43 |
-
batch_size: int = 100
|
| 44 |
-
description_batch_size: int = 100
|
| 45 |
similarity_threshold: float = 0.35
|
| 46 |
relevancy_threshold: float = 0.5
|
| 47 |
primary_threshold: float = 0.6
|
| 48 |
min_display_threshold: float = 0.15
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
supported_file_extensions: List[str] = field(
|
| 53 |
-
default_factory=lambda: ['.pdf', '.docx', '.doc', '.txt', '.md']
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
def __post_init__(self):
|
| 57 |
-
"""Load processing configuration from environment variables"""
|
| 58 |
-
self.chunk_size = int(os.getenv('CHUNK_SIZE', str(self.chunk_size)))
|
| 59 |
-
self.chunk_overlap = int(os.getenv('CHUNK_OVERLAP', str(self.chunk_overlap)))
|
| 60 |
-
self.max_text_length = int(os.getenv('MAX_TEXT_LENGTH', str(self.max_text_length)))
|
| 61 |
-
self.batch_size = int(os.getenv('BATCH_SIZE', str(self.batch_size)))
|
| 62 |
-
self.description_batch_size = int(os.getenv('DESCRIPTION_BATCH_SIZE', str(self.description_batch_size)))
|
| 63 |
-
self.similarity_threshold = float(os.getenv('SIMILARITY_THRESHOLD', str(self.similarity_threshold)))
|
| 64 |
-
self.relevancy_threshold = float(os.getenv('RELEVANCY_THRESHOLD', str(self.relevancy_threshold)))
|
| 65 |
-
self.primary_threshold = float(os.getenv('PRIMARY_THRESHOLD', str(self.primary_threshold)))
|
| 66 |
-
self.min_display_threshold = float(os.getenv('MIN_DISPLAY_THRESHOLD', str(self.min_display_threshold)))
|
| 67 |
-
self.max_workers = int(os.getenv('MAX_WORKERS', str(self.max_workers)))
|
| 68 |
-
self.file_timeout = int(os.getenv('FILE_TIMEOUT', str(self.file_timeout)))
|
| 69 |
-
self.skip_descriptions = os.getenv('SKIP_DESCRIPTIONS', 'false').lower() == 'true'
|
| 70 |
-
|
| 71 |
-
# Handle file extensions from environment (comma-separated)
|
| 72 |
-
extensions_env = os.getenv('SUPPORTED_FILE_EXTENSIONS')
|
| 73 |
-
if extensions_env:
|
| 74 |
-
self.supported_file_extensions = [ext.strip() for ext in extensions_env.split(',')]
|
| 75 |
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
"""Configuration for UI settings"""
|
| 80 |
page_title: str = "AI Due Diligence"
|
| 81 |
page_icon: str = "🤖"
|
| 82 |
layout: str = "wide"
|
| 83 |
top_k_search_results: int = 5
|
| 84 |
-
max_question_sources: int = 3
|
| 85 |
-
max_checklist_matches: int = 5
|
| 86 |
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
"""Configuration for file paths"""
|
| 91 |
data_dir: str = "data"
|
| 92 |
checklist_dir: str = "data/checklist"
|
| 93 |
questions_dir: str = "data/questions"
|
| 94 |
strategy_dir: str = "data/strategy"
|
| 95 |
vdrs_dir: str = "data/vdrs"
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
max_concurrent_requests: int = 50
|
| 114 |
-
request_timeout: int = 30
|
| 115 |
-
retry_attempts: int = 3
|
| 116 |
-
base_delay: float = 0.2
|
| 117 |
-
max_retries: int = 2
|
| 118 |
-
batch_retry_attempts: int = 1
|
| 119 |
-
batch_base_delay: float = 0.1
|
| 120 |
-
single_retry_base_delay: float = 0.05
|
| 121 |
-
|
| 122 |
-
def __post_init__(self):
|
| 123 |
-
"""Load API configuration from environment variables"""
|
| 124 |
-
if not self.anthropic_api_key:
|
| 125 |
-
self.anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
|
| 126 |
-
if not self.openai_api_key:
|
| 127 |
-
self.openai_api_key = os.getenv('OPENAI_API_KEY')
|
| 128 |
-
|
| 129 |
-
self.max_concurrent_requests = int(os.getenv('MAX_CONCURRENT_REQUESTS', str(self.max_concurrent_requests)))
|
| 130 |
-
self.request_timeout = int(os.getenv('REQUEST_TIMEOUT', str(self.request_timeout)))
|
| 131 |
-
self.retry_attempts = int(os.getenv('RETRY_ATTEMPTS', str(self.retry_attempts)))
|
| 132 |
-
self.base_delay = float(os.getenv('BASE_DELAY', str(self.base_delay)))
|
| 133 |
-
self.max_retries = int(os.getenv('MAX_RETRIES', str(self.max_retries)))
|
| 134 |
-
self.batch_retry_attempts = int(os.getenv('BATCH_RETRY_ATTEMPTS', str(self.batch_retry_attempts)))
|
| 135 |
-
self.batch_base_delay = float(os.getenv('BATCH_BASE_DELAY', str(self.batch_base_delay)))
|
| 136 |
-
self.single_retry_base_delay = float(os.getenv('SINGLE_RETRY_BASE_DELAY', str(self.single_retry_base_delay)))
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
@dataclass
|
| 140 |
-
class AppConfig:
|
| 141 |
-
"""Main application configuration"""
|
| 142 |
-
model: ModelConfig = field(default_factory=ModelConfig)
|
| 143 |
-
processing: ProcessingConfig = field(default_factory=ProcessingConfig)
|
| 144 |
-
ui: UIConfig = field(default_factory=UIConfig)
|
| 145 |
-
paths: PathConfig = field(default_factory=PathConfig)
|
| 146 |
-
api: APIConfig = field(default_factory=APIConfig)
|
| 147 |
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
log_level: str = "INFO"
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
self.environment = os.getenv('ENVIRONMENT', 'development')
|
| 157 |
-
self.log_level = os.getenv('LOG_LEVEL', 'INFO')
|
| 158 |
|
| 159 |
|
| 160 |
-
class
|
| 161 |
-
"""
|
| 162 |
-
|
| 163 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
|
| 165 |
-
|
| 166 |
-
""
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
# Load environment variables
|
| 173 |
-
load_dotenv()
|
| 174 |
-
|
| 175 |
-
# Initialize configuration
|
| 176 |
-
self.config = AppConfig()
|
| 177 |
-
|
| 178 |
-
# Load from file if provided
|
| 179 |
-
if config_file and Path(config_file).exists():
|
| 180 |
-
self._load_from_file(config_file)
|
| 181 |
-
|
| 182 |
-
# Validate configuration
|
| 183 |
-
self._validate_config()
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
config_file: Path to configuration file
|
| 191 |
-
"""
|
| 192 |
-
import json
|
| 193 |
-
|
| 194 |
-
config_path = Path(config_file)
|
| 195 |
-
|
| 196 |
-
try:
|
| 197 |
-
if config_path.suffix.lower() == '.json':
|
| 198 |
-
with open(config_path, 'r') as f:
|
| 199 |
-
config_data = json.load(f)
|
| 200 |
-
self._update_config_from_dict(config_data)
|
| 201 |
-
elif config_path.suffix.lower() in ['.yml', '.yaml']:
|
| 202 |
-
try:
|
| 203 |
-
import yaml
|
| 204 |
-
with open(config_path, 'r') as f:
|
| 205 |
-
config_data = yaml.safe_load(f)
|
| 206 |
-
self._update_config_from_dict(config_data)
|
| 207 |
-
except ImportError:
|
| 208 |
-
print("PyYAML not installed. Cannot load YAML configuration.")
|
| 209 |
-
except Exception as e:
|
| 210 |
-
print(f"Warning: Could not load configuration from {config_file}: {e}")
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
if not self.config.paths.data_path.exists():
|
| 230 |
-
print(f"Warning: Data directory does not exist: {self.config.paths.data_path}")
|
| 231 |
-
|
| 232 |
-
# Validate model settings
|
| 233 |
-
if self.config.processing.chunk_size <= self.config.processing.chunk_overlap:
|
| 234 |
-
print("Warning: Chunk size should be larger than chunk overlap")
|
| 235 |
-
|
| 236 |
-
# Validate thresholds
|
| 237 |
-
if not 0 <= self.config.processing.similarity_threshold <= 1:
|
| 238 |
-
print("Warning: Similarity threshold should be between 0 and 1")
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
for key, value in kwargs.items():
|
| 252 |
-
if hasattr(self.config, key):
|
| 253 |
-
setattr(self.config, key, value)
|
| 254 |
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
Args:
|
| 260 |
-
**kwargs: Processing configuration parameters to update
|
| 261 |
-
"""
|
| 262 |
-
for key, value in kwargs.items():
|
| 263 |
-
if hasattr(self.config.processing, key):
|
| 264 |
-
setattr(self.config.processing, key, value)
|
| 265 |
-
else:
|
| 266 |
-
print(f"Warning: Unknown processing config key: {key}")
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
setattr(self.config.api, key, value)
|
| 278 |
-
else:
|
| 279 |
-
print(f"Warning: Unknown API config key: {key}")
|
| 280 |
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
|
| 312 |
# Global configuration instance
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
|
| 316 |
-
def get_config() -> AppConfig:
|
| 317 |
-
"""
|
| 318 |
-
Get the global configuration instance
|
| 319 |
-
|
| 320 |
-
Returns:
|
| 321 |
-
Application configuration
|
| 322 |
-
"""
|
| 323 |
-
global _config_manager
|
| 324 |
-
if _config_manager is None:
|
| 325 |
-
_config_manager = ConfigManager()
|
| 326 |
-
return _config_manager.get_config()
|
| 327 |
|
| 328 |
|
| 329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 330 |
"""
|
| 331 |
-
|
| 332 |
|
| 333 |
Args:
|
| 334 |
-
|
|
|
|
|
|
|
| 335 |
|
| 336 |
Returns:
|
| 337 |
-
|
| 338 |
"""
|
| 339 |
-
|
| 340 |
-
_config_manager = ConfigManager(config_file)
|
| 341 |
-
return _config_manager
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
def update_config(**kwargs) -> None:
|
| 345 |
-
"""
|
| 346 |
-
Update global configuration
|
| 347 |
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
global _config_manager
|
| 352 |
-
if _config_manager is None:
|
| 353 |
-
_config_manager = ConfigManager()
|
| 354 |
-
_config_manager.update_config(**kwargs)
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
# Environment-specific configurations
|
| 358 |
-
DEVELOPMENT_CONFIG = {
|
| 359 |
-
"processing": {
|
| 360 |
-
"batch_size": 50,
|
| 361 |
-
"similarity_threshold": 0.3
|
| 362 |
-
},
|
| 363 |
-
"ui": {
|
| 364 |
-
"layout": "wide"
|
| 365 |
-
}
|
| 366 |
-
}
|
| 367 |
-
|
| 368 |
-
PRODUCTION_CONFIG = {
|
| 369 |
-
"processing": {
|
| 370 |
-
"batch_size": 100,
|
| 371 |
-
"similarity_threshold": 0.35
|
| 372 |
-
},
|
| 373 |
-
"api": {
|
| 374 |
-
"max_concurrent_requests": 20,
|
| 375 |
-
"request_timeout": 60
|
| 376 |
-
}
|
| 377 |
-
}
|
| 378 |
-
|
| 379 |
-
STREAMLIT_CLOUD_CONFIG = {
|
| 380 |
-
"processing": {
|
| 381 |
-
"batch_size": 100, # Optimized for performance
|
| 382 |
-
"description_batch_size": 100, # Match summary batch size
|
| 383 |
-
"max_text_length": 8000, # Higher limit for better quality
|
| 384 |
-
"max_workers": 2, # Moderate parallelism for cloud
|
| 385 |
-
"file_timeout": 30 # Standard timeout
|
| 386 |
-
},
|
| 387 |
-
"api": {
|
| 388 |
-
"max_concurrent_requests": 30, # Good concurrency for cloud
|
| 389 |
-
"base_delay": 0.1, # Fast delays
|
| 390 |
-
"batch_base_delay": 0.05, # Very fast batches
|
| 391 |
-
"request_timeout": 30
|
| 392 |
-
}
|
| 393 |
-
}
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
def get_environment_config() -> Dict[str, Any]:
|
| 397 |
-
"""
|
| 398 |
-
Get environment-specific configuration
|
| 399 |
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
#
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
return get_config().processing
|
| 422 |
|
| 423 |
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
-
def get_path_config() -> PathConfig:
|
| 430 |
-
"""Get path configuration"""
|
| 431 |
-
return get_config().paths
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
-
def get_api_config() -> APIConfig:
|
| 435 |
-
"""Get API configuration"""
|
| 436 |
-
return get_config().api
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
|
| 439 |
-
def is_ai_enabled() -> bool:
|
| 440 |
-
"""Check if AI features are enabled (API key available)"""
|
| 441 |
-
api_config = get_api_config()
|
| 442 |
-
return api_config.anthropic_api_key is not None
|
| 443 |
|
|
|
|
|
|
|
|
|
|
| 444 |
|
| 445 |
-
def
|
| 446 |
-
"""Get
|
| 447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 448 |
|
| 449 |
|
| 450 |
-
def
|
| 451 |
-
"""
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
|
|
|
| 456 |
|
| 457 |
|
| 458 |
-
def
|
| 459 |
-
"""
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Configuration Module
|
| 4 |
|
| 5 |
+
Uses pydantic-settings for robust configuration management from environment variables.
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import os
|
| 9 |
+
import sys
|
| 10 |
+
import logging
|
| 11 |
+
from datetime import datetime
|
| 12 |
from pathlib import Path
|
| 13 |
+
from typing import List, Optional
|
| 14 |
+
from logging.handlers import RotatingFileHandler
|
| 15 |
+
from pydantic import BaseModel, Field
|
| 16 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 17 |
|
| 18 |
# Fix tokenizers parallelism warning
|
| 19 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 20 |
|
| 21 |
+
# Streamlit import for utilities (conditional)
|
| 22 |
+
try:
|
| 23 |
+
import streamlit as st
|
| 24 |
+
STREAMLIT_AVAILABLE = True
|
| 25 |
+
except ImportError:
|
| 26 |
+
STREAMLIT_AVAILABLE = False
|
| 27 |
+
st = None
|
| 28 |
|
| 29 |
+
|
| 30 |
+
class ModelConfig(BaseModel):
|
| 31 |
+
"""Model configuration settings"""
|
| 32 |
sentence_transformer_model: str = "all-MiniLM-L6-v2"
|
| 33 |
claude_model: str = "claude-sonnet-4-20250514"
|
| 34 |
temperature: float = 0.3
|
| 35 |
max_tokens: int = 2000
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class ProcessingConfig(BaseModel):
|
| 39 |
+
"""Processing configuration settings"""
|
| 40 |
+
batch_size: int = 20
|
| 41 |
+
description_batch_size: int = 25
|
| 42 |
+
max_workers: int = 4
|
| 43 |
+
chunk_size: int = 1000
|
| 44 |
+
chunk_overlap: int = 200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
similarity_threshold: float = 0.35
|
| 46 |
relevancy_threshold: float = 0.5
|
| 47 |
primary_threshold: float = 0.6
|
| 48 |
min_display_threshold: float = 0.15
|
| 49 |
+
supported_file_extensions: List[str] = ['.pdf', '.docx', '.doc', '.txt', '.md']
|
| 50 |
+
faiss_store_name: str = "default"
|
| 51 |
+
skip_processed_files: bool = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
+
class UIConfig(BaseModel):
|
| 55 |
+
"""UI configuration settings"""
|
|
|
|
| 56 |
page_title: str = "AI Due Diligence"
|
| 57 |
page_icon: str = "🤖"
|
| 58 |
layout: str = "wide"
|
| 59 |
top_k_search_results: int = 5
|
|
|
|
|
|
|
| 60 |
|
| 61 |
|
| 62 |
+
class PathsConfig(BaseModel):
|
| 63 |
+
"""Paths configuration with computed properties"""
|
|
|
|
| 64 |
data_dir: str = "data"
|
| 65 |
checklist_dir: str = "data/checklist"
|
| 66 |
questions_dir: str = "data/questions"
|
| 67 |
strategy_dir: str = "data/strategy"
|
| 68 |
vdrs_dir: str = "data/vdrs"
|
| 69 |
+
faiss_dir: str = "data/enhanced_faiss"
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
def data_path(self) -> Path:
|
| 73 |
+
return Path(self.data_dir)
|
| 74 |
+
|
| 75 |
+
@property
|
| 76 |
+
def checklist_path(self) -> Path:
|
| 77 |
+
return Path(self.checklist_dir)
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def questions_path(self) -> Path:
|
| 81 |
+
return Path(self.questions_dir)
|
| 82 |
+
|
| 83 |
+
@property
|
| 84 |
+
def strategy_path(self) -> Path:
|
| 85 |
+
return Path(self.strategy_dir)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
@property
|
| 88 |
+
def vdrs_path(self) -> Path:
|
| 89 |
+
return Path(self.vdrs_dir)
|
|
|
|
| 90 |
|
| 91 |
+
@property
|
| 92 |
+
def faiss_path(self) -> Path:
|
| 93 |
+
return Path(self.faiss_dir)
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
+
class APIConfig(BaseModel):
|
| 97 |
+
"""API configuration settings"""
|
| 98 |
+
anthropic_api_key: Optional[str] = None
|
| 99 |
+
max_concurrent_requests: int = 10
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class Config(BaseSettings):
|
| 103 |
+
"""Main application configuration using pydantic-settings"""
|
| 104 |
|
| 105 |
+
model_config = SettingsConfigDict(
|
| 106 |
+
env_file=".env",
|
| 107 |
+
env_file_encoding="utf-8",
|
| 108 |
+
env_nested_delimiter="__",
|
| 109 |
+
case_sensitive=False,
|
| 110 |
+
extra="ignore" # Allow extra environment variables to be ignored
|
| 111 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
# Model settings
|
| 114 |
+
sentence_transformer_model: str = Field(default="all-MiniLM-L6-v2", env="SENTENCE_TRANSFORMER_MODEL")
|
| 115 |
+
claude_model: str = Field(default="claude-sonnet-4-20250514", env="CLAUDE_MODEL")
|
| 116 |
+
temperature: float = Field(default=0.3, env="CLAUDE_TEMPERATURE")
|
| 117 |
+
max_tokens: int = Field(default=2000, env="CLAUDE_MAX_TOKENS")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
# Processing settings (optimized for large datasets)
|
| 120 |
+
batch_size: int = Field(default=20, env="BATCH_SIZE")
|
| 121 |
+
description_batch_size: int = Field(default=25, env="DESCRIPTION_BATCH_SIZE")
|
| 122 |
+
max_workers: int = Field(default=4, env="MAX_WORKERS")
|
| 123 |
+
chunk_size: int = Field(default=1000, env="CHUNK_SIZE")
|
| 124 |
+
chunk_overlap: int = Field(default=200, env="CHUNK_OVERLAP")
|
| 125 |
+
similarity_threshold: float = Field(default=0.35, env="SIMILARITY_THRESHOLD")
|
| 126 |
+
relevancy_threshold: float = Field(default=0.5, env="RELEVANCY_THRESHOLD")
|
| 127 |
+
primary_threshold: float = Field(default=0.6, env="PRIMARY_THRESHOLD")
|
| 128 |
+
min_display_threshold: float = Field(default=0.15, env="MIN_DISPLAY_THRESHOLD")
|
| 129 |
+
supported_file_extensions: List[str] = Field(
|
| 130 |
+
default=['.pdf', '.docx', '.doc', '.txt', '.md'],
|
| 131 |
+
env="SUPPORTED_FILE_EXTENSIONS"
|
| 132 |
+
)
|
| 133 |
+
faiss_store_name: str = Field(default="default", env="FAISS_STORE_NAME")
|
| 134 |
+
skip_processed_files: bool = Field(default=True, env="SKIP_PROCESSED_FILES")
|
| 135 |
|
| 136 |
+
# Logging settings
|
| 137 |
+
log_level: str = Field(default="INFO", env="LOG_LEVEL")
|
| 138 |
+
suppress_langchain_warnings: bool = Field(default=True, env="SUPPRESS_LANGCHAIN_WARNINGS")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
+
# UI settings
|
| 141 |
+
page_title: str = Field(default="AI Due Diligence", env="PAGE_TITLE")
|
| 142 |
+
page_icon: str = Field(default="🤖", env="PAGE_ICON")
|
| 143 |
+
layout: str = Field(default="wide", env="LAYOUT")
|
| 144 |
+
top_k_search_results: int = Field(default=5, env="TOP_K_SEARCH_RESULTS")
|
| 145 |
|
| 146 |
+
# Path settings
|
| 147 |
+
data_dir: str = Field(default="data", env="DATA_DIR")
|
| 148 |
+
checklist_dir: str = Field(default="data/checklist", env="CHECKLIST_DIR")
|
| 149 |
+
questions_dir: str = Field(default="data/questions", env="QUESTIONS_DIR")
|
| 150 |
+
strategy_dir: str = Field(default="data/strategy", env="STRATEGY_DIR")
|
| 151 |
+
vdrs_dir: str = Field(default="data/vdrs", env="VDRS_DIR")
|
| 152 |
+
faiss_dir: str = Field(default="data/enhanced_faiss", env="FAISS_DIR")
|
|
|
|
|
|
|
|
|
|
| 153 |
|
| 154 |
+
# API settings
|
| 155 |
+
anthropic_api_key: Optional[str] = Field(default=None, env="ANTHROPIC_API_KEY")
|
| 156 |
+
max_concurrent_requests: int = Field(default=10, env="MAX_CONCURRENT_REQUESTS")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
+
@property
|
| 159 |
+
def model(self) -> ModelConfig:
|
| 160 |
+
"""Get model configuration"""
|
| 161 |
+
return ModelConfig(
|
| 162 |
+
sentence_transformer_model=self.sentence_transformer_model,
|
| 163 |
+
claude_model=self.claude_model,
|
| 164 |
+
temperature=self.temperature,
|
| 165 |
+
max_tokens=self.max_tokens
|
| 166 |
+
)
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
+
@property
|
| 169 |
+
def processing(self) -> ProcessingConfig:
|
| 170 |
+
"""Get processing configuration"""
|
| 171 |
+
return ProcessingConfig(
|
| 172 |
+
batch_size=self.batch_size,
|
| 173 |
+
description_batch_size=self.description_batch_size,
|
| 174 |
+
max_workers=self.max_workers,
|
| 175 |
+
chunk_size=self.chunk_size,
|
| 176 |
+
chunk_overlap=self.chunk_overlap,
|
| 177 |
+
similarity_threshold=self.similarity_threshold,
|
| 178 |
+
relevancy_threshold=self.relevancy_threshold,
|
| 179 |
+
primary_threshold=self.primary_threshold,
|
| 180 |
+
min_display_threshold=self.min_display_threshold,
|
| 181 |
+
supported_file_extensions=self.supported_file_extensions,
|
| 182 |
+
faiss_store_name=self.faiss_store_name,
|
| 183 |
+
skip_processed_files=self.skip_processed_files
|
| 184 |
+
)
|
| 185 |
|
| 186 |
+
@property
|
| 187 |
+
def ui(self) -> UIConfig:
|
| 188 |
+
"""Get UI configuration"""
|
| 189 |
+
return UIConfig(
|
| 190 |
+
page_title=self.page_title,
|
| 191 |
+
page_icon=self.page_icon,
|
| 192 |
+
layout=self.layout,
|
| 193 |
+
top_k_search_results=self.top_k_search_results
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
@property
|
| 197 |
+
def paths(self) -> PathsConfig:
|
| 198 |
+
"""Get paths configuration"""
|
| 199 |
+
return PathsConfig(
|
| 200 |
+
data_dir=self.data_dir,
|
| 201 |
+
checklist_dir=self.checklist_dir,
|
| 202 |
+
questions_dir=self.questions_dir,
|
| 203 |
+
strategy_dir=self.strategy_dir,
|
| 204 |
+
vdrs_dir=self.vdrs_dir,
|
| 205 |
+
faiss_dir=self.faiss_dir
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
@property
|
| 209 |
+
def api(self) -> APIConfig:
|
| 210 |
+
"""Get API configuration"""
|
| 211 |
+
return APIConfig(
|
| 212 |
+
anthropic_api_key=self.anthropic_api_key,
|
| 213 |
+
max_concurrent_requests=self.max_concurrent_requests
|
| 214 |
+
)
|
| 215 |
|
| 216 |
|
| 217 |
# Global configuration instance
|
| 218 |
+
_config: Optional[Config] = None
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def get_config() -> Config:
|
| 222 |
+
"""Get the global configuration instance"""
|
| 223 |
+
global _config
|
| 224 |
+
if _config is None:
|
| 225 |
+
_config = Config()
|
| 226 |
+
return _config
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def init_config(config_file: Optional[str] = None) -> Config:
|
| 230 |
+
"""Initialize global configuration"""
|
| 231 |
+
global _config
|
| 232 |
+
_config = Config()
|
| 233 |
+
return _config
|
| 234 |
|
| 235 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
|
| 238 |
+
|
| 239 |
+
# =============================================================================
|
| 240 |
+
# LOGGING UTILITIES - Merged from utils.py
|
| 241 |
+
# =============================================================================
|
| 242 |
+
|
| 243 |
+
def setup_logging(
|
| 244 |
+
name: str = "dd_checklist",
|
| 245 |
+
log_level: Optional[str] = None,
|
| 246 |
+
log_file: Optional[str] = None
|
| 247 |
+
) -> logging.Logger:
|
| 248 |
"""
|
| 249 |
+
Set up standard Python logging with rotating file handler
|
| 250 |
|
| 251 |
Args:
|
| 252 |
+
name: Logger name
|
| 253 |
+
log_level: Logging level
|
| 254 |
+
log_file: Optional log file path
|
| 255 |
|
| 256 |
Returns:
|
| 257 |
+
Configured logger instance
|
| 258 |
"""
|
| 259 |
+
logger = logging.getLogger(name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
+
# Avoid duplicate setup if logger already has handlers
|
| 262 |
+
if logger.handlers:
|
| 263 |
+
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
+
# Use configured log level if not provided
|
| 266 |
+
if log_level is None:
|
| 267 |
+
try:
|
| 268 |
+
config = get_config()
|
| 269 |
+
log_level = config.log_level
|
| 270 |
+
except Exception:
|
| 271 |
+
log_level = "INFO" # fallback
|
| 272 |
+
|
| 273 |
+
logger.setLevel(getattr(logging, log_level.upper()))
|
| 274 |
|
| 275 |
+
# Console handler
|
| 276 |
+
console_handler = logging.StreamHandler(sys.stdout)
|
| 277 |
+
console_formatter = logging.Formatter(
|
| 278 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 279 |
+
)
|
| 280 |
+
console_handler.setFormatter(console_formatter)
|
| 281 |
+
logger.addHandler(console_handler)
|
| 282 |
+
|
| 283 |
+
# Rotating file handler (if possible)
|
| 284 |
+
if log_file or True: # Always try to set up file logging
|
| 285 |
+
try:
|
| 286 |
+
log_dir = Path(".logs")
|
| 287 |
+
log_dir.mkdir(exist_ok=True)
|
| 288 |
+
|
| 289 |
+
if not log_file:
|
| 290 |
+
log_file = log_dir / f"dd_checklist_{datetime.now().strftime('%Y%m%d')}.log"
|
| 291 |
+
|
| 292 |
+
# Use RotatingFileHandler for better log management
|
| 293 |
+
file_handler = RotatingFileHandler(
|
| 294 |
+
log_file,
|
| 295 |
+
maxBytes=10 * 1024 * 1024, # 10MB
|
| 296 |
+
backupCount=5
|
| 297 |
+
)
|
| 298 |
+
file_formatter = logging.Formatter(
|
| 299 |
+
'%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
|
| 300 |
+
)
|
| 301 |
+
file_handler.setFormatter(file_formatter)
|
| 302 |
+
logger.addHandler(file_handler)
|
| 303 |
+
except Exception:
|
| 304 |
+
# File logging not available (e.g., on Streamlit Cloud)
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
return logger
|
| 308 |
|
| 309 |
|
| 310 |
+
# Global logger instance
|
| 311 |
+
logger = setup_logging()
|
|
|
|
| 312 |
|
| 313 |
|
| 314 |
+
# =============================================================================
|
| 315 |
+
# STREAMLIT UTILITIES - Merged from utils.py
|
| 316 |
+
# =============================================================================
|
| 317 |
|
| 318 |
+
def show_success(message: str):
|
| 319 |
+
"""Show success message in Streamlit"""
|
| 320 |
+
if STREAMLIT_AVAILABLE and st:
|
| 321 |
+
st.success(message)
|
| 322 |
+
logger.info(message)
|
| 323 |
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
+
def show_info(message: str):
|
| 326 |
+
"""Show info message in Streamlit"""
|
| 327 |
+
if STREAMLIT_AVAILABLE and st:
|
| 328 |
+
st.info(message)
|
| 329 |
+
logger.info(message)
|
| 330 |
|
|
|
|
|
|
|
|
|
|
| 331 |
|
| 332 |
+
def show_error(message: str):
|
| 333 |
+
"""Show error message in Streamlit"""
|
| 334 |
+
if STREAMLIT_AVAILABLE and st:
|
| 335 |
+
st.error(message)
|
| 336 |
+
logger.error(message)
|
| 337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
|
| 339 |
+
# =============================================================================
|
| 340 |
+
# FILE UTILITIES - Common patterns extracted for reuse
|
| 341 |
+
# =============================================================================
|
| 342 |
|
| 343 |
+
def get_mime_type(file_path: Path) -> str:
|
| 344 |
+
"""Get MIME type based on file extension"""
|
| 345 |
+
file_extension = file_path.suffix.lower()
|
| 346 |
+
if file_extension == '.pdf':
|
| 347 |
+
return 'application/pdf'
|
| 348 |
+
elif file_extension in ['.doc', '.docx']:
|
| 349 |
+
return 'application/msword'
|
| 350 |
+
elif file_extension == '.txt':
|
| 351 |
+
return 'text/plain'
|
| 352 |
+
elif file_extension == '.md':
|
| 353 |
+
return 'text/markdown'
|
| 354 |
+
else:
|
| 355 |
+
return 'application/octet-stream'
|
| 356 |
|
| 357 |
|
| 358 |
+
def format_document_title(doc_name: str) -> str:
|
| 359 |
+
"""Format document name into a readable title"""
|
| 360 |
+
if '.' in doc_name:
|
| 361 |
+
doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
|
| 362 |
+
else:
|
| 363 |
+
doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
|
| 364 |
+
return doc_title
|
| 365 |
|
| 366 |
|
| 367 |
+
def count_documents_in_directory(directory: Path, supported_extensions: Optional[List[str]] = None) -> int:
|
| 368 |
+
"""Count supported documents in a directory recursively"""
|
| 369 |
+
if supported_extensions is None:
|
| 370 |
+
supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md']
|
| 371 |
+
|
| 372 |
+
return sum(1 for f in directory.rglob('*')
|
| 373 |
+
if f.is_file() and f.suffix.lower() in supported_extensions)
|
src/document_processing.py
CHANGED
|
@@ -1,52 +1,78 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
Document Processing Module
|
| 4 |
|
| 5 |
-
This module
|
| 6 |
-
-
|
| 7 |
-
-
|
| 8 |
-
- Semantic text chunking
|
| 9 |
-
-
|
| 10 |
"""
|
| 11 |
|
| 12 |
import os
|
|
|
|
| 13 |
# Fix tokenizers parallelism warning
|
| 14 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 15 |
|
| 16 |
-
import fitz # PyMuPDF
|
| 17 |
-
import docx
|
| 18 |
-
import io
|
| 19 |
-
import re
|
| 20 |
-
from pathlib import Path
|
| 21 |
-
from typing import Dict, List, Tuple, Optional
|
| 22 |
-
import streamlit as st
|
| 23 |
-
import numpy as np
|
| 24 |
-
from sentence_transformers import SentenceTransformer
|
| 25 |
-
import concurrent.futures
|
| 26 |
-
import threading
|
| 27 |
import logging
|
| 28 |
-
from functools import wraps
|
| 29 |
-
import joblib
|
| 30 |
-
import hashlib
|
| 31 |
-
import time
|
| 32 |
-
import faiss
|
| 33 |
|
| 34 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 36 |
|
| 37 |
# Import configuration
|
| 38 |
from .config import get_config
|
| 39 |
|
| 40 |
-
#
|
|
|
|
|
|
|
| 41 |
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def escape_markdown_math(text: str) -> str:
|
|
@@ -63,755 +89,331 @@ def escape_markdown_math(text: str) -> str:
|
|
| 63 |
return text
|
| 64 |
|
| 65 |
|
| 66 |
-
|
| 67 |
-
"""
|
| 68 |
-
Extract text from file with metadata
|
| 69 |
-
|
| 70 |
-
Args:
|
| 71 |
-
file_path: Path to the file to extract text from
|
| 72 |
-
|
| 73 |
-
Returns:
|
| 74 |
-
Tuple of (text_content, metadata)
|
| 75 |
-
"""
|
| 76 |
-
metadata = {'pages': [], 'type': 'unknown'}
|
| 77 |
-
text_content = ""
|
| 78 |
-
|
| 79 |
-
try:
|
| 80 |
-
if file_path.suffix.lower() == '.pdf':
|
| 81 |
-
# Use PyMuPDF (fitz) for faster and more robust PDF processing
|
| 82 |
-
try:
|
| 83 |
-
pdf_document = fitz.open(str(file_path))
|
| 84 |
-
texts = []
|
| 85 |
-
|
| 86 |
-
for page_num in range(pdf_document.page_count):
|
| 87 |
-
try:
|
| 88 |
-
page = pdf_document[page_num]
|
| 89 |
-
page_text = page.get_text()
|
| 90 |
-
|
| 91 |
-
if page_text.strip(): # Only add non-empty pages
|
| 92 |
-
texts.append(page_text)
|
| 93 |
-
metadata['pages'].append(page_num + 1) # 1-based page numbering
|
| 94 |
-
except Exception as page_error:
|
| 95 |
-
# Handle individual page errors gracefully
|
| 96 |
-
logger.warning(f"Error reading page {page_num + 1} of {file_path.name}: {page_error}")
|
| 97 |
-
if st and hasattr(st, 'session_state'):
|
| 98 |
-
# Only use streamlit in main thread context
|
| 99 |
-
try:
|
| 100 |
-
st.warning(f"Error reading page {page_num + 1} of {file_path.name}: {page_error}")
|
| 101 |
-
except Exception:
|
| 102 |
-
pass
|
| 103 |
-
continue
|
| 104 |
-
|
| 105 |
-
pdf_document.close()
|
| 106 |
-
text_content = '\n'.join(texts)[:10000]
|
| 107 |
-
metadata['type'] = 'pdf'
|
| 108 |
-
|
| 109 |
-
except Exception as pdf_error:
|
| 110 |
-
# Handle corrupted or unsupported PDF files
|
| 111 |
-
error_msg = f"Error processing PDF {file_path.name}: {pdf_error}"
|
| 112 |
-
logger.error(error_msg)
|
| 113 |
-
if st and hasattr(st, 'session_state'):
|
| 114 |
-
# Only use streamlit in main thread context
|
| 115 |
-
try:
|
| 116 |
-
st.error(error_msg)
|
| 117 |
-
except Exception:
|
| 118 |
-
pass
|
| 119 |
-
# Try to return partial content if available
|
| 120 |
-
if 'pdf_document' in locals():
|
| 121 |
-
try:
|
| 122 |
-
pdf_document.close()
|
| 123 |
-
except:
|
| 124 |
-
pass
|
| 125 |
-
return "", metadata
|
| 126 |
-
|
| 127 |
-
elif file_path.suffix.lower() in ['.docx', '.doc']:
|
| 128 |
-
doc = docx.Document(str(file_path))
|
| 129 |
-
text_content = '\n'.join(p.text for p in doc.paragraphs)[:10000]
|
| 130 |
-
metadata['type'] = 'docx'
|
| 131 |
-
|
| 132 |
-
elif file_path.suffix.lower() in ['.txt', '.md']:
|
| 133 |
-
text_content = file_path.read_text(encoding='utf-8', errors='ignore')[:10000]
|
| 134 |
-
metadata['type'] = 'text'
|
| 135 |
-
|
| 136 |
-
except Exception as e:
|
| 137 |
-
error_msg = f"Could not read {file_path.name}: {e}"
|
| 138 |
-
logger.warning(error_msg)
|
| 139 |
-
if st and hasattr(st, 'session_state'): # Only use streamlit if available and in main thread
|
| 140 |
-
try:
|
| 141 |
-
st.warning(error_msg)
|
| 142 |
-
except Exception:
|
| 143 |
-
pass
|
| 144 |
-
|
| 145 |
-
# Call progress callback if provided (for parallel processing tracking)
|
| 146 |
-
if progress_callback:
|
| 147 |
-
try:
|
| 148 |
-
progress_callback(file_path.name)
|
| 149 |
-
except Exception:
|
| 150 |
-
pass # Don't let callback errors affect processing
|
| 151 |
-
|
| 152 |
-
return text_content, metadata
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
def _process_file_with_context(args):
|
| 156 |
-
"""
|
| 157 |
-
Thread-safe file processing function with proper context management
|
| 158 |
-
|
| 159 |
-
Args:
|
| 160 |
-
args: Tuple of (file_path, base_path, progress_callback)
|
| 161 |
-
|
| 162 |
-
Returns:
|
| 163 |
-
Tuple of (file_path_str, document_info) or None if failed
|
| 164 |
-
"""
|
| 165 |
-
file_path, base_path, progress_callback = args
|
| 166 |
-
|
| 167 |
-
try:
|
| 168 |
-
# Extract text from file
|
| 169 |
-
text, metadata = extract_text_from_file(file_path, progress_callback)
|
| 170 |
-
|
| 171 |
-
if text:
|
| 172 |
-
# Store relative path for display
|
| 173 |
-
rel_path = file_path.relative_to(base_path)
|
| 174 |
-
document_info = {
|
| 175 |
-
'text': text,
|
| 176 |
-
'content': text, # Alias for backward compatibility
|
| 177 |
-
'name': file_path.name,
|
| 178 |
-
'rel_path': str(rel_path),
|
| 179 |
-
'metadata': metadata
|
| 180 |
-
}
|
| 181 |
-
return str(file_path), document_info
|
| 182 |
-
except Exception as e:
|
| 183 |
-
logger.error(f"Error processing file {file_path.name}: {e}")
|
| 184 |
-
|
| 185 |
-
return None
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
def scan_data_room(data_room_path: str, max_workers: Optional[int] = None, progress_callback=None) -> Dict[str, Dict]:
|
| 189 |
-
"""
|
| 190 |
-
Scan entire data room directory for documents using parallel processing
|
| 191 |
-
|
| 192 |
-
Args:
|
| 193 |
-
data_room_path: Path to the data room directory
|
| 194 |
-
max_workers: Maximum number of worker threads (uses config default if None)
|
| 195 |
-
progress_callback: Optional callback function for progress updates
|
| 196 |
-
|
| 197 |
-
Returns:
|
| 198 |
-
Dictionary mapping file paths to document information
|
| 199 |
-
"""
|
| 200 |
-
config = get_config()
|
| 201 |
-
if max_workers is None:
|
| 202 |
-
max_workers = config.processing.max_workers
|
| 203 |
-
|
| 204 |
-
documents = {}
|
| 205 |
-
path = Path(data_room_path)
|
| 206 |
-
|
| 207 |
-
if not path.exists():
|
| 208 |
-
return documents
|
| 209 |
-
|
| 210 |
-
# Collect all document files first
|
| 211 |
-
file_paths = []
|
| 212 |
-
for file_path in path.rglob('*'):
|
| 213 |
-
if file_path.is_file() and not file_path.name.startswith('.'):
|
| 214 |
-
if file_path.suffix.lower() in config.processing.supported_file_extensions:
|
| 215 |
-
file_paths.append(file_path)
|
| 216 |
-
|
| 217 |
-
if not file_paths:
|
| 218 |
-
return documents
|
| 219 |
-
|
| 220 |
-
logger.info(f"Processing {len(file_paths)} files with {max_workers} workers")
|
| 221 |
-
|
| 222 |
-
# Prepare arguments for parallel processing
|
| 223 |
-
process_args = [(file_path, path, progress_callback) for file_path in file_paths]
|
| 224 |
-
|
| 225 |
-
# Process files in parallel
|
| 226 |
-
processed_count = 0
|
| 227 |
-
failed_count = 0
|
| 228 |
-
|
| 229 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 230 |
-
# Submit all tasks
|
| 231 |
-
future_to_file = {}
|
| 232 |
-
|
| 233 |
-
for args in process_args:
|
| 234 |
-
future = executor.submit(_process_file_with_context, args)
|
| 235 |
-
|
| 236 |
-
# Add Streamlit context if available
|
| 237 |
-
if STREAMLIT_CONTEXT_AVAILABLE:
|
| 238 |
-
try:
|
| 239 |
-
script_ctx = get_script_run_ctx()
|
| 240 |
-
if script_ctx:
|
| 241 |
-
add_script_run_ctx(future)
|
| 242 |
-
except Exception as e:
|
| 243 |
-
logger.warning(f"Could not add script context: {e}")
|
| 244 |
-
|
| 245 |
-
future_to_file[future] = args[0] # Store file_path for reference
|
| 246 |
-
|
| 247 |
-
# Collect results as they complete
|
| 248 |
-
for future in concurrent.futures.as_completed(future_to_file):
|
| 249 |
-
try:
|
| 250 |
-
result = future.result(timeout=config.processing.file_timeout)
|
| 251 |
-
if result:
|
| 252 |
-
file_path_str, document_info = result
|
| 253 |
-
documents[file_path_str] = document_info
|
| 254 |
-
processed_count += 1
|
| 255 |
-
else:
|
| 256 |
-
failed_count += 1
|
| 257 |
-
except concurrent.futures.TimeoutError:
|
| 258 |
-
file_path = future_to_file[future]
|
| 259 |
-
logger.error(f"Timeout processing file: {file_path.name}")
|
| 260 |
-
failed_count += 1
|
| 261 |
-
except Exception as e:
|
| 262 |
-
file_path = future_to_file[future]
|
| 263 |
-
logger.error(f"Error processing file {file_path.name}: {e}")
|
| 264 |
-
failed_count += 1
|
| 265 |
-
|
| 266 |
-
logger.info(f"Completed processing: {processed_count} successful, {failed_count} failed")
|
| 267 |
-
return documents
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 2000, overlap: int = 200) -> List[Dict]:
|
| 271 |
-
"""
|
| 272 |
-
Create searchable chunks with semantic splitting and full metadata.
|
| 273 |
-
Uses RecursiveCharacterTextSplitter for better context preservation.
|
| 274 |
-
|
| 275 |
-
Args:
|
| 276 |
-
documents: Dictionary of documents
|
| 277 |
-
chunk_size: Size of each chunk in characters (default: 2000 for ~400 words)
|
| 278 |
-
overlap: Overlap between chunks in characters (default: 200 for ~50 words)
|
| 279 |
-
|
| 280 |
-
Returns:
|
| 281 |
-
List of chunk dictionaries with metadata
|
| 282 |
"""
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
# Initialize semantic text splitter with hierarchical separators
|
| 286 |
-
# This preserves document structure by prioritizing paragraph breaks,
|
| 287 |
-
# then sentences, then words
|
| 288 |
-
text_splitter = RecursiveCharacterTextSplitter(
|
| 289 |
-
chunk_size=chunk_size,
|
| 290 |
-
chunk_overlap=overlap,
|
| 291 |
-
separators=["\n\n", "\n", ".", "!", "?", ",", " "],
|
| 292 |
-
length_function=len,
|
| 293 |
-
is_separator_regex=False,
|
| 294 |
-
)
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
# Split text using semantic boundaries
|
| 303 |
-
semantic_chunks = text_splitter.split_text(text)
|
| 304 |
-
|
| 305 |
-
# Create chunks with metadata
|
| 306 |
-
for i, chunk_text in enumerate(semantic_chunks):
|
| 307 |
-
if chunk_text.strip():
|
| 308 |
-
chunks.append({
|
| 309 |
-
'text': chunk_text.strip(),
|
| 310 |
-
'source': doc_info['name'],
|
| 311 |
-
'path': doc_info['rel_path'],
|
| 312 |
-
'full_path': doc_path,
|
| 313 |
-
'chunk_id': f"semantic_chunk_{i}",
|
| 314 |
-
'metadata': doc_info['metadata']
|
| 315 |
-
})
|
| 316 |
-
|
| 317 |
-
return chunks
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
def create_embeddings_batch(texts: List[str], model: SentenceTransformer, batch_size: Optional[int] = None) -> np.ndarray:
|
| 321 |
"""
|
| 322 |
-
Create embeddings for texts in batches for better performance
|
| 323 |
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
batch_size: Batch size for processing
|
| 328 |
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
if batch_size is None:
|
| 334 |
config = get_config()
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
Args:
|
| 359 |
-
query: Search query
|
| 360 |
-
chunks: List of document chunks
|
| 361 |
-
faiss_index: FAISS index with embeddings
|
| 362 |
-
model: SentenceTransformer model
|
| 363 |
-
top_k: Number of top results to return
|
| 364 |
-
threshold: Minimum similarity threshold (uses config default if None)
|
| 365 |
-
|
| 366 |
-
Returns:
|
| 367 |
-
List of search results with citations
|
| 368 |
-
"""
|
| 369 |
-
if not chunks or faiss_index is None:
|
| 370 |
-
return []
|
| 371 |
-
|
| 372 |
-
config = get_config()
|
| 373 |
-
if threshold is None:
|
| 374 |
-
threshold = config.processing.similarity_threshold
|
| 375 |
-
|
| 376 |
-
# Encode query and normalize for inner product similarity
|
| 377 |
-
query_embedding = model.encode(query).astype('float32')
|
| 378 |
-
query_embedding = query_embedding.reshape(1, -1)
|
| 379 |
-
|
| 380 |
-
# Normalize for cosine similarity using inner product
|
| 381 |
-
faiss.normalize_L2(query_embedding)
|
| 382 |
-
|
| 383 |
-
# Search using FAISS (much faster than numpy)
|
| 384 |
-
scores, indices = faiss_index.search(query_embedding, min(top_k * 2, len(chunks)))
|
| 385 |
-
|
| 386 |
-
results = []
|
| 387 |
-
seen_texts = set()
|
| 388 |
-
|
| 389 |
-
for score, idx in zip(scores[0], indices[0]):
|
| 390 |
-
if idx == -1 or score < threshold: # -1 indicates no more results
|
| 391 |
-
continue
|
| 392 |
-
|
| 393 |
-
# Avoid duplicates
|
| 394 |
-
text_preview = chunks[idx]['text'][:100]
|
| 395 |
-
if text_preview not in seen_texts:
|
| 396 |
-
seen_texts.add(text_preview)
|
| 397 |
-
|
| 398 |
-
# Format citation based on file type
|
| 399 |
-
metadata = chunks[idx]['metadata']
|
| 400 |
-
if metadata['type'] == 'pdf' and metadata.get('pages'):
|
| 401 |
-
citation = f"page {metadata['pages'][0]}"
|
| 402 |
-
else:
|
| 403 |
-
citation = "document"
|
| 404 |
-
|
| 405 |
-
results.append({
|
| 406 |
-
'text': chunks[idx]['text'],
|
| 407 |
-
'source': chunks[idx]['source'],
|
| 408 |
-
'path': chunks[idx]['path'],
|
| 409 |
-
'full_path': chunks[idx].get('full_path', ''),
|
| 410 |
-
'citation': citation,
|
| 411 |
-
'score': float(score)
|
| 412 |
-
})
|
| 413 |
-
|
| 414 |
-
if len(results) >= top_k:
|
| 415 |
-
break
|
| 416 |
-
|
| 417 |
-
return results
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
def search_documents_with_citations(
|
| 421 |
-
query: str,
|
| 422 |
-
chunks: List[Dict],
|
| 423 |
-
embeddings: np.ndarray,
|
| 424 |
-
model: SentenceTransformer,
|
| 425 |
-
top_k: int = 5,
|
| 426 |
-
threshold: Optional[float] = None
|
| 427 |
-
) -> List[Dict]:
|
| 428 |
-
"""
|
| 429 |
-
Legacy search documents function - kept for backward compatibility
|
| 430 |
-
Creates temporary FAISS index and uses FAISS search for better performance
|
| 431 |
-
|
| 432 |
-
Args:
|
| 433 |
-
query: Search query
|
| 434 |
-
chunks: List of document chunks
|
| 435 |
-
embeddings: Precomputed embeddings for chunks
|
| 436 |
-
model: SentenceTransformer model
|
| 437 |
-
top_k: Number of top results to return
|
| 438 |
-
threshold: Minimum similarity threshold
|
| 439 |
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
"""
|
| 443 |
-
if not chunks:
|
| 444 |
-
return []
|
| 445 |
-
|
| 446 |
-
# Create temporary FAISS index for better performance
|
| 447 |
-
embeddings_f32 = embeddings.astype('float32')
|
| 448 |
-
faiss.normalize_L2(embeddings_f32) # Normalize for cosine similarity
|
| 449 |
-
|
| 450 |
-
index = faiss.IndexFlatIP(embeddings_f32.shape[1])
|
| 451 |
-
index.add(embeddings_f32)
|
| 452 |
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
|
|
|
| 463 |
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
lock = threading.Lock()
|
| 469 |
-
|
| 470 |
-
def progress_callback(filename: str = None):
|
| 471 |
-
with lock:
|
| 472 |
-
processed_count[0] += 1
|
| 473 |
-
progress = processed_count[0] / max(total_files, 1)
|
| 474 |
-
|
| 475 |
-
if streamlit_progress_bar and hasattr(st, 'session_state'):
|
| 476 |
-
try:
|
| 477 |
-
streamlit_progress_bar.progress(
|
| 478 |
-
min(progress, 1.0),
|
| 479 |
-
text=f"Processing {filename or 'documents'}... ({processed_count[0]}/{total_files})"
|
| 480 |
-
)
|
| 481 |
-
except Exception:
|
| 482 |
-
pass # Don't let UI errors affect processing
|
| 483 |
-
|
| 484 |
-
return progress_callback
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
def _generate_cache_key(documents: Dict[str, Dict]) -> str:
|
| 488 |
-
"""
|
| 489 |
-
Generate a cache key based on document paths and modification times
|
| 490 |
-
|
| 491 |
-
Args:
|
| 492 |
-
documents: Dictionary of documents with file paths
|
| 493 |
|
| 494 |
-
Returns:
|
| 495 |
-
Cache key string
|
| 496 |
-
"""
|
| 497 |
-
# Create a hash based on file paths and their modification times
|
| 498 |
-
cache_data = []
|
| 499 |
-
|
| 500 |
-
for file_path, doc_info in documents.items():
|
| 501 |
try:
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 506 |
except Exception as e:
|
| 507 |
-
logger.
|
| 508 |
-
|
| 509 |
-
cache_data.append(f"{file_path}:{time.time()}")
|
| 510 |
-
|
| 511 |
-
# Sort to ensure consistent hashing regardless of document order
|
| 512 |
-
cache_data.sort()
|
| 513 |
-
cache_string = "|".join(cache_data)
|
| 514 |
-
|
| 515 |
-
# Generate MD5 hash for the cache key
|
| 516 |
-
return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
def _get_cache_dir() -> Path:
|
| 520 |
-
"""Get or create the cache directory"""
|
| 521 |
-
cache_dir = Path(".cache")
|
| 522 |
-
cache_dir.mkdir(exist_ok=True)
|
| 523 |
-
return cache_dir
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
def _save_embeddings_to_cache(cache_key: str, embeddings: np.ndarray, chunks: List[Dict]) -> bool:
|
| 527 |
-
"""
|
| 528 |
-
Save embeddings and chunks to cache
|
| 529 |
-
|
| 530 |
-
Args:
|
| 531 |
-
cache_key: Cache key for the data
|
| 532 |
-
embeddings: Embeddings array to cache
|
| 533 |
-
chunks: Document chunks to cache
|
| 534 |
-
|
| 535 |
-
Returns:
|
| 536 |
-
True if successful, False otherwise
|
| 537 |
-
"""
|
| 538 |
-
try:
|
| 539 |
-
cache_dir = _get_cache_dir()
|
| 540 |
-
cache_file = cache_dir / f"embeddings_{cache_key}.joblib"
|
| 541 |
-
|
| 542 |
-
cache_data = {
|
| 543 |
-
'embeddings': embeddings,
|
| 544 |
-
'chunks': chunks,
|
| 545 |
-
'timestamp': time.time(),
|
| 546 |
-
'cache_key': cache_key
|
| 547 |
-
}
|
| 548 |
-
|
| 549 |
-
joblib.dump(cache_data, cache_file, compress=3)
|
| 550 |
-
logger.info(f"Saved embeddings to cache: {cache_file}")
|
| 551 |
-
return True
|
| 552 |
-
|
| 553 |
-
except Exception as e:
|
| 554 |
-
logger.error(f"Failed to save embeddings to cache: {e}")
|
| 555 |
-
return False
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
def _load_embeddings_from_cache(cache_key: str) -> Tuple[Optional[np.ndarray], Optional[List[Dict]]]:
|
| 559 |
-
"""
|
| 560 |
-
Load embeddings and chunks from cache
|
| 561 |
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
Tuple of (embeddings, chunks) or (None, None) if not found
|
| 567 |
-
"""
|
| 568 |
-
try:
|
| 569 |
-
cache_dir = _get_cache_dir()
|
| 570 |
-
cache_file = cache_dir / f"embeddings_{cache_key}.joblib"
|
| 571 |
-
|
| 572 |
-
if not cache_file.exists():
|
| 573 |
-
return None, None
|
| 574 |
-
|
| 575 |
-
cache_data = joblib.load(cache_file)
|
| 576 |
|
| 577 |
-
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
# Check if cache key matches (additional validation)
|
| 583 |
-
if cache_data['cache_key'] != cache_key:
|
| 584 |
-
logger.warning(f"Cache key mismatch in {cache_file}")
|
| 585 |
-
return None, None
|
| 586 |
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
def _invalidate_old_cache_files(max_age_days: int = 7) -> None:
|
| 596 |
-
"""
|
| 597 |
-
Remove old cache files to prevent cache directory from growing too large
|
| 598 |
-
|
| 599 |
-
Args:
|
| 600 |
-
max_age_days: Maximum age of cache files in days
|
| 601 |
-
"""
|
| 602 |
-
try:
|
| 603 |
-
cache_dir = _get_cache_dir()
|
| 604 |
-
current_time = time.time()
|
| 605 |
-
max_age_seconds = max_age_days * 24 * 60 * 60
|
| 606 |
-
|
| 607 |
-
for cache_file in cache_dir.glob("embeddings_*.joblib"):
|
| 608 |
-
try:
|
| 609 |
-
file_age = current_time - cache_file.stat().st_mtime
|
| 610 |
-
if file_age > max_age_seconds:
|
| 611 |
-
cache_file.unlink()
|
| 612 |
-
logger.info(f"Removed old cache file: {cache_file}")
|
| 613 |
-
except Exception as e:
|
| 614 |
-
logger.warning(f"Could not remove old cache file {cache_file}: {e}")
|
| 615 |
-
|
| 616 |
-
except Exception as e:
|
| 617 |
-
logger.error(f"Failed to invalidate old cache files: {e}")
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
class DocumentProcessor:
|
| 621 |
-
"""
|
| 622 |
-
Main document processing class that orchestrates document operations with parallel processing support
|
| 623 |
-
Enhanced with FAISS for 10x faster similarity search
|
| 624 |
-
"""
|
| 625 |
-
|
| 626 |
-
def __init__(self, model: Optional[SentenceTransformer] = None):
|
| 627 |
-
"""
|
| 628 |
-
Initialize the document processor
|
| 629 |
-
|
| 630 |
-
Args:
|
| 631 |
-
model: SentenceTransformer model for embeddings (optional)
|
| 632 |
-
"""
|
| 633 |
-
self.model = model
|
| 634 |
-
self.documents = {}
|
| 635 |
-
self.chunks = []
|
| 636 |
-
self.embeddings = None
|
| 637 |
-
self.faiss_index = None # FAISS index for fast similarity search
|
| 638 |
-
self.performance_stats = {} # Track performance metrics
|
| 639 |
|
| 640 |
-
def load_data_room(self, data_room_path: str,
|
| 641 |
"""
|
| 642 |
-
Load and process an entire data room with
|
| 643 |
|
| 644 |
Args:
|
| 645 |
data_room_path: Path to the data room directory
|
| 646 |
-
|
| 647 |
-
progress_callback: Optional callback function for progress updates
|
| 648 |
|
| 649 |
Returns:
|
| 650 |
Dictionary with processing results including performance metrics
|
| 651 |
"""
|
| 652 |
import time
|
|
|
|
|
|
|
| 653 |
config = get_config()
|
| 654 |
-
|
| 655 |
-
max_workers = config.processing.max_workers
|
| 656 |
|
| 657 |
-
|
|
|
|
|
|
|
| 658 |
|
| 659 |
-
logger.info(f"Starting data room processing: {data_room_path}")
|
| 660 |
|
| 661 |
-
#
|
| 662 |
-
self.documents =
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 667 |
|
| 668 |
scan_time = time.time() - start_time
|
| 669 |
-
logger.info(f"Document
|
| 670 |
|
| 671 |
-
#
|
| 672 |
chunk_start = time.time()
|
| 673 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
chunk_time = time.time() - chunk_start
|
|
|
|
| 675 |
|
| 676 |
-
# Create
|
| 677 |
embedding_time = 0
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
if self.model and self.chunks:
|
| 681 |
embedding_start = time.time()
|
| 682 |
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
#
|
| 689 |
-
self.
|
| 690 |
-
|
| 691 |
-
if len(cached_chunks) == len(self.chunks):
|
| 692 |
-
self.chunks = cached_chunks
|
| 693 |
-
cache_hit = True
|
| 694 |
-
logger.info(f"Loaded embeddings from cache (key: {cache_key[:8]}...)")
|
| 695 |
-
# Build FAISS index from cached embeddings
|
| 696 |
-
self._build_faiss_index()
|
| 697 |
-
else:
|
| 698 |
-
logger.warning("Cached chunks length mismatch, regenerating embeddings")
|
| 699 |
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
texts = [chunk['text'] for chunk in self.chunks]
|
| 703 |
-
self.embeddings = create_embeddings_batch(texts, self.model)
|
| 704 |
-
|
| 705 |
-
# Save to cache
|
| 706 |
-
if _save_embeddings_to_cache(cache_key, self.embeddings, self.chunks):
|
| 707 |
-
logger.info(f"Saved new embeddings to cache (key: {cache_key[:8]}...)")
|
| 708 |
-
|
| 709 |
-
# Clean up old cache files
|
| 710 |
-
_invalidate_old_cache_files()
|
| 711 |
|
| 712 |
-
# Build FAISS index for fast similarity search
|
| 713 |
-
self._build_faiss_index()
|
| 714 |
-
|
| 715 |
embedding_time = time.time() - embedding_start
|
| 716 |
-
|
| 717 |
-
logger.info(f"Embeddings {cache_status} and FAISS index built in {embedding_time:.2f} seconds")
|
| 718 |
|
| 719 |
total_time = time.time() - start_time
|
| 720 |
logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
|
| 721 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
return {
|
| 723 |
-
'documents_count':
|
| 724 |
-
'chunks_count': len(self.
|
| 725 |
-
'
|
| 726 |
-
'
|
| 727 |
-
|
| 728 |
-
'scan_time': scan_time,
|
| 729 |
-
'chunk_time': chunk_time,
|
| 730 |
-
'embedding_time': embedding_time,
|
| 731 |
-
'documents_per_second': len(self.documents) / scan_time if scan_time > 0 else 0,
|
| 732 |
-
'cache_hit': cache_hit,
|
| 733 |
-
'cache_key': cache_key[:8] + "..." if 'cache_key' in locals() else None
|
| 734 |
-
}
|
| 735 |
}
|
| 736 |
|
| 737 |
-
def
|
| 738 |
-
"""
|
| 739 |
-
Build FAISS IndexFlatIP for fast similarity search
|
| 740 |
-
"""
|
| 741 |
-
if self.embeddings is None:
|
| 742 |
-
logger.warning("No embeddings available to build FAISS index")
|
| 743 |
-
return
|
| 744 |
-
|
| 745 |
-
try:
|
| 746 |
-
# Convert to float32 and normalize for cosine similarity via inner product
|
| 747 |
-
embeddings_f32 = self.embeddings.astype('float32')
|
| 748 |
-
faiss.normalize_L2(embeddings_f32)
|
| 749 |
-
|
| 750 |
-
# Create FAISS index
|
| 751 |
-
dimension = embeddings_f32.shape[1]
|
| 752 |
-
self.faiss_index = faiss.IndexFlatIP(dimension)
|
| 753 |
-
self.faiss_index.add(embeddings_f32)
|
| 754 |
-
|
| 755 |
-
logger.info(f"Built FAISS index with {self.faiss_index.ntotal} vectors, dimension {dimension}")
|
| 756 |
-
|
| 757 |
-
except Exception as e:
|
| 758 |
-
logger.error(f"Failed to build FAISS index: {e}")
|
| 759 |
-
self.faiss_index = None
|
| 760 |
-
|
| 761 |
-
def faiss_search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
|
| 762 |
"""
|
| 763 |
-
|
| 764 |
|
| 765 |
Args:
|
| 766 |
query: Search query
|
| 767 |
-
top_k: Number of top results
|
| 768 |
threshold: Minimum similarity threshold
|
| 769 |
|
| 770 |
Returns:
|
| 771 |
-
List of search results with
|
| 772 |
"""
|
| 773 |
-
if not self.
|
|
|
|
| 774 |
return []
|
| 775 |
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
|
| 781 |
-
"""
|
| 782 |
-
Search documents using semantic similarity - uses FAISS if available, falls back to numpy
|
| 783 |
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
threshold: Minimum similarity threshold
|
| 788 |
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 804 |
return []
|
| 805 |
|
| 806 |
-
def get_statistics(self) -> Dict[str,
|
| 807 |
-
"""Get processing statistics
|
| 808 |
stats = {
|
| 809 |
'total_documents': len(self.documents),
|
| 810 |
-
'
|
| 811 |
-
'has_embeddings': self.
|
| 812 |
-
'
|
| 813 |
-
'
|
| 814 |
-
'embedding_dimension': self.embeddings.shape[1] if self.embeddings is not None else 0
|
| 815 |
}
|
| 816 |
|
| 817 |
# Add performance metrics if available
|
|
@@ -820,33 +422,3 @@ class DocumentProcessor:
|
|
| 820 |
|
| 821 |
return stats
|
| 822 |
|
| 823 |
-
def load_data_room_with_progress(self, data_room_path: str, max_workers: Optional[int] = None,
|
| 824 |
-
progress_bar=None) -> Dict[str, any]:
|
| 825 |
-
"""
|
| 826 |
-
Load data room with Streamlit progress bar support
|
| 827 |
-
|
| 828 |
-
Args:
|
| 829 |
-
data_room_path: Path to the data room directory
|
| 830 |
-
max_workers: Maximum number of worker threads
|
| 831 |
-
progress_bar: Streamlit progress bar object
|
| 832 |
-
|
| 833 |
-
Returns:
|
| 834 |
-
Dictionary with processing results
|
| 835 |
-
"""
|
| 836 |
-
# Count total files first for accurate progress tracking
|
| 837 |
-
path = Path(data_room_path)
|
| 838 |
-
if not path.exists():
|
| 839 |
-
return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
|
| 840 |
-
|
| 841 |
-
total_files = sum(1 for file_path in path.rglob('*')
|
| 842 |
-
if file_path.is_file() and not file_path.name.startswith('.')
|
| 843 |
-
and file_path.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
|
| 844 |
-
|
| 845 |
-
# Create progress tracker
|
| 846 |
-
progress_callback = create_progress_tracker(total_files, progress_bar)
|
| 847 |
-
|
| 848 |
-
# Load with progress tracking
|
| 849 |
-
result = self.load_data_room(data_room_path, max_workers, progress_callback)
|
| 850 |
-
self.performance_stats = result.get('performance', {})
|
| 851 |
-
|
| 852 |
-
return result
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Streamlined Document Processing Module
|
| 4 |
|
| 5 |
+
This module provides a simplified document processing pipeline with:
|
| 6 |
+
- Direct LangChain loader integration with glob patterns
|
| 7 |
+
- Built-in FAISS vector storage without external file tracking
|
| 8 |
+
- Semantic text chunking using RecursiveCharacterTextSplitter
|
| 9 |
+
- Consolidated document metadata handling
|
| 10 |
"""
|
| 11 |
|
| 12 |
import os
|
| 13 |
+
import warnings
|
| 14 |
# Fix tokenizers parallelism warning
|
| 15 |
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
# Suppress verbose LangChain warnings and output
|
| 20 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
|
| 21 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
|
| 22 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
|
| 23 |
+
warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
|
| 24 |
+
warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
|
| 25 |
+
|
| 26 |
+
# Set LangChain logging to WARNING level to reduce verbosity
|
| 27 |
+
logging.getLogger("langchain").setLevel(logging.WARNING)
|
| 28 |
+
logging.getLogger("langchain_core").setLevel(logging.WARNING)
|
| 29 |
+
logging.getLogger("langchain_community").setLevel(logging.WARNING)
|
| 30 |
+
logging.getLogger("langchain_huggingface").setLevel(logging.WARNING)
|
| 31 |
+
import re
|
| 32 |
+
|
| 33 |
+
from pathlib import Path
|
| 34 |
+
from typing import Dict, List, Optional, Any, Callable
|
| 35 |
+
from datetime import datetime
|
| 36 |
+
|
| 37 |
+
# LangChain imports
|
| 38 |
+
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
|
| 39 |
+
from langchain_community.vectorstores import FAISS
|
| 40 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 41 |
+
from langchain_core.documents import Document
|
| 42 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 43 |
|
| 44 |
# Import configuration
|
| 45 |
from .config import get_config
|
| 46 |
|
| 47 |
+
# Import error handling
|
| 48 |
+
|
| 49 |
+
|
| 50 |
logger = logging.getLogger(__name__)
|
| 51 |
|
| 52 |
+
|
| 53 |
+
# =============================================================================
|
| 54 |
+
# ERROR HANDLING UTILITIES - Merged from error_handlers.py
|
| 55 |
+
# =============================================================================
|
| 56 |
+
|
| 57 |
+
def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
|
| 58 |
+
"""
|
| 59 |
+
Execute a function with basic error handling and logging
|
| 60 |
+
|
| 61 |
+
Args:
|
| 62 |
+
func: Function to execute
|
| 63 |
+
default: Value to return on error
|
| 64 |
+
context: Brief description for logs
|
| 65 |
+
log_errors: Whether to log errors
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
Function result or default value on error
|
| 69 |
+
"""
|
| 70 |
+
try:
|
| 71 |
+
return func()
|
| 72 |
+
except Exception as e:
|
| 73 |
+
if log_errors:
|
| 74 |
+
logger.error(f"{context or func.__name__}: {e}")
|
| 75 |
+
return default
|
| 76 |
|
| 77 |
|
| 78 |
def escape_markdown_math(text: str) -> str:
|
|
|
|
| 89 |
return text
|
| 90 |
|
| 91 |
|
| 92 |
+
class DocumentProcessor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
"""
|
| 94 |
+
Streamlined document processing class with integrated FAISS vector storage
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
This class consolidates all document processing functionality including:
|
| 97 |
+
- Document loading using LangChain's DirectoryLoader with glob patterns
|
| 98 |
+
- Semantic text chunking with RecursiveCharacterTextSplitter
|
| 99 |
+
- FAISS vector storage for similarity search
|
| 100 |
+
- Document metadata handling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
"""
|
|
|
|
| 102 |
|
| 103 |
+
def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
|
| 104 |
+
"""
|
| 105 |
+
Initialize the document processor
|
|
|
|
| 106 |
|
| 107 |
+
Args:
|
| 108 |
+
model_name: Name of the sentence transformer model for embeddings (optional)
|
| 109 |
+
store_name: Name for the FAISS store (optional, uses config default)
|
| 110 |
+
"""
|
|
|
|
| 111 |
config = get_config()
|
| 112 |
+
self.model_name = model_name or config.model.sentence_transformer_model
|
| 113 |
+
self.store_name = store_name or config.processing.faiss_store_name
|
| 114 |
+
|
| 115 |
+
# Initialize components
|
| 116 |
+
self.documents: List[Document] = []
|
| 117 |
+
self.vector_store: Optional[FAISS] = None
|
| 118 |
+
self.embeddings: Optional[HuggingFaceEmbeddings] = None
|
| 119 |
+
self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
|
| 120 |
+
self.performance_stats = {}
|
| 121 |
+
|
| 122 |
+
# Convenience properties for backward compatibility
|
| 123 |
+
self.chunks = [] # Will be populated after processing
|
| 124 |
+
|
| 125 |
+
# Initialize text splitter with semantic boundaries
|
| 126 |
+
self._init_text_splitter()
|
| 127 |
+
|
| 128 |
+
# Initialize embeddings if model name provided
|
| 129 |
+
if self.model_name:
|
| 130 |
+
self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
|
| 131 |
+
logger.info(f"Initialized embeddings with model: {self.model_name}")
|
| 132 |
+
else:
|
| 133 |
+
logger.warning("No model name provided - embeddings not initialized")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
+
# Try to load existing FAISS store
|
| 136 |
+
self._load_existing_store()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
+
def _init_text_splitter(self):
|
| 139 |
+
"""Initialize the text splitter with optimal settings for semantic chunking"""
|
| 140 |
+
config = get_config()
|
| 141 |
+
self.text_splitter = RecursiveCharacterTextSplitter(
|
| 142 |
+
chunk_size=config.processing.chunk_size,
|
| 143 |
+
chunk_overlap=config.processing.chunk_overlap,
|
| 144 |
+
separators=["\\n\\n", "\\n", ".", "!", "?", ",", " "],
|
| 145 |
+
length_function=len,
|
| 146 |
+
is_separator_regex=False,
|
| 147 |
+
)
|
| 148 |
+
logger.info(f"Initialized text splitter: {config.processing.chunk_size} chars, {config.processing.chunk_overlap} overlap")
|
| 149 |
|
| 150 |
+
def _load_existing_store(self):
|
| 151 |
+
"""Load existing FAISS store if available"""
|
| 152 |
+
if not self.embeddings:
|
| 153 |
+
return
|
| 154 |
|
| 155 |
+
config = get_config()
|
| 156 |
+
faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
|
| 157 |
+
faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
|
| 158 |
+
faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
try:
|
| 161 |
+
if faiss_index_path.exists() and faiss_pkl_path.exists():
|
| 162 |
+
self.vector_store = FAISS.load_local(
|
| 163 |
+
str(faiss_dir),
|
| 164 |
+
self.embeddings,
|
| 165 |
+
index_name=self.store_name,
|
| 166 |
+
allow_dangerous_deserialization=True # Safe: we created these files ourselves
|
| 167 |
+
)
|
| 168 |
+
logger.info(f"Loaded existing FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
|
| 169 |
+
else:
|
| 170 |
+
logger.info(f"No existing FAISS store found for: {self.store_name}")
|
| 171 |
except Exception as e:
|
| 172 |
+
logger.error(f"Failed to load FAISS store: {e}")
|
| 173 |
+
self.vector_store = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
+
def _save_store(self):
|
| 176 |
+
"""Save FAISS store to disk"""
|
| 177 |
+
if not self.vector_store:
|
| 178 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
|
| 180 |
+
try:
|
| 181 |
+
config = get_config()
|
| 182 |
+
faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
|
| 183 |
+
faiss_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
+
self.vector_store.save_local(
|
| 186 |
+
str(faiss_dir),
|
| 187 |
+
index_name=self.store_name
|
| 188 |
+
)
|
| 189 |
+
logger.info(f"Saved FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.error(f"Failed to save FAISS store: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
+
def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
|
| 194 |
"""
|
| 195 |
+
Load and process an entire data room using DirectoryLoader with glob patterns
|
| 196 |
|
| 197 |
Args:
|
| 198 |
data_room_path: Path to the data room directory
|
| 199 |
+
progress_bar: Optional Streamlit progress bar object
|
|
|
|
| 200 |
|
| 201 |
Returns:
|
| 202 |
Dictionary with processing results including performance metrics
|
| 203 |
"""
|
| 204 |
import time
|
| 205 |
+
start_time = time.time()
|
| 206 |
+
|
| 207 |
config = get_config()
|
| 208 |
+
data_room_path = Path(data_room_path)
|
|
|
|
| 209 |
|
| 210 |
+
if not data_room_path.exists():
|
| 211 |
+
logger.error(f"Data room path does not exist: {data_room_path}")
|
| 212 |
+
return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
|
| 213 |
|
| 214 |
+
logger.info(f"Starting streamlined data room processing: {data_room_path}")
|
| 215 |
|
| 216 |
+
# Clear existing documents
|
| 217 |
+
self.documents = []
|
| 218 |
+
documents_loaded = 0
|
| 219 |
+
|
| 220 |
+
# Load documents by file type using DirectoryLoader with glob patterns
|
| 221 |
+
supported_extensions = config.processing.supported_file_extensions
|
| 222 |
+
|
| 223 |
+
for ext in supported_extensions:
|
| 224 |
+
try:
|
| 225 |
+
# Create glob pattern for this extension
|
| 226 |
+
glob_pattern = f"**/*{ext}"
|
| 227 |
+
|
| 228 |
+
# Choose appropriate loader based on extension
|
| 229 |
+
if ext == '.pdf':
|
| 230 |
+
loader_cls = PyPDFLoader
|
| 231 |
+
elif ext in ['.docx', '.doc']:
|
| 232 |
+
loader_cls = Docx2txtLoader
|
| 233 |
+
elif ext in ['.txt', '.md']:
|
| 234 |
+
loader_cls = TextLoader
|
| 235 |
+
else:
|
| 236 |
+
continue
|
| 237 |
+
|
| 238 |
+
# Use DirectoryLoader with glob pattern
|
| 239 |
+
loader = DirectoryLoader(
|
| 240 |
+
str(data_room_path),
|
| 241 |
+
glob=glob_pattern,
|
| 242 |
+
loader_cls=loader_cls,
|
| 243 |
+
loader_kwargs={'encoding': 'utf-8'} if ext in ['.txt', '.md'] else {},
|
| 244 |
+
recursive=True,
|
| 245 |
+
show_progress=False, # Disable verbose progress output
|
| 246 |
+
use_multithreading=True
|
| 247 |
+
)
|
| 248 |
+
|
| 249 |
+
# Load documents for this extension
|
| 250 |
+
docs = safe_execute(
|
| 251 |
+
lambda: loader.load(),
|
| 252 |
+
default=[],
|
| 253 |
+
context=f"Loading {ext} files"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
if docs:
|
| 257 |
+
# Add relative path information to metadata
|
| 258 |
+
for doc in docs:
|
| 259 |
+
if 'source' in doc.metadata:
|
| 260 |
+
source_path = Path(doc.metadata['source'])
|
| 261 |
+
if source_path.exists():
|
| 262 |
+
try:
|
| 263 |
+
rel_path = source_path.relative_to(data_room_path)
|
| 264 |
+
doc.metadata['path'] = str(rel_path)
|
| 265 |
+
doc.metadata['name'] = source_path.name
|
| 266 |
+
except ValueError:
|
| 267 |
+
# If relative path fails, use original source
|
| 268 |
+
doc.metadata['path'] = doc.metadata['source']
|
| 269 |
+
doc.metadata['name'] = source_path.name
|
| 270 |
+
|
| 271 |
+
self.documents.extend(docs)
|
| 272 |
+
documents_loaded += len(docs)
|
| 273 |
+
logger.info(f"Loaded {len(docs)} {ext} documents")
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
logger.error(f"Error loading {ext} files: {e}")
|
| 277 |
|
| 278 |
scan_time = time.time() - start_time
|
| 279 |
+
logger.info(f"Document loading completed in {scan_time:.2f} seconds")
|
| 280 |
|
| 281 |
+
# Split documents into chunks using the text splitter
|
| 282 |
chunk_start = time.time()
|
| 283 |
+
if self.documents and self.text_splitter:
|
| 284 |
+
self.documents = self.text_splitter.split_documents(self.documents)
|
| 285 |
+
|
| 286 |
+
# Add chunk metadata and populate chunks for backward compatibility
|
| 287 |
+
self.chunks = []
|
| 288 |
+
for i, doc in enumerate(self.documents):
|
| 289 |
+
doc.metadata['chunk_id'] = f"chunk_{i}"
|
| 290 |
+
doc.metadata['processed_at'] = datetime.now().isoformat()
|
| 291 |
+
|
| 292 |
+
# Add citation information if available
|
| 293 |
+
if 'page' in doc.metadata:
|
| 294 |
+
doc.metadata['citation'] = f"page {doc.metadata['page']}"
|
| 295 |
+
else:
|
| 296 |
+
doc.metadata['citation'] = doc.metadata.get('name', 'document')
|
| 297 |
+
|
| 298 |
+
# Create chunk dict for backward compatibility
|
| 299 |
+
chunk_dict = {
|
| 300 |
+
'text': doc.page_content,
|
| 301 |
+
'source': doc.metadata.get('name', ''),
|
| 302 |
+
'path': doc.metadata.get('path', ''),
|
| 303 |
+
'full_path': doc.metadata.get('source', ''),
|
| 304 |
+
'metadata': doc.metadata
|
| 305 |
+
}
|
| 306 |
+
self.chunks.append(chunk_dict)
|
| 307 |
+
|
| 308 |
chunk_time = time.time() - chunk_start
|
| 309 |
+
logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
|
| 310 |
|
| 311 |
+
# Create or update FAISS vector store
|
| 312 |
embedding_time = 0
|
| 313 |
+
if self.embeddings and self.documents:
|
|
|
|
|
|
|
| 314 |
embedding_start = time.time()
|
| 315 |
|
| 316 |
+
if self.vector_store is None:
|
| 317 |
+
# Create new FAISS store
|
| 318 |
+
self.vector_store = FAISS.from_documents(self.documents, self.embeddings)
|
| 319 |
+
logger.info(f"Created new FAISS store with {len(self.documents)} documents")
|
| 320 |
+
else:
|
| 321 |
+
# Add documents to existing store
|
| 322 |
+
self.vector_store.add_documents(self.documents)
|
| 323 |
+
logger.info(f"Added {len(self.documents)} documents to existing FAISS store")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
| 325 |
+
# Save the updated store
|
| 326 |
+
self._save_store()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
|
|
|
|
|
|
|
|
|
| 328 |
embedding_time = time.time() - embedding_start
|
| 329 |
+
logger.info(f"FAISS processing completed in {embedding_time:.2f} seconds")
|
|
|
|
| 330 |
|
| 331 |
total_time = time.time() - start_time
|
| 332 |
logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
|
| 333 |
|
| 334 |
+
# Store performance stats
|
| 335 |
+
self.performance_stats = {
|
| 336 |
+
'total_time': total_time,
|
| 337 |
+
'scan_time': scan_time,
|
| 338 |
+
'chunk_time': chunk_time,
|
| 339 |
+
'embedding_time': embedding_time,
|
| 340 |
+
'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
return {
|
| 344 |
+
'documents_count': documents_loaded,
|
| 345 |
+
'chunks_count': len(self.documents),
|
| 346 |
+
'total_chunks_in_store': self.vector_store.index.ntotal if self.vector_store else 0,
|
| 347 |
+
'has_embeddings': self.vector_store is not None,
|
| 348 |
+
'performance': self.performance_stats
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
}
|
| 350 |
|
| 351 |
+
def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
"""
|
| 353 |
+
Search documents using FAISS similarity search
|
| 354 |
|
| 355 |
Args:
|
| 356 |
query: Search query
|
| 357 |
+
top_k: Number of top results to return
|
| 358 |
threshold: Minimum similarity threshold
|
| 359 |
|
| 360 |
Returns:
|
| 361 |
+
List of search results with scores and metadata
|
| 362 |
"""
|
| 363 |
+
if not self.vector_store:
|
| 364 |
+
logger.warning("FAISS vector store not available for search")
|
| 365 |
return []
|
| 366 |
|
| 367 |
+
config = get_config()
|
| 368 |
+
if threshold is None:
|
| 369 |
+
threshold = config.processing.similarity_threshold
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
|
| 371 |
+
try:
|
| 372 |
+
# Perform similarity search with scores
|
| 373 |
+
docs_and_scores = self.vector_store.similarity_search_with_score(query, k=top_k*2)
|
|
|
|
| 374 |
|
| 375 |
+
results = []
|
| 376 |
+
seen_texts = set()
|
| 377 |
+
|
| 378 |
+
for doc, score in docs_and_scores:
|
| 379 |
+
# Convert FAISS distance to similarity score (higher is better)
|
| 380 |
+
similarity_score = 1.0 / (1.0 + score) if score >= 0 else 1.0
|
| 381 |
+
|
| 382 |
+
if similarity_score < threshold:
|
| 383 |
+
continue
|
| 384 |
+
|
| 385 |
+
# Avoid duplicates based on text content
|
| 386 |
+
text_preview = doc.page_content[:100]
|
| 387 |
+
if text_preview not in seen_texts:
|
| 388 |
+
seen_texts.add(text_preview)
|
| 389 |
+
|
| 390 |
+
results.append({
|
| 391 |
+
'text': doc.page_content,
|
| 392 |
+
'source': doc.metadata.get('name', ''),
|
| 393 |
+
'path': doc.metadata.get('path', ''),
|
| 394 |
+
'full_path': doc.metadata.get('source', ''),
|
| 395 |
+
'citation': doc.metadata.get('citation', 'document'),
|
| 396 |
+
'score': float(similarity_score),
|
| 397 |
+
'metadata': doc.metadata
|
| 398 |
+
})
|
| 399 |
+
|
| 400 |
+
if len(results) >= top_k:
|
| 401 |
+
break
|
| 402 |
+
|
| 403 |
+
return results
|
| 404 |
+
|
| 405 |
+
except Exception as e:
|
| 406 |
+
logger.error(f"Failed to search FAISS store: {e}")
|
| 407 |
return []
|
| 408 |
|
| 409 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 410 |
+
"""Get processing statistics"""
|
| 411 |
stats = {
|
| 412 |
'total_documents': len(self.documents),
|
| 413 |
+
'total_vectors_in_store': self.vector_store.index.ntotal if self.vector_store else 0,
|
| 414 |
+
'has_embeddings': self.vector_store is not None,
|
| 415 |
+
'store_name': self.store_name,
|
| 416 |
+
'model_name': self.model_name
|
|
|
|
| 417 |
}
|
| 418 |
|
| 419 |
# Add performance metrics if available
|
|
|
|
| 422 |
|
| 423 |
return stats
|
| 424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/services.py
CHANGED
|
@@ -2,408 +2,346 @@
|
|
| 2 |
"""
|
| 3 |
Business Logic Services Module
|
| 4 |
|
| 5 |
-
|
| 6 |
-
Services handle specific domain operations and coordinate between different components.
|
| 7 |
"""
|
| 8 |
|
| 9 |
import re
|
| 10 |
-
import
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
from .config import get_config
|
| 18 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
elif line.strip() and not line.startswith('\t') and not line.startswith(' '):
|
| 45 |
-
# Try plain format (no ##)
|
| 46 |
-
match = re.match(r'^([A-Z])\. (.+)', line.strip())
|
| 47 |
-
|
| 48 |
if match:
|
| 49 |
letter, name = match.groups()
|
| 50 |
-
current_category = letter
|
| 51 |
-
categories[letter] = {
|
| 52 |
-
'name': name.strip(),
|
| 53 |
-
'items': []
|
| 54 |
-
}
|
| 55 |
-
# Numbered items (may be indented with tabs or spaces)
|
| 56 |
-
elif current_category:
|
| 57 |
-
# Check for numbered items with various indentation
|
| 58 |
-
line_stripped = line.strip()
|
| 59 |
-
if re.match(r'^\d+\.', line_stripped):
|
| 60 |
-
item_text = re.sub(r'^\d+\.\s*', '', line_stripped)
|
| 61 |
-
if item_text:
|
| 62 |
-
# Clean up [bracketed] content but keep the text
|
| 63 |
-
clean_text = re.sub(r'\[.*?\]', '', item_text).strip()
|
| 64 |
-
if not clean_text:
|
| 65 |
-
clean_text = item_text
|
| 66 |
-
categories[current_category]['items'].append({
|
| 67 |
-
'text': clean_text,
|
| 68 |
-
'original': item_text
|
| 69 |
-
})
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
| 111 |
|
| 112 |
|
| 113 |
-
|
| 114 |
-
"""
|
|
|
|
|
|
|
| 115 |
|
| 116 |
-
|
| 117 |
-
""
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
Args:
|
| 121 |
-
model: SentenceTransformer model for embeddings
|
| 122 |
-
"""
|
| 123 |
-
self.model = model
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
) -> Dict:
|
| 132 |
-
"""
|
| 133 |
-
Match each checklist item to relevant documents using FAISS for 10x faster similarity search
|
| 134 |
-
|
| 135 |
-
Args:
|
| 136 |
-
checklist: Parsed checklist
|
| 137 |
-
chunks: Document chunks
|
| 138 |
-
embeddings: Precomputed embeddings
|
| 139 |
-
threshold: Similarity threshold (uses config default if None)
|
| 140 |
|
| 141 |
-
|
| 142 |
-
Matching results
|
| 143 |
-
"""
|
| 144 |
-
config = get_config()
|
| 145 |
-
if threshold is None:
|
| 146 |
-
threshold = config.processing.similarity_threshold
|
| 147 |
-
|
| 148 |
-
# Build FAISS index for fast similarity search
|
| 149 |
-
embeddings_f32 = embeddings.astype('float32')
|
| 150 |
-
faiss.normalize_L2(embeddings_f32) # Normalize for cosine similarity
|
| 151 |
-
dimension = embeddings_f32.shape[1]
|
| 152 |
-
faiss_index = faiss.IndexFlatIP(dimension)
|
| 153 |
-
faiss_index.add(embeddings_f32)
|
| 154 |
-
|
| 155 |
-
results = {}
|
| 156 |
-
|
| 157 |
-
for cat_letter, category in checklist.items():
|
| 158 |
-
cat_results = {
|
| 159 |
-
'name': category['name'],
|
| 160 |
-
'items': [],
|
| 161 |
-
'total_items': len(category['items']),
|
| 162 |
-
'matched_items': 0
|
| 163 |
-
}
|
| 164 |
|
| 165 |
-
|
| 166 |
-
# Encode checklist item with category context
|
| 167 |
-
item_text = f"{category['name']} {item['text']}"
|
| 168 |
-
item_embedding = self.model.encode(item_text).astype('float32').reshape(1, -1)
|
| 169 |
-
faiss.normalize_L2(item_embedding)
|
| 170 |
-
|
| 171 |
-
# Use FAISS for fast similarity search
|
| 172 |
-
scores, indices = faiss_index.search(item_embedding, len(chunks))
|
| 173 |
-
|
| 174 |
-
# Get unique documents that match
|
| 175 |
-
doc_matches = {}
|
| 176 |
-
for score, idx in zip(scores[0], indices[0]):
|
| 177 |
-
if idx == -1 or score < threshold:
|
| 178 |
-
continue
|
| 179 |
-
|
| 180 |
-
doc_path = chunks[idx]['path']
|
| 181 |
-
if doc_path not in doc_matches or score > doc_matches[doc_path]['score']:
|
| 182 |
-
doc_matches[doc_path] = {
|
| 183 |
-
'name': chunks[idx]['source'],
|
| 184 |
-
'path': doc_path,
|
| 185 |
-
'full_path': chunks[idx].get('full_path', doc_path),
|
| 186 |
-
'score': float(score),
|
| 187 |
-
'metadata': chunks[idx]['metadata']
|
| 188 |
-
}
|
| 189 |
-
|
| 190 |
-
# Sort by score
|
| 191 |
-
sorted_matches = sorted(doc_matches.values(), key=lambda x: x['score'], reverse=True)
|
| 192 |
-
|
| 193 |
-
item_result = {
|
| 194 |
-
'text': item['text'],
|
| 195 |
-
'original': item['original'],
|
| 196 |
-
'matches': sorted_matches
|
| 197 |
-
}
|
| 198 |
-
|
| 199 |
-
if sorted_matches:
|
| 200 |
-
cat_results['matched_items'] += 1
|
| 201 |
-
|
| 202 |
-
cat_results['items'].append(item_result)
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
dimension = doc_embeddings.shape[1]
|
| 231 |
-
faiss_index = faiss.IndexFlatIP(dimension)
|
| 232 |
-
faiss_index.add(doc_embeddings)
|
| 233 |
-
|
| 234 |
-
results = {}
|
| 235 |
-
|
| 236 |
-
for cat_letter, category in checklist.items():
|
| 237 |
-
cat_name = category.get('name', '')
|
| 238 |
-
cat_results = {
|
| 239 |
-
'name': cat_name,
|
| 240 |
-
'letter': cat_letter,
|
| 241 |
-
'total_items': len(category.get('items', [])),
|
| 242 |
-
'matched_items': 0,
|
| 243 |
-
'items': []
|
| 244 |
-
}
|
| 245 |
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
# Create embedding for checklist item with category context
|
| 250 |
-
checklist_embedding_text = f"{cat_name}: {item_text}"
|
| 251 |
-
item_embedding = self.model.encode(checklist_embedding_text).astype('float32').reshape(1, -1)
|
| 252 |
-
faiss.normalize_L2(item_embedding)
|
| 253 |
-
|
| 254 |
-
# Use FAISS for fast similarity search
|
| 255 |
-
scores, indices = faiss_index.search(item_embedding, len(doc_info))
|
| 256 |
-
|
| 257 |
-
# Find matching documents above threshold
|
| 258 |
-
matches = []
|
| 259 |
-
for score, idx in zip(scores[0], indices[0]):
|
| 260 |
-
if idx == -1: # No more results
|
| 261 |
-
break
|
| 262 |
-
if score > threshold:
|
| 263 |
-
matches.append({
|
| 264 |
-
'name': doc_info[idx]['name'],
|
| 265 |
-
'path': doc_info[idx]['path'],
|
| 266 |
-
'summary': doc_info[idx]['summary'],
|
| 267 |
-
'score': float(score),
|
| 268 |
-
'metadata': doc_info[idx].get('original_doc', {}).get('metadata', {})
|
| 269 |
-
})
|
| 270 |
-
else:
|
| 271 |
-
break # Scores are sorted, so we can stop here
|
| 272 |
-
|
| 273 |
-
# Keep top 5 matches (already sorted by FAISS)
|
| 274 |
-
matches = matches[:5]
|
| 275 |
-
|
| 276 |
-
item_result = {
|
| 277 |
-
'text': item_text,
|
| 278 |
-
'original': item.get('original', item_text),
|
| 279 |
-
'matches': matches
|
| 280 |
-
}
|
| 281 |
-
|
| 282 |
-
if matches:
|
| 283 |
-
cat_results['matched_items'] += 1
|
| 284 |
-
|
| 285 |
-
cat_results['items'].append(item_result)
|
| 286 |
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
|
| 289 |
-
|
|
|
|
|
|
|
| 290 |
|
| 291 |
|
| 292 |
-
|
| 293 |
-
"""
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
"""
|
| 312 |
-
Answer questions using document chunks with FAISS for 10x faster similarity search
|
| 313 |
-
|
| 314 |
-
Args:
|
| 315 |
-
questions: List of parsed questions
|
| 316 |
-
chunks: Document chunks
|
| 317 |
-
embeddings: Precomputed embeddings
|
| 318 |
-
threshold: Similarity threshold (uses config default if None)
|
| 319 |
-
|
| 320 |
-
Returns:
|
| 321 |
-
Dictionary of answers with citations
|
| 322 |
-
"""
|
| 323 |
-
config = get_config()
|
| 324 |
-
if threshold is None:
|
| 325 |
-
threshold = config.processing.relevancy_threshold
|
| 326 |
-
|
| 327 |
-
# Build FAISS index for fast similarity search
|
| 328 |
-
embeddings_f32 = embeddings.astype('float32')
|
| 329 |
-
faiss.normalize_L2(embeddings_f32) # Normalize for cosine similarity
|
| 330 |
-
dimension = embeddings_f32.shape[1]
|
| 331 |
-
faiss_index = faiss.IndexFlatIP(dimension)
|
| 332 |
-
faiss_index.add(embeddings_f32)
|
| 333 |
-
|
| 334 |
-
answers = {}
|
| 335 |
-
|
| 336 |
-
for question in questions:
|
| 337 |
-
# Encode question
|
| 338 |
-
question_embedding = self.model.encode(question['question']).astype('float32').reshape(1, -1)
|
| 339 |
-
faiss.normalize_L2(question_embedding)
|
| 340 |
-
|
| 341 |
-
# Use FAISS for fast similarity search
|
| 342 |
-
scores, indices = faiss_index.search(question_embedding, min(10, len(chunks))) # Get top 10 candidates
|
| 343 |
-
|
| 344 |
-
# Get top matching chunks above threshold
|
| 345 |
-
relevant_chunks = []
|
| 346 |
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
'path': chunk_info['path'],
|
| 356 |
-
'score': float(score),
|
| 357 |
-
'metadata': chunk_info.get('metadata', {})
|
| 358 |
-
})
|
| 359 |
-
|
| 360 |
-
# Limit to top 5 chunks
|
| 361 |
-
if len(relevant_chunks) >= 5:
|
| 362 |
-
break
|
| 363 |
|
| 364 |
answers[question['id']] = {
|
| 365 |
'question': question['question'],
|
| 366 |
'category': question['category'],
|
| 367 |
-
'
|
| 368 |
-
'
|
|
|
|
| 369 |
}
|
| 370 |
-
|
| 371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
| 376 |
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
Initialize the report generator
|
| 380 |
-
|
| 381 |
-
Args:
|
| 382 |
-
agent: Optional AI agent for enhanced reporting
|
| 383 |
-
"""
|
| 384 |
-
self.agent = agent
|
| 385 |
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
|
| 400 |
-
#
|
| 401 |
doc_summaries = []
|
| 402 |
-
for path, doc_info in list(documents.items())[:10]:
|
| 403 |
if 'summary' in doc_info:
|
| 404 |
doc_summaries.append(f"{doc_info['name']}: {doc_info['summary']}")
|
| 405 |
else:
|
| 406 |
-
# Use first 500 chars of content if no summary
|
| 407 |
content_preview = doc_info.get('content', '')[:500]
|
| 408 |
if content_preview:
|
| 409 |
doc_summaries.append(f"{doc_info['name']}: {content_preview}")
|
|
@@ -411,34 +349,69 @@ class ReportGenerator:
|
|
| 411 |
if not doc_summaries:
|
| 412 |
return "No documents available for summary generation."
|
| 413 |
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
Company: {data_room_name}
|
| 419 |
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
return f"Failed to generate AI summary: {str(e)}"
|
| 439 |
|
| 440 |
-
|
| 441 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
doc_count = len(documents)
|
| 443 |
file_types = {}
|
| 444 |
|
|
@@ -446,7 +419,7 @@ class ReportGenerator:
|
|
| 446 |
doc_type = doc_info.get('metadata', {}).get('type', 'unknown')
|
| 447 |
file_types[doc_type] = file_types.get(doc_type, 0) + 1
|
| 448 |
|
| 449 |
-
|
| 450 |
|
| 451 |
## Document Analysis
|
| 452 |
- **Total Documents**: {doc_count}
|
|
@@ -457,70 +430,11 @@ Based on the document structure, this data room appears to cover standard due di
|
|
| 457 |
|
| 458 |
*Note: Enable AI features for detailed company analysis and insights.*
|
| 459 |
"""
|
| 460 |
-
return summary
|
| 461 |
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
checklist_results: Dict,
|
| 466 |
-
documents: Dict[str, Dict]
|
| 467 |
-
) -> str:
|
| 468 |
-
"""
|
| 469 |
-
Generate strategic analysis based on strategy and checklist results
|
| 470 |
-
|
| 471 |
-
Args:
|
| 472 |
-
strategy_text: Strategic document content
|
| 473 |
-
checklist_results: Results from checklist matching
|
| 474 |
-
documents: Document dictionary
|
| 475 |
-
|
| 476 |
-
Returns:
|
| 477 |
-
Strategic analysis text
|
| 478 |
-
"""
|
| 479 |
-
if not self.agent or not hasattr(self.agent, 'llm'):
|
| 480 |
-
return self._generate_basic_strategic_analysis(checklist_results)
|
| 481 |
-
|
| 482 |
-
# Build context from checklist results
|
| 483 |
-
checklist_context = []
|
| 484 |
-
for cat_id, cat_data in checklist_results.items():
|
| 485 |
-
cat_name = cat_data['name']
|
| 486 |
-
matched_items = sum(1 for item in cat_data['items'] if item['matches'])
|
| 487 |
-
total_items = len(cat_data['items'])
|
| 488 |
-
coverage = (matched_items / total_items * 100) if total_items > 0 else 0
|
| 489 |
-
|
| 490 |
-
checklist_context.append(f"- {cat_name}: {coverage:.0f}% coverage ({matched_items}/{total_items} items)")
|
| 491 |
|
| 492 |
-
# Add details about specific gaps
|
| 493 |
-
missing_items = [item['text'] for item in cat_data['items'] if not item['matches']]
|
| 494 |
-
if missing_items and len(missing_items) <= 3:
|
| 495 |
-
checklist_context.append(f" Missing: {', '.join(missing_items[:3])}")
|
| 496 |
-
|
| 497 |
-
# Build prompt
|
| 498 |
-
prompt = f"""Based on the due diligence checklist results and the selected strategy, provide a strategic analysis.
|
| 499 |
-
|
| 500 |
-
Strategy Document:
|
| 501 |
-
{strategy_text}
|
| 502 |
-
|
| 503 |
-
Checklist Coverage:
|
| 504 |
-
{chr(10).join(checklist_context)}
|
| 505 |
-
|
| 506 |
-
Please provide:
|
| 507 |
-
1. Strategic alignment assessment
|
| 508 |
-
2. Key risks and gaps identified
|
| 509 |
-
3. Opportunities and synergies
|
| 510 |
-
4. Recommended next steps
|
| 511 |
-
5. Overall recommendation
|
| 512 |
-
|
| 513 |
-
Format the response with clear sections and bullet points."""
|
| 514 |
-
|
| 515 |
-
try:
|
| 516 |
-
from langchain_core.messages import HumanMessage
|
| 517 |
-
response = self.agent.llm.invoke([HumanMessage(content=prompt)])
|
| 518 |
-
return escape_markdown_math(response.content.strip())
|
| 519 |
-
except Exception as e:
|
| 520 |
-
return f"Failed to generate strategic analysis: {str(e)}"
|
| 521 |
-
|
| 522 |
-
def _generate_basic_strategic_analysis(self, checklist_results: Dict) -> str:
|
| 523 |
-
"""Generate basic strategic analysis without AI"""
|
| 524 |
total_items = sum(cat['total_items'] for cat in checklist_results.values())
|
| 525 |
matched_items = sum(cat['matched_items'] for cat in checklist_results.values())
|
| 526 |
coverage = (matched_items / total_items * 100) if total_items > 0 else 0
|
|
@@ -547,102 +461,30 @@ Based on the document structure, this data room appears to cover standard due di
|
|
| 547 |
"""
|
| 548 |
|
| 549 |
return analysis
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
|
| 551 |
|
| 552 |
-
class DDChecklistService:
|
| 553 |
-
"""
|
| 554 |
-
Main service orchestrator for DD-Checklist operations
|
| 555 |
-
Coordinates between different services and manages the overall workflow
|
| 556 |
-
"""
|
| 557 |
-
|
| 558 |
-
def __init__(self, model: SentenceTransformer, agent=None):
|
| 559 |
-
"""
|
| 560 |
-
Initialize the service
|
| 561 |
-
|
| 562 |
-
Args:
|
| 563 |
-
model: SentenceTransformer model
|
| 564 |
-
agent: Optional AI agent
|
| 565 |
-
"""
|
| 566 |
-
self.model = model
|
| 567 |
-
self.agent = agent
|
| 568 |
-
self.document_processor = DocumentProcessor(model)
|
| 569 |
-
self.checklist_parser = ChecklistParser()
|
| 570 |
-
self.question_parser = QuestionParser()
|
| 571 |
-
self.checklist_matcher = ChecklistMatcher(model)
|
| 572 |
-
self.question_answerer = QuestionAnswerer(model)
|
| 573 |
-
self.report_generator = ReportGenerator(agent)
|
| 574 |
-
|
| 575 |
-
def process_data_room(
|
| 576 |
-
self,
|
| 577 |
-
data_room_path: str,
|
| 578 |
-
checklist_text: str = "",
|
| 579 |
-
questions_text: str = ""
|
| 580 |
-
) -> Dict[str, Any]:
|
| 581 |
-
"""
|
| 582 |
-
Process entire data room with checklist and questions
|
| 583 |
-
|
| 584 |
-
Args:
|
| 585 |
-
data_room_path: Path to data room
|
| 586 |
-
checklist_text: Optional checklist text
|
| 587 |
-
questions_text: Optional questions text
|
| 588 |
-
|
| 589 |
-
Returns:
|
| 590 |
-
Dictionary with all processing results
|
| 591 |
-
"""
|
| 592 |
-
results = {}
|
| 593 |
-
|
| 594 |
-
# Load data room
|
| 595 |
-
load_results = self.document_processor.load_data_room(data_room_path)
|
| 596 |
-
results['load_results'] = load_results
|
| 597 |
-
|
| 598 |
-
# Parse checklist if provided
|
| 599 |
-
checklist = {}
|
| 600 |
-
if checklist_text:
|
| 601 |
-
checklist = self.checklist_parser.parse_checklist(checklist_text)
|
| 602 |
-
results['checklist'] = checklist
|
| 603 |
-
|
| 604 |
-
# Parse questions if provided
|
| 605 |
-
questions = []
|
| 606 |
-
if questions_text:
|
| 607 |
-
questions = self.question_parser.parse_questions(questions_text)
|
| 608 |
-
results['questions'] = questions
|
| 609 |
-
|
| 610 |
-
# Match checklist to documents
|
| 611 |
-
checklist_results = {}
|
| 612 |
-
if checklist and self.document_processor.chunks:
|
| 613 |
-
checklist_results = self.checklist_matcher.match_checklist_to_documents(
|
| 614 |
-
checklist,
|
| 615 |
-
self.document_processor.chunks,
|
| 616 |
-
self.document_processor.embeddings
|
| 617 |
-
)
|
| 618 |
-
results['checklist_results'] = checklist_results
|
| 619 |
-
|
| 620 |
-
# Answer questions
|
| 621 |
-
question_answers = {}
|
| 622 |
-
if questions and self.document_processor.chunks and self.document_processor.embeddings is not None:
|
| 623 |
-
question_answers = self.question_answerer.answer_questions_with_chunks(
|
| 624 |
-
questions,
|
| 625 |
-
self.document_processor.chunks,
|
| 626 |
-
self.document_processor.embeddings
|
| 627 |
-
)
|
| 628 |
-
results['question_answers'] = question_answers
|
| 629 |
-
|
| 630 |
-
return results
|
| 631 |
-
|
| 632 |
-
def search_documents(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
|
| 633 |
-
"""
|
| 634 |
-
Search documents using the document processor
|
| 635 |
-
|
| 636 |
-
Args:
|
| 637 |
-
query: Search query
|
| 638 |
-
top_k: Number of results
|
| 639 |
-
threshold: Similarity threshold
|
| 640 |
-
|
| 641 |
-
Returns:
|
| 642 |
-
Search results
|
| 643 |
-
"""
|
| 644 |
-
return self.document_processor.search(query, top_k, threshold)
|
| 645 |
-
|
| 646 |
-
def get_processing_statistics(self) -> Dict[str, Any]:
|
| 647 |
-
"""Get comprehensive processing statistics"""
|
| 648 |
-
return self.document_processor.get_statistics()
|
|
|
|
| 2 |
"""
|
| 3 |
Business Logic Services Module
|
| 4 |
|
| 5 |
+
Simplified service layer with focused functions instead of over-abstracted classes.
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import re
|
| 9 |
+
import logging
|
| 10 |
+
import warnings
|
| 11 |
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Suppress verbose LangChain warnings in services
|
| 14 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
|
| 15 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
|
| 16 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
|
| 17 |
+
warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
|
| 18 |
+
warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
|
| 19 |
+
from typing import Dict, List, Optional, Any
|
| 20 |
+
import markdown
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
from langchain_core.output_parsers import StrOutputParser
|
| 25 |
+
from langchain_core.prompts import PromptTemplate
|
| 26 |
+
from langchain_community.vectorstores import FAISS
|
| 27 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
| 28 |
+
from langchain_core.documents import Document
|
| 29 |
+
from langchain_core.messages import HumanMessage
|
| 30 |
|
| 31 |
from .config import get_config
|
| 32 |
+
from .document_processing import DocumentProcessor, escape_markdown_math
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# =============================================================================
|
| 39 |
+
# PARSING FUNCTIONS - Simplified from ChecklistParser and QuestionParser classes
|
| 40 |
+
# =============================================================================
|
| 41 |
+
|
| 42 |
+
def parse_checklist(checklist_text: str) -> Dict:
|
| 43 |
+
"""Parse markdown checklist into categories and items using standard markdown parser"""
|
| 44 |
+
categories = {}
|
| 45 |
+
current_category = None
|
| 46 |
+
|
| 47 |
+
# Parse line by line for reliable extraction
|
| 48 |
+
lines = checklist_text.split('\n')
|
| 49 |
+
for line_num, original_line in enumerate(lines):
|
| 50 |
+
line = original_line.strip()
|
| 51 |
|
| 52 |
+
# Skip empty lines and separator lines
|
| 53 |
+
if not line or line.startswith('⸻') or line.startswith('---'):
|
| 54 |
+
continue
|
| 55 |
|
| 56 |
+
# Skip title lines
|
| 57 |
+
if 'due diligence checklist' in line.lower() or line.startswith('#'):
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
# Category headers - look for pattern "A. Category Name"
|
| 61 |
+
category_match = re.match(r'^([A-Z])\.\s+(.+)', line)
|
| 62 |
+
if category_match and not re.match(r'^\d+\.\s+', line):
|
| 63 |
+
letter, name = category_match.groups()
|
| 64 |
+
current_category = letter
|
| 65 |
+
categories[letter] = {
|
| 66 |
+
'name': name.strip(),
|
| 67 |
+
'items': []
|
| 68 |
+
}
|
| 69 |
+
continue
|
| 70 |
+
|
| 71 |
+
# Numbered items within categories - look for indented items
|
| 72 |
+
if current_category and line:
|
| 73 |
+
# Check if original line was indented (starts with tab or multiple spaces)
|
| 74 |
+
is_indented = original_line.startswith(('\t', ' ', ' '))
|
| 75 |
+
item_match = re.match(r'^\d+\.\s+(.+)', line)
|
| 76 |
+
|
| 77 |
+
if item_match and (is_indented or current_category):
|
| 78 |
+
item_text = item_match.group(1).strip()
|
| 79 |
+
if item_text and not item_text.lower().startswith('[other requests'):
|
| 80 |
+
# Clean up markdown formatting but preserve content
|
| 81 |
+
clean_text = re.sub(r'\[.*?\]', '', item_text).strip()
|
| 82 |
+
if not clean_text:
|
| 83 |
+
clean_text = item_text
|
| 84 |
+
|
| 85 |
+
categories[current_category]['items'].append({
|
| 86 |
+
'text': clean_text,
|
| 87 |
+
'original': item_text
|
| 88 |
+
})
|
| 89 |
|
| 90 |
+
return categories
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def parse_questions(questions_text: str) -> List[Dict]:
|
| 94 |
+
"""Parse markdown questions into a list using standard markdown parser"""
|
| 95 |
+
# Convert markdown to understand structure
|
| 96 |
+
md = markdown.Markdown(extensions=['toc'])
|
| 97 |
+
html = md.convert(questions_text)
|
| 98 |
+
|
| 99 |
+
questions = []
|
| 100 |
+
current_category = None
|
| 101 |
+
|
| 102 |
+
# Parse line by line for reliable extraction
|
| 103 |
+
lines = questions_text.split('\n')
|
| 104 |
+
for line in lines:
|
| 105 |
+
line = line.strip()
|
| 106 |
+
|
| 107 |
+
# Category headers (### format)
|
| 108 |
+
if line.startswith('### '):
|
| 109 |
+
match = re.match(r'###\s+([A-Z])\.\s+(.+)', line)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
if match:
|
| 111 |
letter, name = match.groups()
|
| 112 |
+
current_category = f"{letter}. {name.strip()}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
+
# Question items (numbered lists)
|
| 115 |
+
elif current_category and line:
|
| 116 |
+
match = re.match(r'^\d+\.\s+(.+)', line)
|
| 117 |
+
if match:
|
| 118 |
+
question_text = match.group(1).strip()
|
| 119 |
+
if question_text:
|
| 120 |
+
# Clean markdown formatting
|
| 121 |
+
clean_question = re.sub(r'\*\*(.*?)\*\*', r'\1', question_text) # Remove bold
|
| 122 |
+
clean_question = re.sub(r'\*(.*?)\*', r'\1', clean_question) # Remove italics
|
| 123 |
+
|
| 124 |
+
questions.append({
|
| 125 |
+
'category': current_category,
|
| 126 |
+
'question': clean_question,
|
| 127 |
+
'id': f"q_{len(questions)}"
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
return questions
|
| 131 |
|
| 132 |
|
| 133 |
+
# =============================================================================
|
| 134 |
+
# SEARCH FUNCTIONS - Consolidated from ChecklistMatcher and QuestionAnswerer
|
| 135 |
+
# =============================================================================
|
| 136 |
+
|
| 137 |
+
def create_vector_store(source_data, model_name: str) -> FAISS:
|
| 138 |
+
"""Unified vector store creation from various data sources"""
|
| 139 |
+
embeddings = HuggingFaceEmbeddings(model_name=model_name)
|
| 140 |
|
| 141 |
+
# Handle different input types
|
| 142 |
+
if isinstance(source_data, list):
|
| 143 |
+
if all(isinstance(item, Document) for item in source_data):
|
| 144 |
+
# Already LangChain documents
|
| 145 |
+
return FAISS.from_documents(source_data, embeddings)
|
| 146 |
+
elif all(isinstance(item, dict) for item in source_data):
|
| 147 |
+
# Document chunks
|
| 148 |
+
documents = [
|
| 149 |
+
Document(
|
| 150 |
+
page_content=chunk['text'],
|
| 151 |
+
metadata={
|
| 152 |
+
'source': chunk.get('source', ''),
|
| 153 |
+
'path': chunk.get('path', ''),
|
| 154 |
+
'full_path': chunk.get('full_path', ''),
|
| 155 |
+
**chunk.get('metadata', {})
|
| 156 |
+
}
|
| 157 |
+
) for chunk in source_data
|
| 158 |
+
]
|
| 159 |
+
return FAISS.from_documents(documents, embeddings)
|
| 160 |
+
elif isinstance(source_data, dict) and 'documents' in source_data:
|
| 161 |
+
# Document embeddings data with summaries
|
| 162 |
+
documents = [
|
| 163 |
+
Document(
|
| 164 |
+
page_content=f"{doc['name']}\n{doc['path']}\n{doc['summary']}",
|
| 165 |
+
metadata={
|
| 166 |
+
'name': doc['name'],
|
| 167 |
+
'path': doc['path'],
|
| 168 |
+
'summary': doc['summary'],
|
| 169 |
+
**doc.get('original_doc', {}).get('metadata', {})
|
| 170 |
+
}
|
| 171 |
+
) for doc in source_data['documents']
|
| 172 |
+
]
|
| 173 |
+
return FAISS.from_documents(documents, embeddings)
|
| 174 |
+
|
| 175 |
+
raise ValueError("Unsupported data type for vector store creation")
|
| 176 |
|
| 177 |
|
| 178 |
+
def search_and_analyze(queries: List[Dict], vector_store: FAISS, llm=None, threshold: float = 0.7, search_type: str = 'items') -> Dict:
|
| 179 |
+
"""Unified search function for both checklist items and questions using LangChain RAG"""
|
| 180 |
+
from langchain.chains import RetrievalQA
|
| 181 |
+
from langchain.prompts import PromptTemplate
|
| 182 |
|
| 183 |
+
retriever = vector_store.as_retriever(
|
| 184 |
+
search_type="similarity_score_threshold",
|
| 185 |
+
search_kwargs={"score_threshold": threshold, "k": 5 if search_type == 'questions' else 10}
|
| 186 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
+
# Create RAG chain if LLM is provided
|
| 189 |
+
qa_chain = None
|
| 190 |
+
if llm:
|
| 191 |
+
prompt_template = PromptTemplate(
|
| 192 |
+
input_variables=["context", "question"],
|
| 193 |
+
template="""Use the provided context to answer the question. Be concise and factual.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
+
Context: {context}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
|
| 197 |
+
Question: {question}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
+
Answer:"""
|
| 200 |
+
)
|
| 201 |
+
qa_chain = RetrievalQA.from_chain_type(
|
| 202 |
+
llm=llm,
|
| 203 |
+
chain_type="stuff",
|
| 204 |
+
retriever=retriever,
|
| 205 |
+
chain_type_kwargs={"prompt": prompt_template}
|
| 206 |
+
)
|
| 207 |
|
| 208 |
+
if search_type == 'items':
|
| 209 |
+
return _process_checklist_items(queries, retriever, qa_chain)
|
| 210 |
+
else:
|
| 211 |
+
return _process_questions(queries, retriever, qa_chain)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _process_checklist_items(checklist: Dict, retriever, qa_chain=None) -> Dict:
|
| 215 |
+
"""Process checklist items with unified search logic"""
|
| 216 |
+
results = {}
|
| 217 |
+
for cat_letter, category in checklist.items():
|
| 218 |
+
cat_results = {
|
| 219 |
+
'name': category['name'],
|
| 220 |
+
'items': [],
|
| 221 |
+
'total_items': len(category['items']),
|
| 222 |
+
'matched_items': 0
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
for item in category['items']:
|
| 226 |
+
query = f"{category['name']}: {item['text']}"
|
| 227 |
+
try:
|
| 228 |
+
docs = retriever.invoke(query)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"Error in document matching: {e}")
|
| 231 |
+
docs = []
|
| 232 |
|
| 233 |
+
matches = [{
|
| 234 |
+
'name': doc.metadata.get('source', ''),
|
| 235 |
+
'path': doc.metadata.get('path', ''),
|
| 236 |
+
'full_path': doc.metadata.get('full_path', ''),
|
| 237 |
+
'score': 0.8, # LangChain similarity scores not directly accessible
|
| 238 |
+
'metadata': {k: v for k, v in doc.metadata.items()
|
| 239 |
+
if k not in ['source', 'path', 'full_path']}
|
| 240 |
+
} for doc in docs[:5]]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
|
| 242 |
+
if matches:
|
| 243 |
+
cat_results['matched_items'] += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
+
cat_results['items'].append({
|
| 246 |
+
'text': item['text'],
|
| 247 |
+
'original': item['original'],
|
| 248 |
+
'matches': matches
|
| 249 |
+
})
|
| 250 |
|
| 251 |
+
results[cat_letter] = cat_results
|
| 252 |
+
|
| 253 |
+
return results
|
| 254 |
|
| 255 |
|
| 256 |
+
def _process_questions(questions: List[Dict], retriever, qa_chain=None) -> Dict:
|
| 257 |
+
"""Process questions with unified search logic"""
|
| 258 |
+
answers = {}
|
| 259 |
+
for question in questions:
|
| 260 |
+
try:
|
| 261 |
+
docs = retriever.invoke(question['question'])
|
| 262 |
+
except Exception as e:
|
| 263 |
+
logger.error(f"Error in question answering: {e}")
|
| 264 |
+
docs = []
|
| 265 |
+
|
| 266 |
+
if docs:
|
| 267 |
+
chunks_data = [{
|
| 268 |
+
'text': doc.page_content[:500],
|
| 269 |
+
'source': doc.metadata.get('source', ''),
|
| 270 |
+
'path': doc.metadata.get('path', ''),
|
| 271 |
+
'score': 0.8,
|
| 272 |
+
'metadata': {k: v for k, v in doc.metadata.items()
|
| 273 |
+
if k not in ['source', 'path']}
|
| 274 |
+
} for doc in docs]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
+
# Generate answer using RAG chain if available
|
| 277 |
+
answer_text = "Retrieved relevant document chunks."
|
| 278 |
+
if qa_chain:
|
| 279 |
+
try:
|
| 280 |
+
answer_text = qa_chain.run(question['question'])
|
| 281 |
+
except Exception as e:
|
| 282 |
+
logger.error(f"RAG chain failed: {e}")
|
| 283 |
+
answer_text = "Retrieved relevant document chunks."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
answers[question['id']] = {
|
| 286 |
'question': question['question'],
|
| 287 |
'category': question['category'],
|
| 288 |
+
'answer': answer_text,
|
| 289 |
+
'chunks': chunks_data,
|
| 290 |
+
'has_answer': True
|
| 291 |
}
|
| 292 |
+
else:
|
| 293 |
+
answers[question['id']] = {
|
| 294 |
+
'question': question['question'],
|
| 295 |
+
'category': question['category'],
|
| 296 |
+
'answer': "No relevant documents found",
|
| 297 |
+
'chunks': [],
|
| 298 |
+
'has_answer': False
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
return answers
|
| 302 |
+
|
| 303 |
|
| 304 |
+
# =============================================================================
|
| 305 |
+
# REPORT GENERATION FUNCTIONS - Simplified from ReportGenerator class
|
| 306 |
+
# =============================================================================
|
| 307 |
|
| 308 |
+
def generate_reports(documents: Dict[str, Dict], data_room_name: str = "Unknown",
|
| 309 |
+
strategy_text: str = "", checklist_results: Dict = None,
|
| 310 |
+
report_type: str = "overview", llm=None) -> str:
|
| 311 |
+
"""Unified report generation using LangChain prompt templates"""
|
| 312 |
|
| 313 |
+
if not llm:
|
| 314 |
+
return _generate_basic_report(documents, data_room_name, checklist_results, report_type)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
+
# Define prompt templates
|
| 317 |
+
if report_type == "overview":
|
| 318 |
+
template = PromptTemplate(
|
| 319 |
+
input_variables=["company_name", "document_summaries"],
|
| 320 |
+
template="""Based on the following document summaries from a due diligence data room, provide a comprehensive company overview.
|
| 321 |
+
|
| 322 |
+
Company: {company_name}
|
| 323 |
+
|
| 324 |
+
Document Summaries:
|
| 325 |
+
{document_summaries}
|
| 326 |
+
|
| 327 |
+
Please provide:
|
| 328 |
+
1. Company name and industry
|
| 329 |
+
2. Business model and key products/services
|
| 330 |
+
3. Market position and competitive advantages
|
| 331 |
+
4. Key financials (if available)
|
| 332 |
+
5. Organizational structure
|
| 333 |
+
6. Notable risks or concerns
|
| 334 |
+
7. Overall assessment for M&A consideration
|
| 335 |
+
|
| 336 |
+
Format the response in clear sections with bullet points where appropriate."""
|
| 337 |
+
)
|
| 338 |
|
| 339 |
+
# Prepare document summaries
|
| 340 |
doc_summaries = []
|
| 341 |
+
for path, doc_info in list(documents.items())[:10]:
|
| 342 |
if 'summary' in doc_info:
|
| 343 |
doc_summaries.append(f"{doc_info['name']}: {doc_info['summary']}")
|
| 344 |
else:
|
|
|
|
| 345 |
content_preview = doc_info.get('content', '')[:500]
|
| 346 |
if content_preview:
|
| 347 |
doc_summaries.append(f"{doc_info['name']}: {content_preview}")
|
|
|
|
| 349 |
if not doc_summaries:
|
| 350 |
return "No documents available for summary generation."
|
| 351 |
|
| 352 |
+
inputs = {
|
| 353 |
+
"company_name": data_room_name,
|
| 354 |
+
"document_summaries": "\n".join(doc_summaries[:10])
|
| 355 |
+
}
|
|
|
|
| 356 |
|
| 357 |
+
elif report_type == "strategic":
|
| 358 |
+
template = PromptTemplate(
|
| 359 |
+
input_variables=["strategy_text", "checklist_context"],
|
| 360 |
+
template="""Based on the due diligence checklist results and the selected strategy, provide a strategic analysis.
|
| 361 |
+
|
| 362 |
+
Strategy Document:
|
| 363 |
+
{strategy_text}
|
| 364 |
+
|
| 365 |
+
Checklist Coverage:
|
| 366 |
+
{checklist_context}
|
| 367 |
+
|
| 368 |
+
Please provide:
|
| 369 |
+
1. Strategic alignment assessment
|
| 370 |
+
2. Key risks and gaps identified
|
| 371 |
+
3. Opportunities and synergies
|
| 372 |
+
4. Recommended next steps
|
| 373 |
+
5. Overall recommendation
|
| 374 |
+
|
| 375 |
+
Format the response with clear sections and bullet points."""
|
| 376 |
+
)
|
| 377 |
|
| 378 |
+
# Build checklist context
|
| 379 |
+
if not checklist_results:
|
| 380 |
+
return "No checklist results available for strategic analysis."
|
| 381 |
+
|
| 382 |
+
checklist_context = []
|
| 383 |
+
for cat_id, cat_data in checklist_results.items():
|
| 384 |
+
cat_name = cat_data['name']
|
| 385 |
+
matched_items = cat_data['matched_items']
|
| 386 |
+
total_items = cat_data['total_items']
|
| 387 |
+
coverage = (matched_items / total_items * 100) if total_items > 0 else 0
|
| 388 |
+
|
| 389 |
+
checklist_context.append(f"- {cat_name}: {coverage:.0f}% coverage ({matched_items}/{total_items} items)")
|
| 390 |
+
|
| 391 |
+
# Add details about gaps
|
| 392 |
+
missing_items = [item['text'] for item in cat_data['items'] if not item['matches']]
|
| 393 |
+
if missing_items and len(missing_items) <= 3:
|
| 394 |
+
checklist_context.append(f" Missing: {', '.join(missing_items[:3])}")
|
| 395 |
|
| 396 |
+
inputs = {
|
| 397 |
+
"strategy_text": strategy_text,
|
| 398 |
+
"checklist_context": "\n".join(checklist_context)
|
| 399 |
+
}
|
|
|
|
| 400 |
|
| 401 |
+
# Execute the chain
|
| 402 |
+
try:
|
| 403 |
+
chain = template | llm | StrOutputParser()
|
| 404 |
+
response = chain.invoke(inputs)
|
| 405 |
+
return escape_markdown_math(response.strip())
|
| 406 |
+
except Exception as e:
|
| 407 |
+
logger.error(f"LLM report generation failed: {e}")
|
| 408 |
+
return f"Failed to generate {report_type} report: {str(e)}"
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
def _generate_basic_report(documents: Dict[str, Dict], data_room_name: str,
|
| 412 |
+
checklist_results: Dict, report_type: str) -> str:
|
| 413 |
+
"""Generate basic reports without AI"""
|
| 414 |
+
if report_type == "overview":
|
| 415 |
doc_count = len(documents)
|
| 416 |
file_types = {}
|
| 417 |
|
|
|
|
| 419 |
doc_type = doc_info.get('metadata', {}).get('type', 'unknown')
|
| 420 |
file_types[doc_type] = file_types.get(doc_type, 0) + 1
|
| 421 |
|
| 422 |
+
return f"""# Company Overview: {data_room_name}
|
| 423 |
|
| 424 |
## Document Analysis
|
| 425 |
- **Total Documents**: {doc_count}
|
|
|
|
| 430 |
|
| 431 |
*Note: Enable AI features for detailed company analysis and insights.*
|
| 432 |
"""
|
|
|
|
| 433 |
|
| 434 |
+
elif report_type == "strategic":
|
| 435 |
+
if not checklist_results:
|
| 436 |
+
return "No checklist results available for strategic analysis."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
total_items = sum(cat['total_items'] for cat in checklist_results.values())
|
| 439 |
matched_items = sum(cat['matched_items'] for cat in checklist_results.values())
|
| 440 |
coverage = (matched_items / total_items * 100) if total_items > 0 else 0
|
|
|
|
| 461 |
"""
|
| 462 |
|
| 463 |
return analysis
|
| 464 |
+
|
| 465 |
+
return "Invalid report type specified."
|
| 466 |
+
|
| 467 |
+
|
| 468 |
+
# =============================================================================
|
| 469 |
+
# MAIN SERVICE FUNCTIONS - Simplified orchestration
|
| 470 |
+
# =============================================================================
|
| 471 |
+
|
| 472 |
+
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
def search_documents(doc_processor: DocumentProcessor, query: str, top_k: int = 5,
|
| 476 |
+
threshold: Optional[float] = None) -> List[Dict]:
|
| 477 |
+
"""Search documents using the document processor"""
|
| 478 |
+
return doc_processor.search(query, top_k, threshold)
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
def load_default_file(directory: Path, pattern: str) -> str:
|
| 482 |
+
"""Load the first file matching pattern from directory"""
|
| 483 |
+
try:
|
| 484 |
+
files = list(directory.glob(pattern))
|
| 485 |
+
return files[0].read_text(encoding='utf-8') if files else ""
|
| 486 |
+
except Exception as e:
|
| 487 |
+
logger.error(f"File loading failed: {e}")
|
| 488 |
+
return ""
|
| 489 |
|
| 490 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/ui_components.py
CHANGED
|
@@ -9,10 +9,11 @@ Separates UI logic from business logic for better maintainability.
|
|
| 9 |
import streamlit as st
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Dict, List, Optional, Tuple, Any
|
| 12 |
-
|
| 13 |
import base64
|
| 14 |
|
| 15 |
-
from .config import get_config
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def create_document_link(file_path: str, doc_name: str, doc_title: str) -> str:
|
|
@@ -89,8 +90,7 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
|
|
| 89 |
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
|
| 90 |
if subdirs:
|
| 91 |
# Count total documents in all data rooms
|
| 92 |
-
total_docs =
|
| 93 |
-
if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
|
| 94 |
if total_docs > 0:
|
| 95 |
projects.append({
|
| 96 |
'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
|
|
@@ -106,8 +106,7 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
|
|
| 106 |
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
|
| 107 |
if subdirs:
|
| 108 |
# Count total documents in all data rooms
|
| 109 |
-
total_docs =
|
| 110 |
-
if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
|
| 111 |
if total_docs > 0:
|
| 112 |
projects.append({
|
| 113 |
'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
|
|
@@ -169,8 +168,7 @@ def render_data_room_selector(project_path: str) -> Optional[str]:
|
|
| 169 |
for data_room_dir in project_path_obj.iterdir():
|
| 170 |
if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
|
| 171 |
# Count documents for display
|
| 172 |
-
doc_count =
|
| 173 |
-
if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
|
| 174 |
if doc_count > 0: # Only show directories with documents
|
| 175 |
data_rooms.append({
|
| 176 |
'name': data_room_dir.name.replace('-', ' ').replace('_', ' ').title(),
|
|
@@ -221,12 +219,11 @@ def render_ai_settings() -> Tuple[bool, Optional[str], str]:
|
|
| 221 |
model_choice = config.model.claude_model
|
| 222 |
|
| 223 |
if use_ai_features:
|
| 224 |
-
# Check if API key is in
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
if env_key:
|
| 228 |
st.success("✅ API key loaded from .env file")
|
| 229 |
-
api_key =
|
| 230 |
else:
|
| 231 |
api_key = st.text_input(
|
| 232 |
"Anthropic API Key",
|
|
@@ -276,11 +273,11 @@ def render_file_selector(directory: str, file_type: str, key_suffix: str) -> Tup
|
|
| 276 |
if dir_path.exists():
|
| 277 |
for file in dir_path.glob("*.md"):
|
| 278 |
if not file.name.startswith('.'):
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
|
| 285 |
file_content = ""
|
| 286 |
selected_file_path = None
|
|
@@ -483,10 +480,7 @@ def render_document_match(match: Dict, item_idx: int, primary_threshold: float)
|
|
| 483 |
"""
|
| 484 |
# Get document title (use name without extension)
|
| 485 |
doc_name = match.get('name', match.get('path', 'Unknown'))
|
| 486 |
-
|
| 487 |
-
doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
|
| 488 |
-
else:
|
| 489 |
-
doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
|
| 490 |
|
| 491 |
# Compact display with columns
|
| 492 |
col1, col2, col3 = st.columns([0.8, 3.5, 0.5])
|
|
@@ -535,17 +529,7 @@ def render_download_button(match: Dict, item_idx: int, doc_name: str, doc_title:
|
|
| 535 |
file_bytes = f.read()
|
| 536 |
|
| 537 |
# Determine MIME type based on file extension
|
| 538 |
-
|
| 539 |
-
if file_extension == '.pdf':
|
| 540 |
-
mime_type = 'application/pdf'
|
| 541 |
-
elif file_extension in ['.doc', '.docx']:
|
| 542 |
-
mime_type = 'application/msword'
|
| 543 |
-
elif file_extension == '.txt':
|
| 544 |
-
mime_type = 'text/plain'
|
| 545 |
-
elif file_extension == '.md':
|
| 546 |
-
mime_type = 'text/markdown'
|
| 547 |
-
else:
|
| 548 |
-
mime_type = 'application/octet-stream'
|
| 549 |
|
| 550 |
button_key = f"dl_{item_idx}_{match['score']:.0f}_{doc_name[:20]}".replace(" ", "_").replace("/", "_").replace(".", "_")
|
| 551 |
|
|
@@ -648,10 +632,7 @@ def render_question_source(chunk: Dict, chunk_idx: int, question: str) -> None:
|
|
| 648 |
with col2:
|
| 649 |
# Get clean document title
|
| 650 |
doc_name = chunk['source']
|
| 651 |
-
|
| 652 |
-
doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
|
| 653 |
-
else:
|
| 654 |
-
doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
|
| 655 |
|
| 656 |
# Document title as clickable link
|
| 657 |
doc_path = chunk.get('path', '')
|
|
@@ -675,17 +656,7 @@ def render_question_source(chunk: Dict, chunk_idx: int, question: str) -> None:
|
|
| 675 |
file_bytes = f.read()
|
| 676 |
|
| 677 |
# Determine MIME type based on file extension
|
| 678 |
-
|
| 679 |
-
if file_extension == '.pdf':
|
| 680 |
-
mime_type = 'application/pdf'
|
| 681 |
-
elif file_extension in ['.doc', '.docx']:
|
| 682 |
-
mime_type = 'application/msword'
|
| 683 |
-
elif file_extension == '.txt':
|
| 684 |
-
mime_type = 'text/plain'
|
| 685 |
-
elif file_extension == '.md':
|
| 686 |
-
mime_type = 'text/markdown'
|
| 687 |
-
else:
|
| 688 |
-
mime_type = 'application/octet-stream'
|
| 689 |
|
| 690 |
button_key = f"qa_dl_{question[:20]}_{chunk_idx}".replace(" ", "_").replace("?", "").replace("/", "_")
|
| 691 |
|
|
@@ -718,7 +689,7 @@ def render_ai_answer_button(answer_data: Dict, chunks: List[Dict]) -> None:
|
|
| 718 |
context = "\n\n".join([f"From {c['source']}: {c['text']}" for c in chunks[:3]])
|
| 719 |
# Use LLM directly for more reliable answers
|
| 720 |
from langchain_core.messages import HumanMessage
|
| 721 |
-
|
| 722 |
|
| 723 |
prompt = f"Question: {answer_data['question']}\n\nContext from documents:\n{context}\n\nProvide a comprehensive answer based on the context."
|
| 724 |
response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
|
|
|
|
| 9 |
import streamlit as st
|
| 10 |
from pathlib import Path
|
| 11 |
from typing import Dict, List, Optional, Tuple, Any
|
| 12 |
+
|
| 13 |
import base64
|
| 14 |
|
| 15 |
+
from .config import get_config, get_mime_type, format_document_title, count_documents_in_directory
|
| 16 |
+
from .document_processing import escape_markdown_math
|
| 17 |
|
| 18 |
|
| 19 |
def create_document_link(file_path: str, doc_name: str, doc_title: str) -> str:
|
|
|
|
| 90 |
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
|
| 91 |
if subdirs:
|
| 92 |
# Count total documents in all data rooms
|
| 93 |
+
total_docs = count_documents_in_directory(project_dir)
|
|
|
|
| 94 |
if total_docs > 0:
|
| 95 |
projects.append({
|
| 96 |
'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
|
|
|
|
| 106 |
subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
|
| 107 |
if subdirs:
|
| 108 |
# Count total documents in all data rooms
|
| 109 |
+
total_docs = count_documents_in_directory(project_dir)
|
|
|
|
| 110 |
if total_docs > 0:
|
| 111 |
projects.append({
|
| 112 |
'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
|
|
|
|
| 168 |
for data_room_dir in project_path_obj.iterdir():
|
| 169 |
if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
|
| 170 |
# Count documents for display
|
| 171 |
+
doc_count = count_documents_in_directory(data_room_dir)
|
|
|
|
| 172 |
if doc_count > 0: # Only show directories with documents
|
| 173 |
data_rooms.append({
|
| 174 |
'name': data_room_dir.name.replace('-', ' ').replace('_', ' ').title(),
|
|
|
|
| 219 |
model_choice = config.model.claude_model
|
| 220 |
|
| 221 |
if use_ai_features:
|
| 222 |
+
# Check if API key is available in config (which loads from .env)
|
| 223 |
+
config_api_key = config.anthropic_api_key
|
| 224 |
+
if config_api_key:
|
|
|
|
| 225 |
st.success("✅ API key loaded from .env file")
|
| 226 |
+
api_key = config_api_key
|
| 227 |
else:
|
| 228 |
api_key = st.text_input(
|
| 229 |
"Anthropic API Key",
|
|
|
|
| 273 |
if dir_path.exists():
|
| 274 |
for file in dir_path.glob("*.md"):
|
| 275 |
if not file.name.startswith('.'):
|
| 276 |
+
files.append({
|
| 277 |
+
'name': format_document_title(file.stem),
|
| 278 |
+
'path': str(file),
|
| 279 |
+
'filename': file.name
|
| 280 |
+
})
|
| 281 |
|
| 282 |
file_content = ""
|
| 283 |
selected_file_path = None
|
|
|
|
| 480 |
"""
|
| 481 |
# Get document title (use name without extension)
|
| 482 |
doc_name = match.get('name', match.get('path', 'Unknown'))
|
| 483 |
+
doc_title = format_document_title(doc_name)
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
# Compact display with columns
|
| 486 |
col1, col2, col3 = st.columns([0.8, 3.5, 0.5])
|
|
|
|
| 529 |
file_bytes = f.read()
|
| 530 |
|
| 531 |
# Determine MIME type based on file extension
|
| 532 |
+
mime_type = get_mime_type(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
button_key = f"dl_{item_idx}_{match['score']:.0f}_{doc_name[:20]}".replace(" ", "_").replace("/", "_").replace(".", "_")
|
| 535 |
|
|
|
|
| 632 |
with col2:
|
| 633 |
# Get clean document title
|
| 634 |
doc_name = chunk['source']
|
| 635 |
+
doc_title = format_document_title(doc_name)
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
# Document title as clickable link
|
| 638 |
doc_path = chunk.get('path', '')
|
|
|
|
| 656 |
file_bytes = f.read()
|
| 657 |
|
| 658 |
# Determine MIME type based on file extension
|
| 659 |
+
mime_type = get_mime_type(file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 660 |
|
| 661 |
button_key = f"qa_dl_{question[:20]}_{chunk_idx}".replace(" ", "_").replace("?", "").replace("/", "_")
|
| 662 |
|
|
|
|
| 689 |
context = "\n\n".join([f"From {c['source']}: {c['text']}" for c in chunks[:3]])
|
| 690 |
# Use LLM directly for more reliable answers
|
| 691 |
from langchain_core.messages import HumanMessage
|
| 692 |
+
|
| 693 |
|
| 694 |
prompt = f"Question: {answer_data['question']}\n\nContext from documents:\n{context}\n\nProvide a comprehensive answer based on the context."
|
| 695 |
response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
|
src/utils.py
DELETED
|
@@ -1,640 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Utilities Module
|
| 4 |
-
|
| 5 |
-
This module contains error handling, logging, and other utility functions
|
| 6 |
-
for the DD-Checklist application.
|
| 7 |
-
"""
|
| 8 |
-
|
| 9 |
-
import logging
|
| 10 |
-
import functools
|
| 11 |
-
import traceback
|
| 12 |
-
from pathlib import Path
|
| 13 |
-
from typing import Any, Callable, Optional, Dict, List, Union
|
| 14 |
-
import streamlit as st
|
| 15 |
-
from datetime import datetime
|
| 16 |
-
import sys
|
| 17 |
-
import os
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class DDChecklistLogger:
|
| 21 |
-
"""
|
| 22 |
-
Custom logger for DD-Checklist application
|
| 23 |
-
Handles both file and console logging with Streamlit integration
|
| 24 |
-
"""
|
| 25 |
-
|
| 26 |
-
def __init__(self, name: str = "dd_checklist", log_level: str = "INFO"):
|
| 27 |
-
"""
|
| 28 |
-
Initialize logger
|
| 29 |
-
|
| 30 |
-
Args:
|
| 31 |
-
name: Logger name
|
| 32 |
-
log_level: Logging level
|
| 33 |
-
"""
|
| 34 |
-
self.logger = logging.getLogger(name)
|
| 35 |
-
self.logger.setLevel(getattr(logging, log_level.upper()))
|
| 36 |
-
|
| 37 |
-
# Prevent duplicate handlers
|
| 38 |
-
if not self.logger.handlers:
|
| 39 |
-
self._setup_handlers()
|
| 40 |
-
|
| 41 |
-
def _setup_handlers(self):
|
| 42 |
-
"""Setup logging handlers"""
|
| 43 |
-
# Console handler
|
| 44 |
-
console_handler = logging.StreamHandler(sys.stdout)
|
| 45 |
-
console_formatter = logging.Formatter(
|
| 46 |
-
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
| 47 |
-
)
|
| 48 |
-
console_handler.setFormatter(console_formatter)
|
| 49 |
-
self.logger.addHandler(console_handler)
|
| 50 |
-
|
| 51 |
-
# File handler (if possible)
|
| 52 |
-
try:
|
| 53 |
-
log_dir = Path(".logs")
|
| 54 |
-
log_dir.mkdir(exist_ok=True)
|
| 55 |
-
|
| 56 |
-
log_file = log_dir / f"dd_checklist_{datetime.now().strftime('%Y%m%d')}.log"
|
| 57 |
-
file_handler = logging.FileHandler(log_file)
|
| 58 |
-
file_formatter = logging.Formatter(
|
| 59 |
-
'%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
|
| 60 |
-
)
|
| 61 |
-
file_handler.setFormatter(file_formatter)
|
| 62 |
-
self.logger.addHandler(file_handler)
|
| 63 |
-
except Exception:
|
| 64 |
-
# File logging not available (e.g., on Streamlit Cloud)
|
| 65 |
-
pass
|
| 66 |
-
|
| 67 |
-
def info(self, message: str, **kwargs):
|
| 68 |
-
"""Log info message"""
|
| 69 |
-
self.logger.info(message, **kwargs)
|
| 70 |
-
|
| 71 |
-
def warning(self, message: str, **kwargs):
|
| 72 |
-
"""Log warning message"""
|
| 73 |
-
self.logger.warning(message, **kwargs)
|
| 74 |
-
# Also show in Streamlit if available
|
| 75 |
-
if 'st' in globals() and st:
|
| 76 |
-
st.warning(message)
|
| 77 |
-
|
| 78 |
-
def error(self, message: str, **kwargs):
|
| 79 |
-
"""Log error message"""
|
| 80 |
-
self.logger.error(message, **kwargs)
|
| 81 |
-
# Also show in Streamlit if available
|
| 82 |
-
if 'st' in globals() and st:
|
| 83 |
-
st.error(message)
|
| 84 |
-
|
| 85 |
-
def debug(self, message: str, **kwargs):
|
| 86 |
-
"""Log debug message"""
|
| 87 |
-
self.logger.debug(message, **kwargs)
|
| 88 |
-
|
| 89 |
-
def exception(self, message: str, **kwargs):
|
| 90 |
-
"""Log exception with traceback"""
|
| 91 |
-
self.logger.exception(message, **kwargs)
|
| 92 |
-
# Show error in Streamlit if available
|
| 93 |
-
if 'st' in globals() and st:
|
| 94 |
-
st.error(f"{message} - Check logs for details.")
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
# Global logger instance
|
| 98 |
-
logger = DDChecklistLogger()
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
def handle_exceptions(
|
| 102 |
-
return_value: Any = None,
|
| 103 |
-
show_error: bool = True,
|
| 104 |
-
log_error: bool = True
|
| 105 |
-
) -> Callable:
|
| 106 |
-
"""
|
| 107 |
-
Decorator for handling exceptions in functions
|
| 108 |
-
|
| 109 |
-
Args:
|
| 110 |
-
return_value: Value to return on exception
|
| 111 |
-
show_error: Whether to show error in UI
|
| 112 |
-
log_error: Whether to log the error
|
| 113 |
-
|
| 114 |
-
Returns:
|
| 115 |
-
Decorated function
|
| 116 |
-
"""
|
| 117 |
-
def decorator(func: Callable) -> Callable:
|
| 118 |
-
@functools.wraps(func)
|
| 119 |
-
def wrapper(*args, **kwargs):
|
| 120 |
-
try:
|
| 121 |
-
return func(*args, **kwargs)
|
| 122 |
-
except Exception as e:
|
| 123 |
-
error_msg = f"Error in {func.__name__}: {str(e)}"
|
| 124 |
-
|
| 125 |
-
if log_error:
|
| 126 |
-
logger.exception(error_msg)
|
| 127 |
-
|
| 128 |
-
if show_error and 'st' in globals() and st:
|
| 129 |
-
st.error(error_msg)
|
| 130 |
-
|
| 131 |
-
return return_value
|
| 132 |
-
return wrapper
|
| 133 |
-
return decorator
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
def safe_execute(
|
| 137 |
-
func: Callable,
|
| 138 |
-
*args,
|
| 139 |
-
default_return: Any = None,
|
| 140 |
-
error_message: Optional[str] = None,
|
| 141 |
-
show_error: bool = True,
|
| 142 |
-
**kwargs
|
| 143 |
-
) -> Any:
|
| 144 |
-
"""
|
| 145 |
-
Safely execute a function with error handling
|
| 146 |
-
|
| 147 |
-
Args:
|
| 148 |
-
func: Function to execute
|
| 149 |
-
*args: Function arguments
|
| 150 |
-
default_return: Default return value on error
|
| 151 |
-
error_message: Custom error message
|
| 152 |
-
show_error: Whether to show error in UI
|
| 153 |
-
**kwargs: Function keyword arguments
|
| 154 |
-
|
| 155 |
-
Returns:
|
| 156 |
-
Function result or default_return on error
|
| 157 |
-
"""
|
| 158 |
-
try:
|
| 159 |
-
return func(*args, **kwargs)
|
| 160 |
-
except Exception as e:
|
| 161 |
-
msg = error_message or f"Error executing {func.__name__}: {str(e)}"
|
| 162 |
-
logger.exception(msg)
|
| 163 |
-
|
| 164 |
-
if show_error and 'st' in globals() and st:
|
| 165 |
-
st.error(msg)
|
| 166 |
-
|
| 167 |
-
return default_return
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
class ErrorHandler:
|
| 171 |
-
"""
|
| 172 |
-
Context manager for error handling
|
| 173 |
-
"""
|
| 174 |
-
|
| 175 |
-
def __init__(
|
| 176 |
-
self,
|
| 177 |
-
error_message: str = "An error occurred",
|
| 178 |
-
show_error: bool = True,
|
| 179 |
-
reraise: bool = False
|
| 180 |
-
):
|
| 181 |
-
"""
|
| 182 |
-
Initialize error handler
|
| 183 |
-
|
| 184 |
-
Args:
|
| 185 |
-
error_message: Message to display on error
|
| 186 |
-
show_error: Whether to show error in UI
|
| 187 |
-
reraise: Whether to reraise the exception
|
| 188 |
-
"""
|
| 189 |
-
self.error_message = error_message
|
| 190 |
-
self.show_error = show_error
|
| 191 |
-
self.reraise = reraise
|
| 192 |
-
|
| 193 |
-
def __enter__(self):
|
| 194 |
-
return self
|
| 195 |
-
|
| 196 |
-
def __exit__(self, exc_type, exc_val, exc_tb):
|
| 197 |
-
if exc_type is not None:
|
| 198 |
-
error_msg = f"{self.error_message}: {str(exc_val)}"
|
| 199 |
-
logger.exception(error_msg)
|
| 200 |
-
|
| 201 |
-
if self.show_error and 'st' in globals() and st:
|
| 202 |
-
st.error(error_msg)
|
| 203 |
-
|
| 204 |
-
if self.reraise:
|
| 205 |
-
return False # Reraise the exception
|
| 206 |
-
|
| 207 |
-
return True # Suppress the exception
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
def validate_file_path(file_path: Union[str, Path]) -> bool:
|
| 211 |
-
"""
|
| 212 |
-
Validate that a file path exists and is readable
|
| 213 |
-
|
| 214 |
-
Args:
|
| 215 |
-
file_path: Path to validate
|
| 216 |
-
|
| 217 |
-
Returns:
|
| 218 |
-
True if valid, False otherwise
|
| 219 |
-
"""
|
| 220 |
-
try:
|
| 221 |
-
path = Path(file_path)
|
| 222 |
-
return path.exists() and path.is_file()
|
| 223 |
-
except Exception as e:
|
| 224 |
-
logger.warning(f"Invalid file path {file_path}: {e}")
|
| 225 |
-
return False
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
def validate_directory_path(dir_path: Union[str, Path]) -> bool:
|
| 229 |
-
"""
|
| 230 |
-
Validate that a directory path exists
|
| 231 |
-
|
| 232 |
-
Args:
|
| 233 |
-
dir_path: Directory path to validate
|
| 234 |
-
|
| 235 |
-
Returns:
|
| 236 |
-
True if valid, False otherwise
|
| 237 |
-
"""
|
| 238 |
-
try:
|
| 239 |
-
path = Path(dir_path)
|
| 240 |
-
return path.exists() and path.is_dir()
|
| 241 |
-
except Exception as e:
|
| 242 |
-
logger.warning(f"Invalid directory path {dir_path}: {e}")
|
| 243 |
-
return False
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
def ensure_directory(dir_path: Union[str, Path]) -> bool:
|
| 247 |
-
"""
|
| 248 |
-
Ensure directory exists, create if it doesn't
|
| 249 |
-
|
| 250 |
-
Args:
|
| 251 |
-
dir_path: Directory path
|
| 252 |
-
|
| 253 |
-
Returns:
|
| 254 |
-
True if directory exists or was created, False otherwise
|
| 255 |
-
"""
|
| 256 |
-
try:
|
| 257 |
-
path = Path(dir_path)
|
| 258 |
-
path.mkdir(parents=True, exist_ok=True)
|
| 259 |
-
return True
|
| 260 |
-
except Exception as e:
|
| 261 |
-
logger.error(f"Could not create directory {dir_path}: {e}")
|
| 262 |
-
return False
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
|
| 266 |
-
"""
|
| 267 |
-
Get file size in bytes
|
| 268 |
-
|
| 269 |
-
Args:
|
| 270 |
-
file_path: Path to file
|
| 271 |
-
|
| 272 |
-
Returns:
|
| 273 |
-
File size in bytes or None if error
|
| 274 |
-
"""
|
| 275 |
-
try:
|
| 276 |
-
return Path(file_path).stat().st_size
|
| 277 |
-
except Exception as e:
|
| 278 |
-
logger.warning(f"Could not get size for {file_path}: {e}")
|
| 279 |
-
return None
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
def format_file_size(size_bytes: int) -> str:
|
| 283 |
-
"""
|
| 284 |
-
Format file size in human-readable format
|
| 285 |
-
|
| 286 |
-
Args:
|
| 287 |
-
size_bytes: Size in bytes
|
| 288 |
-
|
| 289 |
-
Returns:
|
| 290 |
-
Formatted size string
|
| 291 |
-
"""
|
| 292 |
-
if size_bytes == 0:
|
| 293 |
-
return "0 B"
|
| 294 |
-
|
| 295 |
-
size_names = ["B", "KB", "MB", "GB"]
|
| 296 |
-
size = size_bytes
|
| 297 |
-
|
| 298 |
-
for i, unit in enumerate(size_names):
|
| 299 |
-
if size < 1024 or i == len(size_names) - 1:
|
| 300 |
-
return f"{size:.1f} {unit}"
|
| 301 |
-
size /= 1024
|
| 302 |
-
|
| 303 |
-
return f"{size:.1f} GB"
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
def sanitize_filename(filename: str) -> str:
|
| 307 |
-
"""
|
| 308 |
-
Sanitize filename for safe file operations
|
| 309 |
-
|
| 310 |
-
Args:
|
| 311 |
-
filename: Original filename
|
| 312 |
-
|
| 313 |
-
Returns:
|
| 314 |
-
Sanitized filename
|
| 315 |
-
"""
|
| 316 |
-
import re
|
| 317 |
-
|
| 318 |
-
# Remove or replace invalid characters
|
| 319 |
-
sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
| 320 |
-
|
| 321 |
-
# Remove multiple underscores
|
| 322 |
-
sanitized = re.sub(r'_+', '_', sanitized)
|
| 323 |
-
|
| 324 |
-
# Trim and ensure not empty
|
| 325 |
-
sanitized = sanitized.strip('_. ')
|
| 326 |
-
|
| 327 |
-
if not sanitized:
|
| 328 |
-
sanitized = "untitled"
|
| 329 |
-
|
| 330 |
-
return sanitized
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
def get_memory_usage() -> Dict[str, float]:
|
| 334 |
-
"""
|
| 335 |
-
Get current memory usage information
|
| 336 |
-
|
| 337 |
-
Returns:
|
| 338 |
-
Dictionary with memory usage stats
|
| 339 |
-
"""
|
| 340 |
-
try:
|
| 341 |
-
import psutil
|
| 342 |
-
process = psutil.Process(os.getpid())
|
| 343 |
-
memory_info = process.memory_info()
|
| 344 |
-
|
| 345 |
-
return {
|
| 346 |
-
'rss_mb': memory_info.rss / 1024 / 1024, # Resident Set Size
|
| 347 |
-
'vms_mb': memory_info.vms / 1024 / 1024, # Virtual Memory Size
|
| 348 |
-
'percent': process.memory_percent()
|
| 349 |
-
}
|
| 350 |
-
except ImportError:
|
| 351 |
-
logger.warning("psutil not available, cannot get memory usage")
|
| 352 |
-
return {}
|
| 353 |
-
except Exception as e:
|
| 354 |
-
logger.warning(f"Could not get memory usage: {e}")
|
| 355 |
-
return {}
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
def timing_decorator(func: Callable) -> Callable:
|
| 359 |
-
"""
|
| 360 |
-
Decorator to time function execution
|
| 361 |
-
|
| 362 |
-
Args:
|
| 363 |
-
func: Function to time
|
| 364 |
-
|
| 365 |
-
Returns:
|
| 366 |
-
Decorated function
|
| 367 |
-
"""
|
| 368 |
-
@functools.wraps(func)
|
| 369 |
-
def wrapper(*args, **kwargs):
|
| 370 |
-
import time
|
| 371 |
-
start_time = time.time()
|
| 372 |
-
result = func(*args, **kwargs)
|
| 373 |
-
end_time = time.time()
|
| 374 |
-
|
| 375 |
-
execution_time = end_time - start_time
|
| 376 |
-
logger.debug(f"{func.__name__} executed in {execution_time:.2f} seconds")
|
| 377 |
-
|
| 378 |
-
return result
|
| 379 |
-
return wrapper
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
class ProgressTracker:
|
| 383 |
-
"""
|
| 384 |
-
Progress tracking utility for long-running operations with weighted ETA calculation
|
| 385 |
-
"""
|
| 386 |
-
|
| 387 |
-
def __init__(self, total_steps: int, description: str = "Processing", step_weights: Optional[Dict[int, float]] = None):
|
| 388 |
-
"""
|
| 389 |
-
Initialize progress tracker
|
| 390 |
-
|
| 391 |
-
Args:
|
| 392 |
-
total_steps: Total number of steps
|
| 393 |
-
description: Description of the operation
|
| 394 |
-
step_weights: Optional dict mapping step numbers to relative weights (default: all steps equal weight)
|
| 395 |
-
"""
|
| 396 |
-
self.total_steps = total_steps
|
| 397 |
-
self.current_step = 0
|
| 398 |
-
self.description = description
|
| 399 |
-
self.start_time = datetime.now()
|
| 400 |
-
self.step_start_times = {} # Track when each step started
|
| 401 |
-
self.step_durations = {} # Track actual duration of completed steps
|
| 402 |
-
|
| 403 |
-
# Set up step weights (default: equal weight for all steps)
|
| 404 |
-
if step_weights:
|
| 405 |
-
self.step_weights = step_weights
|
| 406 |
-
else:
|
| 407 |
-
self.step_weights = {i: 1.0 for i in range(1, total_steps + 1)}
|
| 408 |
-
|
| 409 |
-
# Calculate total weight for progress calculation
|
| 410 |
-
self.total_weight = sum(self.step_weights.values())
|
| 411 |
-
|
| 412 |
-
# Initialize Streamlit progress bar if available
|
| 413 |
-
if 'st' in globals() and st:
|
| 414 |
-
self.progress_bar = st.progress(0, text=f"{description}...")
|
| 415 |
-
self.status_text = st.empty()
|
| 416 |
-
else:
|
| 417 |
-
self.progress_bar = None
|
| 418 |
-
self.status_text = None
|
| 419 |
-
|
| 420 |
-
def update(self, step: int, message: str = ""):
|
| 421 |
-
"""
|
| 422 |
-
Update progress with weighted ETA calculation
|
| 423 |
-
|
| 424 |
-
Args:
|
| 425 |
-
step: Current step number
|
| 426 |
-
message: Optional status message
|
| 427 |
-
"""
|
| 428 |
-
now = datetime.now()
|
| 429 |
-
|
| 430 |
-
# Record step timing
|
| 431 |
-
if self.current_step != step:
|
| 432 |
-
# Mark completion of previous step
|
| 433 |
-
if self.current_step > 0 and self.current_step in self.step_start_times:
|
| 434 |
-
self.step_durations[self.current_step] = (now - self.step_start_times[self.current_step]).total_seconds()
|
| 435 |
-
|
| 436 |
-
# Mark start of new step
|
| 437 |
-
self.step_start_times[step] = now
|
| 438 |
-
self.current_step = step
|
| 439 |
-
|
| 440 |
-
# Calculate weighted progress
|
| 441 |
-
completed_weight = sum(self.step_weights.get(i, 1.0) for i in range(1, step))
|
| 442 |
-
current_step_weight = self.step_weights.get(step, 1.0)
|
| 443 |
-
|
| 444 |
-
# For current step, assume 50% completion unless we have sub-progress info
|
| 445 |
-
current_progress_weight = completed_weight + (current_step_weight * 0.5)
|
| 446 |
-
progress = current_progress_weight / self.total_weight if self.total_weight > 0 else 0
|
| 447 |
-
progress = min(progress, 1.0) # Cap at 100%
|
| 448 |
-
|
| 449 |
-
# Calculate improved ETA using weighted approach
|
| 450 |
-
elapsed = (now - self.start_time).total_seconds()
|
| 451 |
-
eta_str = ""
|
| 452 |
-
|
| 453 |
-
if step > 1 and completed_weight > 0:
|
| 454 |
-
# Use actual timing data from completed steps
|
| 455 |
-
avg_time_per_weight = elapsed / completed_weight
|
| 456 |
-
remaining_weight = self.total_weight - current_progress_weight
|
| 457 |
-
eta = avg_time_per_weight * remaining_weight
|
| 458 |
-
|
| 459 |
-
if eta > 1:
|
| 460 |
-
if eta < 60:
|
| 461 |
-
eta_str = f" (ETA: {eta:.0f}s)"
|
| 462 |
-
elif eta < 3600:
|
| 463 |
-
eta_str = f" (ETA: {eta/60:.1f}m)"
|
| 464 |
-
else:
|
| 465 |
-
eta_str = f" (ETA: {eta/3600:.1f}h)"
|
| 466 |
-
elif step == 1 and elapsed > 5: # Only show ETA after 5 seconds
|
| 467 |
-
# For first step, make a rough estimate based on step weights
|
| 468 |
-
estimated_time_per_weight = elapsed / self.step_weights.get(1, 1.0)
|
| 469 |
-
remaining_weight = self.total_weight - current_progress_weight
|
| 470 |
-
eta = estimated_time_per_weight * remaining_weight
|
| 471 |
-
|
| 472 |
-
if eta > 10: # Only show if meaningful
|
| 473 |
-
if eta < 60:
|
| 474 |
-
eta_str = f" (ETA: ~{eta:.0f}s)"
|
| 475 |
-
else:
|
| 476 |
-
eta_str = f" (ETA: ~{eta/60:.1f}m)"
|
| 477 |
-
|
| 478 |
-
status_msg = f"{self.description}: {step}/{self.total_steps}{eta_str}"
|
| 479 |
-
if message:
|
| 480 |
-
status_msg += f" - {message}"
|
| 481 |
-
|
| 482 |
-
# Update Streamlit components
|
| 483 |
-
if self.progress_bar:
|
| 484 |
-
self.progress_bar.progress(progress, text=status_msg)
|
| 485 |
-
|
| 486 |
-
# Log progress at key milestones
|
| 487 |
-
if step == 1 or step % max(1, self.total_steps // 5) == 0: # Log every 20%
|
| 488 |
-
logger.info(status_msg)
|
| 489 |
-
|
| 490 |
-
def update_step_progress(self, step: int, sub_progress: float, message: str = ""):
|
| 491 |
-
"""
|
| 492 |
-
Update progress within a specific step (for long-running operations)
|
| 493 |
-
|
| 494 |
-
Args:
|
| 495 |
-
step: Current step number
|
| 496 |
-
sub_progress: Progress within the step (0.0 to 1.0)
|
| 497 |
-
message: Optional status message
|
| 498 |
-
"""
|
| 499 |
-
now = datetime.now()
|
| 500 |
-
|
| 501 |
-
# Ensure we're tracking this step
|
| 502 |
-
if step not in self.step_start_times:
|
| 503 |
-
self.step_start_times[step] = now
|
| 504 |
-
self.current_step = step
|
| 505 |
-
|
| 506 |
-
# Calculate weighted progress with sub-progress
|
| 507 |
-
completed_weight = sum(self.step_weights.get(i, 1.0) for i in range(1, step))
|
| 508 |
-
current_step_weight = self.step_weights.get(step, 1.0)
|
| 509 |
-
|
| 510 |
-
# Use actual sub-progress instead of assuming 50%
|
| 511 |
-
current_progress_weight = completed_weight + (current_step_weight * sub_progress)
|
| 512 |
-
progress = current_progress_weight / self.total_weight if self.total_weight > 0 else 0
|
| 513 |
-
progress = min(progress, 1.0) # Cap at 100%
|
| 514 |
-
|
| 515 |
-
# Calculate improved ETA
|
| 516 |
-
elapsed = (now - self.start_time).total_seconds()
|
| 517 |
-
eta_str = ""
|
| 518 |
-
|
| 519 |
-
if step > 1 and completed_weight > 0:
|
| 520 |
-
# Use actual timing data from completed steps
|
| 521 |
-
avg_time_per_weight = elapsed / completed_weight
|
| 522 |
-
remaining_weight = self.total_weight - current_progress_weight
|
| 523 |
-
eta = avg_time_per_weight * remaining_weight
|
| 524 |
-
|
| 525 |
-
if eta > 1:
|
| 526 |
-
if eta < 60:
|
| 527 |
-
eta_str = f" (ETA: {eta:.0f}s)"
|
| 528 |
-
elif eta < 3600:
|
| 529 |
-
eta_str = f" (ETA: {eta/60:.1f}m)"
|
| 530 |
-
else:
|
| 531 |
-
eta_str = f" (ETA: {eta/3600:.1f}h)"
|
| 532 |
-
elif step == 1 and elapsed > 5:
|
| 533 |
-
# For first step, estimate based on current progress
|
| 534 |
-
if sub_progress > 0.1: # Only estimate if we have meaningful progress
|
| 535 |
-
step_elapsed = (now - self.step_start_times[step]).total_seconds()
|
| 536 |
-
estimated_step_time = step_elapsed / sub_progress
|
| 537 |
-
remaining_step_time = estimated_step_time * (1 - sub_progress)
|
| 538 |
-
|
| 539 |
-
# Add estimated time for remaining steps
|
| 540 |
-
remaining_weight = self.total_weight - self.step_weights.get(step, 1.0)
|
| 541 |
-
estimated_time_per_weight = estimated_step_time / self.step_weights.get(step, 1.0)
|
| 542 |
-
eta = remaining_step_time + (estimated_time_per_weight * remaining_weight)
|
| 543 |
-
|
| 544 |
-
if eta > 10:
|
| 545 |
-
if eta < 60:
|
| 546 |
-
eta_str = f" (ETA: ~{eta:.0f}s)"
|
| 547 |
-
else:
|
| 548 |
-
eta_str = f" (ETA: ~{eta/60:.1f}m)"
|
| 549 |
-
|
| 550 |
-
status_msg = f"{self.description}: {step}/{self.total_steps}{eta_str}"
|
| 551 |
-
if message:
|
| 552 |
-
status_msg += f" - {message}"
|
| 553 |
-
|
| 554 |
-
# Update Streamlit components
|
| 555 |
-
if self.progress_bar:
|
| 556 |
-
self.progress_bar.progress(progress, text=status_msg)
|
| 557 |
-
|
| 558 |
-
def complete(self, message: str = "Complete"):
|
| 559 |
-
"""
|
| 560 |
-
Mark progress as complete
|
| 561 |
-
|
| 562 |
-
Args:
|
| 563 |
-
message: Completion message
|
| 564 |
-
"""
|
| 565 |
-
if self.progress_bar:
|
| 566 |
-
self.progress_bar.progress(1.0, text=f"{self.description}: {message}")
|
| 567 |
-
|
| 568 |
-
elapsed = (datetime.now() - self.start_time).total_seconds()
|
| 569 |
-
logger.info(f"{self.description} completed in {elapsed:.1f} seconds")
|
| 570 |
-
|
| 571 |
-
|
| 572 |
-
def batch_process(
|
| 573 |
-
items: List[Any],
|
| 574 |
-
process_func: Callable,
|
| 575 |
-
batch_size: int = 10,
|
| 576 |
-
description: str = "Processing"
|
| 577 |
-
) -> List[Any]:
|
| 578 |
-
"""
|
| 579 |
-
Process items in batches with progress tracking
|
| 580 |
-
|
| 581 |
-
Args:
|
| 582 |
-
items: List of items to process
|
| 583 |
-
process_func: Function to process each item
|
| 584 |
-
batch_size: Size of each batch
|
| 585 |
-
description: Description for progress tracking
|
| 586 |
-
|
| 587 |
-
Returns:
|
| 588 |
-
List of processed results
|
| 589 |
-
"""
|
| 590 |
-
results = []
|
| 591 |
-
total_batches = (len(items) + batch_size - 1) // batch_size
|
| 592 |
-
|
| 593 |
-
tracker = ProgressTracker(total_batches, description)
|
| 594 |
-
|
| 595 |
-
for i in range(0, len(items), batch_size):
|
| 596 |
-
batch = items[i:i + batch_size]
|
| 597 |
-
batch_num = i // batch_size + 1
|
| 598 |
-
|
| 599 |
-
try:
|
| 600 |
-
batch_results = [process_func(item) for item in batch]
|
| 601 |
-
results.extend(batch_results)
|
| 602 |
-
|
| 603 |
-
tracker.update(batch_num, f"Batch {batch_num}/{total_batches}")
|
| 604 |
-
|
| 605 |
-
except Exception as e:
|
| 606 |
-
logger.error(f"Error processing batch {batch_num}: {e}")
|
| 607 |
-
# Continue with remaining batches
|
| 608 |
-
continue
|
| 609 |
-
|
| 610 |
-
tracker.complete()
|
| 611 |
-
return results
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
# Streamlit-specific utilities
|
| 615 |
-
def show_success(message: str):
|
| 616 |
-
"""Show success message in Streamlit"""
|
| 617 |
-
if 'st' in globals() and st:
|
| 618 |
-
st.success(message)
|
| 619 |
-
logger.info(message)
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
def show_info(message: str):
|
| 623 |
-
"""Show info message in Streamlit"""
|
| 624 |
-
if 'st' in globals() and st:
|
| 625 |
-
st.info(message)
|
| 626 |
-
logger.info(message)
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
def show_warning(message: str):
|
| 630 |
-
"""Show warning message in Streamlit"""
|
| 631 |
-
if 'st' in globals() and st:
|
| 632 |
-
st.warning(message)
|
| 633 |
-
logger.warning(message)
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
def show_error(message: str):
|
| 637 |
-
"""Show error message in Streamlit"""
|
| 638 |
-
if 'st' in globals() and st:
|
| 639 |
-
st.error(message)
|
| 640 |
-
logger.error(message)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|