Juan Salas commited on
Commit
25ec886
·
1 Parent(s): 15ee652

FAISS file persistance

Browse files
.streamlit/config.toml CHANGED
@@ -7,7 +7,6 @@ textColor = "#262730"
7
  [server]
8
  headless = true
9
  port = 8501
10
- enableCORS = false
11
 
12
  [client]
13
- showErrorDetails = false
 
7
  [server]
8
  headless = true
9
  port = 8501
 
10
 
11
  [client]
12
+ showErrorDetails = true
app.py CHANGED
@@ -7,56 +7,65 @@ using the new modular architecture for better maintainability.
7
  """
8
 
9
  import os
 
10
  # Fix tokenizers parallelism warning early
11
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  import streamlit as st
14
- import numpy as np
15
- from sentence_transformers import SentenceTransformer
16
  from pathlib import Path
17
- from typing import Dict, List, Optional, Any
18
 
19
  # Import our refactored modules
20
  from src import (
21
- get_config, init_config,
22
- DocumentProcessor, DDChecklistService,
23
- logger, handle_exceptions, safe_execute, ErrorHandler,
24
- render_project_selector, render_ai_settings, escape_markdown_math
 
25
  )
 
 
26
  from src.ui_components import (
27
- render_file_selector, render_progress_section, render_metrics_row,
28
- render_checklist_results, render_question_results, render_quick_questions,
29
- create_document_link
 
 
30
  )
31
- from src.services import ReportGenerator
32
- from src.utils import ProgressTracker, show_success, show_error, show_info
33
 
34
- # Import LangGraph + Anthropic configuration
35
- try:
36
- from src.ai import (
37
- DDChecklistAgent,
38
- LANGGRAPH_AVAILABLE,
39
- batch_summarize_documents,
40
- create_document_embeddings_with_summaries,
41
- match_checklist_with_summaries,
42
- generate_checklist_descriptions
43
- )
44
- LLM_AVAILABLE = LANGGRAPH_AVAILABLE
45
- except ImportError:
46
- LLM_AVAILABLE = False
47
- DDChecklistAgent = None
48
 
 
 
 
 
49
 
50
 
51
  class DDChecklistApp:
52
  """
53
  Main application class that orchestrates all components
54
  """
55
-
56
  def __init__(self):
57
  """Initialize the application"""
58
  # Initialize configuration
59
- self.config = init_config().get_config()
60
 
61
  # Initialize session state
62
  self._init_session_state()
@@ -69,63 +78,44 @@ class DDChecklistApp:
69
  )
70
 
71
  # Initialize services (will be loaded when needed)
72
- self.model = None
73
- self.service = None
74
  self.agent = None
75
 
76
  def _init_session_state(self):
77
- """Initialize Streamlit session state variables"""
78
- defaults = {
79
  'documents': {},
80
  'chunks': [],
81
  'embeddings': None,
82
- 'checklist': {},
83
  'checklist_results': {},
84
- 'questions': [],
85
  'question_answers': {},
86
- 'strategy_text': "",
87
- 'strategy_analysis': "",
88
  'company_summary': "",
 
89
  'agent': None,
90
- 'doc_embeddings_data': None,
91
- 'just_processed': False,
92
- 'is_processing': False,
93
- 'trigger_processing': False,
94
- 'processing_path': None
95
  }
96
 
97
- for key, default_value in defaults.items():
98
  if key not in st.session_state:
99
  st.session_state[key] = default_value
100
-
101
- @st.cache_resource
102
- def load_model(_self) -> SentenceTransformer:
103
- """Load the sentence transformer model"""
104
- with ErrorHandler("Failed to load AI model"):
105
- return SentenceTransformer(_self.config.model.sentence_transformer_model)
106
-
107
  def initialize_services(self):
108
  """Initialize core services"""
109
- if self.model is None:
110
- self.model = self.load_model()
111
-
112
- if self.service is None:
113
- self.service = DDChecklistService(self.model, self.agent)
114
 
115
  # Restore document processor state from session state if available
116
  if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
117
  hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
118
 
119
- self.service.document_processor.chunks = st.session_state.chunks
120
- self.service.document_processor.embeddings = st.session_state.embeddings
121
- self.service.document_processor.documents = st.session_state.get('documents', {})
122
-
123
- # Ensure the document processor has the model
124
- self.service.document_processor.model = self.model
125
 
126
  def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
127
  """
128
- Setup AI agent if enabled
129
 
130
  Args:
131
  api_key: Anthropic API key
@@ -133,11 +123,7 @@ class DDChecklistApp:
133
 
134
  Returns:
135
  True if agent was successfully initialized
136
- """
137
- if not LLM_AVAILABLE or not DDChecklistAgent:
138
- show_error("AI packages not installed")
139
- return False
140
-
141
  try:
142
  with st.spinner("Initializing AI agent..."):
143
  agent = DDChecklistAgent(api_key, model_choice)
@@ -147,9 +133,6 @@ class DDChecklistApp:
147
  self.agent = agent
148
  show_success("✅ AI Agent ready")
149
 
150
- # Update service with agent
151
- if self.service:
152
- self.service.report_generator = ReportGenerator(agent)
153
 
154
  return True
155
  else:
@@ -198,15 +181,13 @@ class DDChecklistApp:
198
  self.agent = None
199
 
200
  return selected_data_room_path, use_ai_features, process_button
201
-
202
 
203
  def render_summary_tab(self):
204
- """Render the summary and analysis tab"""
205
  # Strategy selector
206
  strategy_path, strategy_text = render_file_selector(
207
  self.config.paths.strategy_dir, "Strategy", "tab"
208
  )
209
- st.session_state.strategy_text = strategy_text
210
 
211
  # Check if we have documents to display summaries
212
  if st.session_state.documents:
@@ -214,113 +195,115 @@ class DDChecklistApp:
214
  overview_tab, analysis_tab = st.tabs(["🏢 Company Overview", "🎯 Strategic Analysis"])
215
 
216
  with overview_tab:
217
- self._render_company_overview()
218
 
219
  with analysis_tab:
220
- self._render_strategic_analysis()
221
  else:
222
  show_info("👈 Configure and process data room to see analysis")
223
 
224
- def _render_company_overview(self):
225
- """Render company overview section"""
226
- # Auto-generate summary if not already present and AI is available
227
- if (not st.session_state.company_summary and
228
- hasattr(st.session_state, 'agent') and st.session_state.agent):
229
-
230
- with st.spinner("🤖 Generating company overview..."):
231
- report_gen = ReportGenerator(st.session_state.agent)
232
- data_room_name = Path(list(st.session_state.documents.keys())[0]).parent.name if st.session_state.documents else "Unknown"
233
- st.session_state.company_summary = report_gen.generate_company_summary(
234
- st.session_state.documents, data_room_name
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  )
236
 
237
- # Display the company summary if available
238
- if st.session_state.company_summary:
239
- st.markdown(st.session_state.company_summary)
240
 
241
  # Add export and regenerate buttons
 
 
 
 
 
242
  col1, col2 = st.columns([1, 5])
243
  with col1:
 
 
 
244
  st.download_button(
245
  "📥 Export Summary",
246
- data=f"# Company Overview\n\n{st.session_state.company_summary}",
247
- file_name=f"company_overview_{Path(list(st.session_state.documents.keys())[0]).parent.name if st.session_state.documents else 'export'}.md",
248
  mime="text/markdown",
249
- key="export_company_summary"
250
  )
251
  with col2:
252
- if st.button("🔄 Regenerate Overview"):
253
- st.session_state.company_summary = ""
254
  st.rerun()
255
-
256
- def _render_strategic_analysis(self):
257
- """Render strategic analysis section"""
258
- if not st.session_state.checklist_results:
259
- st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
260
- return
261
-
262
- # Auto-generate analysis if not already present and AI is available
263
- if (not st.session_state.strategy_analysis and
264
- hasattr(st.session_state, 'agent') and st.session_state.agent):
265
-
266
- with st.spinner("🤖 Generating strategic analysis..."):
267
- report_gen = ReportGenerator(st.session_state.agent)
268
- st.session_state.strategy_analysis = report_gen.generate_strategic_analysis(
269
- st.session_state.strategy_text,
270
- st.session_state.checklist_results,
271
- st.session_state.documents
272
- )
273
-
274
- if st.session_state.strategy_analysis:
275
- st.markdown(st.session_state.strategy_analysis)
276
-
277
- # Add export and regenerate buttons
278
- col1, col2, col3 = st.columns([1, 1, 3])
279
  with col1:
280
- # Combined report export
281
  combined_report = f"# Due Diligence Report\n\n"
282
- combined_report += f"## Company Overview\n\n{st.session_state.company_summary}\n\n"
283
- combined_report += f"## Strategic Analysis\n\n{st.session_state.strategy_analysis}"
284
 
 
 
 
285
  st.download_button(
286
  "📥 Export Report",
287
  data=combined_report,
288
- file_name=f"dd_report_{Path(list(st.session_state.documents.keys())[0]).parent.name if st.session_state.documents else 'export'}.md",
289
  mime="text/markdown",
290
- key="export_combined_report"
291
  )
292
  with col2:
293
- if st.button("🔄 Regenerate Analysis"):
294
- st.session_state.strategy_analysis = ""
295
  st.rerun()
296
 
297
- def render_checklist_tab(self):
298
- """Render the checklist matching tab"""
299
- # Checklist selector
300
- checklist_path, checklist_text = render_file_selector(
301
- self.config.paths.checklist_dir, "Checklist", "tab"
302
- )
303
-
304
- if not checklist_text:
305
- show_error("No checklists found in data/checklist directory")
306
- return
307
-
308
- # Render results if available
309
- render_checklist_results(st.session_state.checklist_results)
310
-
311
- def render_questions_tab(self):
312
- """Render the questions tab"""
313
- # Question list selector
314
- questions_path, questions_text = render_file_selector(
315
- self.config.paths.questions_dir, "Question List", "tab"
316
- )
317
-
318
- if not questions_text:
319
- show_info("No question lists found in data/questions/")
320
- return
321
-
322
- # Render results if available
323
- render_question_results(st.session_state.question_answers)
324
 
325
  def render_qa_tab(self):
326
  """Render the Q&A with citations tab"""
@@ -346,13 +329,14 @@ class DDChecklistApp:
346
 
347
  def _handle_qa_query(self, question: str):
348
  """Handle Q&A query and display results"""
349
- if not self.service:
350
  self.initialize_services()
351
 
352
  # Use lower threshold for Q&A to get more relevant results
353
  qa_threshold = 0.25
354
 
355
- results = self.service.search_documents(
 
356
  question,
357
  top_k=self.config.ui.top_k_search_results,
358
  threshold=qa_threshold
@@ -369,7 +353,9 @@ class DDChecklistApp:
369
  context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
370
  # Use LLM directly for more reliable answers
371
  from langchain_core.messages import HumanMessage
372
- prompt = f"Question: {question}\n\nRelevant document excerpts:\n{context}\n\nProvide a comprehensive answer with citations to the sources."
 
 
373
  response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
374
  # Clean up any leading whitespace and escape math characters
375
  answer_text = escape_markdown_math(response.content.strip())
@@ -389,10 +375,7 @@ class DDChecklistApp:
389
  # Create clickable link for the document
390
  doc_path = result.get('path', result.get('full_path', ''))
391
  doc_name = result['source']
392
- if '.' in doc_name:
393
- doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
394
- else:
395
- doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
396
 
397
  if doc_path:
398
  link_html = create_document_link(doc_path, doc_name, doc_title)
@@ -419,17 +402,7 @@ class DDChecklistApp:
419
  file_bytes = f.read()
420
 
421
  # Determine MIME type based on file extension
422
- file_extension = file_path.suffix.lower()
423
- if file_extension == '.pdf':
424
- mime_type = 'application/pdf'
425
- elif file_extension in ['.doc', '.docx']:
426
- mime_type = 'application/msword'
427
- elif file_extension == '.txt':
428
- mime_type = 'text/plain'
429
- elif file_extension == '.md':
430
- mime_type = 'text/markdown'
431
- else:
432
- mime_type = 'application/octet-stream'
433
 
434
  button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
435
 
@@ -444,238 +417,107 @@ class DDChecklistApp:
444
  except Exception as e:
445
  st.error(f"Download failed: {str(e)}")
446
 
447
- @handle_exceptions(show_error=True)
448
  def process_data_room(self, data_room_path: str):
449
- """
450
- Process the selected data room
451
-
452
- Args:
453
- data_room_path: Path to the data room to process
454
- """
455
  if not Path(data_room_path).exists():
456
  show_error(f"Data room path not found: {data_room_path}")
457
- st.session_state.is_processing = False # Reset flag on error
458
  return
459
 
460
- try:
461
- # Initialize services
462
  self.initialize_services()
 
 
463
 
464
- # Create progress container
465
- progress_container = st.container()
466
-
467
- with progress_container:
468
- st.markdown("### 🚀 Processing Data Room")
469
-
470
- # Define step weights based on expected complexity/duration
471
- step_weights = {
472
- 1: 1.0, # Scanning data room (fast)
473
- 2: 0.5, # Found documents (instant)
474
- 3: 8.0, # Generate AI summaries (very slow - depends on doc count)
475
- 4: 0.5, # AI summaries complete (instant)
476
- 5: 1.0, # Loading checklist and questions (fast)
477
- 6: 0.5, # Checklist and questions loaded (instant)
478
- 7: 3.0, # Generate checklist descriptions (moderate)
479
- 8: 0.5, # Descriptions generated (instant)
480
- 9: 2.0, # Match checklist to documents (moderate)
481
- 10: 0.5, # Checklist matching complete (instant)
482
- 11: 2.0, # Answer questions (moderate)
483
- 12: 0.5 # Complete (instant)
484
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
- tracker = ProgressTracker(12, "Processing", step_weights)
487
-
488
- # Step 1: Load documents with parallel processing
489
- tracker.update(1, f"Scanning data room: {Path(data_room_path).name}")
490
-
491
- # Create a progress bar for detailed document loading progress
492
- doc_progress_placeholder = st.empty()
493
- with doc_progress_placeholder.container():
494
- doc_progress_bar = st.progress(0, text="Initializing document scan...")
495
 
496
- # Use parallel processing with progress tracking (max_workers=4 as specified)
497
- load_results = self.service.document_processor.load_data_room_with_progress(
498
- data_room_path,
499
- max_workers=4,
500
- progress_bar=doc_progress_bar
 
 
501
  )
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
- # Clear the detailed progress bar
504
- doc_progress_placeholder.empty()
505
-
506
- st.session_state.documents = self.service.document_processor.documents
507
- st.session_state.chunks = self.service.document_processor.chunks
508
- st.session_state.embeddings = self.service.document_processor.embeddings
509
-
510
- # Display performance metrics
511
- if 'performance' in load_results:
512
- perf = load_results['performance']
513
- tracker.update(2, f"Found {load_results['documents_count']} documents in {perf['total_time']:.1f}s "
514
- f"({perf['documents_per_second']:.1f} docs/sec)")
515
- logger.info(f"Document loading performance: {perf}")
516
- else:
517
- tracker.update(2, f"Found {load_results['documents_count']} documents")
518
-
519
- # Step 2: Generate AI summaries if agent available
520
- if hasattr(st.session_state, 'agent') and st.session_state.agent:
521
- doc_count = len(st.session_state.documents)
522
- tracker.update(3, f"Generating AI summaries for {doc_count} documents...")
523
-
524
- # Adjust weight for step 3 based on actual document count
525
- # More documents = longer processing time
526
- if doc_count > 50:
527
- step_weights[3] = min(15.0, doc_count * 0.15) # Scale with doc count, cap at 15
528
- elif doc_count > 20:
529
- step_weights[3] = doc_count * 0.2 # 4-10 weight for 20-50 docs
530
-
531
- # Recalculate total weight
532
- tracker.total_weight = sum(step_weights.values())
533
-
534
- # Convert documents for summarization
535
- docs_for_summary = []
536
- for path, doc_info in st.session_state.documents.items():
537
- docs_for_summary.append({
538
- 'name': doc_info['name'],
539
- 'path': doc_info['rel_path'],
540
- 'content': doc_info.get('content', '')[:1500],
541
- 'metadata': doc_info.get('metadata', {})
542
- })
543
-
544
- # Create a separate progress tracker for batch summarization
545
- st.session_state.summary_progress = st.progress(0, text="📝 Starting document summarization...")
546
-
547
- # Batch summarize
548
- summarized_docs = batch_summarize_documents(
549
- docs_for_summary,
550
- st.session_state.agent.llm,
551
- batch_size=self.config.processing.batch_size
552
- )
553
-
554
- # Clean up summary progress tracker
555
- if 'summary_progress' in st.session_state:
556
- st.session_state.summary_progress.progress(1.0, text="✅ Document summarization complete")
557
- del st.session_state.summary_progress
558
-
559
- # Store summaries
560
- for doc in summarized_docs:
561
- for path, doc_info in st.session_state.documents.items():
562
- if doc_info['rel_path'] == doc['path']:
563
- doc_info['summary'] = doc.get('summary', '')
564
-
565
- # Create embeddings using summaries
566
- st.session_state.doc_embeddings_data = create_document_embeddings_with_summaries(
567
- summarized_docs, self.model
568
- )
569
-
570
- tracker.update(4, f"AI summaries complete ({doc_count} documents processed)")
571
- else:
572
- tracker.update(4, "Skipping AI summaries (not enabled)")
573
-
574
- # Step 3: Parse checklist and questions
575
- tracker.update(5, "Loading checklist and questions...")
576
-
577
- # Load default checklist
578
- checklist_text = self._load_default_file(self.config.paths.checklist_path, "*.md")
579
- if checklist_text:
580
- st.session_state.checklist = self.service.checklist_parser.parse_checklist(checklist_text)
581
-
582
- # Load default questions
583
- questions_text = self._load_default_file(self.config.paths.questions_path, "*.md")
584
- if questions_text:
585
- st.session_state.questions = self.service.question_parser.parse_questions(questions_text)
586
-
587
- tracker.update(6, "Checklist and questions loaded")
588
-
589
- # Step 7: Generate checklist descriptions if AI is available
590
- if (hasattr(st.session_state, 'agent') and st.session_state.agent and
591
- st.session_state.checklist):
592
-
593
- tracker.update(7, "Generating checklist item descriptions...")
594
-
595
- # Create progress tracker for descriptions
596
- st.session_state.description_progress = st.progress(0, text="📝 Generating descriptions...")
597
-
598
- # Generate enhanced descriptions for better matching
599
- st.session_state.checklist = generate_checklist_descriptions(
600
- st.session_state.checklist,
601
- st.session_state.agent.llm,
602
- batch_size=self.config.processing.batch_size
603
- )
604
-
605
- # Clean up progress tracker
606
- if 'description_progress' in st.session_state:
607
- st.session_state.description_progress.progress(1.0, text="✅ Descriptions generated")
608
- del st.session_state.description_progress
609
-
610
- tracker.update(8, "Checklist descriptions generated")
611
- else:
612
- tracker.update(8, "Skipping description generation (AI not enabled)")
613
-
614
- # Step 9: Match checklist to documents
615
- if st.session_state.checklist and st.session_state.chunks:
616
- tracker.update(9, "Matching checklist to documents...")
617
-
618
- if hasattr(st.session_state, 'doc_embeddings_data') and st.session_state.doc_embeddings_data:
619
- # Use AI-enhanced matching with generated descriptions
620
- st.session_state.checklist_results = match_checklist_with_summaries(
621
- st.session_state.checklist,
622
- st.session_state.doc_embeddings_data,
623
- self.model,
624
- self.config.processing.similarity_threshold
625
- )
626
- else:
627
- # Use traditional matching
628
- st.session_state.checklist_results = self.service.checklist_matcher.match_checklist_to_documents(
629
- st.session_state.checklist,
630
- st.session_state.chunks,
631
- st.session_state.embeddings,
632
- self.config.processing.similarity_threshold
633
- )
634
-
635
- tracker.update(10, "Checklist matching complete")
636
-
637
- # Step 11: Answer questions
638
- if (st.session_state.questions and st.session_state.chunks and
639
- st.session_state.embeddings is not None):
640
-
641
- tracker.update(11, "Answering due diligence questions...")
642
-
643
- st.session_state.question_answers = self.service.question_answerer.answer_questions_with_chunks(
644
- st.session_state.questions,
645
- st.session_state.chunks,
646
- st.session_state.embeddings,
647
- self.config.processing.similarity_threshold
648
- )
649
-
650
- answered_count = sum(1 for a in st.session_state.question_answers.values() if a['has_answer'])
651
- tracker.update(12, f"Answered {answered_count}/{len(st.session_state.questions)} questions")
652
-
653
- tracker.complete("Processing complete!")
654
 
655
- # Small delay before clearing
656
- import time
657
- time.sleep(1.5)
658
- progress_container.empty()
659
-
660
- # Reset processing flag and mark as just processed on success
661
- st.session_state.is_processing = False
662
- st.session_state.just_processed = True
663
- st.rerun()
664
-
665
- except Exception:
666
- # Reset processing flag on any error
667
- st.session_state.is_processing = False
668
- raise # Let decorator handle error display
669
-
670
- def _load_default_file(self, directory: Path, pattern: str) -> str:
671
- """Load the first file matching pattern from directory"""
672
- try:
673
- files = list(directory.glob(pattern))
674
- if files:
675
- return files[0].read_text(encoding='utf-8')
676
- except Exception as e:
677
- logger.warning(f"Could not load default file from {directory}: {e}")
678
- return ""
679
 
680
  def run(self):
681
  """Run the main application"""
@@ -698,33 +540,20 @@ class DDChecklistApp:
698
  self.render_summary_tab()
699
 
700
  with tab2:
701
- self.render_checklist_tab()
702
 
703
  with tab3:
704
- self.render_questions_tab()
705
 
706
  with tab4:
707
  self.render_qa_tab()
708
 
709
- # Show success message if just processed
710
- if st.session_state.just_processed:
711
- show_success("✅ Data room processing complete! View results in the tabs above.")
712
- st.session_state.just_processed = False
713
 
714
- # Handle processing trigger
715
  if process_button and selected_data_room_path and not st.session_state.is_processing:
716
- # Set trigger and path for next render
717
- st.session_state.trigger_processing = True
718
- st.session_state.processing_path = selected_data_room_path
719
  st.session_state.is_processing = True
720
- st.rerun()
721
-
722
- # Execute processing if triggered
723
- if st.session_state.trigger_processing and st.session_state.processing_path:
724
- st.session_state.trigger_processing = False # Reset trigger
725
- processing_path = st.session_state.processing_path
726
- st.session_state.processing_path = None
727
- self.process_data_room(processing_path)
728
 
729
 
730
  def main():
 
7
  """
8
 
9
  import os
10
+ import warnings
11
  # Fix tokenizers parallelism warning early
12
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
13
 
14
+ # Suppress all LangChain verbose warnings globally
15
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
16
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
17
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
18
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_huggingface")
19
+ warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
20
+ warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
21
+
22
+ # Set up LangChain logging levels early
23
+ import logging
24
+ logging.getLogger("langchain").setLevel(logging.ERROR)
25
+ logging.getLogger("langchain_core").setLevel(logging.ERROR)
26
+ logging.getLogger("langchain_community").setLevel(logging.ERROR)
27
+ logging.getLogger("langchain_huggingface").setLevel(logging.ERROR)
28
+
29
  import streamlit as st
30
+
 
31
  from pathlib import Path
32
+ from typing import Dict
33
 
34
  # Import our refactored modules
35
  from src import (
36
+ init_config, DocumentProcessor,
37
+ logger,
38
+ render_project_selector,
39
+ render_ai_settings, escape_markdown_math,
40
+ get_mime_type, format_document_title
41
  )
42
+ from src.document_processing import safe_execute
43
+ # Using Streamlit directly for simplicity
44
  from src.ui_components import (
45
+ render_file_selector, render_checklist_results, render_question_results,
46
+ render_quick_questions, create_document_link
47
+ )
48
+ from src.services import (
49
+ search_documents
50
  )
 
 
51
 
52
+ from src.config import show_success, show_error, show_info
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ # Import LangGraph + Anthropic configuration
55
+ from src.ai import (
56
+ DDChecklistAgent
57
+ )
58
 
59
 
60
  class DDChecklistApp:
61
  """
62
  Main application class that orchestrates all components
63
  """
64
+
65
  def __init__(self):
66
  """Initialize the application"""
67
  # Initialize configuration
68
+ self.config = init_config()
69
 
70
  # Initialize session state
71
  self._init_session_state()
 
78
  )
79
 
80
  # Initialize services (will be loaded when needed)
81
+ self.model_name = self.config.model.sentence_transformer_model
82
+ self.document_processor = None
83
  self.agent = None
84
 
85
  def _init_session_state(self):
86
+ """Initialize essential session state variables only"""
87
+ essential_defaults = {
88
  'documents': {},
89
  'chunks': [],
90
  'embeddings': None,
 
91
  'checklist_results': {},
 
92
  'question_answers': {},
 
 
93
  'company_summary': "",
94
+ 'strategy_analysis': "",
95
  'agent': None,
96
+ 'is_processing': False
 
 
 
 
97
  }
98
 
99
+ for key, default_value in essential_defaults.items():
100
  if key not in st.session_state:
101
  st.session_state[key] = default_value
102
+
 
 
 
 
 
 
103
  def initialize_services(self):
104
  """Initialize core services"""
105
+ if self.document_processor is None:
106
+ self.document_processor = DocumentProcessor(self.model_name)
 
 
 
107
 
108
  # Restore document processor state from session state if available
109
  if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
110
  hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
111
 
112
+ self.document_processor.chunks = st.session_state.chunks
113
+ self.document_processor.embeddings = st.session_state.embeddings
114
+ # Note: Don't restore documents here - they'll be recreated from chunks if needed
 
 
 
115
 
116
  def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
117
  """
118
+ Setup AI agent
119
 
120
  Args:
121
  api_key: Anthropic API key
 
123
 
124
  Returns:
125
  True if agent was successfully initialized
126
+ """
 
 
 
 
127
  try:
128
  with st.spinner("Initializing AI agent..."):
129
  agent = DDChecklistAgent(api_key, model_choice)
 
133
  self.agent = agent
134
  show_success("✅ AI Agent ready")
135
 
 
 
 
136
 
137
  return True
138
  else:
 
181
  self.agent = None
182
 
183
  return selected_data_room_path, use_ai_features, process_button
 
184
 
185
  def render_summary_tab(self):
186
+ """Render consolidated summary and analysis tab"""
187
  # Strategy selector
188
  strategy_path, strategy_text = render_file_selector(
189
  self.config.paths.strategy_dir, "Strategy", "tab"
190
  )
 
191
 
192
  # Check if we have documents to display summaries
193
  if st.session_state.documents:
 
195
  overview_tab, analysis_tab = st.tabs(["🏢 Company Overview", "🎯 Strategic Analysis"])
196
 
197
  with overview_tab:
198
+ self._render_report_section("overview", strategy_text=strategy_text)
199
 
200
  with analysis_tab:
201
+ self._render_report_section("strategic", strategy_text=strategy_text)
202
  else:
203
  show_info("👈 Configure and process data room to see analysis")
204
 
205
+ def _render_report_section(self, report_type: str, strategy_text: str = ""):
206
+ """Unified report rendering for both overview and strategic analysis"""
207
+ from src.services import generate_reports
208
+
209
+ summary_key = f"{report_type}_summary"
210
+
211
+ # Check prerequisites for strategic analysis
212
+ if report_type == "strategic" and not st.session_state.checklist_results:
213
+ st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
214
+ return
215
+
216
+ # Auto-generate report if not already present and AI is available
217
+ if (not st.session_state.get(summary_key, "") and st.session_state.agent):
218
+ with st.spinner(f"🤖 Generating {report_type} analysis..."):
219
+ data_room_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
220
+ if st.session_state.documents else "Unknown")
221
+
222
+ st.session_state[summary_key] = generate_reports(
223
+ st.session_state.documents,
224
+ data_room_name,
225
+ strategy_text,
226
+ st.session_state.checklist_results,
227
+ report_type,
228
+ st.session_state.agent.llm if st.session_state.agent else None
229
  )
230
 
231
+ # Display the report if available
232
+ if st.session_state.get(summary_key, ""):
233
+ st.markdown(st.session_state[summary_key])
234
 
235
  # Add export and regenerate buttons
236
+ self._render_report_actions(report_type, summary_key)
237
+
238
+ def _render_report_actions(self, report_type: str, summary_key: str):
239
+ """Render export and regenerate actions for reports"""
240
+ if report_type == "overview":
241
  col1, col2 = st.columns([1, 5])
242
  with col1:
243
+ company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
244
+ if st.session_state.documents else 'export')
245
+ file_name = f"company_overview_{company_name}.md"
246
  st.download_button(
247
  "📥 Export Summary",
248
+ data=f"# Company Overview\n\n{st.session_state[summary_key]}",
249
+ file_name=file_name,
250
  mime="text/markdown",
251
+ key=f"export_{summary_key}"
252
  )
253
  with col2:
254
+ if st.button(f"🔄 Regenerate {report_type.title()}"):
255
+ st.session_state[summary_key] = ""
256
  st.rerun()
257
+ else:
258
+ col1, col2 = st.columns([1, 5])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with col1:
260
+ # Combined report export for strategic analysis
261
  combined_report = f"# Due Diligence Report\n\n"
262
+ combined_report += f"## Company Overview\n\n{st.session_state.get('overview_summary', '')}\n\n"
263
+ combined_report += f"## Strategic Analysis\n\n{st.session_state[summary_key]}"
264
 
265
+ company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
266
+ if st.session_state.documents else 'export')
267
+ file_name = f"dd_report_{company_name}.md"
268
  st.download_button(
269
  "📥 Export Report",
270
  data=combined_report,
271
+ file_name=file_name,
272
  mime="text/markdown",
273
+ key=f"export_combined_{summary_key}"
274
  )
275
  with col2:
276
+ if st.button(f"🔄 Regenerate {report_type.title()}"):
277
+ st.session_state[summary_key] = ""
278
  st.rerun()
279
 
280
+ def render_analysis_tab(self, tab_type: str):
281
+ """Unified rendering for checklist and questions tabs"""
282
+ if tab_type == "checklist":
283
+ # Checklist selector
284
+ file_path, file_text = render_file_selector(
285
+ self.config.paths.checklist_dir, "Checklist", "tab"
286
+ )
287
+
288
+ if not file_text:
289
+ show_error("No checklists found in data/checklist directory")
290
+ return
291
+
292
+ # Render results if available
293
+ render_checklist_results(st.session_state.checklist_results)
294
+
295
+ elif tab_type == "questions":
296
+ # Question list selector
297
+ file_path, file_text = render_file_selector(
298
+ self.config.paths.questions_dir, "Question List", "tab"
299
+ )
300
+
301
+ if not file_text:
302
+ show_info("No question lists found in data/questions/")
303
+ return
304
+
305
+ # Render results if available
306
+ render_question_results(st.session_state.question_answers)
307
 
308
  def render_qa_tab(self):
309
  """Render the Q&A with citations tab"""
 
329
 
330
  def _handle_qa_query(self, question: str):
331
  """Handle Q&A query and display results"""
332
+ if not self.document_processor:
333
  self.initialize_services()
334
 
335
  # Use lower threshold for Q&A to get more relevant results
336
  qa_threshold = 0.25
337
 
338
+ results = search_documents(
339
+ self.document_processor,
340
  question,
341
  top_k=self.config.ui.top_k_search_results,
342
  threshold=qa_threshold
 
353
  context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
354
  # Use LLM directly for more reliable answers
355
  from langchain_core.messages import HumanMessage
356
+ prompt = (f"Question: {question}\n\n"
357
+ f"Relevant document excerpts:\n{context}\n\n"
358
+ f"Provide a comprehensive answer with citations to the sources.")
359
  response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
360
  # Clean up any leading whitespace and escape math characters
361
  answer_text = escape_markdown_math(response.content.strip())
 
375
  # Create clickable link for the document
376
  doc_path = result.get('path', result.get('full_path', ''))
377
  doc_name = result['source']
378
+ doc_title = format_document_title(doc_name)
 
 
 
379
 
380
  if doc_path:
381
  link_html = create_document_link(doc_path, doc_name, doc_title)
 
402
  file_bytes = f.read()
403
 
404
  # Determine MIME type based on file extension
405
+ mime_type = get_mime_type(file_path)
 
 
 
 
 
 
 
 
 
 
406
 
407
  button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
408
 
 
417
  except Exception as e:
418
  st.error(f"Download failed: {str(e)}")
419
 
 
420
  def process_data_room(self, data_room_path: str):
421
+ """Simplified data room processing"""
 
 
 
 
 
422
  if not Path(data_room_path).exists():
423
  show_error(f"Data room path not found: {data_room_path}")
424
+ st.session_state.is_processing = False
425
  return
426
 
427
+ # Use safe_execute for the entire processing operation
428
+ def process_operation():
429
  self.initialize_services()
430
+ # Simple processing - load documents
431
+ self.document_processor.load_data_room(data_room_path)
432
 
433
+ # Store results in session state with simplified structure
434
+ # Convert list of LangChain documents to dictionary format expected by UI
435
+ documents_dict = {}
436
+ for doc in self.document_processor.documents:
437
+ file_path = doc.metadata.get('source', doc.metadata.get('path', 'unknown'))
438
+ documents_dict[file_path] = {
439
+ 'name': doc.metadata.get('name', Path(file_path).name if file_path != 'unknown' else 'unknown'),
440
+ 'path': doc.metadata.get('path', ''),
441
+ 'content': doc.page_content,
442
+ 'metadata': doc.metadata
 
 
 
 
 
 
 
 
 
 
443
  }
444
+
445
+ st.session_state.documents = documents_dict
446
+ st.session_state.chunks = self.document_processor.chunks
447
+ st.session_state.embeddings = self.document_processor.embeddings
448
+
449
+ # Process checklist and questions if available
450
+ self._process_checklist_and_questions()
451
+
452
+ # Clear any existing analysis to trigger regeneration
453
+ st.session_state.company_summary = ""
454
+ st.session_state.strategy_analysis = ""
455
+ st.session_state.overview_summary = ""
456
+ st.session_state.strategic_summary = ""
457
+
458
+ show_success("✅ Data room processing complete! View results in the tabs above.")
459
+ st.rerun()
460
+
461
+ safe_execute(
462
+ process_operation,
463
+ None,
464
+ "Data room processing"
465
+ )
466
+
467
+ st.session_state.is_processing = False
468
+
469
+ def _process_checklist_and_questions(self):
470
+ """Process checklist and questions after documents are loaded"""
471
+ from src.services import parse_checklist, parse_questions, create_vector_store, search_and_analyze, load_default_file
472
+
473
+ # Load default checklist if available
474
+ checklist_text = load_default_file(Path(self.config.paths.checklist_dir), "*.md")
475
+ if checklist_text and self.document_processor.chunks:
476
+ try:
477
+ # Parse checklist
478
+ checklist = parse_checklist(checklist_text)
479
+ st.session_state.checklist = checklist
480
 
481
+ # Create vector store from chunks for processing
482
+ vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
 
 
 
 
 
 
 
483
 
484
+ # Process checklist items
485
+ checklist_results = search_and_analyze(
486
+ checklist,
487
+ vector_store,
488
+ self.agent.llm if self.agent else None,
489
+ self.config.processing.similarity_threshold,
490
+ 'items'
491
  )
492
+ st.session_state.checklist_results = checklist_results
493
+ logger.info("✅ Checklist processing completed")
494
+ except Exception as e:
495
+ logger.error(f"Checklist processing failed: {e}")
496
+
497
+ # Load default questions if available
498
+ questions_text = load_default_file(Path(self.config.paths.questions_dir), "*.md")
499
+ if questions_text and self.document_processor.chunks:
500
+ try:
501
+ # Parse questions
502
+ questions = parse_questions(questions_text)
503
+ st.session_state.questions = questions
504
 
505
+ # Create vector store from chunks for processing (reuse if already created)
506
+ if 'vector_store' not in locals():
507
+ vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
508
 
509
+ # Process questions
510
+ question_answers = search_and_analyze(
511
+ questions,
512
+ vector_store,
513
+ self.agent.llm if self.agent else None,
514
+ self.config.processing.relevancy_threshold,
515
+ 'questions'
516
+ )
517
+ st.session_state.question_answers = question_answers
518
+ logger.info("✅ Questions processing completed")
519
+ except Exception as e:
520
+ logger.error(f"Questions processing failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
521
 
522
  def run(self):
523
  """Run the main application"""
 
540
  self.render_summary_tab()
541
 
542
  with tab2:
543
+ self.render_analysis_tab("checklist")
544
 
545
  with tab3:
546
+ self.render_analysis_tab("questions")
547
 
548
  with tab4:
549
  self.render_qa_tab()
550
 
551
+ # Processing complete message is handled in process_data_room function
 
 
 
552
 
553
+ # Simplified processing trigger
554
  if process_button and selected_data_room_path and not st.session_state.is_processing:
 
 
 
555
  st.session_state.is_processing = True
556
+ self.process_data_room(selected_data_room_path)
 
 
 
 
 
 
 
557
 
558
 
559
  def main():
pyproject.toml CHANGED
@@ -8,22 +8,31 @@ dependencies = [
8
  "streamlit>=1.28.0",
9
  "sentence-transformers>=2.2.0",
10
  "numpy>=1.24.0",
11
- "pandas>=2.0.0",
12
- "watchdog>=3.0.0",
13
  # Document processing
14
  "pymupdf>=1.23.0",
15
  "python-docx>=0.8.11",
16
  # Environment and configuration
17
  "python-dotenv>=1.0.0",
 
 
18
  # Vector store
19
  "faiss-cpu>=1.7.4",
20
- # AI Enhancement
21
  "langchain-anthropic>=0.1.0",
22
  "langgraph>=0.0.20",
23
  "langchain-core>=0.1.0",
24
  "langchain-text-splitters>=0.3.10",
 
 
 
 
25
  ]
26
 
27
  [build-system]
28
  requires = ["setuptools", "wheel"]
29
  build-backend = "setuptools.build_meta"
 
 
 
 
 
 
8
  "streamlit>=1.28.0",
9
  "sentence-transformers>=2.2.0",
10
  "numpy>=1.24.0",
 
 
11
  # Document processing
12
  "pymupdf>=1.23.0",
13
  "python-docx>=0.8.11",
14
  # Environment and configuration
15
  "python-dotenv>=1.0.0",
16
+ "pydantic-settings>=2.10.1",
17
+ "markdown>=3.8.2",
18
  # Vector store
19
  "faiss-cpu>=1.7.4",
20
+ # AI Enhancement - LangChain packages
21
  "langchain-anthropic>=0.1.0",
22
  "langgraph>=0.0.20",
23
  "langchain-core>=0.1.0",
24
  "langchain-text-splitters>=0.3.10",
25
+ "langchain-community>=0.3.29",
26
+ "langchain-huggingface>=0.3.1",
27
+ "pypdf>=6.0.0",
28
+ "watchdog>=6.0.0",
29
  ]
30
 
31
  [build-system]
32
  requires = ["setuptools", "wheel"]
33
  build-backend = "setuptools.build_meta"
34
+
35
+ [dependency-groups]
36
+ dev = [
37
+ "autoflake>=2.3.1",
38
+ ]
requirements.txt CHANGED
@@ -2,22 +2,24 @@
2
  streamlit==1.49.1
3
  sentence-transformers==5.1.0
4
  numpy==2.3.2
5
- pandas==2.3.2
6
- watchdog==6.0.0
7
 
8
  # Document processing - pinned for deployment
9
  PyMuPDF==1.23.18
10
  python-docx==1.2.0
11
- joblib==1.5.2
12
 
13
  # Environment and configuration - pinned for deployment
14
  python-dotenv==1.1.1
 
 
15
 
16
  # Vector store - pinned for deployment
17
  faiss-cpu==1.12.0
18
 
19
- # AI Enhancement - pinned for deployment
20
  langchain-anthropic==0.3.19
21
  langgraph==0.6.6
22
  langchain-core==0.3.75
23
- langchain-text-splitters==0.3.10
 
 
 
 
2
  streamlit==1.49.1
3
  sentence-transformers==5.1.0
4
  numpy==2.3.2
 
 
5
 
6
  # Document processing - pinned for deployment
7
  PyMuPDF==1.23.18
8
  python-docx==1.2.0
 
9
 
10
  # Environment and configuration - pinned for deployment
11
  python-dotenv==1.1.1
12
+ pydantic-settings==2.8.1
13
+ markdown==3.9
14
 
15
  # Vector store - pinned for deployment
16
  faiss-cpu==1.12.0
17
 
18
+ # AI Enhancement - LangChain packages pinned for deployment
19
  langchain-anthropic==0.3.19
20
  langgraph==0.6.6
21
  langchain-core==0.3.75
22
+ langchain-text-splitters==0.3.10
23
+ langchain-community==0.3.29
24
+ langchain-huggingface==0.3.1
25
+
src/__init__.py CHANGED
@@ -5,10 +5,11 @@ DD-Checklist Source Package
5
  This package contains the refactored components of the DD-Checklist application.
6
  """
7
 
8
- from .config import get_config, init_config, get_model_config, get_processing_config
9
- from .document_processing import DocumentProcessor, escape_markdown_math
10
- from .services import DDChecklistService, ChecklistParser, QuestionParser
11
- from .utils import logger, handle_exceptions, safe_execute, ErrorHandler
 
12
  from .ui_components import render_project_selector, render_ai_settings
13
 
14
  __version__ = "0.2.0"
@@ -17,24 +18,21 @@ __author__ = "DD-Checklist Team"
17
  __all__ = [
18
  # Configuration
19
  "get_config",
20
- "init_config",
21
- "get_model_config",
22
- "get_processing_config",
23
 
24
  # Document Processing
25
  "DocumentProcessor",
26
  "escape_markdown_math",
 
27
 
28
- # Services
29
- "DDChecklistService",
30
- "ChecklistParser",
31
- "QuestionParser",
32
-
33
- # Utilities
34
  "logger",
35
- "handle_exceptions",
36
- "safe_execute",
37
- "ErrorHandler",
 
 
 
38
 
39
  # UI Components
40
  "render_project_selector",
 
5
  This package contains the refactored components of the DD-Checklist application.
6
  """
7
 
8
+ from .config import (
9
+ get_config, init_config, logger, show_success, show_error, show_info,
10
+ get_mime_type, format_document_title, count_documents_in_directory
11
+ )
12
+ from .document_processing import DocumentProcessor, escape_markdown_math, safe_execute
13
  from .ui_components import render_project_selector, render_ai_settings
14
 
15
  __version__ = "0.2.0"
 
18
  __all__ = [
19
  # Configuration
20
  "get_config",
21
+ "init_config",
 
 
22
 
23
  # Document Processing
24
  "DocumentProcessor",
25
  "escape_markdown_math",
26
+ "safe_execute",
27
 
28
+ # Utilities (merged from utils.py)
 
 
 
 
 
29
  "logger",
30
+ "show_success",
31
+ "show_error",
32
+ "show_info",
33
+ "get_mime_type",
34
+ "format_document_title",
35
+ "count_documents_in_directory",
36
 
37
  # UI Components
38
  "render_project_selector",
src/ai/__init__.py CHANGED
@@ -6,70 +6,23 @@ This module provides AI-powered functionality for the DD-Checklist application,
6
  including LangGraph agents, document processing, and checklist matching.
7
  """
8
 
9
- # Try to import core components and set availability flag
10
- try:
11
- from .agent_core import DDChecklistAgent, get_langgraph_agent, LANGGRAPH_AVAILABLE
12
- from .llm_utilities import (
13
- batch_summarize_documents,
14
- create_document_embeddings_with_summaries,
15
- match_checklist_with_summaries,
16
- generate_checklist_descriptions,
17
- exponential_backoff_retry
18
- )
19
- from .agent_nodes import AgentState, TaskType
20
- from .prompts import (
21
- get_checklist_parsing_prompt,
22
- get_document_relevance_prompt,
23
- get_question_answering_prompt,
24
- get_findings_summary_prompt,
25
- get_description_generation_prompt,
26
- get_document_summarization_prompt
27
- )
28
-
29
- # Set availability flag based on successful imports
30
- AI_MODULE_AVAILABLE = LANGGRAPH_AVAILABLE
31
-
32
- except ImportError as e:
33
- # Handle missing dependencies gracefully
34
- print(f"AI module dependencies not available: {e}")
35
-
36
- # Create placeholder classes/functions for graceful degradation
37
- class DDChecklistAgent:
38
- def __init__(self, *args, **kwargs):
39
- self.app = None
40
- self.llm = None
41
-
42
- def is_available(self):
43
- return False
44
-
45
- def get_langgraph_agent(*args, **kwargs):
46
- return None
47
-
48
- def batch_summarize_documents(documents, *args, **kwargs):
49
- return documents
50
-
51
- def create_document_embeddings_with_summaries(documents, *args, **kwargs):
52
- return {'embeddings': [], 'documents': []}
53
-
54
- def match_checklist_with_summaries(*args, **kwargs):
55
- return {}
56
-
57
- def generate_checklist_descriptions(checklist, *args, **kwargs):
58
- return checklist
59
-
60
- def exponential_backoff_retry(func, *args, **kwargs):
61
- return func()
62
-
63
- # Set availability flags
64
- LANGGRAPH_AVAILABLE = False
65
- AI_MODULE_AVAILABLE = False
66
-
67
- # Placeholder classes for type hints
68
- class AgentState:
69
- pass
70
-
71
- class TaskType:
72
- pass
73
 
74
  # Export main public API
75
  __all__ = [
@@ -77,14 +30,9 @@ __all__ = [
77
  'DDChecklistAgent',
78
  'get_langgraph_agent',
79
 
80
- # LLM utility functions
81
- 'batch_summarize_documents',
82
- 'create_document_embeddings_with_summaries',
83
- 'match_checklist_with_summaries',
84
- 'generate_checklist_descriptions',
85
- 'exponential_backoff_retry',
86
 
87
- # Agent types and state
88
  'AgentState',
89
  'TaskType',
90
 
@@ -95,8 +43,4 @@ __all__ = [
95
  'get_findings_summary_prompt',
96
  'get_description_generation_prompt',
97
  'get_document_summarization_prompt',
98
-
99
- # Availability flags
100
- 'LANGGRAPH_AVAILABLE',
101
- 'AI_MODULE_AVAILABLE',
102
  ]
 
6
  including LangGraph agents, document processing, and checklist matching.
7
  """
8
 
9
+ # Import core components
10
+ from .prompts import (
11
+ get_checklist_parsing_prompt,
12
+ get_document_relevance_prompt,
13
+ get_question_answering_prompt,
14
+ get_findings_summary_prompt,
15
+ get_description_generation_prompt,
16
+ get_document_summarization_prompt
17
+ )
18
+
19
+ # Direct imports for AI functionality - assuming dependencies are present
20
+ from .agent_core import (
21
+ DDChecklistAgent,
22
+ get_langgraph_agent,
23
+ AgentState,
24
+ TaskType
25
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Export main public API
28
  __all__ = [
 
30
  'DDChecklistAgent',
31
  'get_langgraph_agent',
32
 
33
+
 
 
 
 
 
34
 
35
+ # Agent types and state (now in agent_core)
36
  'AgentState',
37
  'TaskType',
38
 
 
43
  'get_findings_summary_prompt',
44
  'get_description_generation_prompt',
45
  'get_document_summarization_prompt',
 
 
 
 
46
  ]
src/ai/agent_core.py CHANGED
@@ -2,40 +2,548 @@
2
  """
3
  LangGraph Agent Core Module
4
 
5
- This module contains the main LangGraph agent setup and the high-level
6
  DDChecklistAgent class for interacting with the agent system.
 
 
7
  """
8
 
9
  import os
10
- from typing import Optional, Dict, List, Any, Tuple
11
-
12
- try:
13
- import streamlit as st
14
- from langchain_anthropic import ChatAnthropic
15
- from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
16
- from langchain_core.tools import tool
17
- from langgraph.graph import StateGraph, END
18
- from langgraph.prebuilt import ToolNode
19
- from langgraph.checkpoint.memory import MemorySaver
20
- LANGGRAPH_AVAILABLE = True
21
- except ImportError:
22
- LANGGRAPH_AVAILABLE = False
23
- st = None
24
- ChatAnthropic = object
25
- BaseMessage = object
26
- HumanMessage = object
27
- AIMessage = object
28
- SystemMessage = object
29
 
30
  from ..config import get_config
31
- from .agent_nodes import (
32
- AgentState, TaskType,
33
- route_task, parse_checklist_node, match_checklist_node,
34
- answer_question_node, summarize_node, route_condition
 
 
 
 
35
  )
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, ChatAnthropic]]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  """
40
  Create a LangGraph agent with Anthropic
41
 
@@ -47,9 +555,6 @@ def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = No
47
  Tuple of (compiled_app, llm) or None if not available
48
  """
49
 
50
- if not LANGGRAPH_AVAILABLE:
51
- return None
52
-
53
  # Get configuration
54
  config = get_config()
55
 
@@ -165,7 +670,7 @@ class DDChecklistAgent:
165
 
166
  def is_available(self) -> bool:
167
  """Check if the agent is available for use"""
168
- return self.app is not None
169
 
170
  def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
171
  """
@@ -189,8 +694,7 @@ class DDChecklistAgent:
189
 
190
  return result.get("checklist")
191
  except Exception as e:
192
- if st:
193
- st.error(f"Agent error: {str(e)}")
194
  return None
195
 
196
  def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
@@ -223,8 +727,7 @@ class DDChecklistAgent:
223
 
224
  return result.get("findings", {})
225
  except Exception as e:
226
- if st:
227
- st.error(f"Agent error: {str(e)}")
228
  return {}
229
 
230
  def answer_question(self, question: str, documents: List[Dict]) -> str:
 
2
  """
3
  LangGraph Agent Core Module
4
 
5
+ This module contains the main LangGraph agent setup, AI utilities, and the high-level
6
  DDChecklistAgent class for interacting with the agent system.
7
+
8
+ Merged from: agent_core.py, agent_nodes.py, llm_utilities.py
9
  """
10
 
11
  import os
12
+ import json
13
+ import time
14
+ import random
15
+ import logging
16
+ from typing import Optional, Dict, List, Any, Tuple, Sequence
17
+ from typing_extensions import TypedDict
18
+ from enum import Enum
19
+ import streamlit as st
20
+ from langchain_anthropic import ChatAnthropic
21
+ from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
22
+ from langchain_core.tools import tool
23
+ from langchain_core.output_parsers import PydanticOutputParser
24
+ from langchain_community.vectorstores import FAISS
25
+ from langchain_huggingface import HuggingFaceEmbeddings
26
+ from langchain_core.documents import Document
27
+ from langgraph.graph import StateGraph, END
28
+
29
+ from langgraph.checkpoint.memory import MemorySaver
30
+ from pydantic import BaseModel, Field
31
 
32
  from ..config import get_config
33
+ from ..document_processing import safe_execute
34
+ from .prompts import (
35
+ get_checklist_parsing_prompt,
36
+ get_document_relevance_prompt,
37
+ get_question_answering_prompt,
38
+ get_findings_summary_prompt,
39
+ get_description_generation_prompt,
40
+ get_document_summarization_prompt
41
  )
42
 
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ # =============================================================================
47
+ # TYPE DEFINITIONS - Merged from agent_nodes.py
48
+ # =============================================================================
49
+
50
+ # Simple Pydantic models for structured output parsing
51
+ class SimpleChecklist(BaseModel):
52
+ """Simple model matching existing checklist structure"""
53
+ categories: Dict = Field(description="Checklist categories as they currently exist")
54
+
55
+
56
+ # Define the state for our agent
57
+ class AgentState(TypedDict):
58
+ """State for the due diligence agent"""
59
+ messages: Sequence[BaseMessage]
60
+ checklist: Optional[Dict]
61
+ documents: Optional[List[Dict]]
62
+ current_task: Optional[str]
63
+ findings: Dict[str, List[str]]
64
+ next_action: Optional[str]
65
+
66
+
67
+ class TaskType(Enum):
68
+ """Types of tasks the agent can perform"""
69
+ PARSE_CHECKLIST = "parse_checklist"
70
+ ANALYZE_DOCUMENT = "analyze_document"
71
+ MATCH_CHECKLIST = "match_checklist"
72
+ ANSWER_QUESTION = "answer_question"
73
+ SUMMARIZE_FINDINGS = "summarize_findings"
74
+
75
+
76
+ # =============================================================================
77
+ # AGENT NODE FUNCTIONS - Merged from agent_nodes.py
78
+ # =============================================================================
79
+
80
+ def route_task(state: AgentState) -> AgentState:
81
+ """Route to appropriate task based on current state"""
82
+ messages = state["messages"]
83
+ if not messages:
84
+ return state
85
+
86
+ last_message = messages[-1].content if messages else ""
87
+
88
+ # Determine next action based on message content
89
+ if "parse" in last_message.lower() and "checklist" in last_message.lower():
90
+ state["next_action"] = TaskType.PARSE_CHECKLIST.value
91
+ elif "analyze" in last_message.lower() or "match" in last_message.lower():
92
+ state["next_action"] = TaskType.MATCH_CHECKLIST.value
93
+ elif "?" in last_message:
94
+ state["next_action"] = TaskType.ANSWER_QUESTION.value
95
+ else:
96
+ state["next_action"] = TaskType.SUMMARIZE_FINDINGS.value
97
+
98
+ return state
99
+
100
+
101
+ def parse_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
102
+ """Parse checklist using structured output - much simpler!"""
103
+ messages = state["messages"]
104
+ checklist_text = messages[-1].content if messages else ""
105
+
106
+ # Set up simple parser
107
+ parser = PydanticOutputParser(pydantic_object=SimpleChecklist)
108
+ prompt = get_checklist_parsing_prompt(checklist_text)
109
+
110
+ # Create chain and parse - that's it!
111
+ chain = prompt | llm | parser
112
+
113
+ try:
114
+ result = chain.invoke({
115
+ "checklist_text": checklist_text[:3000],
116
+ "format_instructions": parser.get_format_instructions()
117
+ })
118
+
119
+ state["checklist"] = result.categories # Already in the right format!
120
+ state["messages"].append(AIMessage(content=f"Parsed {len(result.categories)} categories"))
121
+
122
+ except Exception as e:
123
+ state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
124
+
125
+ return state
126
+
127
+
128
+ def match_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
129
+ """Match documents to checklist items - keep it simple"""
130
+ checklist = state.get("checklist", {})
131
+ documents = state.get("documents", [])
132
+
133
+ if not checklist or not documents:
134
+ state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
135
+ return state
136
+
137
+ # For each checklist item, find relevant documents
138
+ findings = {}
139
+ for cat_letter, category in checklist.items():
140
+ cat_findings = []
141
+ for item in category.get("items", []):
142
+ # Use Claude to assess relevance
143
+ document_names = [d.get('name', 'Unknown') for d in documents[:10]]
144
+ prompt = get_document_relevance_prompt(item['text'], document_names)
145
+
146
+ response = llm.invoke([HumanMessage(content=str(prompt))])
147
+ cat_findings.append({
148
+ "item": item['text'],
149
+ "relevant_docs": response.content
150
+ })
151
+
152
+ findings[category['name']] = cat_findings
153
+
154
+ state["findings"] = findings
155
+ state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
156
+
157
+ return state
158
+
159
+
160
+ def answer_question_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
161
+ """Answer questions using document context"""
162
+ messages = state["messages"]
163
+ question = messages[-1].content if messages else ""
164
+ documents = state.get("documents", [])
165
+
166
+ # Create context from documents
167
+ context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
168
+ for d in documents[:5]])
169
+
170
+ prompt = get_question_answering_prompt(question, context)
171
+ response = llm.invoke([HumanMessage(content=prompt)])
172
+ state["messages"].append(AIMessage(content=response.content))
173
+
174
+ return state
175
+
176
 
177
+ def summarize_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
178
+ """Summarize findings"""
179
+ findings = state.get("findings", {})
180
+
181
+ if not findings:
182
+ state["messages"].append(AIMessage(content="No findings to summarize"))
183
+ return state
184
+
185
+ prompt = get_findings_summary_prompt(findings)
186
+ response = llm.invoke([HumanMessage(content=prompt)])
187
+ state["messages"].append(AIMessage(content=response.content))
188
+
189
+ return state
190
+
191
+
192
+ def route_condition(state: AgentState) -> str:
193
+ """Conditional routing function based on next_action"""
194
+ next_action = state.get("next_action")
195
+ if next_action == TaskType.PARSE_CHECKLIST.value:
196
+ return "parse_checklist"
197
+ elif next_action == TaskType.MATCH_CHECKLIST.value:
198
+ return "match_checklist"
199
+ elif next_action == TaskType.ANSWER_QUESTION.value:
200
+ return "answer_question"
201
+ else:
202
+ return "summarize"
203
+
204
+
205
+ # =============================================================================
206
+ # LLM UTILITIES - Merged from llm_utilities.py
207
+ # =============================================================================
208
+
209
+ def simple_retry(func, max_retries: int = 3, base_delay: float = 1.0):
210
+ """Simple exponential backoff retry with jitter"""
211
+ last_exception = None
212
+ for attempt in range(max_retries):
213
+ try:
214
+ return func()
215
+ except Exception as e:
216
+ last_exception = e
217
+
218
+ # Check if it's a rate limit error that should be retried
219
+ error_str = str(e).lower()
220
+ if any(keyword in error_str for keyword in [
221
+ 'rate', 'limit', 'quota', 'throttl', '429', 'too many',
222
+ 'overload', '529', 'server_overloaded', 'overloaded_error'
223
+ ]):
224
+ if attempt < max_retries - 1: # Don't wait on last attempt
225
+ delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
226
+ time.sleep(min(delay, 60)) # Cap at 60 seconds
227
+ continue
228
+
229
+ # For non-retryable errors, raise immediately
230
+ raise e
231
+
232
+ # If we get here, all retries failed
233
+ raise last_exception
234
+
235
+
236
+ def generate_checklist_descriptions(checklist: Dict, llm: "ChatAnthropic", batch_size: Optional[int] = None) -> Dict:
237
+ """
238
+ Generate detailed descriptions for each checklist item explaining what documents should satisfy it.
239
+ Returns checklist with added 'description' field for each item.
240
+
241
+ Args:
242
+ checklist: Checklist dictionary to enhance
243
+ llm: ChatAnthropic instance for generating descriptions
244
+ batch_size: Number of items to process in each batch (uses config default if None)
245
+
246
+ Returns:
247
+ Enhanced checklist with descriptions
248
+ """
249
+
250
+ config = get_config()
251
+ if batch_size is None:
252
+ batch_size = config.processing.description_batch_size
253
+
254
+ # Process all checklist items
255
+ enhanced_checklist = {}
256
+ all_items_to_process = []
257
+
258
+ # Collect all items with their context
259
+ for cat_letter, category in checklist.items():
260
+ cat_name = category.get('name', '')
261
+ enhanced_checklist[cat_letter] = {
262
+ 'name': cat_name,
263
+ 'letter': cat_letter,
264
+ 'items': []
265
+ }
266
+
267
+ for item in category.get('items', []):
268
+ item_data = {
269
+ 'category_letter': cat_letter,
270
+ 'category_name': cat_name,
271
+ 'item_text': item.get('text', ''),
272
+ 'original_item': item,
273
+ 'prompt': get_description_generation_prompt(cat_name, item.get('text', '')).format()
274
+ }
275
+ all_items_to_process.append(item_data)
276
+
277
+ # Process items in batches
278
+ total_items = len(all_items_to_process)
279
+ total_batches = (total_items + batch_size - 1) // batch_size
280
+
281
+ for batch_num, i in enumerate(range(0, total_items, batch_size), 1):
282
+ batch = all_items_to_process[i:i + batch_size]
283
+ batch_end = min(i + batch_size, total_items)
284
+
285
+ # Update progress if available
286
+ if hasattr(st, 'progress') and 'description_progress' in st.session_state:
287
+ progress = i / total_items
288
+ st.session_state.description_progress.progress(
289
+ progress,
290
+ text=f"📝 Generating descriptions batch {batch_num}/{total_batches} (items {i+1}-{batch_end} of {total_items})"
291
+ )
292
+
293
+ # Create prompts for batch processing
294
+ prompts = [item_data['prompt'] for item_data in batch]
295
+ messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
296
+
297
+ # Process batch with simple retry logic
298
+ try:
299
+ responses = simple_retry(
300
+ lambda: llm.batch(
301
+ messages_batch,
302
+ config={"max_concurrency": min(batch_size, config.api.max_concurrent_requests)}
303
+ ),
304
+ max_retries=3,
305
+ base_delay=0.5
306
+ )
307
+
308
+ # Extract descriptions from responses
309
+ batch_descriptions = [response.content.strip() if response else f"Documents related to {item_data['item_text']}"
310
+ for response, item_data in zip(responses, batch)]
311
+ except Exception as e:
312
+ logger.warning(f"Batch {batch_num} description generation failed: {e}. Using fallback descriptions.")
313
+ batch_descriptions = [f"Documents related to {item_data['item_text']}" for item_data in batch]
314
+
315
+ # Add descriptions to items
316
+ for item_data, description in zip(batch, batch_descriptions):
317
+ enhanced_item = item_data['original_item'].copy()
318
+ enhanced_item['description'] = description
319
+ enhanced_checklist[item_data['category_letter']]['items'].append(enhanced_item)
320
+
321
+ # No delay between batches - using rate limiting with exponential backoff instead
322
+
323
+ return enhanced_checklist
324
+
325
+
326
+ def batch_summarize_documents(documents: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
327
+ """
328
+ Summarize documents using LangChain's built-in batch processing for true parallelization.
329
+ Optimized with larger batches, higher concurrency, and exponential backoff rate limiting.
330
+ Returns documents with added 'summary' field.
331
+
332
+ Args:
333
+ documents: List of document dictionaries to summarize
334
+ llm: ChatAnthropic instance for generating summaries
335
+ batch_size: Number of documents to process in each batch (uses config default if None)
336
+
337
+ Returns:
338
+ List of documents with added summary field
339
+ """
340
+
341
+ config = get_config()
342
+ if batch_size is None:
343
+ batch_size = config.processing.batch_size
344
+
345
+ # Process documents in batches
346
+ summarized_docs = []
347
+ total_docs = len(documents)
348
+ total_batches = (total_docs + batch_size - 1) // batch_size
349
+
350
+ for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
351
+ batch = documents[i:i + batch_size]
352
+ batch_end = min(i + batch_size, total_docs)
353
+
354
+ # Update progress with batch info
355
+ if hasattr(st, 'progress') and 'summary_progress' in st.session_state:
356
+ progress = i / total_docs
357
+ st.session_state.summary_progress.progress(
358
+ progress,
359
+ text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
360
+ )
361
+
362
+ # Create prompts for all documents in the batch
363
+ templates = [get_document_summarization_prompt(doc) for doc in batch]
364
+ prompts = [template.format() for template in templates]
365
+
366
+ # Convert prompts to HumanMessage format for batch processing
367
+ messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
368
+
369
+ # Process batch with simple retry logic
370
+ try:
371
+ responses = simple_retry(
372
+ lambda: llm.batch(
373
+ messages_batch,
374
+ config={"max_concurrency": min(batch_size // 2 or 1, config.api.max_concurrent_requests)}
375
+ ),
376
+ max_retries=3,
377
+ base_delay=0.5
378
+ )
379
+
380
+ # Extract summaries from responses
381
+ batch_summaries = [response.content.strip() if response else f"Document: {doc.get('name', 'Unknown')}"
382
+ for response, doc in zip(responses, batch)]
383
+ except Exception as e:
384
+ logger.warning(f"Batch {batch_num} processing failed: {e}. Using fallback summaries.")
385
+ batch_summaries = [f"Document: {doc.get('name', 'Unknown')}" for doc in batch]
386
+
387
+ # Add summaries to documents
388
+ for doc, summary in zip(batch, batch_summaries):
389
+ doc['summary'] = summary
390
+ summarized_docs.append(doc)
391
+
392
+ # No delay between batches - using rate limiting with exponential backoff instead
393
+
394
+ return summarized_docs
395
+
396
+
397
+ def create_document_embeddings_with_summaries(documents: List[Dict], model_name: str = None) -> Dict[str, Any]:
398
+ """
399
+ Prepare document data for LangChain-based similarity matching.
400
+ No longer creates embeddings directly - LangChain handles embedding generation.
401
+
402
+ Args:
403
+ documents: List of documents with summaries
404
+
405
+ Returns:
406
+ Dictionary with document info formatted for LangChain matching
407
+ """
408
+ doc_info = []
409
+
410
+ for doc in documents:
411
+ # Prepare document info for LangChain matching
412
+ doc_name = doc.get('name', 'Unknown')
413
+ doc_path = doc.get('path', '')
414
+ summary = doc.get('summary', '')
415
+
416
+ doc_info.append({
417
+ 'name': doc_name,
418
+ 'path': doc_path,
419
+ 'full_path': doc.get('full_path', doc_path),
420
+ 'summary': summary,
421
+ 'original_doc': doc
422
+ })
423
+
424
+ return {
425
+ 'documents': doc_info
426
+ }
427
+
428
+
429
+ def match_checklist_with_summaries(
430
+ checklist: Dict,
431
+ doc_embeddings_data: Dict,
432
+ model_name: str,
433
+ threshold: Optional[float] = None
434
+ ) -> Dict:
435
+ """
436
+ Match checklist items against document summaries using LangChain FAISS.
437
+ Enhanced to use LLM-generated descriptions for better semantic matching.
438
+
439
+ Args:
440
+ checklist: Checklist dictionary with items and descriptions
441
+ doc_embeddings_data: Dictionary containing document info and embeddings
442
+ model_name: Name of the HuggingFace model for embeddings
443
+ threshold: Similarity threshold for matching (uses config default if None)
444
+
445
+ Returns:
446
+ Dictionary with matching results
447
+ """
448
+ config = get_config()
449
+ if threshold is None:
450
+ threshold = config.processing.similarity_threshold
451
+
452
+ doc_info = doc_embeddings_data['documents']
453
+
454
+ # Create LangChain embeddings instance
455
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
456
+
457
+ # Convert document summaries to LangChain Documents
458
+ documents = [
459
+ Document(
460
+ page_content=f"{doc['name']}\n{doc['path']}\n{doc['summary']}",
461
+ metadata={
462
+ 'name': doc['name'],
463
+ 'path': doc['path'],
464
+ 'full_path': doc.get('full_path', doc['path']),
465
+ 'summary': doc['summary'],
466
+ **doc.get('original_doc', {}).get('metadata', {})
467
+ }
468
+ )
469
+ for doc in doc_info
470
+ ]
471
+
472
+ # Create LangChain FAISS vector store
473
+ vector_store = FAISS.from_documents(documents, embeddings)
474
+ retriever = vector_store.as_retriever(
475
+ search_type="similarity_score_threshold",
476
+ search_kwargs={"score_threshold": threshold, "k": 5}
477
+ )
478
+
479
+ results = {}
480
+
481
+ for cat_letter, category in checklist.items():
482
+ cat_name = category.get('name', '')
483
+ cat_results = {
484
+ 'name': cat_name,
485
+ 'letter': cat_letter,
486
+ 'total_items': len(category.get('items', [])),
487
+ 'matched_items': 0,
488
+ 'items': []
489
+ }
490
+
491
+ for item in category.get('items', []):
492
+ item_text = item.get('text', '')
493
+ item_description = item.get('description', '')
494
+
495
+ # Create enhanced query using both item text and generated description
496
+ if item_description:
497
+ # Use the LLM-generated description for richer semantic matching
498
+ query = f"{cat_name}: {item_text}\n{item_description}"
499
+ else:
500
+ # Fall back to original method if no description available
501
+ query = f"{cat_name}: {item_text}"
502
+
503
+ # Use LangChain retriever for similarity search
504
+ docs = safe_execute(
505
+ lambda: retriever.invoke(query),
506
+ default=[],
507
+ context="Document matching with summaries"
508
+ )
509
+
510
+ # Convert LangChain documents to matches format
511
+ matches = []
512
+ for doc in docs[:5]: # Keep top 5 matches
513
+ match_data = {
514
+ 'name': doc.metadata['name'],
515
+ 'path': doc.metadata['path'],
516
+ 'full_path': doc.metadata.get('full_path', doc.metadata['path']),
517
+ 'summary': doc.metadata['summary'],
518
+ 'score': 0.8, # LangChain retriever doesn't return raw scores
519
+ 'metadata': {k: v for k, v in doc.metadata.items()
520
+ if k not in ['name', 'path', 'full_path', 'summary']}
521
+ }
522
+ matches.append(match_data)
523
+
524
+ item_result = {
525
+ 'text': item_text,
526
+ 'original': item.get('original', item_text),
527
+ 'description': item_description, # Include the generated description
528
+ 'matches': matches
529
+ }
530
+
531
+ # Count items with matches toward category total
532
+ if matches:
533
+ cat_results['matched_items'] += 1
534
+
535
+ cat_results['items'].append(item_result)
536
+
537
+ results[cat_letter] = cat_results
538
+
539
+ return results
540
+
541
+
542
+ # =============================================================================
543
+ # LANGGRAPH AGENT FUNCTIONS
544
+ # =============================================================================
545
+
546
+ def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, "ChatAnthropic"]]:
547
  """
548
  Create a LangGraph agent with Anthropic
549
 
 
555
  Tuple of (compiled_app, llm) or None if not available
556
  """
557
 
 
 
 
558
  # Get configuration
559
  config = get_config()
560
 
 
670
 
671
  def is_available(self) -> bool:
672
  """Check if the agent is available for use"""
673
+ return self.app is not None and self.llm is not None
674
 
675
  def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
676
  """
 
694
 
695
  return result.get("checklist")
696
  except Exception as e:
697
+ st.error(f"Agent error: {str(e)}")
 
698
  return None
699
 
700
  def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
 
727
 
728
  return result.get("findings", {})
729
  except Exception as e:
730
+ st.error(f"Agent error: {str(e)}")
 
731
  return {}
732
 
733
  def answer_question(self, question: str, documents: List[Dict]) -> str:
src/ai/agent_nodes.py DELETED
@@ -1,173 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- LangGraph Agent Nodes Module
4
-
5
- This module contains all the individual node functions used in the
6
- LangGraph workflow for the DD-Checklist agent.
7
- """
8
-
9
- import json
10
- from typing import Dict, List, Optional, Sequence, Any
11
- from typing_extensions import TypedDict
12
- from enum import Enum
13
-
14
- try:
15
- from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
16
- from langchain_anthropic import ChatAnthropic
17
- LANGGRAPH_AVAILABLE = True
18
- except ImportError:
19
- LANGGRAPH_AVAILABLE = False
20
- BaseMessage = object
21
- HumanMessage = object
22
- AIMessage = object
23
- ChatAnthropic = object
24
-
25
- from .prompts import (
26
- get_checklist_parsing_prompt,
27
- get_document_relevance_prompt,
28
- get_question_answering_prompt,
29
- get_findings_summary_prompt
30
- )
31
-
32
-
33
- # Define the state for our agent
34
- class AgentState(TypedDict):
35
- """State for the due diligence agent"""
36
- messages: Sequence[BaseMessage]
37
- checklist: Optional[Dict]
38
- documents: Optional[List[Dict]]
39
- current_task: Optional[str]
40
- findings: Dict[str, List[str]]
41
- next_action: Optional[str]
42
-
43
-
44
- class TaskType(Enum):
45
- """Types of tasks the agent can perform"""
46
- PARSE_CHECKLIST = "parse_checklist"
47
- ANALYZE_DOCUMENT = "analyze_document"
48
- MATCH_CHECKLIST = "match_checklist"
49
- ANSWER_QUESTION = "answer_question"
50
- SUMMARIZE_FINDINGS = "summarize_findings"
51
-
52
-
53
- def route_task(state: AgentState) -> AgentState:
54
- """Route to appropriate task based on current state"""
55
- messages = state["messages"]
56
- if not messages:
57
- return state
58
-
59
- last_message = messages[-1].content if messages else ""
60
-
61
- # Determine next action based on message content
62
- if "parse" in last_message.lower() and "checklist" in last_message.lower():
63
- state["next_action"] = TaskType.PARSE_CHECKLIST.value
64
- elif "analyze" in last_message.lower() or "match" in last_message.lower():
65
- state["next_action"] = TaskType.MATCH_CHECKLIST.value
66
- elif "?" in last_message:
67
- state["next_action"] = TaskType.ANSWER_QUESTION.value
68
- else:
69
- state["next_action"] = TaskType.SUMMARIZE_FINDINGS.value
70
-
71
- return state
72
-
73
-
74
- def parse_checklist_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
75
- """Parse checklist using Claude"""
76
- messages = state["messages"]
77
- checklist_text = messages[-1].content if messages else ""
78
-
79
- prompt = get_checklist_parsing_prompt(checklist_text)
80
- response = llm.invoke([HumanMessage(content=prompt)])
81
-
82
- try:
83
- # Parse JSON from response
84
- json_str = response.content
85
- if "```json" in json_str:
86
- json_str = json_str.split("```json")[1].split("```")[0]
87
- elif "```" in json_str:
88
- json_str = json_str.split("```")[1].split("```")[0]
89
-
90
- parsed = json.loads(json_str.strip())
91
- state["checklist"] = parsed
92
- state["messages"].append(AIMessage(content=f"Parsed {len(parsed)} categories"))
93
- except Exception as e:
94
- state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
95
-
96
- return state
97
-
98
-
99
- def match_checklist_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
100
- """Match documents to checklist items"""
101
- checklist = state.get("checklist", {})
102
- documents = state.get("documents", [])
103
-
104
- if not checklist or not documents:
105
- state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
106
- return state
107
-
108
- # For each checklist item, find relevant documents
109
- findings = {}
110
- for cat_letter, category in checklist.items():
111
- cat_findings = []
112
- for item in category.get("items", []):
113
- # Use Claude to assess relevance
114
- document_names = [d.get('name', 'Unknown') for d in documents[:10]]
115
- prompt = get_document_relevance_prompt(item['text'], document_names)
116
-
117
- response = llm.invoke([HumanMessage(content=prompt)])
118
- cat_findings.append({
119
- "item": item['text'],
120
- "relevant_docs": response.content
121
- })
122
-
123
- findings[category['name']] = cat_findings
124
-
125
- state["findings"] = findings
126
- state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
127
-
128
- return state
129
-
130
-
131
- def answer_question_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
132
- """Answer questions using document context"""
133
- messages = state["messages"]
134
- question = messages[-1].content if messages else ""
135
- documents = state.get("documents", [])
136
-
137
- # Create context from documents
138
- context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
139
- for d in documents[:5]])
140
-
141
- prompt = get_question_answering_prompt(question, context)
142
- response = llm.invoke([HumanMessage(content=prompt)])
143
- state["messages"].append(AIMessage(content=response.content))
144
-
145
- return state
146
-
147
-
148
- def summarize_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
149
- """Summarize findings"""
150
- findings = state.get("findings", {})
151
-
152
- if not findings:
153
- state["messages"].append(AIMessage(content="No findings to summarize"))
154
- return state
155
-
156
- prompt = get_findings_summary_prompt(findings)
157
- response = llm.invoke([HumanMessage(content=prompt)])
158
- state["messages"].append(AIMessage(content=response.content))
159
-
160
- return state
161
-
162
-
163
- def route_condition(state: AgentState) -> str:
164
- """Conditional routing function based on next_action"""
165
- next_action = state.get("next_action")
166
- if next_action == TaskType.PARSE_CHECKLIST.value:
167
- return "parse_checklist"
168
- elif next_action == TaskType.MATCH_CHECKLIST.value:
169
- return "match_checklist"
170
- elif next_action == TaskType.ANSWER_QUESTION.value:
171
- return "answer_question"
172
- else:
173
- return "summarize"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ai/llm_utilities.py DELETED
@@ -1,432 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- LLM Utilities Module
4
-
5
- This module contains utility functions for batch processing, document
6
- summarization, embeddings, and checklist matching operations.
7
- """
8
-
9
- import time
10
- import random
11
- from typing import Dict, List, Any, Optional
12
-
13
- try:
14
- import streamlit as st
15
- from langchain_anthropic import ChatAnthropic
16
- from langchain_core.messages import HumanMessage
17
- import numpy as np
18
- import faiss
19
- DEPENDENCIES_AVAILABLE = True
20
- except ImportError:
21
- DEPENDENCIES_AVAILABLE = False
22
- st = None
23
- ChatAnthropic = object
24
- HumanMessage = object
25
-
26
- from ..config import get_config
27
- from .prompts import get_description_generation_prompt, get_document_summarization_prompt
28
-
29
-
30
- def exponential_backoff_retry(func, max_retries: Optional[int] = None, base_delay: Optional[float] = None):
31
- """
32
- Execute function with exponential backoff retry logic for rate limiting.
33
-
34
- Args:
35
- func: Function to execute
36
- max_retries: Maximum number of retries (uses config default if None)
37
- base_delay: Base delay in seconds (uses config default if None)
38
-
39
- Returns:
40
- Result of the function call
41
- """
42
- config = get_config()
43
- if max_retries is None:
44
- max_retries = config.api.max_retries
45
- if base_delay is None:
46
- base_delay = config.api.base_delay
47
-
48
- for attempt in range(max_retries):
49
- try:
50
- return func()
51
- except Exception as e:
52
- error_str = str(e).lower()
53
- # Check if it's a rate limiting error
54
- if any(keyword in error_str for keyword in ['rate', 'limit', 'quota', 'throttl', '429', 'too many']):
55
- if attempt < max_retries - 1:
56
- # Calculate exponential backoff with jitter
57
- delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
58
- print(f"Rate limit hit, retrying in {delay:.2f}s (attempt {attempt + 1}/{max_retries})")
59
- time.sleep(delay)
60
- continue
61
- else:
62
- print(f"Rate limit exceeded after {max_retries} attempts")
63
- raise e
64
- else:
65
- # Non-rate limit error, don't retry
66
- raise e
67
- return None
68
-
69
-
70
- def generate_checklist_descriptions(checklist: Dict, llm: ChatAnthropic, batch_size: Optional[int] = None) -> Dict:
71
- """
72
- Generate detailed descriptions for each checklist item explaining what documents should satisfy it.
73
- Returns checklist with added 'description' field for each item.
74
-
75
- Args:
76
- checklist: Checklist dictionary to enhance
77
- llm: ChatAnthropic instance for generating descriptions
78
- batch_size: Number of items to process in each batch (uses config default if None)
79
-
80
- Returns:
81
- Enhanced checklist with descriptions
82
- """
83
- if not DEPENDENCIES_AVAILABLE:
84
- return checklist
85
-
86
- config = get_config()
87
- if batch_size is None:
88
- batch_size = config.processing.description_batch_size
89
-
90
-
91
-
92
- # Process all checklist items
93
- enhanced_checklist = {}
94
- all_items_to_process = []
95
-
96
- # Collect all items with their context
97
- for cat_letter, category in checklist.items():
98
- cat_name = category.get('name', '')
99
- enhanced_checklist[cat_letter] = {
100
- 'name': cat_name,
101
- 'letter': cat_letter,
102
- 'items': []
103
- }
104
-
105
- for item in category.get('items', []):
106
- item_data = {
107
- 'category_letter': cat_letter,
108
- 'category_name': cat_name,
109
- 'item_text': item.get('text', ''),
110
- 'original_item': item,
111
- 'prompt': get_description_generation_prompt(cat_name, item.get('text', ''))
112
- }
113
- all_items_to_process.append(item_data)
114
-
115
- # Process items in batches
116
- total_items = len(all_items_to_process)
117
- total_batches = (total_items + batch_size - 1) // batch_size
118
-
119
- for batch_num, i in enumerate(range(0, total_items, batch_size), 1):
120
- batch = all_items_to_process[i:i + batch_size]
121
- batch_end = min(i + batch_size, total_items)
122
-
123
- # Update progress if available
124
- if st and hasattr(st, 'progress') and 'description_progress' in st.session_state:
125
- progress = i / total_items
126
- st.session_state.description_progress.progress(
127
- progress,
128
- text=f"📝 Generating descriptions batch {batch_num}/{total_batches} (items {i+1}-{batch_end} of {total_items})"
129
- )
130
-
131
- # Create prompts for batch processing
132
- prompts = [item_data['prompt'] for item_data in batch]
133
- messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
134
-
135
- # Use exponential backoff for batch processing
136
- def process_descriptions_batch():
137
- # Use higher concurrency for descriptions since they're short
138
- max_concurrent = min(batch_size * 2, config.api.max_concurrent_requests)
139
- return llm.batch(
140
- messages_batch,
141
- config={"max_concurrency": max_concurrent}
142
- )
143
-
144
- try:
145
- responses = exponential_backoff_retry(
146
- process_descriptions_batch,
147
- max_retries=config.api.max_retries,
148
- base_delay=config.api.batch_base_delay
149
- )
150
-
151
- # Extract descriptions from responses
152
- batch_descriptions = [response.content.strip() if response else f"Documents related to {item_data['item_text']}"
153
- for response, item_data in zip(responses, batch)]
154
- except Exception as e:
155
- # Fallback to sequential processing with individual retries if batch fails
156
- print(f"Batch {batch_num} description generation failed: {e}. Falling back to sequential with retries.")
157
- batch_descriptions = []
158
- for item_data in batch:
159
- def single_description_process():
160
- return llm.invoke([HumanMessage(content=item_data['prompt'])])
161
-
162
- try:
163
- response = exponential_backoff_retry(
164
- single_description_process,
165
- max_retries=config.api.batch_retry_attempts,
166
- base_delay=config.api.single_retry_base_delay
167
- )
168
- batch_descriptions.append(response.content.strip())
169
- except Exception as inner_e:
170
- print(f"Failed to generate description for {item_data['item_text']}: {inner_e}")
171
- batch_descriptions.append(f"Documents related to {item_data['item_text']}")
172
-
173
- # Add descriptions to items
174
- for item_data, description in zip(batch, batch_descriptions):
175
- enhanced_item = item_data['original_item'].copy()
176
- enhanced_item['description'] = description
177
- enhanced_checklist[item_data['category_letter']]['items'].append(enhanced_item)
178
-
179
- # No delay between batches - using rate limiting with exponential backoff instead
180
-
181
- return enhanced_checklist
182
-
183
-
184
- def batch_summarize_documents(documents: List[Dict], llm: ChatAnthropic, batch_size: Optional[int] = None) -> List[Dict]:
185
- """
186
- Summarize documents using LangChain's built-in batch processing for true parallelization.
187
- Optimized with larger batches, higher concurrency, and exponential backoff rate limiting.
188
- Returns documents with added 'summary' field.
189
-
190
- Args:
191
- documents: List of document dictionaries to summarize
192
- llm: ChatAnthropic instance for generating summaries
193
- batch_size: Number of documents to process in each batch (uses config default if None)
194
-
195
- Returns:
196
- List of documents with added summary field
197
- """
198
- if not DEPENDENCIES_AVAILABLE:
199
- return documents
200
-
201
- config = get_config()
202
- if batch_size is None:
203
- batch_size = config.processing.batch_size
204
-
205
- # Process documents in batches
206
- summarized_docs = []
207
- total_docs = len(documents)
208
- total_batches = (total_docs + batch_size - 1) // batch_size
209
-
210
- for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
211
- batch = documents[i:i + batch_size]
212
- batch_end = min(i + batch_size, total_docs)
213
-
214
- # Update progress with batch info
215
- if st and hasattr(st, 'progress') and 'summary_progress' in st.session_state:
216
- progress = i / total_docs
217
- st.session_state.summary_progress.progress(
218
- progress,
219
- text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
220
- )
221
-
222
- # Create prompts for all documents in the batch
223
- prompts = [get_document_summarization_prompt(doc) for doc in batch]
224
-
225
- # Convert prompts to HumanMessage format for batch processing
226
- messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
227
-
228
- # Use exponential backoff for batch processing
229
- def process_batch():
230
- max_concurrent = min(batch_size, config.api.max_concurrent_requests)
231
- return llm.batch(
232
- messages_batch,
233
- config={"max_concurrency": max_concurrent}
234
- )
235
-
236
- try:
237
- responses = exponential_backoff_retry(
238
- process_batch,
239
- max_retries=config.api.max_retries,
240
- base_delay=config.api.batch_base_delay
241
- )
242
-
243
- # Extract summaries from responses
244
- batch_summaries = [response.content.strip() if response else f"Document: {doc.get('name', 'Unknown')}"
245
- for response, doc in zip(responses, batch)]
246
- except Exception as e:
247
- # Fallback to sequential processing with individual retries if batch fails
248
- print(f"Batch {batch_num} processing failed: {e}. Falling back to sequential with retries.")
249
- batch_summaries = []
250
- for doc_idx, doc in enumerate(batch):
251
- prompt = get_document_summarization_prompt(doc)
252
-
253
- def single_doc_process():
254
- return llm.invoke([HumanMessage(content=prompt)])
255
-
256
- try:
257
- response = exponential_backoff_retry(
258
- single_doc_process,
259
- max_retries=config.api.batch_retry_attempts,
260
- base_delay=config.api.single_retry_base_delay
261
- )
262
- batch_summaries.append(response.content.strip())
263
- except Exception as inner_e:
264
- print(f"Failed to summarize {doc.get('name', 'Unknown')}: {inner_e}")
265
- batch_summaries.append(f"Document: {doc.get('name', 'Unknown')}")
266
-
267
- # Update progress within fallback
268
- if st and hasattr(st, 'progress') and 'summary_progress' in st.session_state:
269
- sub_progress = (i + doc_idx + 1) / total_docs
270
- st.session_state.summary_progress.progress(
271
- sub_progress,
272
- text=f"📝 Sequential fallback: {i + doc_idx + 1}/{total_docs}"
273
- )
274
-
275
- # Add summaries to documents
276
- for doc, summary in zip(batch, batch_summaries):
277
- doc['summary'] = summary
278
- summarized_docs.append(doc)
279
-
280
- # No delay between batches - using rate limiting with exponential backoff instead
281
-
282
- return summarized_docs
283
-
284
-
285
- def create_document_embeddings_with_summaries(documents: List[Dict], model) -> Dict[str, Any]:
286
- """
287
- Create embeddings for documents using their LLM-generated summaries.
288
-
289
- Args:
290
- documents: List of documents with summaries
291
- model: SentenceTransformer model for embeddings
292
-
293
- Returns:
294
- Dictionary with document info and embeddings
295
- """
296
- doc_embeddings = []
297
- doc_info = []
298
-
299
- for doc in documents:
300
- # Combine filename, path context, and LLM summary for rich embedding
301
- doc_name = doc.get('name', 'Unknown')
302
- doc_path = doc.get('path', '')
303
- summary = doc.get('summary', '')
304
-
305
- # Create rich text representation
306
- embedding_text = f"{doc_name}\n{doc_path}\n{summary}"
307
-
308
- # Generate embedding
309
- embedding = model.encode(embedding_text)
310
-
311
- doc_embeddings.append(embedding)
312
- doc_info.append({
313
- 'name': doc_name,
314
- 'path': doc_path,
315
- 'full_path': doc.get('full_path', doc_path),
316
- 'summary': summary,
317
- 'embedding_text': embedding_text,
318
- 'original_doc': doc
319
- })
320
-
321
- return {
322
- 'embeddings': doc_embeddings,
323
- 'documents': doc_info
324
- }
325
-
326
-
327
- def match_checklist_with_summaries(
328
- checklist: Dict,
329
- doc_embeddings_data: Dict,
330
- model,
331
- threshold: Optional[float] = None
332
- ) -> Dict:
333
- """
334
- Match checklist items against document summaries using FAISS for 10x faster similarity search.
335
- Enhanced to use LLM-generated descriptions for better semantic matching.
336
-
337
- Args:
338
- checklist: Checklist dictionary with items and descriptions
339
- doc_embeddings_data: Dictionary containing document embeddings and info
340
- model: SentenceTransformer model for embeddings
341
- threshold: Similarity threshold for matching (uses config default if None)
342
-
343
- Returns:
344
- Dictionary with matching results
345
- """
346
- if not DEPENDENCIES_AVAILABLE:
347
- return {}
348
-
349
- config = get_config()
350
- if threshold is None:
351
- threshold = config.processing.similarity_threshold
352
-
353
- doc_embeddings = np.array(doc_embeddings_data['embeddings'], dtype='float32')
354
- doc_info = doc_embeddings_data['documents']
355
-
356
- # Build FAISS index for fast similarity search
357
- faiss.normalize_L2(doc_embeddings) # Normalize for cosine similarity
358
- dimension = doc_embeddings.shape[1]
359
- faiss_index = faiss.IndexFlatIP(dimension)
360
- faiss_index.add(doc_embeddings)
361
-
362
- results = {}
363
-
364
- for cat_letter, category in checklist.items():
365
- cat_name = category.get('name', '')
366
- cat_results = {
367
- 'name': cat_name,
368
- 'letter': cat_letter,
369
- 'total_items': len(category.get('items', [])),
370
- 'matched_items': 0,
371
- 'items': []
372
- }
373
-
374
- for item in category.get('items', []):
375
- item_text = item.get('text', '')
376
- item_description = item.get('description', '')
377
-
378
- # Create enhanced embedding text using both item text and generated description
379
- if item_description:
380
- # Use the LLM-generated description for richer semantic matching
381
- checklist_embedding_text = f"{cat_name}: {item_text}\n{item_description}"
382
- else:
383
- # Fallback to original method if no description available
384
- checklist_embedding_text = f"{cat_name}: {item_text}"
385
-
386
- # Create and normalize item embedding
387
- item_embedding = model.encode(checklist_embedding_text).astype('float32').reshape(1, -1)
388
- faiss.normalize_L2(item_embedding)
389
-
390
- # Use FAISS for fast similarity search
391
- scores, indices = faiss_index.search(item_embedding, len(doc_info))
392
-
393
- # Find matching documents above threshold
394
- matches = []
395
- min_display_threshold = config.processing.min_display_threshold
396
-
397
- for score, idx in zip(scores[0], indices[0]):
398
- if idx == -1: # No more results
399
- break
400
- if score < min_display_threshold: # Skip very low scoring documents
401
- break # Scores are sorted, so we can stop here
402
-
403
- match_data = {
404
- 'name': doc_info[idx]['name'],
405
- 'path': doc_info[idx]['path'],
406
- 'full_path': doc_info[idx].get('full_path', doc_info[idx]['path']),
407
- 'summary': doc_info[idx]['summary'],
408
- 'score': float(score),
409
- 'metadata': doc_info[idx].get('original_doc', {}).get('metadata', {})
410
- }
411
-
412
- matches.append(match_data)
413
-
414
- # Keep top 5 matches for display
415
- display_matches = matches[:5]
416
-
417
- item_result = {
418
- 'text': item_text,
419
- 'original': item.get('original', item_text),
420
- 'description': item_description, # Include the generated description
421
- 'matches': display_matches
422
- }
423
-
424
- # Count items with ANY matches (both green and yellow) toward category total
425
- if display_matches:
426
- cat_results['matched_items'] += 1
427
-
428
- cat_results['items'].append(item_result)
429
-
430
- results[cat_letter] = cat_results
431
-
432
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ai/prompts.py CHANGED
@@ -6,147 +6,92 @@ This module contains all prompt templates used for AI interactions
6
  in the DD-Checklist application.
7
  """
8
 
 
9
  from typing import Dict, List
 
 
10
 
11
 
12
- def get_checklist_parsing_prompt(checklist_text: str) -> str:
13
- """
14
- Generate prompt for parsing due diligence checklists
15
-
16
- Args:
17
- checklist_text: Raw checklist text to parse
18
-
19
- Returns:
20
- Formatted prompt string
21
- """
22
- return f"""Parse this due diligence checklist into a structured JSON format.
23
-
24
- Extract categories (A., B., C.) and numbered items.
25
-
26
- Return ONLY valid JSON:
27
- {{
28
- "A": {{
29
- "name": "Category Name",
30
- "items": [{{"text": "item", "number": 1}}]
31
- }}
32
- }}
33
-
34
- Checklist:
35
- {checklist_text[:3000]}
36
-
37
- JSON:"""
38
-
39
-
40
- def get_document_relevance_prompt(item_text: str, documents: List[str]) -> str:
41
- """
42
- Generate prompt for assessing document relevance to checklist items
43
-
44
- Args:
45
- item_text: Checklist item text
46
- documents: List of document names
47
-
48
- Returns:
49
- Formatted prompt string
50
- """
51
- return f"""Which of these documents is relevant to: {item_text}
52
-
53
- Documents: {documents}
54
-
55
- List the relevant document names only."""
56
-
57
-
58
- def get_question_answering_prompt(question: str, context: str) -> str:
59
- """
60
- Generate prompt for answering questions based on document context
61
-
62
- Args:
63
- question: User question
64
- context: Document context
65
-
66
- Returns:
67
- Formatted prompt string
68
- """
69
- return f"""Answer this question based on the documents:
70
-
71
- Question: {question}
72
-
73
- Document Context:
74
- {context}
75
 
76
- Provide a comprehensive answer with citations."""
 
 
77
 
 
78
 
79
- def get_findings_summary_prompt(findings: Dict, max_chars: int = 2000) -> str:
80
- """
81
- Generate prompt for summarizing due diligence findings
82
-
83
- Args:
84
- findings: Dictionary of findings to summarize
85
- max_chars: Maximum characters to include from findings
86
-
87
- Returns:
88
- Formatted prompt string
89
- """
90
- import json
91
- findings_text = json.dumps(findings, indent=2)[:max_chars]
92
-
93
- return f"""Provide an executive summary of the due diligence findings:
94
 
95
- {findings_text}
 
96
 
97
- Focus on:
98
- 1. Completeness of documentation
99
- 2. Key gaps or concerns
100
- 3. Overall assessment"""
101
 
 
 
 
 
102
 
103
- def get_description_generation_prompt(category_name: str, item_text: str) -> str:
104
- """
105
- Generate prompt for creating checklist item descriptions
106
-
107
- Args:
108
- category_name: Name of the checklist category
109
- item_text: Text of the checklist item
110
-
111
- Returns:
112
- Formatted prompt string
113
- """
114
- return f"""For this due diligence checklist item, provide a concise description (1-2 sentences) explaining what types of documents or information would satisfy this requirement. Focus on the specific document types and key information that would be relevant.
115
-
116
- Category: {category_name}
117
  Checklist Item: {item_text}
118
 
119
- Description (1-2 sentences explaining what documents/information satisfy this requirement):"""
120
-
121
 
122
- def get_document_summarization_prompt(doc: Dict) -> str:
123
- """
124
- Generate prompt for document type identification and summarization
125
-
126
- Args:
127
- doc: Dictionary containing document information
128
-
129
- Returns:
130
- Formatted prompt string
131
- """
132
- # Extract text preview (first 1000 chars)
133
- text_preview = doc.get('content', '')[:1000] if doc.get('content') else ''
134
- doc_name = doc.get('name', 'Unknown')
135
- doc_path = doc.get('path', '')
136
-
137
- return f"""Identify and describe what type of document this is in 1-2 sentences.
138
- Focus specifically on the document type, category, and what kind of information it contains.
139
 
140
- Examples of document types: financial statement, contract agreement, corporate governance document, employee handbook, technical specification, compliance report, audit report, etc.
 
141
 
142
- Document: {doc_name}
143
- Path: {doc_path}
144
- Content preview:
145
- {text_preview}
146
 
147
- Document type description (1-2 sentences only):"""
 
 
 
 
 
148
 
149
 
150
- # Template constants for common patterns
151
- DEFAULT_TEMPERATURE = 0.3
152
- DEFAULT_MAX_TOKENS = 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  in the DD-Checklist application.
7
  """
8
 
9
+ import json
10
  from typing import Dict, List
11
+ from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
12
+ from langchain_core.messages import SystemMessage, HumanMessage
13
 
14
 
15
+ def get_checklist_parsing_prompt(checklist_text: str) -> ChatPromptTemplate:
16
+ """Generate prompt for parsing due diligence checklists with structured output"""
17
+ return ChatPromptTemplate.from_messages([
18
+ SystemMessage(content="""
19
+ Parse this due diligence checklist into structured format. Extract:
20
+ - Categories (A., B., C., etc.) with their names
21
+ - Numbered items within each category (1., 2., 3., etc.)
22
+ - Total count of items
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ Follow the exact format specified in the format instructions.
25
+ """),
26
+ HumanMessage(content="""Parse this checklist:
27
 
28
+ {checklist_text}
29
 
30
+ {format_instructions}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ Please provide the structured output:""")
33
+ ])
34
 
 
 
 
 
35
 
36
+ def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
37
+ """Generate prompt for assessing document relevance to checklist items with structured output"""
38
+ return PromptTemplate.from_template(
39
+ """Analyze which documents are relevant to the following checklist item:
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  Checklist Item: {item_text}
42
 
43
+ Available Documents:
44
+ {documents}
45
 
46
+ {format_instructions}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ Please provide your analysis in the specified format:"""
49
+ )
50
 
 
 
 
 
51
 
52
+ def get_question_answering_prompt(question: str, context: str) -> ChatPromptTemplate:
53
+ """Generate prompt for answering questions based on document context"""
54
+ return ChatPromptTemplate.from_messages([
55
+ SystemMessage(content="Answer questions based on document context. Provide comprehensive answers with citations."),
56
+ HumanMessage(content=f"Question: {question}\n\nDocument Context:\n{context}\n\nAnswer:")
57
+ ])
58
 
59
 
60
+ def get_findings_summary_prompt(findings: Dict, max_chars: int = 2000) -> PromptTemplate:
61
+ """Generate prompt for summarizing due diligence findings"""
62
+ findings_text = json.dumps(findings, indent=2)[:max_chars]
63
+ return PromptTemplate.from_template(
64
+ "Provide an executive summary of these due diligence findings:\n\n"
65
+ "{findings_text}\n\n"
66
+ "Focus on:\n"
67
+ "1. Completeness of documentation\n"
68
+ "2. Key gaps or concerns\n"
69
+ "3. Overall assessment"
70
+ ).partial(findings_text=findings_text)
71
+
72
+
73
+ def get_description_generation_prompt(category_name: str, item_text: str) -> PromptTemplate:
74
+ """Generate prompt for creating checklist item descriptions"""
75
+ return PromptTemplate.from_template(
76
+ "For this due diligence checklist item, provide a concise description (1-2 sentences) "
77
+ "explaining what types of documents or information would satisfy this requirement.\n\n"
78
+ "Category: {category_name}\n"
79
+ "Checklist Item: {item_text}\n\n"
80
+ "Description:"
81
+ ).partial(category_name=category_name, item_text=item_text)
82
+
83
+
84
+ def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
85
+ """Generate prompt for document type identification and summarization"""
86
+ doc_name = doc.get('name', 'Unknown')
87
+ doc_path = doc.get('path', '')
88
+ text_preview = doc.get('content', '')[:1000] if doc.get('content') else ''
89
+
90
+ return PromptTemplate.from_template(
91
+ "Identify and describe what type of document this is in 1-2 sentences.\n\n"
92
+ "Examples: financial statement, contract agreement, corporate governance document, etc.\n\n"
93
+ "Document: {doc_name}\n"
94
+ "Path: {doc_path}\n"
95
+ "Content preview:\n{text_preview}\n\n"
96
+ "Document type description:"
97
+ ).partial(doc_name=doc_name, doc_path=doc_path, text_preview=text_preview)
src/config.py CHANGED
@@ -1,463 +1,373 @@
1
  #!/usr/bin/env python3
2
  """
3
- Configuration Management Module
4
 
5
- This module centralizes all configuration settings for the DD-Checklist application.
6
- Handles environment variables, default settings, and configuration validation.
7
  """
8
 
9
  import os
 
 
 
10
  from pathlib import Path
11
- from typing import Dict, Any, Optional, List
12
- from dataclasses import dataclass, field
13
- from dotenv import load_dotenv
 
14
 
15
  # Fix tokenizers parallelism warning
16
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
17
 
 
 
 
 
 
 
 
18
 
19
- @dataclass
20
- class ModelConfig:
21
- """Configuration for AI models"""
22
  sentence_transformer_model: str = "all-MiniLM-L6-v2"
23
  claude_model: str = "claude-sonnet-4-20250514"
24
  temperature: float = 0.3
25
  max_tokens: int = 2000
26
- embedding_dimension: int = 384
27
-
28
- def __post_init__(self):
29
- """Load model configuration from environment variables"""
30
- self.sentence_transformer_model = os.getenv('SENTENCE_TRANSFORMER_MODEL', self.sentence_transformer_model)
31
- self.claude_model = os.getenv('CLAUDE_MODEL', self.claude_model)
32
- self.temperature = float(os.getenv('CLAUDE_TEMPERATURE', str(self.temperature)))
33
- self.max_tokens = int(os.getenv('CLAUDE_MAX_TOKENS', str(self.max_tokens)))
34
- self.embedding_dimension = int(os.getenv('EMBEDDING_DIMENSION', str(self.embedding_dimension)))
35
-
36
-
37
- @dataclass
38
- class ProcessingConfig:
39
- """Configuration for document processing"""
40
- chunk_size: int = 400
41
- chunk_overlap: int = 50
42
- max_text_length: int = 10000
43
- batch_size: int = 100
44
- description_batch_size: int = 100
45
  similarity_threshold: float = 0.35
46
  relevancy_threshold: float = 0.5
47
  primary_threshold: float = 0.6
48
  min_display_threshold: float = 0.15
49
- max_workers: int = 4
50
- file_timeout: int = 30
51
- skip_descriptions: bool = False
52
- supported_file_extensions: List[str] = field(
53
- default_factory=lambda: ['.pdf', '.docx', '.doc', '.txt', '.md']
54
- )
55
-
56
- def __post_init__(self):
57
- """Load processing configuration from environment variables"""
58
- self.chunk_size = int(os.getenv('CHUNK_SIZE', str(self.chunk_size)))
59
- self.chunk_overlap = int(os.getenv('CHUNK_OVERLAP', str(self.chunk_overlap)))
60
- self.max_text_length = int(os.getenv('MAX_TEXT_LENGTH', str(self.max_text_length)))
61
- self.batch_size = int(os.getenv('BATCH_SIZE', str(self.batch_size)))
62
- self.description_batch_size = int(os.getenv('DESCRIPTION_BATCH_SIZE', str(self.description_batch_size)))
63
- self.similarity_threshold = float(os.getenv('SIMILARITY_THRESHOLD', str(self.similarity_threshold)))
64
- self.relevancy_threshold = float(os.getenv('RELEVANCY_THRESHOLD', str(self.relevancy_threshold)))
65
- self.primary_threshold = float(os.getenv('PRIMARY_THRESHOLD', str(self.primary_threshold)))
66
- self.min_display_threshold = float(os.getenv('MIN_DISPLAY_THRESHOLD', str(self.min_display_threshold)))
67
- self.max_workers = int(os.getenv('MAX_WORKERS', str(self.max_workers)))
68
- self.file_timeout = int(os.getenv('FILE_TIMEOUT', str(self.file_timeout)))
69
- self.skip_descriptions = os.getenv('SKIP_DESCRIPTIONS', 'false').lower() == 'true'
70
-
71
- # Handle file extensions from environment (comma-separated)
72
- extensions_env = os.getenv('SUPPORTED_FILE_EXTENSIONS')
73
- if extensions_env:
74
- self.supported_file_extensions = [ext.strip() for ext in extensions_env.split(',')]
75
 
76
 
77
- @dataclass
78
- class UIConfig:
79
- """Configuration for UI settings"""
80
  page_title: str = "AI Due Diligence"
81
  page_icon: str = "🤖"
82
  layout: str = "wide"
83
  top_k_search_results: int = 5
84
- max_question_sources: int = 3
85
- max_checklist_matches: int = 5
86
 
87
 
88
- @dataclass
89
- class PathConfig:
90
- """Configuration for file paths"""
91
  data_dir: str = "data"
92
  checklist_dir: str = "data/checklist"
93
  questions_dir: str = "data/questions"
94
  strategy_dir: str = "data/strategy"
95
  vdrs_dir: str = "data/vdrs"
96
- cache_dir: str = ".cache"
97
-
98
- def __post_init__(self):
99
- """Convert string paths to Path objects and ensure they exist"""
100
- self.data_path = Path(self.data_dir)
101
- self.checklist_path = Path(self.checklist_dir)
102
- self.questions_path = Path(self.questions_dir)
103
- self.strategy_path = Path(self.strategy_dir)
104
- self.vdrs_path = Path(self.vdrs_dir)
105
- self.cache_path = Path(self.cache_dir)
106
-
107
-
108
- @dataclass
109
- class APIConfig:
110
- """Configuration for API settings"""
111
- anthropic_api_key: Optional[str] = None
112
- openai_api_key: Optional[str] = None
113
- max_concurrent_requests: int = 50
114
- request_timeout: int = 30
115
- retry_attempts: int = 3
116
- base_delay: float = 0.2
117
- max_retries: int = 2
118
- batch_retry_attempts: int = 1
119
- batch_base_delay: float = 0.1
120
- single_retry_base_delay: float = 0.05
121
-
122
- def __post_init__(self):
123
- """Load API configuration from environment variables"""
124
- if not self.anthropic_api_key:
125
- self.anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
126
- if not self.openai_api_key:
127
- self.openai_api_key = os.getenv('OPENAI_API_KEY')
128
-
129
- self.max_concurrent_requests = int(os.getenv('MAX_CONCURRENT_REQUESTS', str(self.max_concurrent_requests)))
130
- self.request_timeout = int(os.getenv('REQUEST_TIMEOUT', str(self.request_timeout)))
131
- self.retry_attempts = int(os.getenv('RETRY_ATTEMPTS', str(self.retry_attempts)))
132
- self.base_delay = float(os.getenv('BASE_DELAY', str(self.base_delay)))
133
- self.max_retries = int(os.getenv('MAX_RETRIES', str(self.max_retries)))
134
- self.batch_retry_attempts = int(os.getenv('BATCH_RETRY_ATTEMPTS', str(self.batch_retry_attempts)))
135
- self.batch_base_delay = float(os.getenv('BATCH_BASE_DELAY', str(self.batch_base_delay)))
136
- self.single_retry_base_delay = float(os.getenv('SINGLE_RETRY_BASE_DELAY', str(self.single_retry_base_delay)))
137
-
138
-
139
- @dataclass
140
- class AppConfig:
141
- """Main application configuration"""
142
- model: ModelConfig = field(default_factory=ModelConfig)
143
- processing: ProcessingConfig = field(default_factory=ProcessingConfig)
144
- ui: UIConfig = field(default_factory=UIConfig)
145
- paths: PathConfig = field(default_factory=PathConfig)
146
- api: APIConfig = field(default_factory=APIConfig)
147
 
148
- # Environment settings
149
- debug: bool = False
150
- environment: str = "development"
151
- log_level: str = "INFO"
152
 
153
- def __post_init__(self):
154
- """Load environment-specific settings"""
155
- self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
156
- self.environment = os.getenv('ENVIRONMENT', 'development')
157
- self.log_level = os.getenv('LOG_LEVEL', 'INFO')
158
 
159
 
160
- class ConfigManager:
161
- """
162
- Configuration manager that handles loading and validating configuration
163
- """
 
 
 
 
164
 
165
- def __init__(self, config_file: Optional[str] = None):
166
- """
167
- Initialize configuration manager
168
-
169
- Args:
170
- config_file: Optional path to configuration file
171
- """
172
- # Load environment variables
173
- load_dotenv()
174
-
175
- # Initialize configuration
176
- self.config = AppConfig()
177
-
178
- # Load from file if provided
179
- if config_file and Path(config_file).exists():
180
- self._load_from_file(config_file)
181
-
182
- # Validate configuration
183
- self._validate_config()
184
 
185
- def _load_from_file(self, config_file: str) -> None:
186
- """
187
- Load configuration from file (JSON or YAML)
188
-
189
- Args:
190
- config_file: Path to configuration file
191
- """
192
- import json
193
-
194
- config_path = Path(config_file)
195
-
196
- try:
197
- if config_path.suffix.lower() == '.json':
198
- with open(config_path, 'r') as f:
199
- config_data = json.load(f)
200
- self._update_config_from_dict(config_data)
201
- elif config_path.suffix.lower() in ['.yml', '.yaml']:
202
- try:
203
- import yaml
204
- with open(config_path, 'r') as f:
205
- config_data = yaml.safe_load(f)
206
- self._update_config_from_dict(config_data)
207
- except ImportError:
208
- print("PyYAML not installed. Cannot load YAML configuration.")
209
- except Exception as e:
210
- print(f"Warning: Could not load configuration from {config_file}: {e}")
211
 
212
- def _update_config_from_dict(self, config_data: Dict[str, Any]) -> None:
213
- """
214
- Update configuration from dictionary
215
-
216
- Args:
217
- config_data: Configuration dictionary
218
- """
219
- for section, values in config_data.items():
220
- if hasattr(self.config, section) and isinstance(values, dict):
221
- config_section = getattr(self.config, section)
222
- for key, value in values.items():
223
- if hasattr(config_section, key):
224
- setattr(config_section, key, value)
 
 
 
225
 
226
- def _validate_config(self) -> None:
227
- """Validate configuration settings"""
228
- # Validate paths
229
- if not self.config.paths.data_path.exists():
230
- print(f"Warning: Data directory does not exist: {self.config.paths.data_path}")
231
-
232
- # Validate model settings
233
- if self.config.processing.chunk_size <= self.config.processing.chunk_overlap:
234
- print("Warning: Chunk size should be larger than chunk overlap")
235
-
236
- # Validate thresholds
237
- if not 0 <= self.config.processing.similarity_threshold <= 1:
238
- print("Warning: Similarity threshold should be between 0 and 1")
239
 
240
- def get_config(self) -> AppConfig:
241
- """Get the current configuration"""
242
- return self.config
 
 
243
 
244
- def update_config(self, **kwargs) -> None:
245
- """
246
- Update configuration settings
247
-
248
- Args:
249
- **kwargs: Configuration updates
250
- """
251
- for key, value in kwargs.items():
252
- if hasattr(self.config, key):
253
- setattr(self.config, key, value)
254
 
255
- def update_processing_config(self, **kwargs) -> None:
256
- """
257
- Update processing configuration dynamically
258
-
259
- Args:
260
- **kwargs: Processing configuration parameters to update
261
- """
262
- for key, value in kwargs.items():
263
- if hasattr(self.config.processing, key):
264
- setattr(self.config.processing, key, value)
265
- else:
266
- print(f"Warning: Unknown processing config key: {key}")
267
 
268
- def update_api_config(self, **kwargs) -> None:
269
- """
270
- Update API configuration dynamically
271
-
272
- Args:
273
- **kwargs: API configuration parameters to update
274
- """
275
- for key, value in kwargs.items():
276
- if hasattr(self.config, key):
277
- setattr(self.config.api, key, value)
278
- else:
279
- print(f"Warning: Unknown API config key: {key}")
280
 
281
- def save_config(self, config_file: str) -> None:
282
- """
283
- Save current configuration to file
284
-
285
- Args:
286
- config_file: Path to save configuration
287
- """
288
- import json
289
- from dataclasses import asdict
290
-
291
- config_dict = asdict(self.config)
292
-
293
- # Remove Path objects and other non-serializable items
294
- config_dict = self._make_serializable(config_dict)
295
-
296
- with open(config_file, 'w') as f:
297
- json.dump(config_dict, f, indent=2)
298
 
299
- def _make_serializable(self, obj: Any) -> Any:
300
- """Make configuration dictionary serializable"""
301
- if isinstance(obj, dict):
302
- return {k: self._make_serializable(v) for k, v in obj.items()
303
- if not k.endswith('_path')} # Skip Path objects
304
- elif isinstance(obj, list):
305
- return [self._make_serializable(item) for item in obj]
306
- elif isinstance(obj, Path):
307
- return str(obj)
308
- else:
309
- return obj
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
 
312
  # Global configuration instance
313
- _config_manager: Optional[ConfigManager] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
 
316
- def get_config() -> AppConfig:
317
- """
318
- Get the global configuration instance
319
-
320
- Returns:
321
- Application configuration
322
- """
323
- global _config_manager
324
- if _config_manager is None:
325
- _config_manager = ConfigManager()
326
- return _config_manager.get_config()
327
 
328
 
329
- def init_config(config_file: Optional[str] = None) -> ConfigManager:
 
 
 
 
 
 
 
 
 
330
  """
331
- Initialize global configuration
332
 
333
  Args:
334
- config_file: Optional configuration file path
 
 
335
 
336
  Returns:
337
- Configuration manager instance
338
  """
339
- global _config_manager
340
- _config_manager = ConfigManager(config_file)
341
- return _config_manager
342
-
343
-
344
- def update_config(**kwargs) -> None:
345
- """
346
- Update global configuration
347
 
348
- Args:
349
- **kwargs: Configuration updates
350
- """
351
- global _config_manager
352
- if _config_manager is None:
353
- _config_manager = ConfigManager()
354
- _config_manager.update_config(**kwargs)
355
-
356
-
357
- # Environment-specific configurations
358
- DEVELOPMENT_CONFIG = {
359
- "processing": {
360
- "batch_size": 50,
361
- "similarity_threshold": 0.3
362
- },
363
- "ui": {
364
- "layout": "wide"
365
- }
366
- }
367
-
368
- PRODUCTION_CONFIG = {
369
- "processing": {
370
- "batch_size": 100,
371
- "similarity_threshold": 0.35
372
- },
373
- "api": {
374
- "max_concurrent_requests": 20,
375
- "request_timeout": 60
376
- }
377
- }
378
-
379
- STREAMLIT_CLOUD_CONFIG = {
380
- "processing": {
381
- "batch_size": 100, # Optimized for performance
382
- "description_batch_size": 100, # Match summary batch size
383
- "max_text_length": 8000, # Higher limit for better quality
384
- "max_workers": 2, # Moderate parallelism for cloud
385
- "file_timeout": 30 # Standard timeout
386
- },
387
- "api": {
388
- "max_concurrent_requests": 30, # Good concurrency for cloud
389
- "base_delay": 0.1, # Fast delays
390
- "batch_base_delay": 0.05, # Very fast batches
391
- "request_timeout": 30
392
- }
393
- }
394
-
395
-
396
- def get_environment_config() -> Dict[str, Any]:
397
- """
398
- Get environment-specific configuration
399
 
400
- Returns:
401
- Environment configuration dictionary
402
- """
403
- env = os.getenv('ENVIRONMENT', 'development').lower()
 
 
 
 
 
404
 
405
- if env == 'production':
406
- return PRODUCTION_CONFIG
407
- elif env == 'streamlit_cloud':
408
- return STREAMLIT_CLOUD_CONFIG
409
- else:
410
- return DEVELOPMENT_CONFIG
411
-
412
-
413
- # Utility functions for common configuration access
414
- def get_model_config() -> ModelConfig:
415
- """Get model configuration"""
416
- return get_config().model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
417
 
418
 
419
- def get_processing_config() -> ProcessingConfig:
420
- """Get processing configuration"""
421
- return get_config().processing
422
 
423
 
424
- def get_ui_config() -> UIConfig:
425
- """Get UI configuration"""
426
- return get_config().ui
427
 
 
 
 
 
 
428
 
429
- def get_path_config() -> PathConfig:
430
- """Get path configuration"""
431
- return get_config().paths
432
 
 
 
 
 
 
433
 
434
- def get_api_config() -> APIConfig:
435
- """Get API configuration"""
436
- return get_config().api
437
 
 
 
 
 
 
438
 
439
- def is_ai_enabled() -> bool:
440
- """Check if AI features are enabled (API key available)"""
441
- api_config = get_api_config()
442
- return api_config.anthropic_api_key is not None
443
 
 
 
 
444
 
445
- def get_supported_extensions() -> List[str]:
446
- """Get list of supported file extensions"""
447
- return get_processing_config().supported_file_extensions
 
 
 
 
 
 
 
 
 
 
448
 
449
 
450
- def update_processing_config(**kwargs) -> None:
451
- """Update processing configuration dynamically"""
452
- global _config_manager
453
- if _config_manager is None:
454
- _config_manager = ConfigManager()
455
- _config_manager.update_processing_config(**kwargs)
 
456
 
457
 
458
- def update_api_config(**kwargs) -> None:
459
- """Update API configuration dynamically"""
460
- global _config_manager
461
- if _config_manager is None:
462
- _config_manager = ConfigManager()
463
- _config_manager.update_api_config(**kwargs)
 
 
1
  #!/usr/bin/env python3
2
  """
3
+ Configuration Module
4
 
5
+ Uses pydantic-settings for robust configuration management from environment variables.
 
6
  """
7
 
8
  import os
9
+ import sys
10
+ import logging
11
+ from datetime import datetime
12
  from pathlib import Path
13
+ from typing import List, Optional
14
+ from logging.handlers import RotatingFileHandler
15
+ from pydantic import BaseModel, Field
16
+ from pydantic_settings import BaseSettings, SettingsConfigDict
17
 
18
  # Fix tokenizers parallelism warning
19
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
20
 
21
+ # Streamlit import for utilities (conditional)
22
+ try:
23
+ import streamlit as st
24
+ STREAMLIT_AVAILABLE = True
25
+ except ImportError:
26
+ STREAMLIT_AVAILABLE = False
27
+ st = None
28
 
29
+
30
+ class ModelConfig(BaseModel):
31
+ """Model configuration settings"""
32
  sentence_transformer_model: str = "all-MiniLM-L6-v2"
33
  claude_model: str = "claude-sonnet-4-20250514"
34
  temperature: float = 0.3
35
  max_tokens: int = 2000
36
+
37
+
38
+ class ProcessingConfig(BaseModel):
39
+ """Processing configuration settings"""
40
+ batch_size: int = 20
41
+ description_batch_size: int = 25
42
+ max_workers: int = 4
43
+ chunk_size: int = 1000
44
+ chunk_overlap: int = 200
 
 
 
 
 
 
 
 
 
 
45
  similarity_threshold: float = 0.35
46
  relevancy_threshold: float = 0.5
47
  primary_threshold: float = 0.6
48
  min_display_threshold: float = 0.15
49
+ supported_file_extensions: List[str] = ['.pdf', '.docx', '.doc', '.txt', '.md']
50
+ faiss_store_name: str = "default"
51
+ skip_processed_files: bool = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
+ class UIConfig(BaseModel):
55
+ """UI configuration settings"""
 
56
  page_title: str = "AI Due Diligence"
57
  page_icon: str = "🤖"
58
  layout: str = "wide"
59
  top_k_search_results: int = 5
 
 
60
 
61
 
62
+ class PathsConfig(BaseModel):
63
+ """Paths configuration with computed properties"""
 
64
  data_dir: str = "data"
65
  checklist_dir: str = "data/checklist"
66
  questions_dir: str = "data/questions"
67
  strategy_dir: str = "data/strategy"
68
  vdrs_dir: str = "data/vdrs"
69
+ faiss_dir: str = "data/enhanced_faiss"
70
+
71
+ @property
72
+ def data_path(self) -> Path:
73
+ return Path(self.data_dir)
74
+
75
+ @property
76
+ def checklist_path(self) -> Path:
77
+ return Path(self.checklist_dir)
78
+
79
+ @property
80
+ def questions_path(self) -> Path:
81
+ return Path(self.questions_dir)
82
+
83
+ @property
84
+ def strategy_path(self) -> Path:
85
+ return Path(self.strategy_dir)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ @property
88
+ def vdrs_path(self) -> Path:
89
+ return Path(self.vdrs_dir)
 
90
 
91
+ @property
92
+ def faiss_path(self) -> Path:
93
+ return Path(self.faiss_dir)
 
 
94
 
95
 
96
+ class APIConfig(BaseModel):
97
+ """API configuration settings"""
98
+ anthropic_api_key: Optional[str] = None
99
+ max_concurrent_requests: int = 10
100
+
101
+
102
+ class Config(BaseSettings):
103
+ """Main application configuration using pydantic-settings"""
104
 
105
+ model_config = SettingsConfigDict(
106
+ env_file=".env",
107
+ env_file_encoding="utf-8",
108
+ env_nested_delimiter="__",
109
+ case_sensitive=False,
110
+ extra="ignore" # Allow extra environment variables to be ignored
111
+ )
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
+ # Model settings
114
+ sentence_transformer_model: str = Field(default="all-MiniLM-L6-v2", env="SENTENCE_TRANSFORMER_MODEL")
115
+ claude_model: str = Field(default="claude-sonnet-4-20250514", env="CLAUDE_MODEL")
116
+ temperature: float = Field(default=0.3, env="CLAUDE_TEMPERATURE")
117
+ max_tokens: int = Field(default=2000, env="CLAUDE_MAX_TOKENS")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
+ # Processing settings (optimized for large datasets)
120
+ batch_size: int = Field(default=20, env="BATCH_SIZE")
121
+ description_batch_size: int = Field(default=25, env="DESCRIPTION_BATCH_SIZE")
122
+ max_workers: int = Field(default=4, env="MAX_WORKERS")
123
+ chunk_size: int = Field(default=1000, env="CHUNK_SIZE")
124
+ chunk_overlap: int = Field(default=200, env="CHUNK_OVERLAP")
125
+ similarity_threshold: float = Field(default=0.35, env="SIMILARITY_THRESHOLD")
126
+ relevancy_threshold: float = Field(default=0.5, env="RELEVANCY_THRESHOLD")
127
+ primary_threshold: float = Field(default=0.6, env="PRIMARY_THRESHOLD")
128
+ min_display_threshold: float = Field(default=0.15, env="MIN_DISPLAY_THRESHOLD")
129
+ supported_file_extensions: List[str] = Field(
130
+ default=['.pdf', '.docx', '.doc', '.txt', '.md'],
131
+ env="SUPPORTED_FILE_EXTENSIONS"
132
+ )
133
+ faiss_store_name: str = Field(default="default", env="FAISS_STORE_NAME")
134
+ skip_processed_files: bool = Field(default=True, env="SKIP_PROCESSED_FILES")
135
 
136
+ # Logging settings
137
+ log_level: str = Field(default="INFO", env="LOG_LEVEL")
138
+ suppress_langchain_warnings: bool = Field(default=True, env="SUPPRESS_LANGCHAIN_WARNINGS")
 
 
 
 
 
 
 
 
 
 
139
 
140
+ # UI settings
141
+ page_title: str = Field(default="AI Due Diligence", env="PAGE_TITLE")
142
+ page_icon: str = Field(default="🤖", env="PAGE_ICON")
143
+ layout: str = Field(default="wide", env="LAYOUT")
144
+ top_k_search_results: int = Field(default=5, env="TOP_K_SEARCH_RESULTS")
145
 
146
+ # Path settings
147
+ data_dir: str = Field(default="data", env="DATA_DIR")
148
+ checklist_dir: str = Field(default="data/checklist", env="CHECKLIST_DIR")
149
+ questions_dir: str = Field(default="data/questions", env="QUESTIONS_DIR")
150
+ strategy_dir: str = Field(default="data/strategy", env="STRATEGY_DIR")
151
+ vdrs_dir: str = Field(default="data/vdrs", env="VDRS_DIR")
152
+ faiss_dir: str = Field(default="data/enhanced_faiss", env="FAISS_DIR")
 
 
 
153
 
154
+ # API settings
155
+ anthropic_api_key: Optional[str] = Field(default=None, env="ANTHROPIC_API_KEY")
156
+ max_concurrent_requests: int = Field(default=10, env="MAX_CONCURRENT_REQUESTS")
 
 
 
 
 
 
 
 
 
157
 
158
+ @property
159
+ def model(self) -> ModelConfig:
160
+ """Get model configuration"""
161
+ return ModelConfig(
162
+ sentence_transformer_model=self.sentence_transformer_model,
163
+ claude_model=self.claude_model,
164
+ temperature=self.temperature,
165
+ max_tokens=self.max_tokens
166
+ )
 
 
 
167
 
168
+ @property
169
+ def processing(self) -> ProcessingConfig:
170
+ """Get processing configuration"""
171
+ return ProcessingConfig(
172
+ batch_size=self.batch_size,
173
+ description_batch_size=self.description_batch_size,
174
+ max_workers=self.max_workers,
175
+ chunk_size=self.chunk_size,
176
+ chunk_overlap=self.chunk_overlap,
177
+ similarity_threshold=self.similarity_threshold,
178
+ relevancy_threshold=self.relevancy_threshold,
179
+ primary_threshold=self.primary_threshold,
180
+ min_display_threshold=self.min_display_threshold,
181
+ supported_file_extensions=self.supported_file_extensions,
182
+ faiss_store_name=self.faiss_store_name,
183
+ skip_processed_files=self.skip_processed_files
184
+ )
185
 
186
+ @property
187
+ def ui(self) -> UIConfig:
188
+ """Get UI configuration"""
189
+ return UIConfig(
190
+ page_title=self.page_title,
191
+ page_icon=self.page_icon,
192
+ layout=self.layout,
193
+ top_k_search_results=self.top_k_search_results
194
+ )
195
+
196
+ @property
197
+ def paths(self) -> PathsConfig:
198
+ """Get paths configuration"""
199
+ return PathsConfig(
200
+ data_dir=self.data_dir,
201
+ checklist_dir=self.checklist_dir,
202
+ questions_dir=self.questions_dir,
203
+ strategy_dir=self.strategy_dir,
204
+ vdrs_dir=self.vdrs_dir,
205
+ faiss_dir=self.faiss_dir
206
+ )
207
+
208
+ @property
209
+ def api(self) -> APIConfig:
210
+ """Get API configuration"""
211
+ return APIConfig(
212
+ anthropic_api_key=self.anthropic_api_key,
213
+ max_concurrent_requests=self.max_concurrent_requests
214
+ )
215
 
216
 
217
  # Global configuration instance
218
+ _config: Optional[Config] = None
219
+
220
+
221
+ def get_config() -> Config:
222
+ """Get the global configuration instance"""
223
+ global _config
224
+ if _config is None:
225
+ _config = Config()
226
+ return _config
227
+
228
+
229
+ def init_config(config_file: Optional[str] = None) -> Config:
230
+ """Initialize global configuration"""
231
+ global _config
232
+ _config = Config()
233
+ return _config
234
 
235
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
 
238
+
239
+ # =============================================================================
240
+ # LOGGING UTILITIES - Merged from utils.py
241
+ # =============================================================================
242
+
243
+ def setup_logging(
244
+ name: str = "dd_checklist",
245
+ log_level: Optional[str] = None,
246
+ log_file: Optional[str] = None
247
+ ) -> logging.Logger:
248
  """
249
+ Set up standard Python logging with rotating file handler
250
 
251
  Args:
252
+ name: Logger name
253
+ log_level: Logging level
254
+ log_file: Optional log file path
255
 
256
  Returns:
257
+ Configured logger instance
258
  """
259
+ logger = logging.getLogger(name)
 
 
 
 
 
 
 
260
 
261
+ # Avoid duplicate setup if logger already has handlers
262
+ if logger.handlers:
263
+ return logger
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
+ # Use configured log level if not provided
266
+ if log_level is None:
267
+ try:
268
+ config = get_config()
269
+ log_level = config.log_level
270
+ except Exception:
271
+ log_level = "INFO" # fallback
272
+
273
+ logger.setLevel(getattr(logging, log_level.upper()))
274
 
275
+ # Console handler
276
+ console_handler = logging.StreamHandler(sys.stdout)
277
+ console_formatter = logging.Formatter(
278
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
279
+ )
280
+ console_handler.setFormatter(console_formatter)
281
+ logger.addHandler(console_handler)
282
+
283
+ # Rotating file handler (if possible)
284
+ if log_file or True: # Always try to set up file logging
285
+ try:
286
+ log_dir = Path(".logs")
287
+ log_dir.mkdir(exist_ok=True)
288
+
289
+ if not log_file:
290
+ log_file = log_dir / f"dd_checklist_{datetime.now().strftime('%Y%m%d')}.log"
291
+
292
+ # Use RotatingFileHandler for better log management
293
+ file_handler = RotatingFileHandler(
294
+ log_file,
295
+ maxBytes=10 * 1024 * 1024, # 10MB
296
+ backupCount=5
297
+ )
298
+ file_formatter = logging.Formatter(
299
+ '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
300
+ )
301
+ file_handler.setFormatter(file_formatter)
302
+ logger.addHandler(file_handler)
303
+ except Exception:
304
+ # File logging not available (e.g., on Streamlit Cloud)
305
+ pass
306
+
307
+ return logger
308
 
309
 
310
+ # Global logger instance
311
+ logger = setup_logging()
 
312
 
313
 
314
+ # =============================================================================
315
+ # STREAMLIT UTILITIES - Merged from utils.py
316
+ # =============================================================================
317
 
318
+ def show_success(message: str):
319
+ """Show success message in Streamlit"""
320
+ if STREAMLIT_AVAILABLE and st:
321
+ st.success(message)
322
+ logger.info(message)
323
 
 
 
 
324
 
325
+ def show_info(message: str):
326
+ """Show info message in Streamlit"""
327
+ if STREAMLIT_AVAILABLE and st:
328
+ st.info(message)
329
+ logger.info(message)
330
 
 
 
 
331
 
332
+ def show_error(message: str):
333
+ """Show error message in Streamlit"""
334
+ if STREAMLIT_AVAILABLE and st:
335
+ st.error(message)
336
+ logger.error(message)
337
 
 
 
 
 
338
 
339
+ # =============================================================================
340
+ # FILE UTILITIES - Common patterns extracted for reuse
341
+ # =============================================================================
342
 
343
+ def get_mime_type(file_path: Path) -> str:
344
+ """Get MIME type based on file extension"""
345
+ file_extension = file_path.suffix.lower()
346
+ if file_extension == '.pdf':
347
+ return 'application/pdf'
348
+ elif file_extension in ['.doc', '.docx']:
349
+ return 'application/msword'
350
+ elif file_extension == '.txt':
351
+ return 'text/plain'
352
+ elif file_extension == '.md':
353
+ return 'text/markdown'
354
+ else:
355
+ return 'application/octet-stream'
356
 
357
 
358
+ def format_document_title(doc_name: str) -> str:
359
+ """Format document name into a readable title"""
360
+ if '.' in doc_name:
361
+ doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
362
+ else:
363
+ doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
364
+ return doc_title
365
 
366
 
367
+ def count_documents_in_directory(directory: Path, supported_extensions: Optional[List[str]] = None) -> int:
368
+ """Count supported documents in a directory recursively"""
369
+ if supported_extensions is None:
370
+ supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md']
371
+
372
+ return sum(1 for f in directory.rglob('*')
373
+ if f.is_file() and f.suffix.lower() in supported_extensions)
src/document_processing.py CHANGED
@@ -1,52 +1,78 @@
1
  #!/usr/bin/env python3
2
  """
3
- Document Processing Module
4
 
5
- This module handles all document-related operations including:
6
- - File text extraction from various formats (PDF, DOCX, TXT, MD)
7
- - Document scanning and indexing
8
- - Semantic text chunking for RAG with better context preservation
9
- - Document metadata handling
10
  """
11
 
12
  import os
 
13
  # Fix tokenizers parallelism warning
14
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
15
 
16
- import fitz # PyMuPDF
17
- import docx
18
- import io
19
- import re
20
- from pathlib import Path
21
- from typing import Dict, List, Tuple, Optional
22
- import streamlit as st
23
- import numpy as np
24
- from sentence_transformers import SentenceTransformer
25
- import concurrent.futures
26
- import threading
27
  import logging
28
- from functools import wraps
29
- import joblib
30
- import hashlib
31
- import time
32
- import faiss
33
 
34
- # Semantic chunking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  from langchain_text_splitters import RecursiveCharacterTextSplitter
36
 
37
  # Import configuration
38
  from .config import get_config
39
 
40
- # Setup logging for thread-safe error handling
 
 
41
  logger = logging.getLogger(__name__)
42
 
43
- # Thread-safe context management for Streamlit
44
- try:
45
- from streamlit.runtime.scriptrunner import add_script_run_ctx, get_script_run_ctx
46
- STREAMLIT_CONTEXT_AVAILABLE = True
47
- except ImportError:
48
- STREAMLIT_CONTEXT_AVAILABLE = False
49
- logger.warning("Streamlit context management not available")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
 
52
  def escape_markdown_math(text: str) -> str:
@@ -63,755 +89,331 @@ def escape_markdown_math(text: str) -> str:
63
  return text
64
 
65
 
66
- def extract_text_from_file(file_path: Path, progress_callback=None) -> Tuple[str, Dict]:
67
- """
68
- Extract text from file with metadata
69
-
70
- Args:
71
- file_path: Path to the file to extract text from
72
-
73
- Returns:
74
- Tuple of (text_content, metadata)
75
- """
76
- metadata = {'pages': [], 'type': 'unknown'}
77
- text_content = ""
78
-
79
- try:
80
- if file_path.suffix.lower() == '.pdf':
81
- # Use PyMuPDF (fitz) for faster and more robust PDF processing
82
- try:
83
- pdf_document = fitz.open(str(file_path))
84
- texts = []
85
-
86
- for page_num in range(pdf_document.page_count):
87
- try:
88
- page = pdf_document[page_num]
89
- page_text = page.get_text()
90
-
91
- if page_text.strip(): # Only add non-empty pages
92
- texts.append(page_text)
93
- metadata['pages'].append(page_num + 1) # 1-based page numbering
94
- except Exception as page_error:
95
- # Handle individual page errors gracefully
96
- logger.warning(f"Error reading page {page_num + 1} of {file_path.name}: {page_error}")
97
- if st and hasattr(st, 'session_state'):
98
- # Only use streamlit in main thread context
99
- try:
100
- st.warning(f"Error reading page {page_num + 1} of {file_path.name}: {page_error}")
101
- except Exception:
102
- pass
103
- continue
104
-
105
- pdf_document.close()
106
- text_content = '\n'.join(texts)[:10000]
107
- metadata['type'] = 'pdf'
108
-
109
- except Exception as pdf_error:
110
- # Handle corrupted or unsupported PDF files
111
- error_msg = f"Error processing PDF {file_path.name}: {pdf_error}"
112
- logger.error(error_msg)
113
- if st and hasattr(st, 'session_state'):
114
- # Only use streamlit in main thread context
115
- try:
116
- st.error(error_msg)
117
- except Exception:
118
- pass
119
- # Try to return partial content if available
120
- if 'pdf_document' in locals():
121
- try:
122
- pdf_document.close()
123
- except:
124
- pass
125
- return "", metadata
126
-
127
- elif file_path.suffix.lower() in ['.docx', '.doc']:
128
- doc = docx.Document(str(file_path))
129
- text_content = '\n'.join(p.text for p in doc.paragraphs)[:10000]
130
- metadata['type'] = 'docx'
131
-
132
- elif file_path.suffix.lower() in ['.txt', '.md']:
133
- text_content = file_path.read_text(encoding='utf-8', errors='ignore')[:10000]
134
- metadata['type'] = 'text'
135
-
136
- except Exception as e:
137
- error_msg = f"Could not read {file_path.name}: {e}"
138
- logger.warning(error_msg)
139
- if st and hasattr(st, 'session_state'): # Only use streamlit if available and in main thread
140
- try:
141
- st.warning(error_msg)
142
- except Exception:
143
- pass
144
-
145
- # Call progress callback if provided (for parallel processing tracking)
146
- if progress_callback:
147
- try:
148
- progress_callback(file_path.name)
149
- except Exception:
150
- pass # Don't let callback errors affect processing
151
-
152
- return text_content, metadata
153
-
154
-
155
- def _process_file_with_context(args):
156
- """
157
- Thread-safe file processing function with proper context management
158
-
159
- Args:
160
- args: Tuple of (file_path, base_path, progress_callback)
161
-
162
- Returns:
163
- Tuple of (file_path_str, document_info) or None if failed
164
- """
165
- file_path, base_path, progress_callback = args
166
-
167
- try:
168
- # Extract text from file
169
- text, metadata = extract_text_from_file(file_path, progress_callback)
170
-
171
- if text:
172
- # Store relative path for display
173
- rel_path = file_path.relative_to(base_path)
174
- document_info = {
175
- 'text': text,
176
- 'content': text, # Alias for backward compatibility
177
- 'name': file_path.name,
178
- 'rel_path': str(rel_path),
179
- 'metadata': metadata
180
- }
181
- return str(file_path), document_info
182
- except Exception as e:
183
- logger.error(f"Error processing file {file_path.name}: {e}")
184
-
185
- return None
186
-
187
-
188
- def scan_data_room(data_room_path: str, max_workers: Optional[int] = None, progress_callback=None) -> Dict[str, Dict]:
189
- """
190
- Scan entire data room directory for documents using parallel processing
191
-
192
- Args:
193
- data_room_path: Path to the data room directory
194
- max_workers: Maximum number of worker threads (uses config default if None)
195
- progress_callback: Optional callback function for progress updates
196
-
197
- Returns:
198
- Dictionary mapping file paths to document information
199
- """
200
- config = get_config()
201
- if max_workers is None:
202
- max_workers = config.processing.max_workers
203
-
204
- documents = {}
205
- path = Path(data_room_path)
206
-
207
- if not path.exists():
208
- return documents
209
-
210
- # Collect all document files first
211
- file_paths = []
212
- for file_path in path.rglob('*'):
213
- if file_path.is_file() and not file_path.name.startswith('.'):
214
- if file_path.suffix.lower() in config.processing.supported_file_extensions:
215
- file_paths.append(file_path)
216
-
217
- if not file_paths:
218
- return documents
219
-
220
- logger.info(f"Processing {len(file_paths)} files with {max_workers} workers")
221
-
222
- # Prepare arguments for parallel processing
223
- process_args = [(file_path, path, progress_callback) for file_path in file_paths]
224
-
225
- # Process files in parallel
226
- processed_count = 0
227
- failed_count = 0
228
-
229
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
230
- # Submit all tasks
231
- future_to_file = {}
232
-
233
- for args in process_args:
234
- future = executor.submit(_process_file_with_context, args)
235
-
236
- # Add Streamlit context if available
237
- if STREAMLIT_CONTEXT_AVAILABLE:
238
- try:
239
- script_ctx = get_script_run_ctx()
240
- if script_ctx:
241
- add_script_run_ctx(future)
242
- except Exception as e:
243
- logger.warning(f"Could not add script context: {e}")
244
-
245
- future_to_file[future] = args[0] # Store file_path for reference
246
-
247
- # Collect results as they complete
248
- for future in concurrent.futures.as_completed(future_to_file):
249
- try:
250
- result = future.result(timeout=config.processing.file_timeout)
251
- if result:
252
- file_path_str, document_info = result
253
- documents[file_path_str] = document_info
254
- processed_count += 1
255
- else:
256
- failed_count += 1
257
- except concurrent.futures.TimeoutError:
258
- file_path = future_to_file[future]
259
- logger.error(f"Timeout processing file: {file_path.name}")
260
- failed_count += 1
261
- except Exception as e:
262
- file_path = future_to_file[future]
263
- logger.error(f"Error processing file {file_path.name}: {e}")
264
- failed_count += 1
265
-
266
- logger.info(f"Completed processing: {processed_count} successful, {failed_count} failed")
267
- return documents
268
-
269
-
270
- def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 2000, overlap: int = 200) -> List[Dict]:
271
- """
272
- Create searchable chunks with semantic splitting and full metadata.
273
- Uses RecursiveCharacterTextSplitter for better context preservation.
274
-
275
- Args:
276
- documents: Dictionary of documents
277
- chunk_size: Size of each chunk in characters (default: 2000 for ~400 words)
278
- overlap: Overlap between chunks in characters (default: 200 for ~50 words)
279
-
280
- Returns:
281
- List of chunk dictionaries with metadata
282
  """
283
- chunks = []
284
-
285
- # Initialize semantic text splitter with hierarchical separators
286
- # This preserves document structure by prioritizing paragraph breaks,
287
- # then sentences, then words
288
- text_splitter = RecursiveCharacterTextSplitter(
289
- chunk_size=chunk_size,
290
- chunk_overlap=overlap,
291
- separators=["\n\n", "\n", ".", "!", "?", ",", " "],
292
- length_function=len,
293
- is_separator_regex=False,
294
- )
295
 
296
- for doc_path, doc_info in documents.items():
297
- text = doc_info['text']
298
-
299
- if not text.strip():
300
- continue
301
-
302
- # Split text using semantic boundaries
303
- semantic_chunks = text_splitter.split_text(text)
304
-
305
- # Create chunks with metadata
306
- for i, chunk_text in enumerate(semantic_chunks):
307
- if chunk_text.strip():
308
- chunks.append({
309
- 'text': chunk_text.strip(),
310
- 'source': doc_info['name'],
311
- 'path': doc_info['rel_path'],
312
- 'full_path': doc_path,
313
- 'chunk_id': f"semantic_chunk_{i}",
314
- 'metadata': doc_info['metadata']
315
- })
316
-
317
- return chunks
318
-
319
-
320
- def create_embeddings_batch(texts: List[str], model: SentenceTransformer, batch_size: Optional[int] = None) -> np.ndarray:
321
  """
322
- Create embeddings for texts in batches for better performance
323
 
324
- Args:
325
- texts: List of texts to embed
326
- model: SentenceTransformer model
327
- batch_size: Batch size for processing
328
 
329
- Returns:
330
- NumPy array of embeddings
331
- """
332
- # Set default batch_size from config if None
333
- if batch_size is None:
334
  config = get_config()
335
- batch_size = config.processing.batch_size
336
-
337
- embeddings_list = []
338
-
339
- for i in range(0, len(texts), batch_size):
340
- batch = texts[i:i + batch_size]
341
- batch_embeddings = model.encode(batch)
342
- embeddings_list.append(batch_embeddings)
343
-
344
- return np.vstack(embeddings_list) if embeddings_list else np.array([])
345
-
346
-
347
- def search_documents_with_faiss(
348
- query: str,
349
- chunks: List[Dict],
350
- faiss_index: faiss.IndexFlatIP,
351
- model: SentenceTransformer,
352
- top_k: int = 5,
353
- threshold: Optional[float] = None
354
- ) -> List[Dict]:
355
- """
356
- Search documents using FAISS IndexFlatIP for fast similarity search
357
-
358
- Args:
359
- query: Search query
360
- chunks: List of document chunks
361
- faiss_index: FAISS index with embeddings
362
- model: SentenceTransformer model
363
- top_k: Number of top results to return
364
- threshold: Minimum similarity threshold (uses config default if None)
365
-
366
- Returns:
367
- List of search results with citations
368
- """
369
- if not chunks or faiss_index is None:
370
- return []
371
-
372
- config = get_config()
373
- if threshold is None:
374
- threshold = config.processing.similarity_threshold
375
-
376
- # Encode query and normalize for inner product similarity
377
- query_embedding = model.encode(query).astype('float32')
378
- query_embedding = query_embedding.reshape(1, -1)
379
-
380
- # Normalize for cosine similarity using inner product
381
- faiss.normalize_L2(query_embedding)
382
-
383
- # Search using FAISS (much faster than numpy)
384
- scores, indices = faiss_index.search(query_embedding, min(top_k * 2, len(chunks)))
385
-
386
- results = []
387
- seen_texts = set()
388
-
389
- for score, idx in zip(scores[0], indices[0]):
390
- if idx == -1 or score < threshold: # -1 indicates no more results
391
- continue
392
-
393
- # Avoid duplicates
394
- text_preview = chunks[idx]['text'][:100]
395
- if text_preview not in seen_texts:
396
- seen_texts.add(text_preview)
397
-
398
- # Format citation based on file type
399
- metadata = chunks[idx]['metadata']
400
- if metadata['type'] == 'pdf' and metadata.get('pages'):
401
- citation = f"page {metadata['pages'][0]}"
402
- else:
403
- citation = "document"
404
-
405
- results.append({
406
- 'text': chunks[idx]['text'],
407
- 'source': chunks[idx]['source'],
408
- 'path': chunks[idx]['path'],
409
- 'full_path': chunks[idx].get('full_path', ''),
410
- 'citation': citation,
411
- 'score': float(score)
412
- })
413
-
414
- if len(results) >= top_k:
415
- break
416
-
417
- return results
418
-
419
-
420
- def search_documents_with_citations(
421
- query: str,
422
- chunks: List[Dict],
423
- embeddings: np.ndarray,
424
- model: SentenceTransformer,
425
- top_k: int = 5,
426
- threshold: Optional[float] = None
427
- ) -> List[Dict]:
428
- """
429
- Legacy search documents function - kept for backward compatibility
430
- Creates temporary FAISS index and uses FAISS search for better performance
431
-
432
- Args:
433
- query: Search query
434
- chunks: List of document chunks
435
- embeddings: Precomputed embeddings for chunks
436
- model: SentenceTransformer model
437
- top_k: Number of top results to return
438
- threshold: Minimum similarity threshold
439
 
440
- Returns:
441
- List of search results with citations
442
- """
443
- if not chunks:
444
- return []
445
-
446
- # Create temporary FAISS index for better performance
447
- embeddings_f32 = embeddings.astype('float32')
448
- faiss.normalize_L2(embeddings_f32) # Normalize for cosine similarity
449
-
450
- index = faiss.IndexFlatIP(embeddings_f32.shape[1])
451
- index.add(embeddings_f32)
452
 
453
- return search_documents_with_faiss(query, chunks, index, model, top_k, threshold)
454
-
455
-
456
- def create_progress_tracker(total_files: int = 0, streamlit_progress_bar=None):
457
- """
458
- Create a thread-safe progress tracking function
 
 
 
 
 
459
 
460
- Args:
461
- total_files: Total number of files to process
462
- streamlit_progress_bar: Optional Streamlit progress bar
 
463
 
464
- Returns:
465
- Progress callback function
466
- """
467
- processed_count = [0] # Use list for mutable counter in closure
468
- lock = threading.Lock()
469
-
470
- def progress_callback(filename: str = None):
471
- with lock:
472
- processed_count[0] += 1
473
- progress = processed_count[0] / max(total_files, 1)
474
-
475
- if streamlit_progress_bar and hasattr(st, 'session_state'):
476
- try:
477
- streamlit_progress_bar.progress(
478
- min(progress, 1.0),
479
- text=f"Processing {filename or 'documents'}... ({processed_count[0]}/{total_files})"
480
- )
481
- except Exception:
482
- pass # Don't let UI errors affect processing
483
-
484
- return progress_callback
485
-
486
-
487
- def _generate_cache_key(documents: Dict[str, Dict]) -> str:
488
- """
489
- Generate a cache key based on document paths and modification times
490
-
491
- Args:
492
- documents: Dictionary of documents with file paths
493
 
494
- Returns:
495
- Cache key string
496
- """
497
- # Create a hash based on file paths and their modification times
498
- cache_data = []
499
-
500
- for file_path, doc_info in documents.items():
501
  try:
502
- path_obj = Path(file_path)
503
- if path_obj.exists():
504
- mtime = path_obj.stat().st_mtime
505
- cache_data.append(f"{file_path}:{mtime}")
 
 
 
 
 
 
506
  except Exception as e:
507
- logger.warning(f"Could not get modification time for {file_path}: {e}")
508
- # Use current time as fallback
509
- cache_data.append(f"{file_path}:{time.time()}")
510
-
511
- # Sort to ensure consistent hashing regardless of document order
512
- cache_data.sort()
513
- cache_string = "|".join(cache_data)
514
-
515
- # Generate MD5 hash for the cache key
516
- return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
517
-
518
-
519
- def _get_cache_dir() -> Path:
520
- """Get or create the cache directory"""
521
- cache_dir = Path(".cache")
522
- cache_dir.mkdir(exist_ok=True)
523
- return cache_dir
524
-
525
-
526
- def _save_embeddings_to_cache(cache_key: str, embeddings: np.ndarray, chunks: List[Dict]) -> bool:
527
- """
528
- Save embeddings and chunks to cache
529
-
530
- Args:
531
- cache_key: Cache key for the data
532
- embeddings: Embeddings array to cache
533
- chunks: Document chunks to cache
534
-
535
- Returns:
536
- True if successful, False otherwise
537
- """
538
- try:
539
- cache_dir = _get_cache_dir()
540
- cache_file = cache_dir / f"embeddings_{cache_key}.joblib"
541
-
542
- cache_data = {
543
- 'embeddings': embeddings,
544
- 'chunks': chunks,
545
- 'timestamp': time.time(),
546
- 'cache_key': cache_key
547
- }
548
-
549
- joblib.dump(cache_data, cache_file, compress=3)
550
- logger.info(f"Saved embeddings to cache: {cache_file}")
551
- return True
552
-
553
- except Exception as e:
554
- logger.error(f"Failed to save embeddings to cache: {e}")
555
- return False
556
-
557
-
558
- def _load_embeddings_from_cache(cache_key: str) -> Tuple[Optional[np.ndarray], Optional[List[Dict]]]:
559
- """
560
- Load embeddings and chunks from cache
561
 
562
- Args:
563
- cache_key: Cache key for the data
564
-
565
- Returns:
566
- Tuple of (embeddings, chunks) or (None, None) if not found
567
- """
568
- try:
569
- cache_dir = _get_cache_dir()
570
- cache_file = cache_dir / f"embeddings_{cache_key}.joblib"
571
-
572
- if not cache_file.exists():
573
- return None, None
574
-
575
- cache_data = joblib.load(cache_file)
576
 
577
- # Validate cache data structure
578
- if not all(key in cache_data for key in ['embeddings', 'chunks', 'timestamp', 'cache_key']):
579
- logger.warning(f"Invalid cache data structure in {cache_file}")
580
- return None, None
581
-
582
- # Check if cache key matches (additional validation)
583
- if cache_data['cache_key'] != cache_key:
584
- logger.warning(f"Cache key mismatch in {cache_file}")
585
- return None, None
586
 
587
- logger.info(f"Loaded embeddings from cache: {cache_file}")
588
- return cache_data['embeddings'], cache_data['chunks']
589
-
590
- except Exception as e:
591
- logger.error(f"Failed to load embeddings from cache: {e}")
592
- return None, None
593
-
594
-
595
- def _invalidate_old_cache_files(max_age_days: int = 7) -> None:
596
- """
597
- Remove old cache files to prevent cache directory from growing too large
598
-
599
- Args:
600
- max_age_days: Maximum age of cache files in days
601
- """
602
- try:
603
- cache_dir = _get_cache_dir()
604
- current_time = time.time()
605
- max_age_seconds = max_age_days * 24 * 60 * 60
606
-
607
- for cache_file in cache_dir.glob("embeddings_*.joblib"):
608
- try:
609
- file_age = current_time - cache_file.stat().st_mtime
610
- if file_age > max_age_seconds:
611
- cache_file.unlink()
612
- logger.info(f"Removed old cache file: {cache_file}")
613
- except Exception as e:
614
- logger.warning(f"Could not remove old cache file {cache_file}: {e}")
615
-
616
- except Exception as e:
617
- logger.error(f"Failed to invalidate old cache files: {e}")
618
-
619
-
620
- class DocumentProcessor:
621
- """
622
- Main document processing class that orchestrates document operations with parallel processing support
623
- Enhanced with FAISS for 10x faster similarity search
624
- """
625
-
626
- def __init__(self, model: Optional[SentenceTransformer] = None):
627
- """
628
- Initialize the document processor
629
-
630
- Args:
631
- model: SentenceTransformer model for embeddings (optional)
632
- """
633
- self.model = model
634
- self.documents = {}
635
- self.chunks = []
636
- self.embeddings = None
637
- self.faiss_index = None # FAISS index for fast similarity search
638
- self.performance_stats = {} # Track performance metrics
639
 
640
- def load_data_room(self, data_room_path: str, max_workers: Optional[int] = None, progress_callback=None) -> Dict[str, any]:
641
  """
642
- Load and process an entire data room with parallel processing
643
 
644
  Args:
645
  data_room_path: Path to the data room directory
646
- max_workers: Maximum number of worker threads (uses config default if None)
647
- progress_callback: Optional callback function for progress updates
648
 
649
  Returns:
650
  Dictionary with processing results including performance metrics
651
  """
652
  import time
 
 
653
  config = get_config()
654
- if max_workers is None:
655
- max_workers = config.processing.max_workers
656
 
657
- start_time = time.time()
 
 
658
 
659
- logger.info(f"Starting data room processing: {data_room_path}")
660
 
661
- # Scan documents with parallel processing
662
- self.documents = scan_data_room(
663
- data_room_path,
664
- max_workers=max_workers,
665
- progress_callback=progress_callback
666
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
667
 
668
  scan_time = time.time() - start_time
669
- logger.info(f"Document scanning completed in {scan_time:.2f} seconds")
670
 
671
- # Create chunks
672
  chunk_start = time.time()
673
- self.chunks = create_chunks_with_metadata(self.documents)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
674
  chunk_time = time.time() - chunk_start
 
675
 
676
- # Create embeddings if model is available
677
  embedding_time = 0
678
- cache_hit = False
679
-
680
- if self.model and self.chunks:
681
  embedding_start = time.time()
682
 
683
- # Try to load from cache first
684
- cache_key = _generate_cache_key(self.documents)
685
- cached_embeddings, cached_chunks = _load_embeddings_from_cache(cache_key)
686
-
687
- if cached_embeddings is not None and cached_chunks is not None:
688
- # Cache hit - use cached data
689
- self.embeddings = cached_embeddings
690
- # Verify chunks match (safety check)
691
- if len(cached_chunks) == len(self.chunks):
692
- self.chunks = cached_chunks
693
- cache_hit = True
694
- logger.info(f"Loaded embeddings from cache (key: {cache_key[:8]}...)")
695
- # Build FAISS index from cached embeddings
696
- self._build_faiss_index()
697
- else:
698
- logger.warning("Cached chunks length mismatch, regenerating embeddings")
699
 
700
- if not cache_hit:
701
- # Cache miss or invalid - generate new embeddings
702
- texts = [chunk['text'] for chunk in self.chunks]
703
- self.embeddings = create_embeddings_batch(texts, self.model)
704
-
705
- # Save to cache
706
- if _save_embeddings_to_cache(cache_key, self.embeddings, self.chunks):
707
- logger.info(f"Saved new embeddings to cache (key: {cache_key[:8]}...)")
708
-
709
- # Clean up old cache files
710
- _invalidate_old_cache_files()
711
 
712
- # Build FAISS index for fast similarity search
713
- self._build_faiss_index()
714
-
715
  embedding_time = time.time() - embedding_start
716
- cache_status = "from cache" if cache_hit else "generated"
717
- logger.info(f"Embeddings {cache_status} and FAISS index built in {embedding_time:.2f} seconds")
718
 
719
  total_time = time.time() - start_time
720
  logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
721
 
 
 
 
 
 
 
 
 
 
722
  return {
723
- 'documents_count': len(self.documents),
724
- 'chunks_count': len(self.chunks),
725
- 'has_embeddings': self.embeddings is not None,
726
- 'performance': {
727
- 'total_time': total_time,
728
- 'scan_time': scan_time,
729
- 'chunk_time': chunk_time,
730
- 'embedding_time': embedding_time,
731
- 'documents_per_second': len(self.documents) / scan_time if scan_time > 0 else 0,
732
- 'cache_hit': cache_hit,
733
- 'cache_key': cache_key[:8] + "..." if 'cache_key' in locals() else None
734
- }
735
  }
736
 
737
- def _build_faiss_index(self) -> None:
738
- """
739
- Build FAISS IndexFlatIP for fast similarity search
740
- """
741
- if self.embeddings is None:
742
- logger.warning("No embeddings available to build FAISS index")
743
- return
744
-
745
- try:
746
- # Convert to float32 and normalize for cosine similarity via inner product
747
- embeddings_f32 = self.embeddings.astype('float32')
748
- faiss.normalize_L2(embeddings_f32)
749
-
750
- # Create FAISS index
751
- dimension = embeddings_f32.shape[1]
752
- self.faiss_index = faiss.IndexFlatIP(dimension)
753
- self.faiss_index.add(embeddings_f32)
754
-
755
- logger.info(f"Built FAISS index with {self.faiss_index.ntotal} vectors, dimension {dimension}")
756
-
757
- except Exception as e:
758
- logger.error(f"Failed to build FAISS index: {e}")
759
- self.faiss_index = None
760
-
761
- def faiss_search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
762
  """
763
- Fast similarity search using FAISS IndexFlatIP
764
 
765
  Args:
766
  query: Search query
767
- top_k: Number of top results
768
  threshold: Minimum similarity threshold
769
 
770
  Returns:
771
- List of search results with citations
772
  """
773
- if not self.model or self.faiss_index is None:
 
774
  return []
775
 
776
- return search_documents_with_faiss(
777
- query, self.chunks, self.faiss_index, self.model, top_k, threshold
778
- )
779
-
780
- def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
781
- """
782
- Search documents using semantic similarity - uses FAISS if available, falls back to numpy
783
 
784
- Args:
785
- query: Search query
786
- top_k: Number of top results
787
- threshold: Minimum similarity threshold
788
 
789
- Returns:
790
- List of search results
791
- """
792
- if not self.model:
793
- return []
794
-
795
- # Use FAISS search if index is available (10x faster)
796
- if self.faiss_index is not None:
797
- return self.faiss_search(query, top_k, threshold)
798
- elif self.embeddings is not None:
799
- # Fallback to numpy-based search
800
- return search_documents_with_citations(
801
- query, self.chunks, self.embeddings, self.model, top_k, threshold
802
- )
803
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
  return []
805
 
806
- def get_statistics(self) -> Dict[str, any]:
807
- """Get processing statistics including performance metrics"""
808
  stats = {
809
  'total_documents': len(self.documents),
810
- 'total_chunks': len(self.chunks),
811
- 'has_embeddings': self.embeddings is not None,
812
- 'has_faiss_index': self.faiss_index is not None,
813
- 'faiss_index_size': self.faiss_index.ntotal if self.faiss_index is not None else 0,
814
- 'embedding_dimension': self.embeddings.shape[1] if self.embeddings is not None else 0
815
  }
816
 
817
  # Add performance metrics if available
@@ -820,33 +422,3 @@ class DocumentProcessor:
820
 
821
  return stats
822
 
823
- def load_data_room_with_progress(self, data_room_path: str, max_workers: Optional[int] = None,
824
- progress_bar=None) -> Dict[str, any]:
825
- """
826
- Load data room with Streamlit progress bar support
827
-
828
- Args:
829
- data_room_path: Path to the data room directory
830
- max_workers: Maximum number of worker threads
831
- progress_bar: Streamlit progress bar object
832
-
833
- Returns:
834
- Dictionary with processing results
835
- """
836
- # Count total files first for accurate progress tracking
837
- path = Path(data_room_path)
838
- if not path.exists():
839
- return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
840
-
841
- total_files = sum(1 for file_path in path.rglob('*')
842
- if file_path.is_file() and not file_path.name.startswith('.')
843
- and file_path.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
844
-
845
- # Create progress tracker
846
- progress_callback = create_progress_tracker(total_files, progress_bar)
847
-
848
- # Load with progress tracking
849
- result = self.load_data_room(data_room_path, max_workers, progress_callback)
850
- self.performance_stats = result.get('performance', {})
851
-
852
- return result
 
1
  #!/usr/bin/env python3
2
  """
3
+ Streamlined Document Processing Module
4
 
5
+ This module provides a simplified document processing pipeline with:
6
+ - Direct LangChain loader integration with glob patterns
7
+ - Built-in FAISS vector storage without external file tracking
8
+ - Semantic text chunking using RecursiveCharacterTextSplitter
9
+ - Consolidated document metadata handling
10
  """
11
 
12
  import os
13
+ import warnings
14
  # Fix tokenizers parallelism warning
15
  os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
16
 
 
 
 
 
 
 
 
 
 
 
 
17
  import logging
 
 
 
 
 
18
 
19
+ # Suppress verbose LangChain warnings and output
20
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
21
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
22
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
23
+ warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
24
+ warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
25
+
26
+ # Set LangChain logging to WARNING level to reduce verbosity
27
+ logging.getLogger("langchain").setLevel(logging.WARNING)
28
+ logging.getLogger("langchain_core").setLevel(logging.WARNING)
29
+ logging.getLogger("langchain_community").setLevel(logging.WARNING)
30
+ logging.getLogger("langchain_huggingface").setLevel(logging.WARNING)
31
+ import re
32
+
33
+ from pathlib import Path
34
+ from typing import Dict, List, Optional, Any, Callable
35
+ from datetime import datetime
36
+
37
+ # LangChain imports
38
+ from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
39
+ from langchain_community.vectorstores import FAISS
40
+ from langchain_huggingface import HuggingFaceEmbeddings
41
+ from langchain_core.documents import Document
42
  from langchain_text_splitters import RecursiveCharacterTextSplitter
43
 
44
  # Import configuration
45
  from .config import get_config
46
 
47
+ # Import error handling
48
+
49
+
50
  logger = logging.getLogger(__name__)
51
 
52
+
53
+ # =============================================================================
54
+ # ERROR HANDLING UTILITIES - Merged from error_handlers.py
55
+ # =============================================================================
56
+
57
+ def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
58
+ """
59
+ Execute a function with basic error handling and logging
60
+
61
+ Args:
62
+ func: Function to execute
63
+ default: Value to return on error
64
+ context: Brief description for logs
65
+ log_errors: Whether to log errors
66
+
67
+ Returns:
68
+ Function result or default value on error
69
+ """
70
+ try:
71
+ return func()
72
+ except Exception as e:
73
+ if log_errors:
74
+ logger.error(f"{context or func.__name__}: {e}")
75
+ return default
76
 
77
 
78
  def escape_markdown_math(text: str) -> str:
 
89
  return text
90
 
91
 
92
+ class DocumentProcessor:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  """
94
+ Streamlined document processing class with integrated FAISS vector storage
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ This class consolidates all document processing functionality including:
97
+ - Document loading using LangChain's DirectoryLoader with glob patterns
98
+ - Semantic text chunking with RecursiveCharacterTextSplitter
99
+ - FAISS vector storage for similarity search
100
+ - Document metadata handling
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  """
 
102
 
103
+ def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
104
+ """
105
+ Initialize the document processor
 
106
 
107
+ Args:
108
+ model_name: Name of the sentence transformer model for embeddings (optional)
109
+ store_name: Name for the FAISS store (optional, uses config default)
110
+ """
 
111
  config = get_config()
112
+ self.model_name = model_name or config.model.sentence_transformer_model
113
+ self.store_name = store_name or config.processing.faiss_store_name
114
+
115
+ # Initialize components
116
+ self.documents: List[Document] = []
117
+ self.vector_store: Optional[FAISS] = None
118
+ self.embeddings: Optional[HuggingFaceEmbeddings] = None
119
+ self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
120
+ self.performance_stats = {}
121
+
122
+ # Convenience properties for backward compatibility
123
+ self.chunks = [] # Will be populated after processing
124
+
125
+ # Initialize text splitter with semantic boundaries
126
+ self._init_text_splitter()
127
+
128
+ # Initialize embeddings if model name provided
129
+ if self.model_name:
130
+ self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
131
+ logger.info(f"Initialized embeddings with model: {self.model_name}")
132
+ else:
133
+ logger.warning("No model name provided - embeddings not initialized")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
+ # Try to load existing FAISS store
136
+ self._load_existing_store()
 
 
 
 
 
 
 
 
 
 
137
 
138
+ def _init_text_splitter(self):
139
+ """Initialize the text splitter with optimal settings for semantic chunking"""
140
+ config = get_config()
141
+ self.text_splitter = RecursiveCharacterTextSplitter(
142
+ chunk_size=config.processing.chunk_size,
143
+ chunk_overlap=config.processing.chunk_overlap,
144
+ separators=["\\n\\n", "\\n", ".", "!", "?", ",", " "],
145
+ length_function=len,
146
+ is_separator_regex=False,
147
+ )
148
+ logger.info(f"Initialized text splitter: {config.processing.chunk_size} chars, {config.processing.chunk_overlap} overlap")
149
 
150
+ def _load_existing_store(self):
151
+ """Load existing FAISS store if available"""
152
+ if not self.embeddings:
153
+ return
154
 
155
+ config = get_config()
156
+ faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
157
+ faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
158
+ faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
 
 
 
 
 
 
 
 
160
  try:
161
+ if faiss_index_path.exists() and faiss_pkl_path.exists():
162
+ self.vector_store = FAISS.load_local(
163
+ str(faiss_dir),
164
+ self.embeddings,
165
+ index_name=self.store_name,
166
+ allow_dangerous_deserialization=True # Safe: we created these files ourselves
167
+ )
168
+ logger.info(f"Loaded existing FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
169
+ else:
170
+ logger.info(f"No existing FAISS store found for: {self.store_name}")
171
  except Exception as e:
172
+ logger.error(f"Failed to load FAISS store: {e}")
173
+ self.vector_store = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ def _save_store(self):
176
+ """Save FAISS store to disk"""
177
+ if not self.vector_store:
178
+ return
 
 
 
 
 
 
 
 
 
 
179
 
180
+ try:
181
+ config = get_config()
182
+ faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
183
+ faiss_dir.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
184
 
185
+ self.vector_store.save_local(
186
+ str(faiss_dir),
187
+ index_name=self.store_name
188
+ )
189
+ logger.info(f"Saved FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
190
+ except Exception as e:
191
+ logger.error(f"Failed to save FAISS store: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
194
  """
195
+ Load and process an entire data room using DirectoryLoader with glob patterns
196
 
197
  Args:
198
  data_room_path: Path to the data room directory
199
+ progress_bar: Optional Streamlit progress bar object
 
200
 
201
  Returns:
202
  Dictionary with processing results including performance metrics
203
  """
204
  import time
205
+ start_time = time.time()
206
+
207
  config = get_config()
208
+ data_room_path = Path(data_room_path)
 
209
 
210
+ if not data_room_path.exists():
211
+ logger.error(f"Data room path does not exist: {data_room_path}")
212
+ return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
213
 
214
+ logger.info(f"Starting streamlined data room processing: {data_room_path}")
215
 
216
+ # Clear existing documents
217
+ self.documents = []
218
+ documents_loaded = 0
219
+
220
+ # Load documents by file type using DirectoryLoader with glob patterns
221
+ supported_extensions = config.processing.supported_file_extensions
222
+
223
+ for ext in supported_extensions:
224
+ try:
225
+ # Create glob pattern for this extension
226
+ glob_pattern = f"**/*{ext}"
227
+
228
+ # Choose appropriate loader based on extension
229
+ if ext == '.pdf':
230
+ loader_cls = PyPDFLoader
231
+ elif ext in ['.docx', '.doc']:
232
+ loader_cls = Docx2txtLoader
233
+ elif ext in ['.txt', '.md']:
234
+ loader_cls = TextLoader
235
+ else:
236
+ continue
237
+
238
+ # Use DirectoryLoader with glob pattern
239
+ loader = DirectoryLoader(
240
+ str(data_room_path),
241
+ glob=glob_pattern,
242
+ loader_cls=loader_cls,
243
+ loader_kwargs={'encoding': 'utf-8'} if ext in ['.txt', '.md'] else {},
244
+ recursive=True,
245
+ show_progress=False, # Disable verbose progress output
246
+ use_multithreading=True
247
+ )
248
+
249
+ # Load documents for this extension
250
+ docs = safe_execute(
251
+ lambda: loader.load(),
252
+ default=[],
253
+ context=f"Loading {ext} files"
254
+ )
255
+
256
+ if docs:
257
+ # Add relative path information to metadata
258
+ for doc in docs:
259
+ if 'source' in doc.metadata:
260
+ source_path = Path(doc.metadata['source'])
261
+ if source_path.exists():
262
+ try:
263
+ rel_path = source_path.relative_to(data_room_path)
264
+ doc.metadata['path'] = str(rel_path)
265
+ doc.metadata['name'] = source_path.name
266
+ except ValueError:
267
+ # If relative path fails, use original source
268
+ doc.metadata['path'] = doc.metadata['source']
269
+ doc.metadata['name'] = source_path.name
270
+
271
+ self.documents.extend(docs)
272
+ documents_loaded += len(docs)
273
+ logger.info(f"Loaded {len(docs)} {ext} documents")
274
+
275
+ except Exception as e:
276
+ logger.error(f"Error loading {ext} files: {e}")
277
 
278
  scan_time = time.time() - start_time
279
+ logger.info(f"Document loading completed in {scan_time:.2f} seconds")
280
 
281
+ # Split documents into chunks using the text splitter
282
  chunk_start = time.time()
283
+ if self.documents and self.text_splitter:
284
+ self.documents = self.text_splitter.split_documents(self.documents)
285
+
286
+ # Add chunk metadata and populate chunks for backward compatibility
287
+ self.chunks = []
288
+ for i, doc in enumerate(self.documents):
289
+ doc.metadata['chunk_id'] = f"chunk_{i}"
290
+ doc.metadata['processed_at'] = datetime.now().isoformat()
291
+
292
+ # Add citation information if available
293
+ if 'page' in doc.metadata:
294
+ doc.metadata['citation'] = f"page {doc.metadata['page']}"
295
+ else:
296
+ doc.metadata['citation'] = doc.metadata.get('name', 'document')
297
+
298
+ # Create chunk dict for backward compatibility
299
+ chunk_dict = {
300
+ 'text': doc.page_content,
301
+ 'source': doc.metadata.get('name', ''),
302
+ 'path': doc.metadata.get('path', ''),
303
+ 'full_path': doc.metadata.get('source', ''),
304
+ 'metadata': doc.metadata
305
+ }
306
+ self.chunks.append(chunk_dict)
307
+
308
  chunk_time = time.time() - chunk_start
309
+ logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
310
 
311
+ # Create or update FAISS vector store
312
  embedding_time = 0
313
+ if self.embeddings and self.documents:
 
 
314
  embedding_start = time.time()
315
 
316
+ if self.vector_store is None:
317
+ # Create new FAISS store
318
+ self.vector_store = FAISS.from_documents(self.documents, self.embeddings)
319
+ logger.info(f"Created new FAISS store with {len(self.documents)} documents")
320
+ else:
321
+ # Add documents to existing store
322
+ self.vector_store.add_documents(self.documents)
323
+ logger.info(f"Added {len(self.documents)} documents to existing FAISS store")
 
 
 
 
 
 
 
 
324
 
325
+ # Save the updated store
326
+ self._save_store()
 
 
 
 
 
 
 
 
 
327
 
 
 
 
328
  embedding_time = time.time() - embedding_start
329
+ logger.info(f"FAISS processing completed in {embedding_time:.2f} seconds")
 
330
 
331
  total_time = time.time() - start_time
332
  logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
333
 
334
+ # Store performance stats
335
+ self.performance_stats = {
336
+ 'total_time': total_time,
337
+ 'scan_time': scan_time,
338
+ 'chunk_time': chunk_time,
339
+ 'embedding_time': embedding_time,
340
+ 'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
341
+ }
342
+
343
  return {
344
+ 'documents_count': documents_loaded,
345
+ 'chunks_count': len(self.documents),
346
+ 'total_chunks_in_store': self.vector_store.index.ntotal if self.vector_store else 0,
347
+ 'has_embeddings': self.vector_store is not None,
348
+ 'performance': self.performance_stats
 
 
 
 
 
 
 
349
  }
350
 
351
+ def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  """
353
+ Search documents using FAISS similarity search
354
 
355
  Args:
356
  query: Search query
357
+ top_k: Number of top results to return
358
  threshold: Minimum similarity threshold
359
 
360
  Returns:
361
+ List of search results with scores and metadata
362
  """
363
+ if not self.vector_store:
364
+ logger.warning("FAISS vector store not available for search")
365
  return []
366
 
367
+ config = get_config()
368
+ if threshold is None:
369
+ threshold = config.processing.similarity_threshold
 
 
 
 
370
 
371
+ try:
372
+ # Perform similarity search with scores
373
+ docs_and_scores = self.vector_store.similarity_search_with_score(query, k=top_k*2)
 
374
 
375
+ results = []
376
+ seen_texts = set()
377
+
378
+ for doc, score in docs_and_scores:
379
+ # Convert FAISS distance to similarity score (higher is better)
380
+ similarity_score = 1.0 / (1.0 + score) if score >= 0 else 1.0
381
+
382
+ if similarity_score < threshold:
383
+ continue
384
+
385
+ # Avoid duplicates based on text content
386
+ text_preview = doc.page_content[:100]
387
+ if text_preview not in seen_texts:
388
+ seen_texts.add(text_preview)
389
+
390
+ results.append({
391
+ 'text': doc.page_content,
392
+ 'source': doc.metadata.get('name', ''),
393
+ 'path': doc.metadata.get('path', ''),
394
+ 'full_path': doc.metadata.get('source', ''),
395
+ 'citation': doc.metadata.get('citation', 'document'),
396
+ 'score': float(similarity_score),
397
+ 'metadata': doc.metadata
398
+ })
399
+
400
+ if len(results) >= top_k:
401
+ break
402
+
403
+ return results
404
+
405
+ except Exception as e:
406
+ logger.error(f"Failed to search FAISS store: {e}")
407
  return []
408
 
409
+ def get_statistics(self) -> Dict[str, Any]:
410
+ """Get processing statistics"""
411
  stats = {
412
  'total_documents': len(self.documents),
413
+ 'total_vectors_in_store': self.vector_store.index.ntotal if self.vector_store else 0,
414
+ 'has_embeddings': self.vector_store is not None,
415
+ 'store_name': self.store_name,
416
+ 'model_name': self.model_name
 
417
  }
418
 
419
  # Add performance metrics if available
 
422
 
423
  return stats
424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/services.py CHANGED
@@ -2,408 +2,346 @@
2
  """
3
  Business Logic Services Module
4
 
5
- This module contains the core business logic services for the DD-Checklist application.
6
- Services handle specific domain operations and coordinate between different components.
7
  """
8
 
9
  import re
10
- import json
 
11
  from pathlib import Path
12
- from typing import Dict, List, Optional, Any, Tuple
13
- import numpy as np
14
- from sentence_transformers import SentenceTransformer
15
- import faiss
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  from .config import get_config
18
- from src.document_processing import DocumentProcessor, escape_markdown_math
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
 
 
 
20
 
21
- class ChecklistParser:
22
- """Service for parsing due diligence checklists"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- @staticmethod
25
- def parse_checklist(checklist_text: str) -> Dict:
26
- """
27
- Parse markdown checklist into categories and items
28
-
29
- Args:
30
- checklist_text: Raw checklist text in markdown format
31
-
32
- Returns:
33
- Dictionary with parsed categories and items
34
- """
35
- categories = {}
36
- current_category = None
37
-
38
- for line in checklist_text.split('\n'):
39
- # Category header (e.g., "A. Corporate Organization" or "## A. Corporate Organization")
40
- # Try both formats
41
- match = None
42
- if line.startswith('## '):
43
- match = re.match(r'## ([A-Z])\. (.+)', line)
44
- elif line.strip() and not line.startswith('\t') and not line.startswith(' '):
45
- # Try plain format (no ##)
46
- match = re.match(r'^([A-Z])\. (.+)', line.strip())
47
-
48
  if match:
49
  letter, name = match.groups()
50
- current_category = letter
51
- categories[letter] = {
52
- 'name': name.strip(),
53
- 'items': []
54
- }
55
- # Numbered items (may be indented with tabs or spaces)
56
- elif current_category:
57
- # Check for numbered items with various indentation
58
- line_stripped = line.strip()
59
- if re.match(r'^\d+\.', line_stripped):
60
- item_text = re.sub(r'^\d+\.\s*', '', line_stripped)
61
- if item_text:
62
- # Clean up [bracketed] content but keep the text
63
- clean_text = re.sub(r'\[.*?\]', '', item_text).strip()
64
- if not clean_text:
65
- clean_text = item_text
66
- categories[current_category]['items'].append({
67
- 'text': clean_text,
68
- 'original': item_text
69
- })
70
 
71
- return categories
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
 
74
- class QuestionParser:
75
- """Service for parsing due diligence questions"""
 
 
 
 
 
76
 
77
- @staticmethod
78
- def parse_questions(questions_text: str) -> List[Dict]:
79
- """
80
- Parse markdown questions into a list of questions with categories
81
-
82
- Args:
83
- questions_text: Raw questions text in markdown format
84
-
85
- Returns:
86
- List of parsed questions with categories
87
- """
88
- questions = []
89
- current_category = None
90
-
91
- for line in questions_text.split('\n'):
92
- # Category header (e.g., "### A. Organizational and Corporate Documents")
93
- if line.startswith('### '):
94
- match = re.match(r'### ([A-Z])\. (.+)', line)
95
- if match:
96
- letter, name = match.groups()
97
- current_category = f"{letter}. {name.strip()}"
98
- # Question lines (numbered items)
99
- elif current_category and line.strip():
100
- match = re.match(r'^\d+\.\s+(.+)', line.strip())
101
- if match:
102
- question_text = match.group(1).strip()
103
- if question_text:
104
- questions.append({
105
- 'category': current_category,
106
- 'question': question_text,
107
- 'id': f"q_{len(questions)}"
108
- })
109
-
110
- return questions
 
111
 
112
 
113
- class ChecklistMatcher:
114
- """Service for matching checklists to documents"""
 
 
115
 
116
- def __init__(self, model: SentenceTransformer):
117
- """
118
- Initialize the matcher
119
-
120
- Args:
121
- model: SentenceTransformer model for embeddings
122
- """
123
- self.model = model
124
 
125
- def match_checklist_to_documents(
126
- self,
127
- checklist: Dict,
128
- chunks: List[Dict],
129
- embeddings: np.ndarray,
130
- threshold: Optional[float] = None
131
- ) -> Dict:
132
- """
133
- Match each checklist item to relevant documents using FAISS for 10x faster similarity search
134
-
135
- Args:
136
- checklist: Parsed checklist
137
- chunks: Document chunks
138
- embeddings: Precomputed embeddings
139
- threshold: Similarity threshold (uses config default if None)
140
 
141
- Returns:
142
- Matching results
143
- """
144
- config = get_config()
145
- if threshold is None:
146
- threshold = config.processing.similarity_threshold
147
-
148
- # Build FAISS index for fast similarity search
149
- embeddings_f32 = embeddings.astype('float32')
150
- faiss.normalize_L2(embeddings_f32) # Normalize for cosine similarity
151
- dimension = embeddings_f32.shape[1]
152
- faiss_index = faiss.IndexFlatIP(dimension)
153
- faiss_index.add(embeddings_f32)
154
-
155
- results = {}
156
-
157
- for cat_letter, category in checklist.items():
158
- cat_results = {
159
- 'name': category['name'],
160
- 'items': [],
161
- 'total_items': len(category['items']),
162
- 'matched_items': 0
163
- }
164
 
165
- for item_idx, item in enumerate(category['items']):
166
- # Encode checklist item with category context
167
- item_text = f"{category['name']} {item['text']}"
168
- item_embedding = self.model.encode(item_text).astype('float32').reshape(1, -1)
169
- faiss.normalize_L2(item_embedding)
170
-
171
- # Use FAISS for fast similarity search
172
- scores, indices = faiss_index.search(item_embedding, len(chunks))
173
-
174
- # Get unique documents that match
175
- doc_matches = {}
176
- for score, idx in zip(scores[0], indices[0]):
177
- if idx == -1 or score < threshold:
178
- continue
179
-
180
- doc_path = chunks[idx]['path']
181
- if doc_path not in doc_matches or score > doc_matches[doc_path]['score']:
182
- doc_matches[doc_path] = {
183
- 'name': chunks[idx]['source'],
184
- 'path': doc_path,
185
- 'full_path': chunks[idx].get('full_path', doc_path),
186
- 'score': float(score),
187
- 'metadata': chunks[idx]['metadata']
188
- }
189
-
190
- # Sort by score
191
- sorted_matches = sorted(doc_matches.values(), key=lambda x: x['score'], reverse=True)
192
-
193
- item_result = {
194
- 'text': item['text'],
195
- 'original': item['original'],
196
- 'matches': sorted_matches
197
- }
198
-
199
- if sorted_matches:
200
- cat_results['matched_items'] += 1
201
-
202
- cat_results['items'].append(item_result)
203
 
204
- results[cat_letter] = cat_results
205
-
206
- return results
 
 
 
 
 
207
 
208
- def match_checklist_with_summaries(
209
- self,
210
- checklist: Dict,
211
- doc_embeddings_data: Dict,
212
- threshold: Optional[float] = None
213
- ) -> Dict:
214
- """
215
- Match checklist items against document summaries using FAISS for 10x faster similarity search
216
-
217
- Args:
218
- checklist: Parsed checklist
219
- doc_embeddings_data: Document embeddings with summaries
220
- threshold: Similarity threshold
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- Returns:
223
- Matching results using AI summaries
224
- """
225
- doc_embeddings = np.array(doc_embeddings_data['embeddings'], dtype='float32')
226
- doc_info = doc_embeddings_data['documents']
227
-
228
- # Build FAISS index for fast similarity search
229
- faiss.normalize_L2(doc_embeddings) # Normalize for cosine similarity
230
- dimension = doc_embeddings.shape[1]
231
- faiss_index = faiss.IndexFlatIP(dimension)
232
- faiss_index.add(doc_embeddings)
233
-
234
- results = {}
235
-
236
- for cat_letter, category in checklist.items():
237
- cat_name = category.get('name', '')
238
- cat_results = {
239
- 'name': cat_name,
240
- 'letter': cat_letter,
241
- 'total_items': len(category.get('items', [])),
242
- 'matched_items': 0,
243
- 'items': []
244
- }
245
 
246
- for item in category.get('items', []):
247
- item_text = item.get('text', '')
248
-
249
- # Create embedding for checklist item with category context
250
- checklist_embedding_text = f"{cat_name}: {item_text}"
251
- item_embedding = self.model.encode(checklist_embedding_text).astype('float32').reshape(1, -1)
252
- faiss.normalize_L2(item_embedding)
253
-
254
- # Use FAISS for fast similarity search
255
- scores, indices = faiss_index.search(item_embedding, len(doc_info))
256
-
257
- # Find matching documents above threshold
258
- matches = []
259
- for score, idx in zip(scores[0], indices[0]):
260
- if idx == -1: # No more results
261
- break
262
- if score > threshold:
263
- matches.append({
264
- 'name': doc_info[idx]['name'],
265
- 'path': doc_info[idx]['path'],
266
- 'summary': doc_info[idx]['summary'],
267
- 'score': float(score),
268
- 'metadata': doc_info[idx].get('original_doc', {}).get('metadata', {})
269
- })
270
- else:
271
- break # Scores are sorted, so we can stop here
272
-
273
- # Keep top 5 matches (already sorted by FAISS)
274
- matches = matches[:5]
275
-
276
- item_result = {
277
- 'text': item_text,
278
- 'original': item.get('original', item_text),
279
- 'matches': matches
280
- }
281
-
282
- if matches:
283
- cat_results['matched_items'] += 1
284
-
285
- cat_results['items'].append(item_result)
286
 
287
- results[cat_letter] = cat_results
 
 
 
 
288
 
289
- return results
 
 
290
 
291
 
292
- class QuestionAnswerer:
293
- """Service for answering questions using document chunks"""
294
-
295
- def __init__(self, model: SentenceTransformer):
296
- """
297
- Initialize the question answerer
298
-
299
- Args:
300
- model: SentenceTransformer model for embeddings
301
- """
302
- self.model = model
303
-
304
- def answer_questions_with_chunks(
305
- self,
306
- questions: List[Dict],
307
- chunks: List[Dict],
308
- embeddings: np.ndarray,
309
- threshold: Optional[float] = None
310
- ) -> Dict:
311
- """
312
- Answer questions using document chunks with FAISS for 10x faster similarity search
313
-
314
- Args:
315
- questions: List of parsed questions
316
- chunks: Document chunks
317
- embeddings: Precomputed embeddings
318
- threshold: Similarity threshold (uses config default if None)
319
-
320
- Returns:
321
- Dictionary of answers with citations
322
- """
323
- config = get_config()
324
- if threshold is None:
325
- threshold = config.processing.relevancy_threshold
326
-
327
- # Build FAISS index for fast similarity search
328
- embeddings_f32 = embeddings.astype('float32')
329
- faiss.normalize_L2(embeddings_f32) # Normalize for cosine similarity
330
- dimension = embeddings_f32.shape[1]
331
- faiss_index = faiss.IndexFlatIP(dimension)
332
- faiss_index.add(embeddings_f32)
333
-
334
- answers = {}
335
-
336
- for question in questions:
337
- # Encode question
338
- question_embedding = self.model.encode(question['question']).astype('float32').reshape(1, -1)
339
- faiss.normalize_L2(question_embedding)
340
-
341
- # Use FAISS for fast similarity search
342
- scores, indices = faiss_index.search(question_embedding, min(10, len(chunks))) # Get top 10 candidates
343
-
344
- # Get top matching chunks above threshold
345
- relevant_chunks = []
346
 
347
- for score, idx in zip(scores[0], indices[0]):
348
- if idx == -1 or score < threshold:
349
- continue
350
-
351
- chunk_info = chunks[idx]
352
- relevant_chunks.append({
353
- 'text': chunk_info['text'][:500], # Limit text length
354
- 'source': chunk_info['source'],
355
- 'path': chunk_info['path'],
356
- 'score': float(score),
357
- 'metadata': chunk_info.get('metadata', {})
358
- })
359
-
360
- # Limit to top 5 chunks
361
- if len(relevant_chunks) >= 5:
362
- break
363
 
364
  answers[question['id']] = {
365
  'question': question['question'],
366
  'category': question['category'],
367
- 'chunks': relevant_chunks,
368
- 'has_answer': len(relevant_chunks) > 0
 
369
  }
370
-
371
- return answers
 
 
 
 
 
 
 
 
 
372
 
 
 
 
373
 
374
- class ReportGenerator:
375
- """Service for generating reports and summaries"""
 
 
376
 
377
- def __init__(self, agent=None):
378
- """
379
- Initialize the report generator
380
-
381
- Args:
382
- agent: Optional AI agent for enhanced reporting
383
- """
384
- self.agent = agent
385
 
386
- def generate_company_summary(self, documents: Dict[str, Dict], data_room_name: str = "Unknown") -> str:
387
- """
388
- Generate company overview summary
389
-
390
- Args:
391
- documents: Dictionary of processed documents
392
- data_room_name: Name of the data room/company
393
-
394
- Returns:
395
- Company summary text
396
- """
397
- if not self.agent or not hasattr(self.agent, 'llm'):
398
- return self._generate_basic_summary(documents, data_room_name)
 
 
 
 
 
 
 
 
 
399
 
400
- # Gather key information from documents
401
  doc_summaries = []
402
- for path, doc_info in list(documents.items())[:10]: # Use top 10 docs
403
  if 'summary' in doc_info:
404
  doc_summaries.append(f"{doc_info['name']}: {doc_info['summary']}")
405
  else:
406
- # Use first 500 chars of content if no summary
407
  content_preview = doc_info.get('content', '')[:500]
408
  if content_preview:
409
  doc_summaries.append(f"{doc_info['name']}: {content_preview}")
@@ -411,34 +349,69 @@ class ReportGenerator:
411
  if not doc_summaries:
412
  return "No documents available for summary generation."
413
 
414
- # Create prompt for company summary
415
- from langchain_core.messages import HumanMessage
416
- prompt = f"""Based on the following document summaries from a due diligence data room, provide a comprehensive company overview.
417
-
418
- Company: {data_room_name}
419
 
420
- Document Summaries:
421
- {chr(10).join(doc_summaries[:10])}
422
-
423
- Please provide:
424
- 1. Company name and industry
425
- 2. Business model and key products/services
426
- 3. Market position and competitive advantages
427
- 4. Key financials (if available)
428
- 5. Organizational structure
429
- 6. Notable risks or concerns
430
- 7. Overall assessment for M&A consideration
 
 
 
 
 
 
 
 
 
431
 
432
- Format the response in clear sections with bullet points where appropriate."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- try:
435
- response = self.agent.llm.invoke([HumanMessage(content=prompt)])
436
- return escape_markdown_math(response.content.strip())
437
- except Exception as e:
438
- return f"Failed to generate AI summary: {str(e)}"
439
 
440
- def _generate_basic_summary(self, documents: Dict[str, Dict], data_room_name: str) -> str:
441
- """Generate basic summary without AI"""
 
 
 
 
 
 
 
 
 
 
 
 
442
  doc_count = len(documents)
443
  file_types = {}
444
 
@@ -446,7 +419,7 @@ class ReportGenerator:
446
  doc_type = doc_info.get('metadata', {}).get('type', 'unknown')
447
  file_types[doc_type] = file_types.get(doc_type, 0) + 1
448
 
449
- summary = f"""# Company Overview: {data_room_name}
450
 
451
  ## Document Analysis
452
  - **Total Documents**: {doc_count}
@@ -457,70 +430,11 @@ Based on the document structure, this data room appears to cover standard due di
457
 
458
  *Note: Enable AI features for detailed company analysis and insights.*
459
  """
460
- return summary
461
 
462
- def generate_strategic_analysis(
463
- self,
464
- strategy_text: str,
465
- checklist_results: Dict,
466
- documents: Dict[str, Dict]
467
- ) -> str:
468
- """
469
- Generate strategic analysis based on strategy and checklist results
470
-
471
- Args:
472
- strategy_text: Strategic document content
473
- checklist_results: Results from checklist matching
474
- documents: Document dictionary
475
-
476
- Returns:
477
- Strategic analysis text
478
- """
479
- if not self.agent or not hasattr(self.agent, 'llm'):
480
- return self._generate_basic_strategic_analysis(checklist_results)
481
-
482
- # Build context from checklist results
483
- checklist_context = []
484
- for cat_id, cat_data in checklist_results.items():
485
- cat_name = cat_data['name']
486
- matched_items = sum(1 for item in cat_data['items'] if item['matches'])
487
- total_items = len(cat_data['items'])
488
- coverage = (matched_items / total_items * 100) if total_items > 0 else 0
489
-
490
- checklist_context.append(f"- {cat_name}: {coverage:.0f}% coverage ({matched_items}/{total_items} items)")
491
 
492
- # Add details about specific gaps
493
- missing_items = [item['text'] for item in cat_data['items'] if not item['matches']]
494
- if missing_items and len(missing_items) <= 3:
495
- checklist_context.append(f" Missing: {', '.join(missing_items[:3])}")
496
-
497
- # Build prompt
498
- prompt = f"""Based on the due diligence checklist results and the selected strategy, provide a strategic analysis.
499
-
500
- Strategy Document:
501
- {strategy_text}
502
-
503
- Checklist Coverage:
504
- {chr(10).join(checklist_context)}
505
-
506
- Please provide:
507
- 1. Strategic alignment assessment
508
- 2. Key risks and gaps identified
509
- 3. Opportunities and synergies
510
- 4. Recommended next steps
511
- 5. Overall recommendation
512
-
513
- Format the response with clear sections and bullet points."""
514
-
515
- try:
516
- from langchain_core.messages import HumanMessage
517
- response = self.agent.llm.invoke([HumanMessage(content=prompt)])
518
- return escape_markdown_math(response.content.strip())
519
- except Exception as e:
520
- return f"Failed to generate strategic analysis: {str(e)}"
521
-
522
- def _generate_basic_strategic_analysis(self, checklist_results: Dict) -> str:
523
- """Generate basic strategic analysis without AI"""
524
  total_items = sum(cat['total_items'] for cat in checklist_results.values())
525
  matched_items = sum(cat['matched_items'] for cat in checklist_results.values())
526
  coverage = (matched_items / total_items * 100) if total_items > 0 else 0
@@ -547,102 +461,30 @@ Based on the document structure, this data room appears to cover standard due di
547
  """
548
 
549
  return analysis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
 
551
 
552
- class DDChecklistService:
553
- """
554
- Main service orchestrator for DD-Checklist operations
555
- Coordinates between different services and manages the overall workflow
556
- """
557
-
558
- def __init__(self, model: SentenceTransformer, agent=None):
559
- """
560
- Initialize the service
561
-
562
- Args:
563
- model: SentenceTransformer model
564
- agent: Optional AI agent
565
- """
566
- self.model = model
567
- self.agent = agent
568
- self.document_processor = DocumentProcessor(model)
569
- self.checklist_parser = ChecklistParser()
570
- self.question_parser = QuestionParser()
571
- self.checklist_matcher = ChecklistMatcher(model)
572
- self.question_answerer = QuestionAnswerer(model)
573
- self.report_generator = ReportGenerator(agent)
574
-
575
- def process_data_room(
576
- self,
577
- data_room_path: str,
578
- checklist_text: str = "",
579
- questions_text: str = ""
580
- ) -> Dict[str, Any]:
581
- """
582
- Process entire data room with checklist and questions
583
-
584
- Args:
585
- data_room_path: Path to data room
586
- checklist_text: Optional checklist text
587
- questions_text: Optional questions text
588
-
589
- Returns:
590
- Dictionary with all processing results
591
- """
592
- results = {}
593
-
594
- # Load data room
595
- load_results = self.document_processor.load_data_room(data_room_path)
596
- results['load_results'] = load_results
597
-
598
- # Parse checklist if provided
599
- checklist = {}
600
- if checklist_text:
601
- checklist = self.checklist_parser.parse_checklist(checklist_text)
602
- results['checklist'] = checklist
603
-
604
- # Parse questions if provided
605
- questions = []
606
- if questions_text:
607
- questions = self.question_parser.parse_questions(questions_text)
608
- results['questions'] = questions
609
-
610
- # Match checklist to documents
611
- checklist_results = {}
612
- if checklist and self.document_processor.chunks:
613
- checklist_results = self.checklist_matcher.match_checklist_to_documents(
614
- checklist,
615
- self.document_processor.chunks,
616
- self.document_processor.embeddings
617
- )
618
- results['checklist_results'] = checklist_results
619
-
620
- # Answer questions
621
- question_answers = {}
622
- if questions and self.document_processor.chunks and self.document_processor.embeddings is not None:
623
- question_answers = self.question_answerer.answer_questions_with_chunks(
624
- questions,
625
- self.document_processor.chunks,
626
- self.document_processor.embeddings
627
- )
628
- results['question_answers'] = question_answers
629
-
630
- return results
631
-
632
- def search_documents(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
633
- """
634
- Search documents using the document processor
635
-
636
- Args:
637
- query: Search query
638
- top_k: Number of results
639
- threshold: Similarity threshold
640
-
641
- Returns:
642
- Search results
643
- """
644
- return self.document_processor.search(query, top_k, threshold)
645
-
646
- def get_processing_statistics(self) -> Dict[str, Any]:
647
- """Get comprehensive processing statistics"""
648
- return self.document_processor.get_statistics()
 
2
  """
3
  Business Logic Services Module
4
 
5
+ Simplified service layer with focused functions instead of over-abstracted classes.
 
6
  """
7
 
8
  import re
9
+ import logging
10
+ import warnings
11
  from pathlib import Path
12
+
13
+ # Suppress verbose LangChain warnings in services
14
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
15
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
16
+ warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
17
+ warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
18
+ warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
19
+ from typing import Dict, List, Optional, Any
20
+ import markdown
21
+
22
+
23
+
24
+ from langchain_core.output_parsers import StrOutputParser
25
+ from langchain_core.prompts import PromptTemplate
26
+ from langchain_community.vectorstores import FAISS
27
+ from langchain_huggingface import HuggingFaceEmbeddings
28
+ from langchain_core.documents import Document
29
+ from langchain_core.messages import HumanMessage
30
 
31
  from .config import get_config
32
+ from .document_processing import DocumentProcessor, escape_markdown_math
33
+
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ # =============================================================================
39
+ # PARSING FUNCTIONS - Simplified from ChecklistParser and QuestionParser classes
40
+ # =============================================================================
41
+
42
+ def parse_checklist(checklist_text: str) -> Dict:
43
+ """Parse markdown checklist into categories and items using standard markdown parser"""
44
+ categories = {}
45
+ current_category = None
46
+
47
+ # Parse line by line for reliable extraction
48
+ lines = checklist_text.split('\n')
49
+ for line_num, original_line in enumerate(lines):
50
+ line = original_line.strip()
51
 
52
+ # Skip empty lines and separator lines
53
+ if not line or line.startswith('⸻') or line.startswith('---'):
54
+ continue
55
 
56
+ # Skip title lines
57
+ if 'due diligence checklist' in line.lower() or line.startswith('#'):
58
+ continue
59
+
60
+ # Category headers - look for pattern "A. Category Name"
61
+ category_match = re.match(r'^([A-Z])\.\s+(.+)', line)
62
+ if category_match and not re.match(r'^\d+\.\s+', line):
63
+ letter, name = category_match.groups()
64
+ current_category = letter
65
+ categories[letter] = {
66
+ 'name': name.strip(),
67
+ 'items': []
68
+ }
69
+ continue
70
+
71
+ # Numbered items within categories - look for indented items
72
+ if current_category and line:
73
+ # Check if original line was indented (starts with tab or multiple spaces)
74
+ is_indented = original_line.startswith(('\t', ' ', ' '))
75
+ item_match = re.match(r'^\d+\.\s+(.+)', line)
76
+
77
+ if item_match and (is_indented or current_category):
78
+ item_text = item_match.group(1).strip()
79
+ if item_text and not item_text.lower().startswith('[other requests'):
80
+ # Clean up markdown formatting but preserve content
81
+ clean_text = re.sub(r'\[.*?\]', '', item_text).strip()
82
+ if not clean_text:
83
+ clean_text = item_text
84
+
85
+ categories[current_category]['items'].append({
86
+ 'text': clean_text,
87
+ 'original': item_text
88
+ })
89
 
90
+ return categories
91
+
92
+
93
+ def parse_questions(questions_text: str) -> List[Dict]:
94
+ """Parse markdown questions into a list using standard markdown parser"""
95
+ # Convert markdown to understand structure
96
+ md = markdown.Markdown(extensions=['toc'])
97
+ html = md.convert(questions_text)
98
+
99
+ questions = []
100
+ current_category = None
101
+
102
+ # Parse line by line for reliable extraction
103
+ lines = questions_text.split('\n')
104
+ for line in lines:
105
+ line = line.strip()
106
+
107
+ # Category headers (### format)
108
+ if line.startswith('### '):
109
+ match = re.match(r'###\s+([A-Z])\.\s+(.+)', line)
 
 
 
 
110
  if match:
111
  letter, name = match.groups()
112
+ current_category = f"{letter}. {name.strip()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ # Question items (numbered lists)
115
+ elif current_category and line:
116
+ match = re.match(r'^\d+\.\s+(.+)', line)
117
+ if match:
118
+ question_text = match.group(1).strip()
119
+ if question_text:
120
+ # Clean markdown formatting
121
+ clean_question = re.sub(r'\*\*(.*?)\*\*', r'\1', question_text) # Remove bold
122
+ clean_question = re.sub(r'\*(.*?)\*', r'\1', clean_question) # Remove italics
123
+
124
+ questions.append({
125
+ 'category': current_category,
126
+ 'question': clean_question,
127
+ 'id': f"q_{len(questions)}"
128
+ })
129
+
130
+ return questions
131
 
132
 
133
+ # =============================================================================
134
+ # SEARCH FUNCTIONS - Consolidated from ChecklistMatcher and QuestionAnswerer
135
+ # =============================================================================
136
+
137
+ def create_vector_store(source_data, model_name: str) -> FAISS:
138
+ """Unified vector store creation from various data sources"""
139
+ embeddings = HuggingFaceEmbeddings(model_name=model_name)
140
 
141
+ # Handle different input types
142
+ if isinstance(source_data, list):
143
+ if all(isinstance(item, Document) for item in source_data):
144
+ # Already LangChain documents
145
+ return FAISS.from_documents(source_data, embeddings)
146
+ elif all(isinstance(item, dict) for item in source_data):
147
+ # Document chunks
148
+ documents = [
149
+ Document(
150
+ page_content=chunk['text'],
151
+ metadata={
152
+ 'source': chunk.get('source', ''),
153
+ 'path': chunk.get('path', ''),
154
+ 'full_path': chunk.get('full_path', ''),
155
+ **chunk.get('metadata', {})
156
+ }
157
+ ) for chunk in source_data
158
+ ]
159
+ return FAISS.from_documents(documents, embeddings)
160
+ elif isinstance(source_data, dict) and 'documents' in source_data:
161
+ # Document embeddings data with summaries
162
+ documents = [
163
+ Document(
164
+ page_content=f"{doc['name']}\n{doc['path']}\n{doc['summary']}",
165
+ metadata={
166
+ 'name': doc['name'],
167
+ 'path': doc['path'],
168
+ 'summary': doc['summary'],
169
+ **doc.get('original_doc', {}).get('metadata', {})
170
+ }
171
+ ) for doc in source_data['documents']
172
+ ]
173
+ return FAISS.from_documents(documents, embeddings)
174
+
175
+ raise ValueError("Unsupported data type for vector store creation")
176
 
177
 
178
+ def search_and_analyze(queries: List[Dict], vector_store: FAISS, llm=None, threshold: float = 0.7, search_type: str = 'items') -> Dict:
179
+ """Unified search function for both checklist items and questions using LangChain RAG"""
180
+ from langchain.chains import RetrievalQA
181
+ from langchain.prompts import PromptTemplate
182
 
183
+ retriever = vector_store.as_retriever(
184
+ search_type="similarity_score_threshold",
185
+ search_kwargs={"score_threshold": threshold, "k": 5 if search_type == 'questions' else 10}
186
+ )
 
 
 
 
187
 
188
+ # Create RAG chain if LLM is provided
189
+ qa_chain = None
190
+ if llm:
191
+ prompt_template = PromptTemplate(
192
+ input_variables=["context", "question"],
193
+ template="""Use the provided context to answer the question. Be concise and factual.
 
 
 
 
 
 
 
 
 
194
 
195
+ Context: {context}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ Question: {question}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ Answer:"""
200
+ )
201
+ qa_chain = RetrievalQA.from_chain_type(
202
+ llm=llm,
203
+ chain_type="stuff",
204
+ retriever=retriever,
205
+ chain_type_kwargs={"prompt": prompt_template}
206
+ )
207
 
208
+ if search_type == 'items':
209
+ return _process_checklist_items(queries, retriever, qa_chain)
210
+ else:
211
+ return _process_questions(queries, retriever, qa_chain)
212
+
213
+
214
+ def _process_checklist_items(checklist: Dict, retriever, qa_chain=None) -> Dict:
215
+ """Process checklist items with unified search logic"""
216
+ results = {}
217
+ for cat_letter, category in checklist.items():
218
+ cat_results = {
219
+ 'name': category['name'],
220
+ 'items': [],
221
+ 'total_items': len(category['items']),
222
+ 'matched_items': 0
223
+ }
224
+
225
+ for item in category['items']:
226
+ query = f"{category['name']}: {item['text']}"
227
+ try:
228
+ docs = retriever.invoke(query)
229
+ except Exception as e:
230
+ logger.error(f"Error in document matching: {e}")
231
+ docs = []
232
 
233
+ matches = [{
234
+ 'name': doc.metadata.get('source', ''),
235
+ 'path': doc.metadata.get('path', ''),
236
+ 'full_path': doc.metadata.get('full_path', ''),
237
+ 'score': 0.8, # LangChain similarity scores not directly accessible
238
+ 'metadata': {k: v for k, v in doc.metadata.items()
239
+ if k not in ['source', 'path', 'full_path']}
240
+ } for doc in docs[:5]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
+ if matches:
243
+ cat_results['matched_items'] += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ cat_results['items'].append({
246
+ 'text': item['text'],
247
+ 'original': item['original'],
248
+ 'matches': matches
249
+ })
250
 
251
+ results[cat_letter] = cat_results
252
+
253
+ return results
254
 
255
 
256
+ def _process_questions(questions: List[Dict], retriever, qa_chain=None) -> Dict:
257
+ """Process questions with unified search logic"""
258
+ answers = {}
259
+ for question in questions:
260
+ try:
261
+ docs = retriever.invoke(question['question'])
262
+ except Exception as e:
263
+ logger.error(f"Error in question answering: {e}")
264
+ docs = []
265
+
266
+ if docs:
267
+ chunks_data = [{
268
+ 'text': doc.page_content[:500],
269
+ 'source': doc.metadata.get('source', ''),
270
+ 'path': doc.metadata.get('path', ''),
271
+ 'score': 0.8,
272
+ 'metadata': {k: v for k, v in doc.metadata.items()
273
+ if k not in ['source', 'path']}
274
+ } for doc in docs]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ # Generate answer using RAG chain if available
277
+ answer_text = "Retrieved relevant document chunks."
278
+ if qa_chain:
279
+ try:
280
+ answer_text = qa_chain.run(question['question'])
281
+ except Exception as e:
282
+ logger.error(f"RAG chain failed: {e}")
283
+ answer_text = "Retrieved relevant document chunks."
 
 
 
 
 
 
 
 
284
 
285
  answers[question['id']] = {
286
  'question': question['question'],
287
  'category': question['category'],
288
+ 'answer': answer_text,
289
+ 'chunks': chunks_data,
290
+ 'has_answer': True
291
  }
292
+ else:
293
+ answers[question['id']] = {
294
+ 'question': question['question'],
295
+ 'category': question['category'],
296
+ 'answer': "No relevant documents found",
297
+ 'chunks': [],
298
+ 'has_answer': False
299
+ }
300
+
301
+ return answers
302
+
303
 
304
+ # =============================================================================
305
+ # REPORT GENERATION FUNCTIONS - Simplified from ReportGenerator class
306
+ # =============================================================================
307
 
308
+ def generate_reports(documents: Dict[str, Dict], data_room_name: str = "Unknown",
309
+ strategy_text: str = "", checklist_results: Dict = None,
310
+ report_type: str = "overview", llm=None) -> str:
311
+ """Unified report generation using LangChain prompt templates"""
312
 
313
+ if not llm:
314
+ return _generate_basic_report(documents, data_room_name, checklist_results, report_type)
 
 
 
 
 
 
315
 
316
+ # Define prompt templates
317
+ if report_type == "overview":
318
+ template = PromptTemplate(
319
+ input_variables=["company_name", "document_summaries"],
320
+ template="""Based on the following document summaries from a due diligence data room, provide a comprehensive company overview.
321
+
322
+ Company: {company_name}
323
+
324
+ Document Summaries:
325
+ {document_summaries}
326
+
327
+ Please provide:
328
+ 1. Company name and industry
329
+ 2. Business model and key products/services
330
+ 3. Market position and competitive advantages
331
+ 4. Key financials (if available)
332
+ 5. Organizational structure
333
+ 6. Notable risks or concerns
334
+ 7. Overall assessment for M&A consideration
335
+
336
+ Format the response in clear sections with bullet points where appropriate."""
337
+ )
338
 
339
+ # Prepare document summaries
340
  doc_summaries = []
341
+ for path, doc_info in list(documents.items())[:10]:
342
  if 'summary' in doc_info:
343
  doc_summaries.append(f"{doc_info['name']}: {doc_info['summary']}")
344
  else:
 
345
  content_preview = doc_info.get('content', '')[:500]
346
  if content_preview:
347
  doc_summaries.append(f"{doc_info['name']}: {content_preview}")
 
349
  if not doc_summaries:
350
  return "No documents available for summary generation."
351
 
352
+ inputs = {
353
+ "company_name": data_room_name,
354
+ "document_summaries": "\n".join(doc_summaries[:10])
355
+ }
 
356
 
357
+ elif report_type == "strategic":
358
+ template = PromptTemplate(
359
+ input_variables=["strategy_text", "checklist_context"],
360
+ template="""Based on the due diligence checklist results and the selected strategy, provide a strategic analysis.
361
+
362
+ Strategy Document:
363
+ {strategy_text}
364
+
365
+ Checklist Coverage:
366
+ {checklist_context}
367
+
368
+ Please provide:
369
+ 1. Strategic alignment assessment
370
+ 2. Key risks and gaps identified
371
+ 3. Opportunities and synergies
372
+ 4. Recommended next steps
373
+ 5. Overall recommendation
374
+
375
+ Format the response with clear sections and bullet points."""
376
+ )
377
 
378
+ # Build checklist context
379
+ if not checklist_results:
380
+ return "No checklist results available for strategic analysis."
381
+
382
+ checklist_context = []
383
+ for cat_id, cat_data in checklist_results.items():
384
+ cat_name = cat_data['name']
385
+ matched_items = cat_data['matched_items']
386
+ total_items = cat_data['total_items']
387
+ coverage = (matched_items / total_items * 100) if total_items > 0 else 0
388
+
389
+ checklist_context.append(f"- {cat_name}: {coverage:.0f}% coverage ({matched_items}/{total_items} items)")
390
+
391
+ # Add details about gaps
392
+ missing_items = [item['text'] for item in cat_data['items'] if not item['matches']]
393
+ if missing_items and len(missing_items) <= 3:
394
+ checklist_context.append(f" Missing: {', '.join(missing_items[:3])}")
395
 
396
+ inputs = {
397
+ "strategy_text": strategy_text,
398
+ "checklist_context": "\n".join(checklist_context)
399
+ }
 
400
 
401
+ # Execute the chain
402
+ try:
403
+ chain = template | llm | StrOutputParser()
404
+ response = chain.invoke(inputs)
405
+ return escape_markdown_math(response.strip())
406
+ except Exception as e:
407
+ logger.error(f"LLM report generation failed: {e}")
408
+ return f"Failed to generate {report_type} report: {str(e)}"
409
+
410
+
411
+ def _generate_basic_report(documents: Dict[str, Dict], data_room_name: str,
412
+ checklist_results: Dict, report_type: str) -> str:
413
+ """Generate basic reports without AI"""
414
+ if report_type == "overview":
415
  doc_count = len(documents)
416
  file_types = {}
417
 
 
419
  doc_type = doc_info.get('metadata', {}).get('type', 'unknown')
420
  file_types[doc_type] = file_types.get(doc_type, 0) + 1
421
 
422
+ return f"""# Company Overview: {data_room_name}
423
 
424
  ## Document Analysis
425
  - **Total Documents**: {doc_count}
 
430
 
431
  *Note: Enable AI features for detailed company analysis and insights.*
432
  """
 
433
 
434
+ elif report_type == "strategic":
435
+ if not checklist_results:
436
+ return "No checklist results available for strategic analysis."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  total_items = sum(cat['total_items'] for cat in checklist_results.values())
439
  matched_items = sum(cat['matched_items'] for cat in checklist_results.values())
440
  coverage = (matched_items / total_items * 100) if total_items > 0 else 0
 
461
  """
462
 
463
  return analysis
464
+
465
+ return "Invalid report type specified."
466
+
467
+
468
+ # =============================================================================
469
+ # MAIN SERVICE FUNCTIONS - Simplified orchestration
470
+ # =============================================================================
471
+
472
+
473
+
474
+
475
+ def search_documents(doc_processor: DocumentProcessor, query: str, top_k: int = 5,
476
+ threshold: Optional[float] = None) -> List[Dict]:
477
+ """Search documents using the document processor"""
478
+ return doc_processor.search(query, top_k, threshold)
479
+
480
+
481
+ def load_default_file(directory: Path, pattern: str) -> str:
482
+ """Load the first file matching pattern from directory"""
483
+ try:
484
+ files = list(directory.glob(pattern))
485
+ return files[0].read_text(encoding='utf-8') if files else ""
486
+ except Exception as e:
487
+ logger.error(f"File loading failed: {e}")
488
+ return ""
489
 
490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/ui_components.py CHANGED
@@ -9,10 +9,11 @@ Separates UI logic from business logic for better maintainability.
9
  import streamlit as st
10
  from pathlib import Path
11
  from typing import Dict, List, Optional, Tuple, Any
12
- import numpy as np
13
  import base64
14
 
15
- from .config import get_config
 
16
 
17
 
18
  def create_document_link(file_path: str, doc_name: str, doc_title: str) -> str:
@@ -89,8 +90,7 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
89
  subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
90
  if subdirs:
91
  # Count total documents in all data rooms
92
- total_docs = sum(1 for f in project_dir.rglob('*')
93
- if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
94
  if total_docs > 0:
95
  projects.append({
96
  'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
@@ -106,8 +106,7 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
106
  subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
107
  if subdirs:
108
  # Count total documents in all data rooms
109
- total_docs = sum(1 for f in project_dir.rglob('*')
110
- if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
111
  if total_docs > 0:
112
  projects.append({
113
  'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
@@ -169,8 +168,7 @@ def render_data_room_selector(project_path: str) -> Optional[str]:
169
  for data_room_dir in project_path_obj.iterdir():
170
  if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
171
  # Count documents for display
172
- doc_count = sum(1 for f in data_room_dir.rglob('*')
173
- if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
174
  if doc_count > 0: # Only show directories with documents
175
  data_rooms.append({
176
  'name': data_room_dir.name.replace('-', ' ').replace('_', ' ').title(),
@@ -221,12 +219,11 @@ def render_ai_settings() -> Tuple[bool, Optional[str], str]:
221
  model_choice = config.model.claude_model
222
 
223
  if use_ai_features:
224
- # Check if API key is in environment
225
- import os
226
- env_key = os.getenv('ANTHROPIC_API_KEY')
227
- if env_key:
228
  st.success("✅ API key loaded from .env file")
229
- api_key = env_key
230
  else:
231
  api_key = st.text_input(
232
  "Anthropic API Key",
@@ -276,11 +273,11 @@ def render_file_selector(directory: str, file_type: str, key_suffix: str) -> Tup
276
  if dir_path.exists():
277
  for file in dir_path.glob("*.md"):
278
  if not file.name.startswith('.'):
279
- files.append({
280
- 'name': file.stem.replace('_', ' ').replace('-', ' ').title(),
281
- 'path': str(file),
282
- 'filename': file.name
283
- })
284
 
285
  file_content = ""
286
  selected_file_path = None
@@ -483,10 +480,7 @@ def render_document_match(match: Dict, item_idx: int, primary_threshold: float)
483
  """
484
  # Get document title (use name without extension)
485
  doc_name = match.get('name', match.get('path', 'Unknown'))
486
- if '.' in doc_name:
487
- doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
488
- else:
489
- doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
490
 
491
  # Compact display with columns
492
  col1, col2, col3 = st.columns([0.8, 3.5, 0.5])
@@ -535,17 +529,7 @@ def render_download_button(match: Dict, item_idx: int, doc_name: str, doc_title:
535
  file_bytes = f.read()
536
 
537
  # Determine MIME type based on file extension
538
- file_extension = file_path.suffix.lower()
539
- if file_extension == '.pdf':
540
- mime_type = 'application/pdf'
541
- elif file_extension in ['.doc', '.docx']:
542
- mime_type = 'application/msword'
543
- elif file_extension == '.txt':
544
- mime_type = 'text/plain'
545
- elif file_extension == '.md':
546
- mime_type = 'text/markdown'
547
- else:
548
- mime_type = 'application/octet-stream'
549
 
550
  button_key = f"dl_{item_idx}_{match['score']:.0f}_{doc_name[:20]}".replace(" ", "_").replace("/", "_").replace(".", "_")
551
 
@@ -648,10 +632,7 @@ def render_question_source(chunk: Dict, chunk_idx: int, question: str) -> None:
648
  with col2:
649
  # Get clean document title
650
  doc_name = chunk['source']
651
- if '.' in doc_name:
652
- doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
653
- else:
654
- doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
655
 
656
  # Document title as clickable link
657
  doc_path = chunk.get('path', '')
@@ -675,17 +656,7 @@ def render_question_source(chunk: Dict, chunk_idx: int, question: str) -> None:
675
  file_bytes = f.read()
676
 
677
  # Determine MIME type based on file extension
678
- file_extension = file_path.suffix.lower()
679
- if file_extension == '.pdf':
680
- mime_type = 'application/pdf'
681
- elif file_extension in ['.doc', '.docx']:
682
- mime_type = 'application/msword'
683
- elif file_extension == '.txt':
684
- mime_type = 'text/plain'
685
- elif file_extension == '.md':
686
- mime_type = 'text/markdown'
687
- else:
688
- mime_type = 'application/octet-stream'
689
 
690
  button_key = f"qa_dl_{question[:20]}_{chunk_idx}".replace(" ", "_").replace("?", "").replace("/", "_")
691
 
@@ -718,7 +689,7 @@ def render_ai_answer_button(answer_data: Dict, chunks: List[Dict]) -> None:
718
  context = "\n\n".join([f"From {c['source']}: {c['text']}" for c in chunks[:3]])
719
  # Use LLM directly for more reliable answers
720
  from langchain_core.messages import HumanMessage
721
- from src.document_processing import escape_markdown_math
722
 
723
  prompt = f"Question: {answer_data['question']}\n\nContext from documents:\n{context}\n\nProvide a comprehensive answer based on the context."
724
  response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
 
9
  import streamlit as st
10
  from pathlib import Path
11
  from typing import Dict, List, Optional, Tuple, Any
12
+
13
  import base64
14
 
15
+ from .config import get_config, get_mime_type, format_document_title, count_documents_in_directory
16
+ from .document_processing import escape_markdown_math
17
 
18
 
19
  def create_document_link(file_path: str, doc_name: str, doc_title: str) -> str:
 
90
  subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
91
  if subdirs:
92
  # Count total documents in all data rooms
93
+ total_docs = count_documents_in_directory(project_dir)
 
94
  if total_docs > 0:
95
  projects.append({
96
  'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
 
106
  subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
107
  if subdirs:
108
  # Count total documents in all data rooms
109
+ total_docs = count_documents_in_directory(project_dir)
 
110
  if total_docs > 0:
111
  projects.append({
112
  'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
 
168
  for data_room_dir in project_path_obj.iterdir():
169
  if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
170
  # Count documents for display
171
+ doc_count = count_documents_in_directory(data_room_dir)
 
172
  if doc_count > 0: # Only show directories with documents
173
  data_rooms.append({
174
  'name': data_room_dir.name.replace('-', ' ').replace('_', ' ').title(),
 
219
  model_choice = config.model.claude_model
220
 
221
  if use_ai_features:
222
+ # Check if API key is available in config (which loads from .env)
223
+ config_api_key = config.anthropic_api_key
224
+ if config_api_key:
 
225
  st.success("✅ API key loaded from .env file")
226
+ api_key = config_api_key
227
  else:
228
  api_key = st.text_input(
229
  "Anthropic API Key",
 
273
  if dir_path.exists():
274
  for file in dir_path.glob("*.md"):
275
  if not file.name.startswith('.'):
276
+ files.append({
277
+ 'name': format_document_title(file.stem),
278
+ 'path': str(file),
279
+ 'filename': file.name
280
+ })
281
 
282
  file_content = ""
283
  selected_file_path = None
 
480
  """
481
  # Get document title (use name without extension)
482
  doc_name = match.get('name', match.get('path', 'Unknown'))
483
+ doc_title = format_document_title(doc_name)
 
 
 
484
 
485
  # Compact display with columns
486
  col1, col2, col3 = st.columns([0.8, 3.5, 0.5])
 
529
  file_bytes = f.read()
530
 
531
  # Determine MIME type based on file extension
532
+ mime_type = get_mime_type(file_path)
 
 
 
 
 
 
 
 
 
 
533
 
534
  button_key = f"dl_{item_idx}_{match['score']:.0f}_{doc_name[:20]}".replace(" ", "_").replace("/", "_").replace(".", "_")
535
 
 
632
  with col2:
633
  # Get clean document title
634
  doc_name = chunk['source']
635
+ doc_title = format_document_title(doc_name)
 
 
 
636
 
637
  # Document title as clickable link
638
  doc_path = chunk.get('path', '')
 
656
  file_bytes = f.read()
657
 
658
  # Determine MIME type based on file extension
659
+ mime_type = get_mime_type(file_path)
 
 
 
 
 
 
 
 
 
 
660
 
661
  button_key = f"qa_dl_{question[:20]}_{chunk_idx}".replace(" ", "_").replace("?", "").replace("/", "_")
662
 
 
689
  context = "\n\n".join([f"From {c['source']}: {c['text']}" for c in chunks[:3]])
690
  # Use LLM directly for more reliable answers
691
  from langchain_core.messages import HumanMessage
692
+
693
 
694
  prompt = f"Question: {answer_data['question']}\n\nContext from documents:\n{context}\n\nProvide a comprehensive answer based on the context."
695
  response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
src/utils.py DELETED
@@ -1,640 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Utilities Module
4
-
5
- This module contains error handling, logging, and other utility functions
6
- for the DD-Checklist application.
7
- """
8
-
9
- import logging
10
- import functools
11
- import traceback
12
- from pathlib import Path
13
- from typing import Any, Callable, Optional, Dict, List, Union
14
- import streamlit as st
15
- from datetime import datetime
16
- import sys
17
- import os
18
-
19
-
20
- class DDChecklistLogger:
21
- """
22
- Custom logger for DD-Checklist application
23
- Handles both file and console logging with Streamlit integration
24
- """
25
-
26
- def __init__(self, name: str = "dd_checklist", log_level: str = "INFO"):
27
- """
28
- Initialize logger
29
-
30
- Args:
31
- name: Logger name
32
- log_level: Logging level
33
- """
34
- self.logger = logging.getLogger(name)
35
- self.logger.setLevel(getattr(logging, log_level.upper()))
36
-
37
- # Prevent duplicate handlers
38
- if not self.logger.handlers:
39
- self._setup_handlers()
40
-
41
- def _setup_handlers(self):
42
- """Setup logging handlers"""
43
- # Console handler
44
- console_handler = logging.StreamHandler(sys.stdout)
45
- console_formatter = logging.Formatter(
46
- '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
47
- )
48
- console_handler.setFormatter(console_formatter)
49
- self.logger.addHandler(console_handler)
50
-
51
- # File handler (if possible)
52
- try:
53
- log_dir = Path(".logs")
54
- log_dir.mkdir(exist_ok=True)
55
-
56
- log_file = log_dir / f"dd_checklist_{datetime.now().strftime('%Y%m%d')}.log"
57
- file_handler = logging.FileHandler(log_file)
58
- file_formatter = logging.Formatter(
59
- '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
60
- )
61
- file_handler.setFormatter(file_formatter)
62
- self.logger.addHandler(file_handler)
63
- except Exception:
64
- # File logging not available (e.g., on Streamlit Cloud)
65
- pass
66
-
67
- def info(self, message: str, **kwargs):
68
- """Log info message"""
69
- self.logger.info(message, **kwargs)
70
-
71
- def warning(self, message: str, **kwargs):
72
- """Log warning message"""
73
- self.logger.warning(message, **kwargs)
74
- # Also show in Streamlit if available
75
- if 'st' in globals() and st:
76
- st.warning(message)
77
-
78
- def error(self, message: str, **kwargs):
79
- """Log error message"""
80
- self.logger.error(message, **kwargs)
81
- # Also show in Streamlit if available
82
- if 'st' in globals() and st:
83
- st.error(message)
84
-
85
- def debug(self, message: str, **kwargs):
86
- """Log debug message"""
87
- self.logger.debug(message, **kwargs)
88
-
89
- def exception(self, message: str, **kwargs):
90
- """Log exception with traceback"""
91
- self.logger.exception(message, **kwargs)
92
- # Show error in Streamlit if available
93
- if 'st' in globals() and st:
94
- st.error(f"{message} - Check logs for details.")
95
-
96
-
97
- # Global logger instance
98
- logger = DDChecklistLogger()
99
-
100
-
101
- def handle_exceptions(
102
- return_value: Any = None,
103
- show_error: bool = True,
104
- log_error: bool = True
105
- ) -> Callable:
106
- """
107
- Decorator for handling exceptions in functions
108
-
109
- Args:
110
- return_value: Value to return on exception
111
- show_error: Whether to show error in UI
112
- log_error: Whether to log the error
113
-
114
- Returns:
115
- Decorated function
116
- """
117
- def decorator(func: Callable) -> Callable:
118
- @functools.wraps(func)
119
- def wrapper(*args, **kwargs):
120
- try:
121
- return func(*args, **kwargs)
122
- except Exception as e:
123
- error_msg = f"Error in {func.__name__}: {str(e)}"
124
-
125
- if log_error:
126
- logger.exception(error_msg)
127
-
128
- if show_error and 'st' in globals() and st:
129
- st.error(error_msg)
130
-
131
- return return_value
132
- return wrapper
133
- return decorator
134
-
135
-
136
- def safe_execute(
137
- func: Callable,
138
- *args,
139
- default_return: Any = None,
140
- error_message: Optional[str] = None,
141
- show_error: bool = True,
142
- **kwargs
143
- ) -> Any:
144
- """
145
- Safely execute a function with error handling
146
-
147
- Args:
148
- func: Function to execute
149
- *args: Function arguments
150
- default_return: Default return value on error
151
- error_message: Custom error message
152
- show_error: Whether to show error in UI
153
- **kwargs: Function keyword arguments
154
-
155
- Returns:
156
- Function result or default_return on error
157
- """
158
- try:
159
- return func(*args, **kwargs)
160
- except Exception as e:
161
- msg = error_message or f"Error executing {func.__name__}: {str(e)}"
162
- logger.exception(msg)
163
-
164
- if show_error and 'st' in globals() and st:
165
- st.error(msg)
166
-
167
- return default_return
168
-
169
-
170
- class ErrorHandler:
171
- """
172
- Context manager for error handling
173
- """
174
-
175
- def __init__(
176
- self,
177
- error_message: str = "An error occurred",
178
- show_error: bool = True,
179
- reraise: bool = False
180
- ):
181
- """
182
- Initialize error handler
183
-
184
- Args:
185
- error_message: Message to display on error
186
- show_error: Whether to show error in UI
187
- reraise: Whether to reraise the exception
188
- """
189
- self.error_message = error_message
190
- self.show_error = show_error
191
- self.reraise = reraise
192
-
193
- def __enter__(self):
194
- return self
195
-
196
- def __exit__(self, exc_type, exc_val, exc_tb):
197
- if exc_type is not None:
198
- error_msg = f"{self.error_message}: {str(exc_val)}"
199
- logger.exception(error_msg)
200
-
201
- if self.show_error and 'st' in globals() and st:
202
- st.error(error_msg)
203
-
204
- if self.reraise:
205
- return False # Reraise the exception
206
-
207
- return True # Suppress the exception
208
-
209
-
210
- def validate_file_path(file_path: Union[str, Path]) -> bool:
211
- """
212
- Validate that a file path exists and is readable
213
-
214
- Args:
215
- file_path: Path to validate
216
-
217
- Returns:
218
- True if valid, False otherwise
219
- """
220
- try:
221
- path = Path(file_path)
222
- return path.exists() and path.is_file()
223
- except Exception as e:
224
- logger.warning(f"Invalid file path {file_path}: {e}")
225
- return False
226
-
227
-
228
- def validate_directory_path(dir_path: Union[str, Path]) -> bool:
229
- """
230
- Validate that a directory path exists
231
-
232
- Args:
233
- dir_path: Directory path to validate
234
-
235
- Returns:
236
- True if valid, False otherwise
237
- """
238
- try:
239
- path = Path(dir_path)
240
- return path.exists() and path.is_dir()
241
- except Exception as e:
242
- logger.warning(f"Invalid directory path {dir_path}: {e}")
243
- return False
244
-
245
-
246
- def ensure_directory(dir_path: Union[str, Path]) -> bool:
247
- """
248
- Ensure directory exists, create if it doesn't
249
-
250
- Args:
251
- dir_path: Directory path
252
-
253
- Returns:
254
- True if directory exists or was created, False otherwise
255
- """
256
- try:
257
- path = Path(dir_path)
258
- path.mkdir(parents=True, exist_ok=True)
259
- return True
260
- except Exception as e:
261
- logger.error(f"Could not create directory {dir_path}: {e}")
262
- return False
263
-
264
-
265
- def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
266
- """
267
- Get file size in bytes
268
-
269
- Args:
270
- file_path: Path to file
271
-
272
- Returns:
273
- File size in bytes or None if error
274
- """
275
- try:
276
- return Path(file_path).stat().st_size
277
- except Exception as e:
278
- logger.warning(f"Could not get size for {file_path}: {e}")
279
- return None
280
-
281
-
282
- def format_file_size(size_bytes: int) -> str:
283
- """
284
- Format file size in human-readable format
285
-
286
- Args:
287
- size_bytes: Size in bytes
288
-
289
- Returns:
290
- Formatted size string
291
- """
292
- if size_bytes == 0:
293
- return "0 B"
294
-
295
- size_names = ["B", "KB", "MB", "GB"]
296
- size = size_bytes
297
-
298
- for i, unit in enumerate(size_names):
299
- if size < 1024 or i == len(size_names) - 1:
300
- return f"{size:.1f} {unit}"
301
- size /= 1024
302
-
303
- return f"{size:.1f} GB"
304
-
305
-
306
- def sanitize_filename(filename: str) -> str:
307
- """
308
- Sanitize filename for safe file operations
309
-
310
- Args:
311
- filename: Original filename
312
-
313
- Returns:
314
- Sanitized filename
315
- """
316
- import re
317
-
318
- # Remove or replace invalid characters
319
- sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
320
-
321
- # Remove multiple underscores
322
- sanitized = re.sub(r'_+', '_', sanitized)
323
-
324
- # Trim and ensure not empty
325
- sanitized = sanitized.strip('_. ')
326
-
327
- if not sanitized:
328
- sanitized = "untitled"
329
-
330
- return sanitized
331
-
332
-
333
- def get_memory_usage() -> Dict[str, float]:
334
- """
335
- Get current memory usage information
336
-
337
- Returns:
338
- Dictionary with memory usage stats
339
- """
340
- try:
341
- import psutil
342
- process = psutil.Process(os.getpid())
343
- memory_info = process.memory_info()
344
-
345
- return {
346
- 'rss_mb': memory_info.rss / 1024 / 1024, # Resident Set Size
347
- 'vms_mb': memory_info.vms / 1024 / 1024, # Virtual Memory Size
348
- 'percent': process.memory_percent()
349
- }
350
- except ImportError:
351
- logger.warning("psutil not available, cannot get memory usage")
352
- return {}
353
- except Exception as e:
354
- logger.warning(f"Could not get memory usage: {e}")
355
- return {}
356
-
357
-
358
- def timing_decorator(func: Callable) -> Callable:
359
- """
360
- Decorator to time function execution
361
-
362
- Args:
363
- func: Function to time
364
-
365
- Returns:
366
- Decorated function
367
- """
368
- @functools.wraps(func)
369
- def wrapper(*args, **kwargs):
370
- import time
371
- start_time = time.time()
372
- result = func(*args, **kwargs)
373
- end_time = time.time()
374
-
375
- execution_time = end_time - start_time
376
- logger.debug(f"{func.__name__} executed in {execution_time:.2f} seconds")
377
-
378
- return result
379
- return wrapper
380
-
381
-
382
- class ProgressTracker:
383
- """
384
- Progress tracking utility for long-running operations with weighted ETA calculation
385
- """
386
-
387
- def __init__(self, total_steps: int, description: str = "Processing", step_weights: Optional[Dict[int, float]] = None):
388
- """
389
- Initialize progress tracker
390
-
391
- Args:
392
- total_steps: Total number of steps
393
- description: Description of the operation
394
- step_weights: Optional dict mapping step numbers to relative weights (default: all steps equal weight)
395
- """
396
- self.total_steps = total_steps
397
- self.current_step = 0
398
- self.description = description
399
- self.start_time = datetime.now()
400
- self.step_start_times = {} # Track when each step started
401
- self.step_durations = {} # Track actual duration of completed steps
402
-
403
- # Set up step weights (default: equal weight for all steps)
404
- if step_weights:
405
- self.step_weights = step_weights
406
- else:
407
- self.step_weights = {i: 1.0 for i in range(1, total_steps + 1)}
408
-
409
- # Calculate total weight for progress calculation
410
- self.total_weight = sum(self.step_weights.values())
411
-
412
- # Initialize Streamlit progress bar if available
413
- if 'st' in globals() and st:
414
- self.progress_bar = st.progress(0, text=f"{description}...")
415
- self.status_text = st.empty()
416
- else:
417
- self.progress_bar = None
418
- self.status_text = None
419
-
420
- def update(self, step: int, message: str = ""):
421
- """
422
- Update progress with weighted ETA calculation
423
-
424
- Args:
425
- step: Current step number
426
- message: Optional status message
427
- """
428
- now = datetime.now()
429
-
430
- # Record step timing
431
- if self.current_step != step:
432
- # Mark completion of previous step
433
- if self.current_step > 0 and self.current_step in self.step_start_times:
434
- self.step_durations[self.current_step] = (now - self.step_start_times[self.current_step]).total_seconds()
435
-
436
- # Mark start of new step
437
- self.step_start_times[step] = now
438
- self.current_step = step
439
-
440
- # Calculate weighted progress
441
- completed_weight = sum(self.step_weights.get(i, 1.0) for i in range(1, step))
442
- current_step_weight = self.step_weights.get(step, 1.0)
443
-
444
- # For current step, assume 50% completion unless we have sub-progress info
445
- current_progress_weight = completed_weight + (current_step_weight * 0.5)
446
- progress = current_progress_weight / self.total_weight if self.total_weight > 0 else 0
447
- progress = min(progress, 1.0) # Cap at 100%
448
-
449
- # Calculate improved ETA using weighted approach
450
- elapsed = (now - self.start_time).total_seconds()
451
- eta_str = ""
452
-
453
- if step > 1 and completed_weight > 0:
454
- # Use actual timing data from completed steps
455
- avg_time_per_weight = elapsed / completed_weight
456
- remaining_weight = self.total_weight - current_progress_weight
457
- eta = avg_time_per_weight * remaining_weight
458
-
459
- if eta > 1:
460
- if eta < 60:
461
- eta_str = f" (ETA: {eta:.0f}s)"
462
- elif eta < 3600:
463
- eta_str = f" (ETA: {eta/60:.1f}m)"
464
- else:
465
- eta_str = f" (ETA: {eta/3600:.1f}h)"
466
- elif step == 1 and elapsed > 5: # Only show ETA after 5 seconds
467
- # For first step, make a rough estimate based on step weights
468
- estimated_time_per_weight = elapsed / self.step_weights.get(1, 1.0)
469
- remaining_weight = self.total_weight - current_progress_weight
470
- eta = estimated_time_per_weight * remaining_weight
471
-
472
- if eta > 10: # Only show if meaningful
473
- if eta < 60:
474
- eta_str = f" (ETA: ~{eta:.0f}s)"
475
- else:
476
- eta_str = f" (ETA: ~{eta/60:.1f}m)"
477
-
478
- status_msg = f"{self.description}: {step}/{self.total_steps}{eta_str}"
479
- if message:
480
- status_msg += f" - {message}"
481
-
482
- # Update Streamlit components
483
- if self.progress_bar:
484
- self.progress_bar.progress(progress, text=status_msg)
485
-
486
- # Log progress at key milestones
487
- if step == 1 or step % max(1, self.total_steps // 5) == 0: # Log every 20%
488
- logger.info(status_msg)
489
-
490
- def update_step_progress(self, step: int, sub_progress: float, message: str = ""):
491
- """
492
- Update progress within a specific step (for long-running operations)
493
-
494
- Args:
495
- step: Current step number
496
- sub_progress: Progress within the step (0.0 to 1.0)
497
- message: Optional status message
498
- """
499
- now = datetime.now()
500
-
501
- # Ensure we're tracking this step
502
- if step not in self.step_start_times:
503
- self.step_start_times[step] = now
504
- self.current_step = step
505
-
506
- # Calculate weighted progress with sub-progress
507
- completed_weight = sum(self.step_weights.get(i, 1.0) for i in range(1, step))
508
- current_step_weight = self.step_weights.get(step, 1.0)
509
-
510
- # Use actual sub-progress instead of assuming 50%
511
- current_progress_weight = completed_weight + (current_step_weight * sub_progress)
512
- progress = current_progress_weight / self.total_weight if self.total_weight > 0 else 0
513
- progress = min(progress, 1.0) # Cap at 100%
514
-
515
- # Calculate improved ETA
516
- elapsed = (now - self.start_time).total_seconds()
517
- eta_str = ""
518
-
519
- if step > 1 and completed_weight > 0:
520
- # Use actual timing data from completed steps
521
- avg_time_per_weight = elapsed / completed_weight
522
- remaining_weight = self.total_weight - current_progress_weight
523
- eta = avg_time_per_weight * remaining_weight
524
-
525
- if eta > 1:
526
- if eta < 60:
527
- eta_str = f" (ETA: {eta:.0f}s)"
528
- elif eta < 3600:
529
- eta_str = f" (ETA: {eta/60:.1f}m)"
530
- else:
531
- eta_str = f" (ETA: {eta/3600:.1f}h)"
532
- elif step == 1 and elapsed > 5:
533
- # For first step, estimate based on current progress
534
- if sub_progress > 0.1: # Only estimate if we have meaningful progress
535
- step_elapsed = (now - self.step_start_times[step]).total_seconds()
536
- estimated_step_time = step_elapsed / sub_progress
537
- remaining_step_time = estimated_step_time * (1 - sub_progress)
538
-
539
- # Add estimated time for remaining steps
540
- remaining_weight = self.total_weight - self.step_weights.get(step, 1.0)
541
- estimated_time_per_weight = estimated_step_time / self.step_weights.get(step, 1.0)
542
- eta = remaining_step_time + (estimated_time_per_weight * remaining_weight)
543
-
544
- if eta > 10:
545
- if eta < 60:
546
- eta_str = f" (ETA: ~{eta:.0f}s)"
547
- else:
548
- eta_str = f" (ETA: ~{eta/60:.1f}m)"
549
-
550
- status_msg = f"{self.description}: {step}/{self.total_steps}{eta_str}"
551
- if message:
552
- status_msg += f" - {message}"
553
-
554
- # Update Streamlit components
555
- if self.progress_bar:
556
- self.progress_bar.progress(progress, text=status_msg)
557
-
558
- def complete(self, message: str = "Complete"):
559
- """
560
- Mark progress as complete
561
-
562
- Args:
563
- message: Completion message
564
- """
565
- if self.progress_bar:
566
- self.progress_bar.progress(1.0, text=f"{self.description}: {message}")
567
-
568
- elapsed = (datetime.now() - self.start_time).total_seconds()
569
- logger.info(f"{self.description} completed in {elapsed:.1f} seconds")
570
-
571
-
572
- def batch_process(
573
- items: List[Any],
574
- process_func: Callable,
575
- batch_size: int = 10,
576
- description: str = "Processing"
577
- ) -> List[Any]:
578
- """
579
- Process items in batches with progress tracking
580
-
581
- Args:
582
- items: List of items to process
583
- process_func: Function to process each item
584
- batch_size: Size of each batch
585
- description: Description for progress tracking
586
-
587
- Returns:
588
- List of processed results
589
- """
590
- results = []
591
- total_batches = (len(items) + batch_size - 1) // batch_size
592
-
593
- tracker = ProgressTracker(total_batches, description)
594
-
595
- for i in range(0, len(items), batch_size):
596
- batch = items[i:i + batch_size]
597
- batch_num = i // batch_size + 1
598
-
599
- try:
600
- batch_results = [process_func(item) for item in batch]
601
- results.extend(batch_results)
602
-
603
- tracker.update(batch_num, f"Batch {batch_num}/{total_batches}")
604
-
605
- except Exception as e:
606
- logger.error(f"Error processing batch {batch_num}: {e}")
607
- # Continue with remaining batches
608
- continue
609
-
610
- tracker.complete()
611
- return results
612
-
613
-
614
- # Streamlit-specific utilities
615
- def show_success(message: str):
616
- """Show success message in Streamlit"""
617
- if 'st' in globals() and st:
618
- st.success(message)
619
- logger.info(message)
620
-
621
-
622
- def show_info(message: str):
623
- """Show info message in Streamlit"""
624
- if 'st' in globals() and st:
625
- st.info(message)
626
- logger.info(message)
627
-
628
-
629
- def show_warning(message: str):
630
- """Show warning message in Streamlit"""
631
- if 'st' in globals() and st:
632
- st.warning(message)
633
- logger.warning(message)
634
-
635
-
636
- def show_error(message: str):
637
- """Show error message in Streamlit"""
638
- if 'st' in globals() and st:
639
- st.error(message)
640
- logger.error(message)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff