Spaces:

jmzlx
/

dd-poc

Sleeping

App Files Files Community

Juan Salas commited on Sep 1, 2025

Commit

25ec886

1 Parent(s): 15ee652

FAISS file persistance

Browse files

Files changed (16) hide show

.streamlit/config.toml +1 -2
app.py +235 -406
pyproject.toml +12 -3
requirements.txt +7 -5
src/__init__.py +14 -16
src/ai/__init__.py +19 -75
src/ai/agent_core.py +536 -33
src/ai/agent_nodes.py +0 -173
src/ai/llm_utilities.py +0 -432
src/ai/prompts.py +71 -126
src/config.py +294 -384
src/document_processing.py +317 -745
src/services.py +384 -542
src/ui_components.py +20 -49
src/utils.py +0 -640
uv.lock +0 -0

.streamlit/config.toml CHANGED Viewed

@@ -7,7 +7,6 @@ textColor = "#262730"
 [server]
 headless = true
 port = 8501
-enableCORS = false
 [client]
-showErrorDetails = false

 [server]
 headless = true
 port = 8501
 [client]
+showErrorDetails = true

app.py CHANGED Viewed

@@ -7,56 +7,65 @@ using the new modular architecture for better maintainability.
 """
 import os
 # Fix tokenizers parallelism warning early
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 import streamlit as st
-import numpy as np
-from sentence_transformers import SentenceTransformer
 from pathlib import Path
-from typing import Dict, List, Optional, Any
 # Import our refactored modules
 from src import (
-    get_config, init_config,
-    DocumentProcessor, DDChecklistService,
-    logger, handle_exceptions, safe_execute, ErrorHandler,
-    render_project_selector, render_ai_settings, escape_markdown_math
 )
 from src.ui_components import (
-    render_file_selector, render_progress_section, render_metrics_row,
-    render_checklist_results, render_question_results, render_quick_questions,
-    create_document_link
 )
-from src.services import ReportGenerator
-from src.utils import ProgressTracker, show_success, show_error, show_info
-# Import LangGraph + Anthropic configuration
-try:
-    from src.ai import (
-        DDChecklistAgent,
-        LANGGRAPH_AVAILABLE,
-        batch_summarize_documents,
-        create_document_embeddings_with_summaries,
-        match_checklist_with_summaries,
-        generate_checklist_descriptions
-    )
-    LLM_AVAILABLE = LANGGRAPH_AVAILABLE
-except ImportError:
-    LLM_AVAILABLE = False
-    DDChecklistAgent = None
 class DDChecklistApp:
     """
     Main application class that orchestrates all components
     """
     def __init__(self):
         """Initialize the application"""
         # Initialize configuration
-        self.config = init_config().get_config()
         # Initialize session state
         self._init_session_state()
@@ -69,63 +78,44 @@ class DDChecklistApp:
         )
         # Initialize services (will be loaded when needed)
-        self.model = None
-        self.service = None
         self.agent = None
     def _init_session_state(self):
-        """Initialize Streamlit session state variables"""
-        defaults = {
             'documents': {},
             'chunks': [],
             'embeddings': None,
-            'checklist': {},
             'checklist_results': {},
-            'questions': [],
             'question_answers': {},
-            'strategy_text': "",
-            'strategy_analysis': "",
             'company_summary': "",
             'agent': None,
-            'doc_embeddings_data': None,
-            'just_processed': False,
-            'is_processing': False,
-            'trigger_processing': False,
-            'processing_path': None
         }
-        for key, default_value in defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = default_value
-    @st.cache_resource
-    def load_model(_self) -> SentenceTransformer:
-        """Load the sentence transformer model"""
-        with ErrorHandler("Failed to load AI model"):
-            return SentenceTransformer(_self.config.model.sentence_transformer_model)
     def initialize_services(self):
         """Initialize core services"""
-        if self.model is None:
-            self.model = self.load_model()
-        if self.service is None:
-            self.service = DDChecklistService(self.model, self.agent)
             # Restore document processor state from session state if available
             if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
                 hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
-                self.service.document_processor.chunks = st.session_state.chunks
-                self.service.document_processor.embeddings = st.session_state.embeddings
-                self.service.document_processor.documents = st.session_state.get('documents', {})
-                # Ensure the document processor has the model
-                self.service.document_processor.model = self.model
     def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
         """
-        Setup AI agent if enabled
         Args:
             api_key: Anthropic API key
@@ -133,11 +123,7 @@ class DDChecklistApp:
         Returns:
             True if agent was successfully initialized
-        """
-        if not LLM_AVAILABLE or not DDChecklistAgent:
-            show_error("AI packages not installed")
-            return False
         try:
             with st.spinner("Initializing AI agent..."):
                 agent = DDChecklistAgent(api_key, model_choice)
@@ -147,9 +133,6 @@ class DDChecklistApp:
                     self.agent = agent
                     show_success("✅ AI Agent ready")
-                    # Update service with agent
-                    if self.service:
-                        self.service.report_generator = ReportGenerator(agent)
                     return True
                 else:
@@ -198,15 +181,13 @@ class DDChecklistApp:
                 self.agent = None
         return selected_data_room_path, use_ai_features, process_button
     def render_summary_tab(self):
-        """Render the summary and analysis tab"""
         # Strategy selector
         strategy_path, strategy_text = render_file_selector(
             self.config.paths.strategy_dir, "Strategy", "tab"
         )
-        st.session_state.strategy_text = strategy_text
         # Check if we have documents to display summaries
         if st.session_state.documents:
@@ -214,113 +195,115 @@ class DDChecklistApp:
             overview_tab, analysis_tab = st.tabs(["🏢 Company Overview", "🎯 Strategic Analysis"])
             with overview_tab:
-                self._render_company_overview()
             with analysis_tab:
-                self._render_strategic_analysis()
         else:
             show_info("👈 Configure and process data room to see analysis")
-    def _render_company_overview(self):
-        """Render company overview section"""
-        # Auto-generate summary if not already present and AI is available
-        if (not st.session_state.company_summary and
-            hasattr(st.session_state, 'agent') and st.session_state.agent):
-            with st.spinner("🤖 Generating company overview..."):
-                report_gen = ReportGenerator(st.session_state.agent)
-                data_room_name = Path(list(st.session_state.documents.keys())[0]).parent.name if st.session_state.documents else "Unknown"
-                st.session_state.company_summary = report_gen.generate_company_summary(
-                    st.session_state.documents, data_room_name
                 )
-        # Display the company summary if available
-        if st.session_state.company_summary:
-            st.markdown(st.session_state.company_summary)
             # Add export and regenerate buttons
             col1, col2 = st.columns([1, 5])
             with col1:
                 st.download_button(
                     "📥 Export Summary",
-                    data=f"# Company Overview\n\n{st.session_state.company_summary}",
-                    file_name=f"company_overview_{Path(list(st.session_state.documents.keys())[0]).parent.name if st.session_state.documents else 'export'}.md",
                     mime="text/markdown",
-                    key="export_company_summary"
                 )
             with col2:
-                if st.button("🔄 Regenerate Overview"):
-                    st.session_state.company_summary = ""
                     st.rerun()
-    def _render_strategic_analysis(self):
-        """Render strategic analysis section"""
-        if not st.session_state.checklist_results:
-            st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
-            return
-        # Auto-generate analysis if not already present and AI is available
-        if (not st.session_state.strategy_analysis and
-            hasattr(st.session_state, 'agent') and st.session_state.agent):
-            with st.spinner("🤖 Generating strategic analysis..."):
-                report_gen = ReportGenerator(st.session_state.agent)
-                st.session_state.strategy_analysis = report_gen.generate_strategic_analysis(
-                    st.session_state.strategy_text,
-                    st.session_state.checklist_results,
-                    st.session_state.documents
-                )
-        if st.session_state.strategy_analysis:
-            st.markdown(st.session_state.strategy_analysis)
-            # Add export and regenerate buttons
-            col1, col2, col3 = st.columns([1, 1, 3])
             with col1:
-                # Combined report export
                 combined_report = f"# Due Diligence Report\n\n"
-                combined_report += f"## Company Overview\n\n{st.session_state.company_summary}\n\n"
-                combined_report += f"## Strategic Analysis\n\n{st.session_state.strategy_analysis}"
                 st.download_button(
                     "📥 Export Report",
                     data=combined_report,
-                    file_name=f"dd_report_{Path(list(st.session_state.documents.keys())[0]).parent.name if st.session_state.documents else 'export'}.md",
                     mime="text/markdown",
-                    key="export_combined_report"
                 )
             with col2:
-                if st.button("🔄 Regenerate Analysis"):
-                    st.session_state.strategy_analysis = ""
                     st.rerun()
-    def render_checklist_tab(self):
-        """Render the checklist matching tab"""
-        # Checklist selector
-        checklist_path, checklist_text = render_file_selector(
-            self.config.paths.checklist_dir, "Checklist", "tab"
-        )
-        if not checklist_text:
-            show_error("No checklists found in data/checklist directory")
-            return
-        # Render results if available
-        render_checklist_results(st.session_state.checklist_results)
-    def render_questions_tab(self):
-        """Render the questions tab"""
-        # Question list selector
-        questions_path, questions_text = render_file_selector(
-            self.config.paths.questions_dir, "Question List", "tab"
-        )
-        if not questions_text:
-            show_info("No question lists found in data/questions/")
-            return
-        # Render results if available
-        render_question_results(st.session_state.question_answers)
     def render_qa_tab(self):
         """Render the Q&A with citations tab"""
@@ -346,13 +329,14 @@ class DDChecklistApp:
     def _handle_qa_query(self, question: str):
         """Handle Q&A query and display results"""
-        if not self.service:
             self.initialize_services()
         # Use lower threshold for Q&A to get more relevant results
         qa_threshold = 0.25
-        results = self.service.search_documents(
             question,
             top_k=self.config.ui.top_k_search_results,
             threshold=qa_threshold
@@ -369,7 +353,9 @@ class DDChecklistApp:
                     context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
                     # Use LLM directly for more reliable answers
                     from langchain_core.messages import HumanMessage
-                    prompt = f"Question: {question}\n\nRelevant document excerpts:\n{context}\n\nProvide a comprehensive answer with citations to the sources."
                     response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
                     # Clean up any leading whitespace and escape math characters
                     answer_text = escape_markdown_math(response.content.strip())
@@ -389,10 +375,7 @@ class DDChecklistApp:
                         # Create clickable link for the document
                         doc_path = result.get('path', result.get('full_path', ''))
                         doc_name = result['source']
-                        if '.' in doc_name:
-                            doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
-                        else:
-                            doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
                         if doc_path:
                             link_html = create_document_link(doc_path, doc_name, doc_title)
@@ -419,17 +402,7 @@ class DDChecklistApp:
                         file_bytes = f.read()
                     # Determine MIME type based on file extension
-                    file_extension = file_path.suffix.lower()
-                    if file_extension == '.pdf':
-                        mime_type = 'application/pdf'
-                    elif file_extension in ['.doc', '.docx']:
-                        mime_type = 'application/msword'
-                    elif file_extension == '.txt':
-                        mime_type = 'text/plain'
-                    elif file_extension == '.md':
-                        mime_type = 'text/markdown'
-                    else:
-                        mime_type = 'application/octet-stream'
                     button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
@@ -444,238 +417,107 @@ class DDChecklistApp:
             except Exception as e:
                 st.error(f"Download failed: {str(e)}")
-    @handle_exceptions(show_error=True)
     def process_data_room(self, data_room_path: str):
-        """
-        Process the selected data room
-        Args:
-            data_room_path: Path to the data room to process
-        """
         if not Path(data_room_path).exists():
             show_error(f"Data room path not found: {data_room_path}")
-            st.session_state.is_processing = False  # Reset flag on error
             return
-        try:
-            # Initialize services
             self.initialize_services()
-            # Create progress container
-            progress_container = st.container()
-            with progress_container:
-                st.markdown("### 🚀 Processing Data Room")
-                # Define step weights based on expected complexity/duration
-                step_weights = {
-                    1: 1.0,    # Scanning data room (fast)
-                    2: 0.5,    # Found documents (instant)
-                    3: 8.0,    # Generate AI summaries (very slow - depends on doc count)
-                    4: 0.5,    # AI summaries complete (instant)
-                    5: 1.0,    # Loading checklist and questions (fast)
-                    6: 0.5,    # Checklist and questions loaded (instant)
-                    7: 3.0,    # Generate checklist descriptions (moderate)
-                    8: 0.5,    # Descriptions generated (instant)
-                    9: 2.0,    # Match checklist to documents (moderate)
-                    10: 0.5,   # Checklist matching complete (instant)
-                    11: 2.0,   # Answer questions (moderate)
-                    12: 0.5    # Complete (instant)
                 }
-                tracker = ProgressTracker(12, "Processing", step_weights)
-                # Step 1: Load documents with parallel processing
-                tracker.update(1, f"Scanning data room: {Path(data_room_path).name}")
-                # Create a progress bar for detailed document loading progress
-                doc_progress_placeholder = st.empty()
-                with doc_progress_placeholder.container():
-                    doc_progress_bar = st.progress(0, text="Initializing document scan...")
-                # Use parallel processing with progress tracking (max_workers=4 as specified)
-                load_results = self.service.document_processor.load_data_room_with_progress(
-                    data_room_path,
-                    max_workers=4,
-                    progress_bar=doc_progress_bar
                 )
-                # Clear the detailed progress bar
-                doc_progress_placeholder.empty()
-                st.session_state.documents = self.service.document_processor.documents
-                st.session_state.chunks = self.service.document_processor.chunks
-                st.session_state.embeddings = self.service.document_processor.embeddings
-                # Display performance metrics
-                if 'performance' in load_results:
-                    perf = load_results['performance']
-                    tracker.update(2, f"Found {load_results['documents_count']} documents in {perf['total_time']:.1f}s "
-                                    f"({perf['documents_per_second']:.1f} docs/sec)")
-                    logger.info(f"Document loading performance: {perf}")
-                else:
-                    tracker.update(2, f"Found {load_results['documents_count']} documents")
-                # Step 2: Generate AI summaries if agent available
-                if hasattr(st.session_state, 'agent') and st.session_state.agent:
-                    doc_count = len(st.session_state.documents)
-                    tracker.update(3, f"Generating AI summaries for {doc_count} documents...")
-                    # Adjust weight for step 3 based on actual document count
-                    # More documents = longer processing time
-                    if doc_count > 50:
-                        step_weights[3] = min(15.0, doc_count * 0.15)  # Scale with doc count, cap at 15
-                    elif doc_count > 20:
-                        step_weights[3] = doc_count * 0.2  # 4-10 weight for 20-50 docs
-                    # Recalculate total weight
-                    tracker.total_weight = sum(step_weights.values())
-                    # Convert documents for summarization
-                    docs_for_summary = []
-                    for path, doc_info in st.session_state.documents.items():
-                        docs_for_summary.append({
-                            'name': doc_info['name'],
-                            'path': doc_info['rel_path'],
-                            'content': doc_info.get('content', '')[:1500],
-                            'metadata': doc_info.get('metadata', {})
-                        })
-                    # Create a separate progress tracker for batch summarization
-                    st.session_state.summary_progress = st.progress(0, text="📝 Starting document summarization...")
-                    # Batch summarize
-                    summarized_docs = batch_summarize_documents(
-                        docs_for_summary,
-                        st.session_state.agent.llm,
-                        batch_size=self.config.processing.batch_size
-                    )
-                    # Clean up summary progress tracker
-                    if 'summary_progress' in st.session_state:
-                        st.session_state.summary_progress.progress(1.0, text="✅ Document summarization complete")
-                        del st.session_state.summary_progress
-                    # Store summaries
-                    for doc in summarized_docs:
-                        for path, doc_info in st.session_state.documents.items():
-                            if doc_info['rel_path'] == doc['path']:
-                                doc_info['summary'] = doc.get('summary', '')
-                    # Create embeddings using summaries
-                    st.session_state.doc_embeddings_data = create_document_embeddings_with_summaries(
-                        summarized_docs, self.model
-                    )
-                    tracker.update(4, f"AI summaries complete ({doc_count} documents processed)")
-                else:
-                    tracker.update(4, "Skipping AI summaries (not enabled)")
-                # Step 3: Parse checklist and questions
-                tracker.update(5, "Loading checklist and questions...")
-                # Load default checklist
-                checklist_text = self._load_default_file(self.config.paths.checklist_path, "*.md")
-                if checklist_text:
-                    st.session_state.checklist = self.service.checklist_parser.parse_checklist(checklist_text)
-                # Load default questions
-                questions_text = self._load_default_file(self.config.paths.questions_path, "*.md")
-                if questions_text:
-                    st.session_state.questions = self.service.question_parser.parse_questions(questions_text)
-                tracker.update(6, "Checklist and questions loaded")
-                # Step 7: Generate checklist descriptions if AI is available
-                if (hasattr(st.session_state, 'agent') and st.session_state.agent and
-                    st.session_state.checklist):
-                    tracker.update(7, "Generating checklist item descriptions...")
-                    # Create progress tracker for descriptions
-                    st.session_state.description_progress = st.progress(0, text="📝 Generating descriptions...")
-                    # Generate enhanced descriptions for better matching
-                    st.session_state.checklist = generate_checklist_descriptions(
-                        st.session_state.checklist,
-                        st.session_state.agent.llm,
-                        batch_size=self.config.processing.batch_size
-                    )
-                    # Clean up progress tracker
-                    if 'description_progress' in st.session_state:
-                        st.session_state.description_progress.progress(1.0, text="✅ Descriptions generated")
-                        del st.session_state.description_progress
-                    tracker.update(8, "Checklist descriptions generated")
-                else:
-                    tracker.update(8, "Skipping description generation (AI not enabled)")
-                # Step 9: Match checklist to documents
-                if st.session_state.checklist and st.session_state.chunks:
-                    tracker.update(9, "Matching checklist to documents...")
-                    if hasattr(st.session_state, 'doc_embeddings_data') and st.session_state.doc_embeddings_data:
-                        # Use AI-enhanced matching with generated descriptions
-                        st.session_state.checklist_results = match_checklist_with_summaries(
-                            st.session_state.checklist,
-                            st.session_state.doc_embeddings_data,
-                            self.model,
-                            self.config.processing.similarity_threshold
-                        )
-                    else:
-                        # Use traditional matching
-                        st.session_state.checklist_results = self.service.checklist_matcher.match_checklist_to_documents(
-                            st.session_state.checklist,
-                            st.session_state.chunks,
-                            st.session_state.embeddings,
-                            self.config.processing.similarity_threshold
-                        )
-                    tracker.update(10, "Checklist matching complete")
-                # Step 11: Answer questions
-                if (st.session_state.questions and st.session_state.chunks and
-                    st.session_state.embeddings is not None):
-                    tracker.update(11, "Answering due diligence questions...")
-                    st.session_state.question_answers = self.service.question_answerer.answer_questions_with_chunks(
-                        st.session_state.questions,
-                        st.session_state.chunks,
-                        st.session_state.embeddings,
-                        self.config.processing.similarity_threshold
-                    )
-                    answered_count = sum(1 for a in st.session_state.question_answers.values() if a['has_answer'])
-                    tracker.update(12, f"Answered {answered_count}/{len(st.session_state.questions)} questions")
-                tracker.complete("Processing complete!")
-                # Small delay before clearing
-                import time
-                time.sleep(1.5)
-                progress_container.empty()
-            # Reset processing flag and mark as just processed on success
-            st.session_state.is_processing = False
-            st.session_state.just_processed = True
-            st.rerun()
-        except Exception:
-            # Reset processing flag on any error
-            st.session_state.is_processing = False
-            raise  # Let decorator handle error display
-    def _load_default_file(self, directory: Path, pattern: str) -> str:
-        """Load the first file matching pattern from directory"""
-        try:
-            files = list(directory.glob(pattern))
-            if files:
-                return files[0].read_text(encoding='utf-8')
-        except Exception as e:
-            logger.warning(f"Could not load default file from {directory}: {e}")
-        return ""
     def run(self):
         """Run the main application"""
@@ -698,33 +540,20 @@ class DDChecklistApp:
             self.render_summary_tab()
         with tab2:
-            self.render_checklist_tab()
         with tab3:
-            self.render_questions_tab()
         with tab4:
             self.render_qa_tab()
-        # Show success message if just processed
-        if st.session_state.just_processed:
-            show_success("✅ Data room processing complete! View results in the tabs above.")
-            st.session_state.just_processed = False
-        # Handle processing trigger
         if process_button and selected_data_room_path and not st.session_state.is_processing:
-            # Set trigger and path for next render
-            st.session_state.trigger_processing = True
-            st.session_state.processing_path = selected_data_room_path
             st.session_state.is_processing = True
-            st.rerun()
-        # Execute processing if triggered
-        if st.session_state.trigger_processing and st.session_state.processing_path:
-            st.session_state.trigger_processing = False  # Reset trigger
-            processing_path = st.session_state.processing_path
-            st.session_state.processing_path = None
-            self.process_data_room(processing_path)
 def main():

 """
 import os
+import warnings
 # Fix tokenizers parallelism warning early
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+# Suppress all LangChain verbose warnings globally
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_huggingface")
+warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
+warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
+# Set up LangChain logging levels early
+import logging
+logging.getLogger("langchain").setLevel(logging.ERROR)
+logging.getLogger("langchain_core").setLevel(logging.ERROR)
+logging.getLogger("langchain_community").setLevel(logging.ERROR)
+logging.getLogger("langchain_huggingface").setLevel(logging.ERROR)
 import streamlit as st
 from pathlib import Path
+from typing import Dict
 # Import our refactored modules
 from src import (
+    init_config, DocumentProcessor,
+    logger,
+    render_project_selector,
+    render_ai_settings, escape_markdown_math,
+    get_mime_type, format_document_title
 )
+from src.document_processing import safe_execute
+# Using Streamlit directly for simplicity
 from src.ui_components import (
+    render_file_selector, render_checklist_results, render_question_results,
+    render_quick_questions, create_document_link
+)
+from src.services import (
+    search_documents
 )
+from src.config import show_success, show_error, show_info
+# Import LangGraph + Anthropic configuration
+from src.ai import (
+    DDChecklistAgent
+)
 class DDChecklistApp:
     """
     Main application class that orchestrates all components
     """
     def __init__(self):
         """Initialize the application"""
         # Initialize configuration
+        self.config = init_config()
         # Initialize session state
         self._init_session_state()
         )
         # Initialize services (will be loaded when needed)
+        self.model_name = self.config.model.sentence_transformer_model
+        self.document_processor = None
         self.agent = None
     def _init_session_state(self):
+        """Initialize essential session state variables only"""
+        essential_defaults = {
             'documents': {},
             'chunks': [],
             'embeddings': None,
             'checklist_results': {},
             'question_answers': {},
             'company_summary': "",
+            'strategy_analysis': "",
             'agent': None,
+            'is_processing': False
         }
+        for key, default_value in essential_defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = default_value
     def initialize_services(self):
         """Initialize core services"""
+        if self.document_processor is None:
+            self.document_processor = DocumentProcessor(self.model_name)
             # Restore document processor state from session state if available
             if (hasattr(st.session_state, 'chunks') and st.session_state.chunks and
                 hasattr(st.session_state, 'embeddings') and st.session_state.embeddings is not None):
+                self.document_processor.chunks = st.session_state.chunks
+                self.document_processor.embeddings = st.session_state.embeddings
+                # Note: Don't restore documents here - they'll be recreated from chunks if needed
     def setup_ai_agent(self, api_key: str, model_choice: str) -> bool:
         """
+        Setup AI agent
         Args:
             api_key: Anthropic API key
         Returns:
             True if agent was successfully initialized
+        """
         try:
             with st.spinner("Initializing AI agent..."):
                 agent = DDChecklistAgent(api_key, model_choice)
                     self.agent = agent
                     show_success("✅ AI Agent ready")
                     return True
                 else:
                 self.agent = None
         return selected_data_room_path, use_ai_features, process_button
     def render_summary_tab(self):
+        """Render consolidated summary and analysis tab"""
         # Strategy selector
         strategy_path, strategy_text = render_file_selector(
             self.config.paths.strategy_dir, "Strategy", "tab"
         )
         # Check if we have documents to display summaries
         if st.session_state.documents:
             overview_tab, analysis_tab = st.tabs(["🏢 Company Overview", "🎯 Strategic Analysis"])
             with overview_tab:
+                self._render_report_section("overview", strategy_text=strategy_text)
             with analysis_tab:
+                self._render_report_section("strategic", strategy_text=strategy_text)
         else:
             show_info("👈 Configure and process data room to see analysis")
+    def _render_report_section(self, report_type: str, strategy_text: str = ""):
+        """Unified report rendering for both overview and strategic analysis"""
+        from src.services import generate_reports
+        summary_key = f"{report_type}_summary"
+        # Check prerequisites for strategic analysis
+        if report_type == "strategic" and not st.session_state.checklist_results:
+            st.warning("⚠️ Process data room with checklist first to enable strategic analysis")
+            return
+        # Auto-generate report if not already present and AI is available
+        if (not st.session_state.get(summary_key, "") and st.session_state.agent):
+            with st.spinner(f"🤖 Generating {report_type} analysis..."):
+                data_room_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
+                                if st.session_state.documents else "Unknown")
+                st.session_state[summary_key] = generate_reports(
+                    st.session_state.documents,
+                    data_room_name,
+                    strategy_text,
+                    st.session_state.checklist_results,
+                    report_type,
+                    st.session_state.agent.llm if st.session_state.agent else None
                 )
+        # Display the report if available
+        if st.session_state.get(summary_key, ""):
+            st.markdown(st.session_state[summary_key])
             # Add export and regenerate buttons
+            self._render_report_actions(report_type, summary_key)
+    def _render_report_actions(self, report_type: str, summary_key: str):
+        """Render export and regenerate actions for reports"""
+        if report_type == "overview":
             col1, col2 = st.columns([1, 5])
             with col1:
+                company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
+                               if st.session_state.documents else 'export')
+                file_name = f"company_overview_{company_name}.md"
                 st.download_button(
                     "📥 Export Summary",
+                    data=f"# Company Overview\n\n{st.session_state[summary_key]}",
+                    file_name=file_name,
                     mime="text/markdown",
+                    key=f"export_{summary_key}"
                 )
             with col2:
+                if st.button(f"🔄 Regenerate {report_type.title()}"):
+                    st.session_state[summary_key] = ""
                     st.rerun()
+        else:
+            col1, col2 = st.columns([1, 5])
             with col1:
+                # Combined report export for strategic analysis
                 combined_report = f"# Due Diligence Report\n\n"
+                combined_report += f"## Company Overview\n\n{st.session_state.get('overview_summary', '')}\n\n"
+                combined_report += f"## Strategic Analysis\n\n{st.session_state[summary_key]}"
+                company_name = (Path(list(st.session_state.documents.keys())[0]).parent.name
+                               if st.session_state.documents else 'export')
+                file_name = f"dd_report_{company_name}.md"
                 st.download_button(
                     "📥 Export Report",
                     data=combined_report,
+                    file_name=file_name,
                     mime="text/markdown",
+                    key=f"export_combined_{summary_key}"
                 )
             with col2:
+                if st.button(f"🔄 Regenerate {report_type.title()}"):
+                    st.session_state[summary_key] = ""
                     st.rerun()
+    def render_analysis_tab(self, tab_type: str):
+        """Unified rendering for checklist and questions tabs"""
+        if tab_type == "checklist":
+            # Checklist selector
+            file_path, file_text = render_file_selector(
+                self.config.paths.checklist_dir, "Checklist", "tab"
+            )
+            if not file_text:
+                show_error("No checklists found in data/checklist directory")
+                return
+            # Render results if available
+            render_checklist_results(st.session_state.checklist_results)
+        elif tab_type == "questions":
+            # Question list selector
+            file_path, file_text = render_file_selector(
+                self.config.paths.questions_dir, "Question List", "tab"
+            )
+            if not file_text:
+                show_info("No question lists found in data/questions/")
+                return
+            # Render results if available
+            render_question_results(st.session_state.question_answers)
     def render_qa_tab(self):
         """Render the Q&A with citations tab"""
     def _handle_qa_query(self, question: str):
         """Handle Q&A query and display results"""
+        if not self.document_processor:
             self.initialize_services()
         # Use lower threshold for Q&A to get more relevant results
         qa_threshold = 0.25
+        results = search_documents(
+            self.document_processor,
             question,
             top_k=self.config.ui.top_k_search_results,
             threshold=qa_threshold
                     context = "\n\n".join([f"From {r['source']}:\n{r['text']}" for r in results[:3]])
                     # Use LLM directly for more reliable answers
                     from langchain_core.messages import HumanMessage
+                    prompt = (f"Question: {question}\n\n"
+                             f"Relevant document excerpts:\n{context}\n\n"
+                             f"Provide a comprehensive answer with citations to the sources.")
                     response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])
                     # Clean up any leading whitespace and escape math characters
                     answer_text = escape_markdown_math(response.content.strip())
                         # Create clickable link for the document
                         doc_path = result.get('path', result.get('full_path', ''))
                         doc_name = result['source']
+                        doc_title = format_document_title(doc_name)
                         if doc_path:
                             link_html = create_document_link(doc_path, doc_name, doc_title)
                         file_bytes = f.read()
                     # Determine MIME type based on file extension
+                    mime_type = get_mime_type(file_path)
                     button_key = f"qacit_dl_{idx}_{question[:20]}".replace(" ", "_").replace("?", "")
             except Exception as e:
                 st.error(f"Download failed: {str(e)}")
     def process_data_room(self, data_room_path: str):
+        """Simplified data room processing"""
         if not Path(data_room_path).exists():
             show_error(f"Data room path not found: {data_room_path}")
+            st.session_state.is_processing = False
             return
+        # Use safe_execute for the entire processing operation
+        def process_operation():
             self.initialize_services()
+            # Simple processing - load documents
+            self.document_processor.load_data_room(data_room_path)
+            # Store results in session state with simplified structure
+            # Convert list of LangChain documents to dictionary format expected by UI
+            documents_dict = {}
+            for doc in self.document_processor.documents:
+                file_path = doc.metadata.get('source', doc.metadata.get('path', 'unknown'))
+                documents_dict[file_path] = {
+                    'name': doc.metadata.get('name', Path(file_path).name if file_path != 'unknown' else 'unknown'),
+                    'path': doc.metadata.get('path', ''),
+                    'content': doc.page_content,
+                    'metadata': doc.metadata
                 }
+            st.session_state.documents = documents_dict
+            st.session_state.chunks = self.document_processor.chunks
+            st.session_state.embeddings = self.document_processor.embeddings
+            # Process checklist and questions if available
+            self._process_checklist_and_questions()
+            # Clear any existing analysis to trigger regeneration
+            st.session_state.company_summary = ""
+            st.session_state.strategy_analysis = ""
+            st.session_state.overview_summary = ""
+            st.session_state.strategic_summary = ""
+            show_success("✅ Data room processing complete! View results in the tabs above.")
+            st.rerun()
+        safe_execute(
+            process_operation,
+            None,
+            "Data room processing"
+        )
+        st.session_state.is_processing = False
+    def _process_checklist_and_questions(self):
+        """Process checklist and questions after documents are loaded"""
+        from src.services import parse_checklist, parse_questions, create_vector_store, search_and_analyze, load_default_file
+        # Load default checklist if available
+        checklist_text = load_default_file(Path(self.config.paths.checklist_dir), "*.md")
+        if checklist_text and self.document_processor.chunks:
+            try:
+                # Parse checklist
+                checklist = parse_checklist(checklist_text)
+                st.session_state.checklist = checklist
+                # Create vector store from chunks for processing
+                vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
+                # Process checklist items
+                checklist_results = search_and_analyze(
+                    checklist,
+                    vector_store,
+                    self.agent.llm if self.agent else None,
+                    self.config.processing.similarity_threshold,
+                    'items'
                 )
+                st.session_state.checklist_results = checklist_results
+                logger.info("✅ Checklist processing completed")
+            except Exception as e:
+                logger.error(f"Checklist processing failed: {e}")
+        # Load default questions if available
+        questions_text = load_default_file(Path(self.config.paths.questions_dir), "*.md")
+        if questions_text and self.document_processor.chunks:
+            try:
+                # Parse questions
+                questions = parse_questions(questions_text)
+                st.session_state.questions = questions
+                # Create vector store from chunks for processing (reuse if already created)
+                if 'vector_store' not in locals():
+                    vector_store = create_vector_store(self.document_processor.chunks, self.model_name)
+                # Process questions
+                question_answers = search_and_analyze(
+                    questions,
+                    vector_store,
+                    self.agent.llm if self.agent else None,
+                    self.config.processing.relevancy_threshold,
+                    'questions'
+                )
+                st.session_state.question_answers = question_answers
+                logger.info("✅ Questions processing completed")
+            except Exception as e:
+                logger.error(f"Questions processing failed: {e}")
     def run(self):
         """Run the main application"""
             self.render_summary_tab()
         with tab2:
+            self.render_analysis_tab("checklist")
         with tab3:
+            self.render_analysis_tab("questions")
         with tab4:
             self.render_qa_tab()
+        # Processing complete message is handled in process_data_room function
+        # Simplified processing trigger
         if process_button and selected_data_room_path and not st.session_state.is_processing:
             st.session_state.is_processing = True
+            self.process_data_room(selected_data_room_path)
 def main():

pyproject.toml CHANGED Viewed

@@ -8,22 +8,31 @@ dependencies = [
     "streamlit>=1.28.0",
     "sentence-transformers>=2.2.0",
     "numpy>=1.24.0",
-    "pandas>=2.0.0",
-    "watchdog>=3.0.0",
     # Document processing
     "pymupdf>=1.23.0",
     "python-docx>=0.8.11",
     # Environment and configuration
     "python-dotenv>=1.0.0",
     # Vector store
     "faiss-cpu>=1.7.4",
-    # AI Enhancement
     "langchain-anthropic>=0.1.0",
     "langgraph>=0.0.20",
     "langchain-core>=0.1.0",
     "langchain-text-splitters>=0.3.10",
 ]
 [build-system]
 requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"

     "streamlit>=1.28.0",
     "sentence-transformers>=2.2.0",
     "numpy>=1.24.0",
     # Document processing
     "pymupdf>=1.23.0",
     "python-docx>=0.8.11",
     # Environment and configuration
     "python-dotenv>=1.0.0",
+    "pydantic-settings>=2.10.1",
+    "markdown>=3.8.2",
     # Vector store
     "faiss-cpu>=1.7.4",
+    # AI Enhancement - LangChain packages
     "langchain-anthropic>=0.1.0",
     "langgraph>=0.0.20",
     "langchain-core>=0.1.0",
     "langchain-text-splitters>=0.3.10",
+    "langchain-community>=0.3.29",
+    "langchain-huggingface>=0.3.1",
+    "pypdf>=6.0.0",
+    "watchdog>=6.0.0",
 ]
 [build-system]
 requires = ["setuptools", "wheel"]
 build-backend = "setuptools.build_meta"
+[dependency-groups]
+dev = [
+    "autoflake>=2.3.1",
+]

requirements.txt CHANGED Viewed

@@ -2,22 +2,24 @@
 streamlit==1.49.1
 sentence-transformers==5.1.0
 numpy==2.3.2
-pandas==2.3.2
-watchdog==6.0.0
 # Document processing - pinned for deployment
 PyMuPDF==1.23.18
 python-docx==1.2.0
-joblib==1.5.2
 # Environment and configuration - pinned for deployment
 python-dotenv==1.1.1
 # Vector store - pinned for deployment
 faiss-cpu==1.12.0
-# AI Enhancement - pinned for deployment
 langchain-anthropic==0.3.19
 langgraph==0.6.6
 langchain-core==0.3.75
-langchain-text-splitters==0.3.10

 streamlit==1.49.1
 sentence-transformers==5.1.0
 numpy==2.3.2
 # Document processing - pinned for deployment
 PyMuPDF==1.23.18
 python-docx==1.2.0
 # Environment and configuration - pinned for deployment
 python-dotenv==1.1.1
+pydantic-settings==2.8.1
+markdown==3.9
 # Vector store - pinned for deployment
 faiss-cpu==1.12.0
+# AI Enhancement - LangChain packages pinned for deployment
 langchain-anthropic==0.3.19
 langgraph==0.6.6
 langchain-core==0.3.75
+langchain-text-splitters==0.3.10
+langchain-community==0.3.29
+langchain-huggingface==0.3.1

src/__init__.py CHANGED Viewed

@@ -5,10 +5,11 @@ DD-Checklist Source Package
 This package contains the refactored components of the DD-Checklist application.
 """
-from .config import get_config, init_config, get_model_config, get_processing_config
-from .document_processing import DocumentProcessor, escape_markdown_math
-from .services import DDChecklistService, ChecklistParser, QuestionParser
-from .utils import logger, handle_exceptions, safe_execute, ErrorHandler
 from .ui_components import render_project_selector, render_ai_settings
 __version__ = "0.2.0"
@@ -17,24 +18,21 @@ __author__ = "DD-Checklist Team"
 __all__ = [
     # Configuration
     "get_config",
-    "init_config",
-    "get_model_config",
-    "get_processing_config",
     # Document Processing
     "DocumentProcessor",
     "escape_markdown_math",
-    # Services
-    "DDChecklistService",
-    "ChecklistParser",
-    "QuestionParser",
-    # Utilities
     "logger",
-    "handle_exceptions",
-    "safe_execute",
-    "ErrorHandler",
     # UI Components
     "render_project_selector",

 This package contains the refactored components of the DD-Checklist application.
 """
+from .config import (
+    get_config, init_config, logger, show_success, show_error, show_info,
+    get_mime_type, format_document_title, count_documents_in_directory
+)
+from .document_processing import DocumentProcessor, escape_markdown_math, safe_execute
 from .ui_components import render_project_selector, render_ai_settings
 __version__ = "0.2.0"
 __all__ = [
     # Configuration
     "get_config",
+    "init_config",
     # Document Processing
     "DocumentProcessor",
     "escape_markdown_math",
+    "safe_execute",
+    # Utilities (merged from utils.py)
     "logger",
+    "show_success",
+    "show_error",
+    "show_info",
+    "get_mime_type",
+    "format_document_title",
+    "count_documents_in_directory",
     # UI Components
     "render_project_selector",

src/ai/__init__.py CHANGED Viewed

@@ -6,70 +6,23 @@ This module provides AI-powered functionality for the DD-Checklist application,
 including LangGraph agents, document processing, and checklist matching.
 """
-# Try to import core components and set availability flag
-try:
-    from .agent_core import DDChecklistAgent, get_langgraph_agent, LANGGRAPH_AVAILABLE
-    from .llm_utilities import (
-        batch_summarize_documents,
-        create_document_embeddings_with_summaries,
-        match_checklist_with_summaries,
-        generate_checklist_descriptions,
-        exponential_backoff_retry
-    )
-    from .agent_nodes import AgentState, TaskType
-    from .prompts import (
-        get_checklist_parsing_prompt,
-        get_document_relevance_prompt,
-        get_question_answering_prompt,
-        get_findings_summary_prompt,
-        get_description_generation_prompt,
-        get_document_summarization_prompt
-    )
-    # Set availability flag based on successful imports
-    AI_MODULE_AVAILABLE = LANGGRAPH_AVAILABLE
-except ImportError as e:
-    # Handle missing dependencies gracefully
-    print(f"AI module dependencies not available: {e}")
-    # Create placeholder classes/functions for graceful degradation
-    class DDChecklistAgent:
-        def __init__(self, *args, **kwargs):
-            self.app = None
-            self.llm = None
-        def is_available(self):
-            return False
-    def get_langgraph_agent(*args, **kwargs):
-        return None
-    def batch_summarize_documents(documents, *args, **kwargs):
-        return documents
-    def create_document_embeddings_with_summaries(documents, *args, **kwargs):
-        return {'embeddings': [], 'documents': []}
-    def match_checklist_with_summaries(*args, **kwargs):
-        return {}
-    def generate_checklist_descriptions(checklist, *args, **kwargs):
-        return checklist
-    def exponential_backoff_retry(func, *args, **kwargs):
-        return func()
-    # Set availability flags
-    LANGGRAPH_AVAILABLE = False
-    AI_MODULE_AVAILABLE = False
-    # Placeholder classes for type hints
-    class AgentState:
-        pass
-    class TaskType:
-        pass
 # Export main public API
 __all__ = [
@@ -77,14 +30,9 @@ __all__ = [
     'DDChecklistAgent',
     'get_langgraph_agent',
-    # LLM utility functions
-    'batch_summarize_documents',
-    'create_document_embeddings_with_summaries',
-    'match_checklist_with_summaries',
-    'generate_checklist_descriptions',
-    'exponential_backoff_retry',
-    # Agent types and state
     'AgentState',
     'TaskType',
@@ -95,8 +43,4 @@ __all__ = [
     'get_findings_summary_prompt',
     'get_description_generation_prompt',
     'get_document_summarization_prompt',
-    # Availability flags
-    'LANGGRAPH_AVAILABLE',
-    'AI_MODULE_AVAILABLE',
 ]

 including LangGraph agents, document processing, and checklist matching.
 """
+# Import core components
+from .prompts import (
+    get_checklist_parsing_prompt,
+    get_document_relevance_prompt,
+    get_question_answering_prompt,
+    get_findings_summary_prompt,
+    get_description_generation_prompt,
+    get_document_summarization_prompt
+)
+# Direct imports for AI functionality - assuming dependencies are present
+from .agent_core import (
+    DDChecklistAgent,
+    get_langgraph_agent,
+    AgentState,
+    TaskType
+)
 # Export main public API
 __all__ = [
     'DDChecklistAgent',
     'get_langgraph_agent',
+    # Agent types and state (now in agent_core)
     'AgentState',
     'TaskType',
     'get_findings_summary_prompt',
     'get_description_generation_prompt',
     'get_document_summarization_prompt',
 ]

src/ai/agent_core.py CHANGED Viewed

@@ -2,40 +2,548 @@
 """
 LangGraph Agent Core Module
-This module contains the main LangGraph agent setup and the high-level
 DDChecklistAgent class for interacting with the agent system.
 """
 import os
-from typing import Optional, Dict, List, Any, Tuple
-try:
-    import streamlit as st
-    from langchain_anthropic import ChatAnthropic
-    from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
-    from langchain_core.tools import tool
-    from langgraph.graph import StateGraph, END
-    from langgraph.prebuilt import ToolNode
-    from langgraph.checkpoint.memory import MemorySaver
-    LANGGRAPH_AVAILABLE = True
-except ImportError:
-    LANGGRAPH_AVAILABLE = False
-    st = None
-    ChatAnthropic = object
-    BaseMessage = object
-    HumanMessage = object
-    AIMessage = object
-    SystemMessage = object
 from ..config import get_config
-from .agent_nodes import (
-    AgentState, TaskType,
-    route_task, parse_checklist_node, match_checklist_node,
-    answer_question_node, summarize_node, route_condition
 )
-def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, ChatAnthropic]]:
     """
     Create a LangGraph agent with Anthropic
@@ -47,9 +555,6 @@ def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = No
         Tuple of (compiled_app, llm) or None if not available
     """
-    if not LANGGRAPH_AVAILABLE:
-        return None
     # Get configuration
     config = get_config()
@@ -165,7 +670,7 @@ class DDChecklistAgent:
     def is_available(self) -> bool:
         """Check if the agent is available for use"""
-        return self.app is not None
     def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
         """
@@ -189,8 +694,7 @@ class DDChecklistAgent:
             return result.get("checklist")
         except Exception as e:
-            if st:
-                st.error(f"Agent error: {str(e)}")
             return None
     def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
@@ -223,8 +727,7 @@ class DDChecklistAgent:
             return result.get("findings", {})
         except Exception as e:
-            if st:
-                st.error(f"Agent error: {str(e)}")
             return {}
     def answer_question(self, question: str, documents: List[Dict]) -> str:

 """
 LangGraph Agent Core Module
+This module contains the main LangGraph agent setup, AI utilities, and the high-level
 DDChecklistAgent class for interacting with the agent system.
+Merged from: agent_core.py, agent_nodes.py, llm_utilities.py
 """
 import os
+import json
+import time
+import random
+import logging
+from typing import Optional, Dict, List, Any, Tuple, Sequence
+from typing_extensions import TypedDict
+from enum import Enum
+import streamlit as st
+from langchain_anthropic import ChatAnthropic
+from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, SystemMessage
+from langchain_core.tools import tool
+from langchain_core.output_parsers import PydanticOutputParser
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+from langgraph.graph import StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver
+from pydantic import BaseModel, Field
 from ..config import get_config
+from ..document_processing import safe_execute
+from .prompts import (
+    get_checklist_parsing_prompt,
+    get_document_relevance_prompt,
+    get_question_answering_prompt,
+    get_findings_summary_prompt,
+    get_description_generation_prompt,
+    get_document_summarization_prompt
 )
+logger = logging.getLogger(__name__)
+# =============================================================================
+# TYPE DEFINITIONS - Merged from agent_nodes.py
+# =============================================================================
+# Simple Pydantic models for structured output parsing
+class SimpleChecklist(BaseModel):
+    """Simple model matching existing checklist structure"""
+    categories: Dict = Field(description="Checklist categories as they currently exist")
+# Define the state for our agent
+class AgentState(TypedDict):
+    """State for the due diligence agent"""
+    messages: Sequence[BaseMessage]
+    checklist: Optional[Dict]
+    documents: Optional[List[Dict]]
+    current_task: Optional[str]
+    findings: Dict[str, List[str]]
+    next_action: Optional[str]
+class TaskType(Enum):
+    """Types of tasks the agent can perform"""
+    PARSE_CHECKLIST = "parse_checklist"
+    ANALYZE_DOCUMENT = "analyze_document"
+    MATCH_CHECKLIST = "match_checklist"
+    ANSWER_QUESTION = "answer_question"
+    SUMMARIZE_FINDINGS = "summarize_findings"
+# =============================================================================
+# AGENT NODE FUNCTIONS - Merged from agent_nodes.py
+# =============================================================================
+def route_task(state: AgentState) -> AgentState:
+    """Route to appropriate task based on current state"""
+    messages = state["messages"]
+    if not messages:
+        return state
+    last_message = messages[-1].content if messages else ""
+    # Determine next action based on message content
+    if "parse" in last_message.lower() and "checklist" in last_message.lower():
+        state["next_action"] = TaskType.PARSE_CHECKLIST.value
+    elif "analyze" in last_message.lower() or "match" in last_message.lower():
+        state["next_action"] = TaskType.MATCH_CHECKLIST.value
+    elif "?" in last_message:
+        state["next_action"] = TaskType.ANSWER_QUESTION.value
+    else:
+        state["next_action"] = TaskType.SUMMARIZE_FINDINGS.value
+    return state
+def parse_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
+    """Parse checklist using structured output - much simpler!"""
+    messages = state["messages"]
+    checklist_text = messages[-1].content if messages else ""
+    # Set up simple parser
+    parser = PydanticOutputParser(pydantic_object=SimpleChecklist)
+    prompt = get_checklist_parsing_prompt(checklist_text)
+    # Create chain and parse - that's it!
+    chain = prompt | llm | parser
+    try:
+        result = chain.invoke({
+            "checklist_text": checklist_text[:3000],
+            "format_instructions": parser.get_format_instructions()
+        })
+        state["checklist"] = result.categories  # Already in the right format!
+        state["messages"].append(AIMessage(content=f"Parsed {len(result.categories)} categories"))
+    except Exception as e:
+        state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
+    return state
+def match_checklist_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
+    """Match documents to checklist items - keep it simple"""
+    checklist = state.get("checklist", {})
+    documents = state.get("documents", [])
+    if not checklist or not documents:
+        state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
+        return state
+    # For each checklist item, find relevant documents
+    findings = {}
+    for cat_letter, category in checklist.items():
+        cat_findings = []
+        for item in category.get("items", []):
+            # Use Claude to assess relevance
+            document_names = [d.get('name', 'Unknown') for d in documents[:10]]
+            prompt = get_document_relevance_prompt(item['text'], document_names)
+            response = llm.invoke([HumanMessage(content=str(prompt))])
+            cat_findings.append({
+                "item": item['text'],
+                "relevant_docs": response.content
+            })
+        findings[category['name']] = cat_findings
+    state["findings"] = findings
+    state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
+    return state
+def answer_question_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
+    """Answer questions using document context"""
+    messages = state["messages"]
+    question = messages[-1].content if messages else ""
+    documents = state.get("documents", [])
+    # Create context from documents
+    context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
+                        for d in documents[:5]])
+    prompt = get_question_answering_prompt(question, context)
+    response = llm.invoke([HumanMessage(content=prompt)])
+    state["messages"].append(AIMessage(content=response.content))
+    return state
+def summarize_node(state: AgentState, llm: "ChatAnthropic") -> AgentState:
+    """Summarize findings"""
+    findings = state.get("findings", {})
+    if not findings:
+        state["messages"].append(AIMessage(content="No findings to summarize"))
+        return state
+    prompt = get_findings_summary_prompt(findings)
+    response = llm.invoke([HumanMessage(content=prompt)])
+    state["messages"].append(AIMessage(content=response.content))
+    return state
+def route_condition(state: AgentState) -> str:
+    """Conditional routing function based on next_action"""
+    next_action = state.get("next_action")
+    if next_action == TaskType.PARSE_CHECKLIST.value:
+        return "parse_checklist"
+    elif next_action == TaskType.MATCH_CHECKLIST.value:
+        return "match_checklist"
+    elif next_action == TaskType.ANSWER_QUESTION.value:
+        return "answer_question"
+    else:
+        return "summarize"
+# =============================================================================
+# LLM UTILITIES - Merged from llm_utilities.py
+# =============================================================================
+def simple_retry(func, max_retries: int = 3, base_delay: float = 1.0):
+    """Simple exponential backoff retry with jitter"""
+    last_exception = None
+    for attempt in range(max_retries):
+        try:
+            return func()
+        except Exception as e:
+            last_exception = e
+            # Check if it's a rate limit error that should be retried
+            error_str = str(e).lower()
+            if any(keyword in error_str for keyword in [
+                'rate', 'limit', 'quota', 'throttl', '429', 'too many',
+                'overload', '529', 'server_overloaded', 'overloaded_error'
+            ]):
+                if attempt < max_retries - 1:  # Don't wait on last attempt
+                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
+                    time.sleep(min(delay, 60))  # Cap at 60 seconds
+                    continue
+            # For non-retryable errors, raise immediately
+            raise e
+    # If we get here, all retries failed
+    raise last_exception
+def generate_checklist_descriptions(checklist: Dict, llm: "ChatAnthropic", batch_size: Optional[int] = None) -> Dict:
+    """
+    Generate detailed descriptions for each checklist item explaining what documents should satisfy it.
+    Returns checklist with added 'description' field for each item.
+    Args:
+        checklist: Checklist dictionary to enhance
+        llm: ChatAnthropic instance for generating descriptions
+        batch_size: Number of items to process in each batch (uses config default if None)
+    Returns:
+        Enhanced checklist with descriptions
+    """
+    config = get_config()
+    if batch_size is None:
+        batch_size = config.processing.description_batch_size
+    # Process all checklist items
+    enhanced_checklist = {}
+    all_items_to_process = []
+    # Collect all items with their context
+    for cat_letter, category in checklist.items():
+        cat_name = category.get('name', '')
+        enhanced_checklist[cat_letter] = {
+            'name': cat_name,
+            'letter': cat_letter,
+            'items': []
+        }
+        for item in category.get('items', []):
+            item_data = {
+                'category_letter': cat_letter,
+                'category_name': cat_name,
+                'item_text': item.get('text', ''),
+                'original_item': item,
+                'prompt': get_description_generation_prompt(cat_name, item.get('text', '')).format()
+            }
+            all_items_to_process.append(item_data)
+    # Process items in batches
+    total_items = len(all_items_to_process)
+    total_batches = (total_items + batch_size - 1) // batch_size
+    for batch_num, i in enumerate(range(0, total_items, batch_size), 1):
+        batch = all_items_to_process[i:i + batch_size]
+        batch_end = min(i + batch_size, total_items)
+        # Update progress if available
+        if hasattr(st, 'progress') and 'description_progress' in st.session_state:
+            progress = i / total_items
+            st.session_state.description_progress.progress(
+                progress,
+                text=f"📝 Generating descriptions batch {batch_num}/{total_batches} (items {i+1}-{batch_end} of {total_items})"
+            )
+        # Create prompts for batch processing
+        prompts = [item_data['prompt'] for item_data in batch]
+        messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
+        # Process batch with simple retry logic
+        try:
+            responses = simple_retry(
+                lambda: llm.batch(
+                    messages_batch,
+                    config={"max_concurrency": min(batch_size, config.api.max_concurrent_requests)}
+                ),
+                max_retries=3,
+                base_delay=0.5
+            )
+            # Extract descriptions from responses
+            batch_descriptions = [response.content.strip() if response else f"Documents related to {item_data['item_text']}"
+                                for response, item_data in zip(responses, batch)]
+        except Exception as e:
+            logger.warning(f"Batch {batch_num} description generation failed: {e}. Using fallback descriptions.")
+            batch_descriptions = [f"Documents related to {item_data['item_text']}" for item_data in batch]
+        # Add descriptions to items
+        for item_data, description in zip(batch, batch_descriptions):
+            enhanced_item = item_data['original_item'].copy()
+            enhanced_item['description'] = description
+            enhanced_checklist[item_data['category_letter']]['items'].append(enhanced_item)
+        # No delay between batches - using rate limiting with exponential backoff instead
+    return enhanced_checklist
+def batch_summarize_documents(documents: List[Dict], llm: "ChatAnthropic", batch_size: Optional[int] = None) -> List[Dict]:
+    """
+    Summarize documents using LangChain's built-in batch processing for true parallelization.
+    Optimized with larger batches, higher concurrency, and exponential backoff rate limiting.
+    Returns documents with added 'summary' field.
+    Args:
+        documents: List of document dictionaries to summarize
+        llm: ChatAnthropic instance for generating summaries
+        batch_size: Number of documents to process in each batch (uses config default if None)
+    Returns:
+        List of documents with added summary field
+    """
+    config = get_config()
+    if batch_size is None:
+        batch_size = config.processing.batch_size
+    # Process documents in batches
+    summarized_docs = []
+    total_docs = len(documents)
+    total_batches = (total_docs + batch_size - 1) // batch_size
+    for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
+        batch = documents[i:i + batch_size]
+        batch_end = min(i + batch_size, total_docs)
+        # Update progress with batch info
+        if hasattr(st, 'progress') and 'summary_progress' in st.session_state:
+            progress = i / total_docs
+            st.session_state.summary_progress.progress(
+                progress,
+                text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
+            )
+        # Create prompts for all documents in the batch
+        templates = [get_document_summarization_prompt(doc) for doc in batch]
+        prompts = [template.format() for template in templates]
+        # Convert prompts to HumanMessage format for batch processing
+        messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
+        # Process batch with simple retry logic
+        try:
+            responses = simple_retry(
+                lambda: llm.batch(
+                    messages_batch,
+                    config={"max_concurrency": min(batch_size // 2 or 1, config.api.max_concurrent_requests)}
+                ),
+                max_retries=3,
+                base_delay=0.5
+            )
+            # Extract summaries from responses
+            batch_summaries = [response.content.strip() if response else f"Document: {doc.get('name', 'Unknown')}"
+                              for response, doc in zip(responses, batch)]
+        except Exception as e:
+            logger.warning(f"Batch {batch_num} processing failed: {e}. Using fallback summaries.")
+            batch_summaries = [f"Document: {doc.get('name', 'Unknown')}" for doc in batch]
+        # Add summaries to documents
+        for doc, summary in zip(batch, batch_summaries):
+            doc['summary'] = summary
+            summarized_docs.append(doc)
+        # No delay between batches - using rate limiting with exponential backoff instead
+    return summarized_docs
+def create_document_embeddings_with_summaries(documents: List[Dict], model_name: str = None) -> Dict[str, Any]:
+    """
+    Prepare document data for LangChain-based similarity matching.
+    No longer creates embeddings directly - LangChain handles embedding generation.
+    Args:
+        documents: List of documents with summaries
+    Returns:
+        Dictionary with document info formatted for LangChain matching
+    """
+    doc_info = []
+    for doc in documents:
+        # Prepare document info for LangChain matching
+        doc_name = doc.get('name', 'Unknown')
+        doc_path = doc.get('path', '')
+        summary = doc.get('summary', '')
+        doc_info.append({
+            'name': doc_name,
+            'path': doc_path,
+            'full_path': doc.get('full_path', doc_path),
+            'summary': summary,
+            'original_doc': doc
+        })
+    return {
+        'documents': doc_info
+    }
+def match_checklist_with_summaries(
+    checklist: Dict,
+    doc_embeddings_data: Dict,
+    model_name: str,
+    threshold: Optional[float] = None
+) -> Dict:
+    """
+    Match checklist items against document summaries using LangChain FAISS.
+    Enhanced to use LLM-generated descriptions for better semantic matching.
+    Args:
+        checklist: Checklist dictionary with items and descriptions
+        doc_embeddings_data: Dictionary containing document info and embeddings
+        model_name: Name of the HuggingFace model for embeddings
+        threshold: Similarity threshold for matching (uses config default if None)
+    Returns:
+        Dictionary with matching results
+    """
+    config = get_config()
+    if threshold is None:
+        threshold = config.processing.similarity_threshold
+    doc_info = doc_embeddings_data['documents']
+    # Create LangChain embeddings instance
+    embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    # Convert document summaries to LangChain Documents
+    documents = [
+        Document(
+            page_content=f"{doc['name']}\n{doc['path']}\n{doc['summary']}",
+            metadata={
+                'name': doc['name'],
+                'path': doc['path'],
+                'full_path': doc.get('full_path', doc['path']),
+                'summary': doc['summary'],
+                **doc.get('original_doc', {}).get('metadata', {})
+            }
+        )
+        for doc in doc_info
+    ]
+    # Create LangChain FAISS vector store
+    vector_store = FAISS.from_documents(documents, embeddings)
+    retriever = vector_store.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"score_threshold": threshold, "k": 5}
+    )
+    results = {}
+    for cat_letter, category in checklist.items():
+        cat_name = category.get('name', '')
+        cat_results = {
+            'name': cat_name,
+            'letter': cat_letter,
+            'total_items': len(category.get('items', [])),
+            'matched_items': 0,
+            'items': []
+        }
+        for item in category.get('items', []):
+            item_text = item.get('text', '')
+            item_description = item.get('description', '')
+            # Create enhanced query using both item text and generated description
+            if item_description:
+                # Use the LLM-generated description for richer semantic matching
+                query = f"{cat_name}: {item_text}\n{item_description}"
+            else:
+                # Fall back to original method if no description available
+                query = f"{cat_name}: {item_text}"
+            # Use LangChain retriever for similarity search
+            docs = safe_execute(
+                lambda: retriever.invoke(query),
+                default=[],
+                context="Document matching with summaries"
+            )
+            # Convert LangChain documents to matches format
+            matches = []
+            for doc in docs[:5]:  # Keep top 5 matches
+                match_data = {
+                    'name': doc.metadata['name'],
+                    'path': doc.metadata['path'],
+                    'full_path': doc.metadata.get('full_path', doc.metadata['path']),
+                    'summary': doc.metadata['summary'],
+                    'score': 0.8,  # LangChain retriever doesn't return raw scores
+                    'metadata': {k: v for k, v in doc.metadata.items()
+                                if k not in ['name', 'path', 'full_path', 'summary']}
+                }
+                matches.append(match_data)
+            item_result = {
+                'text': item_text,
+                'original': item.get('original', item_text),
+                'description': item_description,  # Include the generated description
+                'matches': matches
+            }
+            # Count items with matches toward category total
+            if matches:
+                cat_results['matched_items'] += 1
+            cat_results['items'].append(item_result)
+        results[cat_letter] = cat_results
+    return results
+# =============================================================================
+# LANGGRAPH AGENT FUNCTIONS
+# =============================================================================
+def get_langgraph_agent(api_key: Optional[str] = None, model: Optional[str] = None) -> Optional[Tuple[Any, "ChatAnthropic"]]:
     """
     Create a LangGraph agent with Anthropic
         Tuple of (compiled_app, llm) or None if not available
     """
     # Get configuration
     config = get_config()
     def is_available(self) -> bool:
         """Check if the agent is available for use"""
+        return self.app is not None and self.llm is not None
     def parse_checklist(self, checklist_text: str) -> Optional[Dict]:
         """
             return result.get("checklist")
         except Exception as e:
+            st.error(f"Agent error: {str(e)}")
             return None
     def match_documents(self, checklist: Dict, documents: List[Dict]) -> Dict:
             return result.get("findings", {})
         except Exception as e:
+            st.error(f"Agent error: {str(e)}")
             return {}
     def answer_question(self, question: str, documents: List[Dict]) -> str:

src/ai/agent_nodes.py DELETED Viewed

@@ -1,173 +0,0 @@
-#!/usr/bin/env python3
-"""
-LangGraph Agent Nodes Module
-This module contains all the individual node functions used in the
-LangGraph workflow for the DD-Checklist agent.
-"""
-import json
-from typing import Dict, List, Optional, Sequence, Any
-from typing_extensions import TypedDict
-from enum import Enum
-try:
-    from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
-    from langchain_anthropic import ChatAnthropic
-    LANGGRAPH_AVAILABLE = True
-except ImportError:
-    LANGGRAPH_AVAILABLE = False
-    BaseMessage = object
-    HumanMessage = object
-    AIMessage = object
-    ChatAnthropic = object
-from .prompts import (
-    get_checklist_parsing_prompt,
-    get_document_relevance_prompt,
-    get_question_answering_prompt,
-    get_findings_summary_prompt
-)
-# Define the state for our agent
-class AgentState(TypedDict):
-    """State for the due diligence agent"""
-    messages: Sequence[BaseMessage]
-    checklist: Optional[Dict]
-    documents: Optional[List[Dict]]
-    current_task: Optional[str]
-    findings: Dict[str, List[str]]
-    next_action: Optional[str]
-class TaskType(Enum):
-    """Types of tasks the agent can perform"""
-    PARSE_CHECKLIST = "parse_checklist"
-    ANALYZE_DOCUMENT = "analyze_document"
-    MATCH_CHECKLIST = "match_checklist"
-    ANSWER_QUESTION = "answer_question"
-    SUMMARIZE_FINDINGS = "summarize_findings"
-def route_task(state: AgentState) -> AgentState:
-    """Route to appropriate task based on current state"""
-    messages = state["messages"]
-    if not messages:
-        return state
-    last_message = messages[-1].content if messages else ""
-    # Determine next action based on message content
-    if "parse" in last_message.lower() and "checklist" in last_message.lower():
-        state["next_action"] = TaskType.PARSE_CHECKLIST.value
-    elif "analyze" in last_message.lower() or "match" in last_message.lower():
-        state["next_action"] = TaskType.MATCH_CHECKLIST.value
-    elif "?" in last_message:
-        state["next_action"] = TaskType.ANSWER_QUESTION.value
-    else:
-        state["next_action"] = TaskType.SUMMARIZE_FINDINGS.value
-    return state
-def parse_checklist_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
-    """Parse checklist using Claude"""
-    messages = state["messages"]
-    checklist_text = messages[-1].content if messages else ""
-    prompt = get_checklist_parsing_prompt(checklist_text)
-    response = llm.invoke([HumanMessage(content=prompt)])
-    try:
-        # Parse JSON from response
-        json_str = response.content
-        if "```json" in json_str:
-            json_str = json_str.split("```json")[1].split("```")[0]
-        elif "```" in json_str:
-            json_str = json_str.split("```")[1].split("```")[0]
-        parsed = json.loads(json_str.strip())
-        state["checklist"] = parsed
-        state["messages"].append(AIMessage(content=f"Parsed {len(parsed)} categories"))
-    except Exception as e:
-        state["messages"].append(AIMessage(content=f"Parsing failed: {str(e)}"))
-    return state
-def match_checklist_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
-    """Match documents to checklist items"""
-    checklist = state.get("checklist", {})
-    documents = state.get("documents", [])
-    if not checklist or not documents:
-        state["messages"].append(AIMessage(content="Need both checklist and documents to match"))
-        return state
-    # For each checklist item, find relevant documents
-    findings = {}
-    for cat_letter, category in checklist.items():
-        cat_findings = []
-        for item in category.get("items", []):
-            # Use Claude to assess relevance
-            document_names = [d.get('name', 'Unknown') for d in documents[:10]]
-            prompt = get_document_relevance_prompt(item['text'], document_names)
-            response = llm.invoke([HumanMessage(content=prompt)])
-            cat_findings.append({
-                "item": item['text'],
-                "relevant_docs": response.content
-            })
-        findings[category['name']] = cat_findings
-    state["findings"] = findings
-    state["messages"].append(AIMessage(content=f"Matched checklist to {len(documents)} documents"))
-    return state
-def answer_question_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
-    """Answer questions using document context"""
-    messages = state["messages"]
-    question = messages[-1].content if messages else ""
-    documents = state.get("documents", [])
-    # Create context from documents
-    context = "\n".join([f"- {d.get('name', 'Unknown')}: {d.get('text', '')[:200]}"
-                        for d in documents[:5]])
-    prompt = get_question_answering_prompt(question, context)
-    response = llm.invoke([HumanMessage(content=prompt)])
-    state["messages"].append(AIMessage(content=response.content))
-    return state
-def summarize_node(state: AgentState, llm: ChatAnthropic) -> AgentState:
-    """Summarize findings"""
-    findings = state.get("findings", {})
-    if not findings:
-        state["messages"].append(AIMessage(content="No findings to summarize"))
-        return state
-    prompt = get_findings_summary_prompt(findings)
-    response = llm.invoke([HumanMessage(content=prompt)])
-    state["messages"].append(AIMessage(content=response.content))
-    return state
-def route_condition(state: AgentState) -> str:
-    """Conditional routing function based on next_action"""
-    next_action = state.get("next_action")
-    if next_action == TaskType.PARSE_CHECKLIST.value:
-        return "parse_checklist"
-    elif next_action == TaskType.MATCH_CHECKLIST.value:
-        return "match_checklist"
-    elif next_action == TaskType.ANSWER_QUESTION.value:
-        return "answer_question"
-    else:
-        return "summarize"

src/ai/llm_utilities.py DELETED Viewed

@@ -1,432 +0,0 @@
-#!/usr/bin/env python3
-"""
-LLM Utilities Module
-This module contains utility functions for batch processing, document
-summarization, embeddings, and checklist matching operations.
-"""
-import time
-import random
-from typing import Dict, List, Any, Optional
-try:
-    import streamlit as st
-    from langchain_anthropic import ChatAnthropic
-    from langchain_core.messages import HumanMessage
-    import numpy as np
-    import faiss
-    DEPENDENCIES_AVAILABLE = True
-except ImportError:
-    DEPENDENCIES_AVAILABLE = False
-    st = None
-    ChatAnthropic = object
-    HumanMessage = object
-from ..config import get_config
-from .prompts import get_description_generation_prompt, get_document_summarization_prompt
-def exponential_backoff_retry(func, max_retries: Optional[int] = None, base_delay: Optional[float] = None):
-    """
-    Execute function with exponential backoff retry logic for rate limiting.
-    Args:
-        func: Function to execute
-        max_retries: Maximum number of retries (uses config default if None)
-        base_delay: Base delay in seconds (uses config default if None)
-    Returns:
-        Result of the function call
-    """
-    config = get_config()
-    if max_retries is None:
-        max_retries = config.api.max_retries
-    if base_delay is None:
-        base_delay = config.api.base_delay
-    for attempt in range(max_retries):
-        try:
-            return func()
-        except Exception as e:
-            error_str = str(e).lower()
-            # Check if it's a rate limiting error
-            if any(keyword in error_str for keyword in ['rate', 'limit', 'quota', 'throttl', '429', 'too many']):
-                if attempt < max_retries - 1:
-                    # Calculate exponential backoff with jitter
-                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
-                    print(f"Rate limit hit, retrying in {delay:.2f}s (attempt {attempt + 1}/{max_retries})")
-                    time.sleep(delay)
-                    continue
-                else:
-                    print(f"Rate limit exceeded after {max_retries} attempts")
-                    raise e
-            else:
-                # Non-rate limit error, don't retry
-                raise e
-    return None
-def generate_checklist_descriptions(checklist: Dict, llm: ChatAnthropic, batch_size: Optional[int] = None) -> Dict:
-    """
-    Generate detailed descriptions for each checklist item explaining what documents should satisfy it.
-    Returns checklist with added 'description' field for each item.
-    Args:
-        checklist: Checklist dictionary to enhance
-        llm: ChatAnthropic instance for generating descriptions
-        batch_size: Number of items to process in each batch (uses config default if None)
-    Returns:
-        Enhanced checklist with descriptions
-    """
-    if not DEPENDENCIES_AVAILABLE:
-        return checklist
-    config = get_config()
-    if batch_size is None:
-        batch_size = config.processing.description_batch_size
-    # Process all checklist items
-    enhanced_checklist = {}
-    all_items_to_process = []
-    # Collect all items with their context
-    for cat_letter, category in checklist.items():
-        cat_name = category.get('name', '')
-        enhanced_checklist[cat_letter] = {
-            'name': cat_name,
-            'letter': cat_letter,
-            'items': []
-        }
-        for item in category.get('items', []):
-            item_data = {
-                'category_letter': cat_letter,
-                'category_name': cat_name,
-                'item_text': item.get('text', ''),
-                'original_item': item,
-                'prompt': get_description_generation_prompt(cat_name, item.get('text', ''))
-            }
-            all_items_to_process.append(item_data)
-    # Process items in batches
-    total_items = len(all_items_to_process)
-    total_batches = (total_items + batch_size - 1) // batch_size
-    for batch_num, i in enumerate(range(0, total_items, batch_size), 1):
-        batch = all_items_to_process[i:i + batch_size]
-        batch_end = min(i + batch_size, total_items)
-        # Update progress if available
-        if st and hasattr(st, 'progress') and 'description_progress' in st.session_state:
-            progress = i / total_items
-            st.session_state.description_progress.progress(
-                progress,
-                text=f"📝 Generating descriptions batch {batch_num}/{total_batches} (items {i+1}-{batch_end} of {total_items})"
-            )
-        # Create prompts for batch processing
-        prompts = [item_data['prompt'] for item_data in batch]
-        messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
-        # Use exponential backoff for batch processing
-        def process_descriptions_batch():
-            # Use higher concurrency for descriptions since they're short
-            max_concurrent = min(batch_size * 2, config.api.max_concurrent_requests)
-            return llm.batch(
-                messages_batch,
-                config={"max_concurrency": max_concurrent}
-            )
-        try:
-            responses = exponential_backoff_retry(
-                process_descriptions_batch,
-                max_retries=config.api.max_retries,
-                base_delay=config.api.batch_base_delay
-            )
-            # Extract descriptions from responses
-            batch_descriptions = [response.content.strip() if response else f"Documents related to {item_data['item_text']}"
-                                for response, item_data in zip(responses, batch)]
-        except Exception as e:
-            # Fallback to sequential processing with individual retries if batch fails
-            print(f"Batch {batch_num} description generation failed: {e}. Falling back to sequential with retries.")
-            batch_descriptions = []
-            for item_data in batch:
-                def single_description_process():
-                    return llm.invoke([HumanMessage(content=item_data['prompt'])])
-                try:
-                    response = exponential_backoff_retry(
-                        single_description_process,
-                        max_retries=config.api.batch_retry_attempts,
-                        base_delay=config.api.single_retry_base_delay
-                    )
-                    batch_descriptions.append(response.content.strip())
-                except Exception as inner_e:
-                    print(f"Failed to generate description for {item_data['item_text']}: {inner_e}")
-                    batch_descriptions.append(f"Documents related to {item_data['item_text']}")
-        # Add descriptions to items
-        for item_data, description in zip(batch, batch_descriptions):
-            enhanced_item = item_data['original_item'].copy()
-            enhanced_item['description'] = description
-            enhanced_checklist[item_data['category_letter']]['items'].append(enhanced_item)
-        # No delay between batches - using rate limiting with exponential backoff instead
-    return enhanced_checklist
-def batch_summarize_documents(documents: List[Dict], llm: ChatAnthropic, batch_size: Optional[int] = None) -> List[Dict]:
-    """
-    Summarize documents using LangChain's built-in batch processing for true parallelization.
-    Optimized with larger batches, higher concurrency, and exponential backoff rate limiting.
-    Returns documents with added 'summary' field.
-    Args:
-        documents: List of document dictionaries to summarize
-        llm: ChatAnthropic instance for generating summaries
-        batch_size: Number of documents to process in each batch (uses config default if None)
-    Returns:
-        List of documents with added summary field
-    """
-    if not DEPENDENCIES_AVAILABLE:
-        return documents
-    config = get_config()
-    if batch_size is None:
-        batch_size = config.processing.batch_size
-    # Process documents in batches
-    summarized_docs = []
-    total_docs = len(documents)
-    total_batches = (total_docs + batch_size - 1) // batch_size
-    for batch_num, i in enumerate(range(0, total_docs, batch_size), 1):
-        batch = documents[i:i + batch_size]
-        batch_end = min(i + batch_size, total_docs)
-        # Update progress with batch info
-        if st and hasattr(st, 'progress') and 'summary_progress' in st.session_state:
-            progress = i / total_docs
-            st.session_state.summary_progress.progress(
-                progress,
-                text=f"📝 Processing batch {batch_num}/{total_batches} (docs {i+1}-{batch_end} of {total_docs})"
-            )
-        # Create prompts for all documents in the batch
-        prompts = [get_document_summarization_prompt(doc) for doc in batch]
-        # Convert prompts to HumanMessage format for batch processing
-        messages_batch = [[HumanMessage(content=prompt)] for prompt in prompts]
-        # Use exponential backoff for batch processing
-        def process_batch():
-            max_concurrent = min(batch_size, config.api.max_concurrent_requests)
-            return llm.batch(
-                messages_batch,
-                config={"max_concurrency": max_concurrent}
-            )
-        try:
-            responses = exponential_backoff_retry(
-                process_batch,
-                max_retries=config.api.max_retries,
-                base_delay=config.api.batch_base_delay
-            )
-            # Extract summaries from responses
-            batch_summaries = [response.content.strip() if response else f"Document: {doc.get('name', 'Unknown')}"
-                              for response, doc in zip(responses, batch)]
-        except Exception as e:
-            # Fallback to sequential processing with individual retries if batch fails
-            print(f"Batch {batch_num} processing failed: {e}. Falling back to sequential with retries.")
-            batch_summaries = []
-            for doc_idx, doc in enumerate(batch):
-                prompt = get_document_summarization_prompt(doc)
-                def single_doc_process():
-                    return llm.invoke([HumanMessage(content=prompt)])
-                try:
-                    response = exponential_backoff_retry(
-                        single_doc_process,
-                        max_retries=config.api.batch_retry_attempts,
-                        base_delay=config.api.single_retry_base_delay
-                    )
-                    batch_summaries.append(response.content.strip())
-                except Exception as inner_e:
-                    print(f"Failed to summarize {doc.get('name', 'Unknown')}: {inner_e}")
-                    batch_summaries.append(f"Document: {doc.get('name', 'Unknown')}")
-                # Update progress within fallback
-                if st and hasattr(st, 'progress') and 'summary_progress' in st.session_state:
-                    sub_progress = (i + doc_idx + 1) / total_docs
-                    st.session_state.summary_progress.progress(
-                        sub_progress,
-                        text=f"📝 Sequential fallback: {i + doc_idx + 1}/{total_docs}"
-                    )
-        # Add summaries to documents
-        for doc, summary in zip(batch, batch_summaries):
-            doc['summary'] = summary
-            summarized_docs.append(doc)
-        # No delay between batches - using rate limiting with exponential backoff instead
-    return summarized_docs
-def create_document_embeddings_with_summaries(documents: List[Dict], model) -> Dict[str, Any]:
-    """
-    Create embeddings for documents using their LLM-generated summaries.
-    Args:
-        documents: List of documents with summaries
-        model: SentenceTransformer model for embeddings
-    Returns:
-        Dictionary with document info and embeddings
-    """
-    doc_embeddings = []
-    doc_info = []
-    for doc in documents:
-        # Combine filename, path context, and LLM summary for rich embedding
-        doc_name = doc.get('name', 'Unknown')
-        doc_path = doc.get('path', '')
-        summary = doc.get('summary', '')
-        # Create rich text representation
-        embedding_text = f"{doc_name}\n{doc_path}\n{summary}"
-        # Generate embedding
-        embedding = model.encode(embedding_text)
-        doc_embeddings.append(embedding)
-        doc_info.append({
-            'name': doc_name,
-            'path': doc_path,
-            'full_path': doc.get('full_path', doc_path),
-            'summary': summary,
-            'embedding_text': embedding_text,
-            'original_doc': doc
-        })
-    return {
-        'embeddings': doc_embeddings,
-        'documents': doc_info
-    }
-def match_checklist_with_summaries(
-    checklist: Dict,
-    doc_embeddings_data: Dict,
-    model,
-    threshold: Optional[float] = None
-) -> Dict:
-    """
-    Match checklist items against document summaries using FAISS for 10x faster similarity search.
-    Enhanced to use LLM-generated descriptions for better semantic matching.
-    Args:
-        checklist: Checklist dictionary with items and descriptions
-        doc_embeddings_data: Dictionary containing document embeddings and info
-        model: SentenceTransformer model for embeddings
-        threshold: Similarity threshold for matching (uses config default if None)
-    Returns:
-        Dictionary with matching results
-    """
-    if not DEPENDENCIES_AVAILABLE:
-        return {}
-    config = get_config()
-    if threshold is None:
-        threshold = config.processing.similarity_threshold
-    doc_embeddings = np.array(doc_embeddings_data['embeddings'], dtype='float32')
-    doc_info = doc_embeddings_data['documents']
-    # Build FAISS index for fast similarity search
-    faiss.normalize_L2(doc_embeddings)  # Normalize for cosine similarity
-    dimension = doc_embeddings.shape[1]
-    faiss_index = faiss.IndexFlatIP(dimension)
-    faiss_index.add(doc_embeddings)
-    results = {}
-    for cat_letter, category in checklist.items():
-        cat_name = category.get('name', '')
-        cat_results = {
-            'name': cat_name,
-            'letter': cat_letter,
-            'total_items': len(category.get('items', [])),
-            'matched_items': 0,
-            'items': []
-        }
-        for item in category.get('items', []):
-            item_text = item.get('text', '')
-            item_description = item.get('description', '')
-            # Create enhanced embedding text using both item text and generated description
-            if item_description:
-                # Use the LLM-generated description for richer semantic matching
-                checklist_embedding_text = f"{cat_name}: {item_text}\n{item_description}"
-            else:
-                # Fallback to original method if no description available
-                checklist_embedding_text = f"{cat_name}: {item_text}"
-            # Create and normalize item embedding
-            item_embedding = model.encode(checklist_embedding_text).astype('float32').reshape(1, -1)
-            faiss.normalize_L2(item_embedding)
-            # Use FAISS for fast similarity search
-            scores, indices = faiss_index.search(item_embedding, len(doc_info))
-            # Find matching documents above threshold
-            matches = []
-            min_display_threshold = config.processing.min_display_threshold
-            for score, idx in zip(scores[0], indices[0]):
-                if idx == -1:  # No more results
-                    break
-                if score < min_display_threshold:  # Skip very low scoring documents
-                    break  # Scores are sorted, so we can stop here
-                match_data = {
-                    'name': doc_info[idx]['name'],
-                    'path': doc_info[idx]['path'],
-                    'full_path': doc_info[idx].get('full_path', doc_info[idx]['path']),
-                    'summary': doc_info[idx]['summary'],
-                    'score': float(score),
-                    'metadata': doc_info[idx].get('original_doc', {}).get('metadata', {})
-                }
-                matches.append(match_data)
-            # Keep top 5 matches for display
-            display_matches = matches[:5]
-            item_result = {
-                'text': item_text,
-                'original': item.get('original', item_text),
-                'description': item_description,  # Include the generated description
-                'matches': display_matches
-            }
-            # Count items with ANY matches (both green and yellow) toward category total
-            if display_matches:
-                cat_results['matched_items'] += 1
-            cat_results['items'].append(item_result)
-        results[cat_letter] = cat_results
-    return results

src/ai/prompts.py CHANGED Viewed

@@ -6,147 +6,92 @@ This module contains all prompt templates used for AI interactions
 in the DD-Checklist application.
 """
 from typing import Dict, List
-def get_checklist_parsing_prompt(checklist_text: str) -> str:
-    """
-    Generate prompt for parsing due diligence checklists
-    Args:
-        checklist_text: Raw checklist text to parse
-    Returns:
-        Formatted prompt string
-    """
-    return f"""Parse this due diligence checklist into a structured JSON format.
-Extract categories (A., B., C.) and numbered items.
-Return ONLY valid JSON:
-{{
-    "A": {{
-        "name": "Category Name",
-        "items": [{{"text": "item", "number": 1}}]
-    }}
-}}
-Checklist:
-{checklist_text[:3000]}
-JSON:"""
-def get_document_relevance_prompt(item_text: str, documents: List[str]) -> str:
-    """
-    Generate prompt for assessing document relevance to checklist items
-    Args:
-        item_text: Checklist item text
-        documents: List of document names
-    Returns:
-        Formatted prompt string
-    """
-    return f"""Which of these documents is relevant to: {item_text}
-Documents: {documents}
-List the relevant document names only."""
-def get_question_answering_prompt(question: str, context: str) -> str:
-    """
-    Generate prompt for answering questions based on document context
-    Args:
-        question: User question
-        context: Document context
-    Returns:
-        Formatted prompt string
-    """
-    return f"""Answer this question based on the documents:
-Question: {question}
-Document Context:
-{context}
-Provide a comprehensive answer with citations."""
-def get_findings_summary_prompt(findings: Dict, max_chars: int = 2000) -> str:
-    """
-    Generate prompt for summarizing due diligence findings
-    Args:
-        findings: Dictionary of findings to summarize
-        max_chars: Maximum characters to include from findings
-    Returns:
-        Formatted prompt string
-    """
-    import json
-    findings_text = json.dumps(findings, indent=2)[:max_chars]
-    return f"""Provide an executive summary of the due diligence findings:
-{findings_text}
-Focus on:
-1. Completeness of documentation
-2. Key gaps or concerns
-3. Overall assessment"""
-def get_description_generation_prompt(category_name: str, item_text: str) -> str:
-    """
-    Generate prompt for creating checklist item descriptions
-    Args:
-        category_name: Name of the checklist category
-        item_text: Text of the checklist item
-    Returns:
-        Formatted prompt string
-    """
-    return f"""For this due diligence checklist item, provide a concise description (1-2 sentences) explaining what types of documents or information would satisfy this requirement. Focus on the specific document types and key information that would be relevant.
-Category: {category_name}
 Checklist Item: {item_text}
-Description (1-2 sentences explaining what documents/information satisfy this requirement):"""
-def get_document_summarization_prompt(doc: Dict) -> str:
-    """
-    Generate prompt for document type identification and summarization
-    Args:
-        doc: Dictionary containing document information
-    Returns:
-        Formatted prompt string
-    """
-    # Extract text preview (first 1000 chars)
-    text_preview = doc.get('content', '')[:1000] if doc.get('content') else ''
-    doc_name = doc.get('name', 'Unknown')
-    doc_path = doc.get('path', '')
-    return f"""Identify and describe what type of document this is in 1-2 sentences.
-Focus specifically on the document type, category, and what kind of information it contains.
-Examples of document types: financial statement, contract agreement, corporate governance document, employee handbook, technical specification, compliance report, audit report, etc.
-Document: {doc_name}
-Path: {doc_path}
-Content preview:
-{text_preview}
-Document type description (1-2 sentences only):"""
-# Template constants for common patterns
-DEFAULT_TEMPERATURE = 0.3
-DEFAULT_MAX_TOKENS = 2000

 in the DD-Checklist application.
 """
+import json
 from typing import Dict, List
+from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
+from langchain_core.messages import SystemMessage, HumanMessage
+def get_checklist_parsing_prompt(checklist_text: str) -> ChatPromptTemplate:
+    """Generate prompt for parsing due diligence checklists with structured output"""
+    return ChatPromptTemplate.from_messages([
+        SystemMessage(content="""
+Parse this due diligence checklist into structured format. Extract:
+- Categories (A., B., C., etc.) with their names
+- Numbered items within each category (1., 2., 3., etc.)
+- Total count of items
+Follow the exact format specified in the format instructions.
+"""),
+        HumanMessage(content="""Parse this checklist:
+{checklist_text}
+{format_instructions}
+Please provide the structured output:""")
+    ])
+def get_document_relevance_prompt(item_text: str, documents: List[str]) -> PromptTemplate:
+    """Generate prompt for assessing document relevance to checklist items with structured output"""
+    return PromptTemplate.from_template(
+        """Analyze which documents are relevant to the following checklist item:
 Checklist Item: {item_text}
+Available Documents:
+{documents}
+{format_instructions}
+Please provide your analysis in the specified format:"""
+    )
+def get_question_answering_prompt(question: str, context: str) -> ChatPromptTemplate:
+    """Generate prompt for answering questions based on document context"""
+    return ChatPromptTemplate.from_messages([
+        SystemMessage(content="Answer questions based on document context. Provide comprehensive answers with citations."),
+        HumanMessage(content=f"Question: {question}\n\nDocument Context:\n{context}\n\nAnswer:")
+    ])
+def get_findings_summary_prompt(findings: Dict, max_chars: int = 2000) -> PromptTemplate:
+    """Generate prompt for summarizing due diligence findings"""
+    findings_text = json.dumps(findings, indent=2)[:max_chars]
+    return PromptTemplate.from_template(
+        "Provide an executive summary of these due diligence findings:\n\n"
+        "{findings_text}\n\n"
+        "Focus on:\n"
+        "1. Completeness of documentation\n"
+        "2. Key gaps or concerns\n"
+        "3. Overall assessment"
+    ).partial(findings_text=findings_text)
+def get_description_generation_prompt(category_name: str, item_text: str) -> PromptTemplate:
+    """Generate prompt for creating checklist item descriptions"""
+    return PromptTemplate.from_template(
+        "For this due diligence checklist item, provide a concise description (1-2 sentences) "
+        "explaining what types of documents or information would satisfy this requirement.\n\n"
+        "Category: {category_name}\n"
+        "Checklist Item: {item_text}\n\n"
+        "Description:"
+    ).partial(category_name=category_name, item_text=item_text)
+def get_document_summarization_prompt(doc: Dict) -> PromptTemplate:
+    """Generate prompt for document type identification and summarization"""
+    doc_name = doc.get('name', 'Unknown')
+    doc_path = doc.get('path', '')
+    text_preview = doc.get('content', '')[:1000] if doc.get('content') else ''
+    return PromptTemplate.from_template(
+        "Identify and describe what type of document this is in 1-2 sentences.\n\n"
+        "Examples: financial statement, contract agreement, corporate governance document, etc.\n\n"
+        "Document: {doc_name}\n"
+        "Path: {doc_path}\n"
+        "Content preview:\n{text_preview}\n\n"
+        "Document type description:"
+    ).partial(doc_name=doc_name, doc_path=doc_path, text_preview=text_preview)

src/config.py CHANGED Viewed

@@ -1,463 +1,373 @@
 #!/usr/bin/env python3
 """
-Configuration Management Module
-This module centralizes all configuration settings for the DD-Checklist application.
-Handles environment variables, default settings, and configuration validation.
 """
 import os
 from pathlib import Path
-from typing import Dict, Any, Optional, List
-from dataclasses import dataclass, field
-from dotenv import load_dotenv
 # Fix tokenizers parallelism warning
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-@dataclass
-class ModelConfig:
-    """Configuration for AI models"""
     sentence_transformer_model: str = "all-MiniLM-L6-v2"
     claude_model: str = "claude-sonnet-4-20250514"
     temperature: float = 0.3
     max_tokens: int = 2000
-    embedding_dimension: int = 384
-    def __post_init__(self):
-        """Load model configuration from environment variables"""
-        self.sentence_transformer_model = os.getenv('SENTENCE_TRANSFORMER_MODEL', self.sentence_transformer_model)
-        self.claude_model = os.getenv('CLAUDE_MODEL', self.claude_model)
-        self.temperature = float(os.getenv('CLAUDE_TEMPERATURE', str(self.temperature)))
-        self.max_tokens = int(os.getenv('CLAUDE_MAX_TOKENS', str(self.max_tokens)))
-        self.embedding_dimension = int(os.getenv('EMBEDDING_DIMENSION', str(self.embedding_dimension)))
-@dataclass
-class ProcessingConfig:
-    """Configuration for document processing"""
-    chunk_size: int = 400
-    chunk_overlap: int = 50
-    max_text_length: int = 10000
-    batch_size: int = 100
-    description_batch_size: int = 100
     similarity_threshold: float = 0.35
     relevancy_threshold: float = 0.5
     primary_threshold: float = 0.6
     min_display_threshold: float = 0.15
-    max_workers: int = 4
-    file_timeout: int = 30
-    skip_descriptions: bool = False
-    supported_file_extensions: List[str] = field(
-        default_factory=lambda: ['.pdf', '.docx', '.doc', '.txt', '.md']
-    )
-    def __post_init__(self):
-        """Load processing configuration from environment variables"""
-        self.chunk_size = int(os.getenv('CHUNK_SIZE', str(self.chunk_size)))
-        self.chunk_overlap = int(os.getenv('CHUNK_OVERLAP', str(self.chunk_overlap)))
-        self.max_text_length = int(os.getenv('MAX_TEXT_LENGTH', str(self.max_text_length)))
-        self.batch_size = int(os.getenv('BATCH_SIZE', str(self.batch_size)))
-        self.description_batch_size = int(os.getenv('DESCRIPTION_BATCH_SIZE', str(self.description_batch_size)))
-        self.similarity_threshold = float(os.getenv('SIMILARITY_THRESHOLD', str(self.similarity_threshold)))
-        self.relevancy_threshold = float(os.getenv('RELEVANCY_THRESHOLD', str(self.relevancy_threshold)))
-        self.primary_threshold = float(os.getenv('PRIMARY_THRESHOLD', str(self.primary_threshold)))
-        self.min_display_threshold = float(os.getenv('MIN_DISPLAY_THRESHOLD', str(self.min_display_threshold)))
-        self.max_workers = int(os.getenv('MAX_WORKERS', str(self.max_workers)))
-        self.file_timeout = int(os.getenv('FILE_TIMEOUT', str(self.file_timeout)))
-        self.skip_descriptions = os.getenv('SKIP_DESCRIPTIONS', 'false').lower() == 'true'
-        # Handle file extensions from environment (comma-separated)
-        extensions_env = os.getenv('SUPPORTED_FILE_EXTENSIONS')
-        if extensions_env:
-            self.supported_file_extensions = [ext.strip() for ext in extensions_env.split(',')]
-@dataclass
-class UIConfig:
-    """Configuration for UI settings"""
     page_title: str = "AI Due Diligence"
     page_icon: str = "🤖"
     layout: str = "wide"
     top_k_search_results: int = 5
-    max_question_sources: int = 3
-    max_checklist_matches: int = 5
-@dataclass
-class PathConfig:
-    """Configuration for file paths"""
     data_dir: str = "data"
     checklist_dir: str = "data/checklist"
     questions_dir: str = "data/questions"
     strategy_dir: str = "data/strategy"
     vdrs_dir: str = "data/vdrs"
-    cache_dir: str = ".cache"
-    def __post_init__(self):
-        """Convert string paths to Path objects and ensure they exist"""
-        self.data_path = Path(self.data_dir)
-        self.checklist_path = Path(self.checklist_dir)
-        self.questions_path = Path(self.questions_dir)
-        self.strategy_path = Path(self.strategy_dir)
-        self.vdrs_path = Path(self.vdrs_dir)
-        self.cache_path = Path(self.cache_dir)
-@dataclass
-class APIConfig:
-    """Configuration for API settings"""
-    anthropic_api_key: Optional[str] = None
-    openai_api_key: Optional[str] = None
-    max_concurrent_requests: int = 50
-    request_timeout: int = 30
-    retry_attempts: int = 3
-    base_delay: float = 0.2
-    max_retries: int = 2
-    batch_retry_attempts: int = 1
-    batch_base_delay: float = 0.1
-    single_retry_base_delay: float = 0.05
-    def __post_init__(self):
-        """Load API configuration from environment variables"""
-        if not self.anthropic_api_key:
-            self.anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
-        if not self.openai_api_key:
-            self.openai_api_key = os.getenv('OPENAI_API_KEY')
-        self.max_concurrent_requests = int(os.getenv('MAX_CONCURRENT_REQUESTS', str(self.max_concurrent_requests)))
-        self.request_timeout = int(os.getenv('REQUEST_TIMEOUT', str(self.request_timeout)))
-        self.retry_attempts = int(os.getenv('RETRY_ATTEMPTS', str(self.retry_attempts)))
-        self.base_delay = float(os.getenv('BASE_DELAY', str(self.base_delay)))
-        self.max_retries = int(os.getenv('MAX_RETRIES', str(self.max_retries)))
-        self.batch_retry_attempts = int(os.getenv('BATCH_RETRY_ATTEMPTS', str(self.batch_retry_attempts)))
-        self.batch_base_delay = float(os.getenv('BATCH_BASE_DELAY', str(self.batch_base_delay)))
-        self.single_retry_base_delay = float(os.getenv('SINGLE_RETRY_BASE_DELAY', str(self.single_retry_base_delay)))
-@dataclass
-class AppConfig:
-    """Main application configuration"""
-    model: ModelConfig = field(default_factory=ModelConfig)
-    processing: ProcessingConfig = field(default_factory=ProcessingConfig)
-    ui: UIConfig = field(default_factory=UIConfig)
-    paths: PathConfig = field(default_factory=PathConfig)
-    api: APIConfig = field(default_factory=APIConfig)
-    # Environment settings
-    debug: bool = False
-    environment: str = "development"
-    log_level: str = "INFO"
-    def __post_init__(self):
-        """Load environment-specific settings"""
-        self.debug = os.getenv('DEBUG', 'false').lower() == 'true'
-        self.environment = os.getenv('ENVIRONMENT', 'development')
-        self.log_level = os.getenv('LOG_LEVEL', 'INFO')
-class ConfigManager:
-    """
-    Configuration manager that handles loading and validating configuration
-    """
-    def __init__(self, config_file: Optional[str] = None):
-        """
-        Initialize configuration manager
-        Args:
-            config_file: Optional path to configuration file
-        """
-        # Load environment variables
-        load_dotenv()
-        # Initialize configuration
-        self.config = AppConfig()
-        # Load from file if provided
-        if config_file and Path(config_file).exists():
-            self._load_from_file(config_file)
-        # Validate configuration
-        self._validate_config()
-    def _load_from_file(self, config_file: str) -> None:
-        """
-        Load configuration from file (JSON or YAML)
-        Args:
-            config_file: Path to configuration file
-        """
-        import json
-        config_path = Path(config_file)
-        try:
-            if config_path.suffix.lower() == '.json':
-                with open(config_path, 'r') as f:
-                    config_data = json.load(f)
-                self._update_config_from_dict(config_data)
-            elif config_path.suffix.lower() in ['.yml', '.yaml']:
-                try:
-                    import yaml
-                    with open(config_path, 'r') as f:
-                        config_data = yaml.safe_load(f)
-                    self._update_config_from_dict(config_data)
-                except ImportError:
-                    print("PyYAML not installed. Cannot load YAML configuration.")
-        except Exception as e:
-            print(f"Warning: Could not load configuration from {config_file}: {e}")
-    def _update_config_from_dict(self, config_data: Dict[str, Any]) -> None:
-        """
-        Update configuration from dictionary
-        Args:
-            config_data: Configuration dictionary
-        """
-        for section, values in config_data.items():
-            if hasattr(self.config, section) and isinstance(values, dict):
-                config_section = getattr(self.config, section)
-                for key, value in values.items():
-                    if hasattr(config_section, key):
-                        setattr(config_section, key, value)
-    def _validate_config(self) -> None:
-        """Validate configuration settings"""
-        # Validate paths
-        if not self.config.paths.data_path.exists():
-            print(f"Warning: Data directory does not exist: {self.config.paths.data_path}")
-        # Validate model settings
-        if self.config.processing.chunk_size <= self.config.processing.chunk_overlap:
-            print("Warning: Chunk size should be larger than chunk overlap")
-        # Validate thresholds
-        if not 0 <= self.config.processing.similarity_threshold <= 1:
-            print("Warning: Similarity threshold should be between 0 and 1")
-    def get_config(self) -> AppConfig:
-        """Get the current configuration"""
-        return self.config
-    def update_config(self, **kwargs) -> None:
-        """
-        Update configuration settings
-        Args:
-            **kwargs: Configuration updates
-        """
-        for key, value in kwargs.items():
-            if hasattr(self.config, key):
-                setattr(self.config, key, value)
-    def update_processing_config(self, **kwargs) -> None:
-        """
-        Update processing configuration dynamically
-        Args:
-            **kwargs: Processing configuration parameters to update
-        """
-        for key, value in kwargs.items():
-            if hasattr(self.config.processing, key):
-                setattr(self.config.processing, key, value)
-            else:
-                print(f"Warning: Unknown processing config key: {key}")
-    def update_api_config(self, **kwargs) -> None:
-        """
-        Update API configuration dynamically
-        Args:
-            **kwargs: API configuration parameters to update
-        """
-        for key, value in kwargs.items():
-            if hasattr(self.config, key):
-                setattr(self.config.api, key, value)
-            else:
-                print(f"Warning: Unknown API config key: {key}")
-    def save_config(self, config_file: str) -> None:
-        """
-        Save current configuration to file
-        Args:
-            config_file: Path to save configuration
-        """
-        import json
-        from dataclasses import asdict
-        config_dict = asdict(self.config)
-        # Remove Path objects and other non-serializable items
-        config_dict = self._make_serializable(config_dict)
-        with open(config_file, 'w') as f:
-            json.dump(config_dict, f, indent=2)
-    def _make_serializable(self, obj: Any) -> Any:
-        """Make configuration dictionary serializable"""
-        if isinstance(obj, dict):
-            return {k: self._make_serializable(v) for k, v in obj.items()
-                   if not k.endswith('_path')}  # Skip Path objects
-        elif isinstance(obj, list):
-            return [self._make_serializable(item) for item in obj]
-        elif isinstance(obj, Path):
-            return str(obj)
-        else:
-            return obj
 # Global configuration instance
-_config_manager: Optional[ConfigManager] = None
-def get_config() -> AppConfig:
-    """
-    Get the global configuration instance
-    Returns:
-        Application configuration
-    """
-    global _config_manager
-    if _config_manager is None:
-        _config_manager = ConfigManager()
-    return _config_manager.get_config()
-def init_config(config_file: Optional[str] = None) -> ConfigManager:
     """
-    Initialize global configuration
     Args:
-        config_file: Optional configuration file path
     Returns:
-        Configuration manager instance
     """
-    global _config_manager
-    _config_manager = ConfigManager(config_file)
-    return _config_manager
-def update_config(**kwargs) -> None:
-    """
-    Update global configuration
-    Args:
-        **kwargs: Configuration updates
-    """
-    global _config_manager
-    if _config_manager is None:
-        _config_manager = ConfigManager()
-    _config_manager.update_config(**kwargs)
-# Environment-specific configurations
-DEVELOPMENT_CONFIG = {
-    "processing": {
-        "batch_size": 50,
-        "similarity_threshold": 0.3
-    },
-    "ui": {
-        "layout": "wide"
-    }
-}
-PRODUCTION_CONFIG = {
-    "processing": {
-        "batch_size": 100,
-        "similarity_threshold": 0.35
-    },
-    "api": {
-        "max_concurrent_requests": 20,
-        "request_timeout": 60
-    }
-}
-STREAMLIT_CLOUD_CONFIG = {
-    "processing": {
-        "batch_size": 100,  # Optimized for performance
-        "description_batch_size": 100,  # Match summary batch size
-        "max_text_length": 8000,  # Higher limit for better quality
-        "max_workers": 2,  # Moderate parallelism for cloud
-        "file_timeout": 30  # Standard timeout
-    },
-    "api": {
-        "max_concurrent_requests": 30,  # Good concurrency for cloud
-        "base_delay": 0.1,  # Fast delays
-        "batch_base_delay": 0.05,  # Very fast batches
-        "request_timeout": 30
-    }
-}
-def get_environment_config() -> Dict[str, Any]:
-    """
-    Get environment-specific configuration
-    Returns:
-        Environment configuration dictionary
-    """
-    env = os.getenv('ENVIRONMENT', 'development').lower()
-    if env == 'production':
-        return PRODUCTION_CONFIG
-    elif env == 'streamlit_cloud':
-        return STREAMLIT_CLOUD_CONFIG
-    else:
-        return DEVELOPMENT_CONFIG
-# Utility functions for common configuration access
-def get_model_config() -> ModelConfig:
-    """Get model configuration"""
-    return get_config().model
-def get_processing_config() -> ProcessingConfig:
-    """Get processing configuration"""
-    return get_config().processing
-def get_ui_config() -> UIConfig:
-    """Get UI configuration"""
-    return get_config().ui
-def get_path_config() -> PathConfig:
-    """Get path configuration"""
-    return get_config().paths
-def get_api_config() -> APIConfig:
-    """Get API configuration"""
-    return get_config().api
-def is_ai_enabled() -> bool:
-    """Check if AI features are enabled (API key available)"""
-    api_config = get_api_config()
-    return api_config.anthropic_api_key is not None
-def get_supported_extensions() -> List[str]:
-    """Get list of supported file extensions"""
-    return get_processing_config().supported_file_extensions
-def update_processing_config(**kwargs) -> None:
-    """Update processing configuration dynamically"""
-    global _config_manager
-    if _config_manager is None:
-        _config_manager = ConfigManager()
-    _config_manager.update_processing_config(**kwargs)
-def update_api_config(**kwargs) -> None:
-    """Update API configuration dynamically"""
-    global _config_manager
-    if _config_manager is None:
-        _config_manager = ConfigManager()
-    _config_manager.update_api_config(**kwargs)

 #!/usr/bin/env python3
 """
+Configuration Module
+Uses pydantic-settings for robust configuration management from environment variables.
 """
 import os
+import sys
+import logging
+from datetime import datetime
 from pathlib import Path
+from typing import List, Optional
+from logging.handlers import RotatingFileHandler
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
 # Fix tokenizers parallelism warning
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+# Streamlit import for utilities (conditional)
+try:
+    import streamlit as st
+    STREAMLIT_AVAILABLE = True
+except ImportError:
+    STREAMLIT_AVAILABLE = False
+    st = None
+class ModelConfig(BaseModel):
+    """Model configuration settings"""
     sentence_transformer_model: str = "all-MiniLM-L6-v2"
     claude_model: str = "claude-sonnet-4-20250514"
     temperature: float = 0.3
     max_tokens: int = 2000
+class ProcessingConfig(BaseModel):
+    """Processing configuration settings"""
+    batch_size: int = 20
+    description_batch_size: int = 25
+    max_workers: int = 4
+    chunk_size: int = 1000
+    chunk_overlap: int = 200
     similarity_threshold: float = 0.35
     relevancy_threshold: float = 0.5
     primary_threshold: float = 0.6
     min_display_threshold: float = 0.15
+    supported_file_extensions: List[str] = ['.pdf', '.docx', '.doc', '.txt', '.md']
+    faiss_store_name: str = "default"
+    skip_processed_files: bool = True
+class UIConfig(BaseModel):
+    """UI configuration settings"""
     page_title: str = "AI Due Diligence"
     page_icon: str = "🤖"
     layout: str = "wide"
     top_k_search_results: int = 5
+class PathsConfig(BaseModel):
+    """Paths configuration with computed properties"""
     data_dir: str = "data"
     checklist_dir: str = "data/checklist"
     questions_dir: str = "data/questions"
     strategy_dir: str = "data/strategy"
     vdrs_dir: str = "data/vdrs"
+    faiss_dir: str = "data/enhanced_faiss"
+    @property
+    def data_path(self) -> Path:
+        return Path(self.data_dir)
+    @property
+    def checklist_path(self) -> Path:
+        return Path(self.checklist_dir)
+    @property
+    def questions_path(self) -> Path:
+        return Path(self.questions_dir)
+    @property
+    def strategy_path(self) -> Path:
+        return Path(self.strategy_dir)
+    @property
+    def vdrs_path(self) -> Path:
+        return Path(self.vdrs_dir)
+    @property
+    def faiss_path(self) -> Path:
+        return Path(self.faiss_dir)
+class APIConfig(BaseModel):
+    """API configuration settings"""
+    anthropic_api_key: Optional[str] = None
+    max_concurrent_requests: int = 10
+class Config(BaseSettings):
+    """Main application configuration using pydantic-settings"""
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        env_nested_delimiter="__",
+        case_sensitive=False,
+        extra="ignore"  # Allow extra environment variables to be ignored
+    )
+    # Model settings
+    sentence_transformer_model: str = Field(default="all-MiniLM-L6-v2", env="SENTENCE_TRANSFORMER_MODEL")
+    claude_model: str = Field(default="claude-sonnet-4-20250514", env="CLAUDE_MODEL")
+    temperature: float = Field(default=0.3, env="CLAUDE_TEMPERATURE")
+    max_tokens: int = Field(default=2000, env="CLAUDE_MAX_TOKENS")
+    # Processing settings (optimized for large datasets)
+    batch_size: int = Field(default=20, env="BATCH_SIZE")
+    description_batch_size: int = Field(default=25, env="DESCRIPTION_BATCH_SIZE")
+    max_workers: int = Field(default=4, env="MAX_WORKERS")
+    chunk_size: int = Field(default=1000, env="CHUNK_SIZE")
+    chunk_overlap: int = Field(default=200, env="CHUNK_OVERLAP")
+    similarity_threshold: float = Field(default=0.35, env="SIMILARITY_THRESHOLD")
+    relevancy_threshold: float = Field(default=0.5, env="RELEVANCY_THRESHOLD")
+    primary_threshold: float = Field(default=0.6, env="PRIMARY_THRESHOLD")
+    min_display_threshold: float = Field(default=0.15, env="MIN_DISPLAY_THRESHOLD")
+    supported_file_extensions: List[str] = Field(
+        default=['.pdf', '.docx', '.doc', '.txt', '.md'],
+        env="SUPPORTED_FILE_EXTENSIONS"
+    )
+    faiss_store_name: str = Field(default="default", env="FAISS_STORE_NAME")
+    skip_processed_files: bool = Field(default=True, env="SKIP_PROCESSED_FILES")
+    # Logging settings
+    log_level: str = Field(default="INFO", env="LOG_LEVEL")
+    suppress_langchain_warnings: bool = Field(default=True, env="SUPPRESS_LANGCHAIN_WARNINGS")
+    # UI settings
+    page_title: str = Field(default="AI Due Diligence", env="PAGE_TITLE")
+    page_icon: str = Field(default="🤖", env="PAGE_ICON")
+    layout: str = Field(default="wide", env="LAYOUT")
+    top_k_search_results: int = Field(default=5, env="TOP_K_SEARCH_RESULTS")
+    # Path settings
+    data_dir: str = Field(default="data", env="DATA_DIR")
+    checklist_dir: str = Field(default="data/checklist", env="CHECKLIST_DIR")
+    questions_dir: str = Field(default="data/questions", env="QUESTIONS_DIR")
+    strategy_dir: str = Field(default="data/strategy", env="STRATEGY_DIR")
+    vdrs_dir: str = Field(default="data/vdrs", env="VDRS_DIR")
+    faiss_dir: str = Field(default="data/enhanced_faiss", env="FAISS_DIR")
+    # API settings
+    anthropic_api_key: Optional[str] = Field(default=None, env="ANTHROPIC_API_KEY")
+    max_concurrent_requests: int = Field(default=10, env="MAX_CONCURRENT_REQUESTS")
+    @property
+    def model(self) -> ModelConfig:
+        """Get model configuration"""
+        return ModelConfig(
+            sentence_transformer_model=self.sentence_transformer_model,
+            claude_model=self.claude_model,
+            temperature=self.temperature,
+            max_tokens=self.max_tokens
+        )
+    @property
+    def processing(self) -> ProcessingConfig:
+        """Get processing configuration"""
+        return ProcessingConfig(
+            batch_size=self.batch_size,
+            description_batch_size=self.description_batch_size,
+            max_workers=self.max_workers,
+            chunk_size=self.chunk_size,
+            chunk_overlap=self.chunk_overlap,
+            similarity_threshold=self.similarity_threshold,
+            relevancy_threshold=self.relevancy_threshold,
+            primary_threshold=self.primary_threshold,
+            min_display_threshold=self.min_display_threshold,
+            supported_file_extensions=self.supported_file_extensions,
+            faiss_store_name=self.faiss_store_name,
+            skip_processed_files=self.skip_processed_files
+        )
+    @property
+    def ui(self) -> UIConfig:
+        """Get UI configuration"""
+        return UIConfig(
+            page_title=self.page_title,
+            page_icon=self.page_icon,
+            layout=self.layout,
+            top_k_search_results=self.top_k_search_results
+        )
+    @property
+    def paths(self) -> PathsConfig:
+        """Get paths configuration"""
+        return PathsConfig(
+            data_dir=self.data_dir,
+            checklist_dir=self.checklist_dir,
+            questions_dir=self.questions_dir,
+            strategy_dir=self.strategy_dir,
+            vdrs_dir=self.vdrs_dir,
+            faiss_dir=self.faiss_dir
+        )
+    @property
+    def api(self) -> APIConfig:
+        """Get API configuration"""
+        return APIConfig(
+            anthropic_api_key=self.anthropic_api_key,
+            max_concurrent_requests=self.max_concurrent_requests
+        )
 # Global configuration instance
+_config: Optional[Config] = None
+def get_config() -> Config:
+    """Get the global configuration instance"""
+    global _config
+    if _config is None:
+        _config = Config()
+    return _config
+def init_config(config_file: Optional[str] = None) -> Config:
+    """Initialize global configuration"""
+    global _config
+    _config = Config()
+    return _config
+# =============================================================================
+# LOGGING UTILITIES - Merged from utils.py
+# =============================================================================
+def setup_logging(
+    name: str = "dd_checklist",
+    log_level: Optional[str] = None,
+    log_file: Optional[str] = None
+) -> logging.Logger:
     """
+    Set up standard Python logging with rotating file handler
     Args:
+        name: Logger name
+        log_level: Logging level
+        log_file: Optional log file path
     Returns:
+        Configured logger instance
     """
+    logger = logging.getLogger(name)
+    # Avoid duplicate setup if logger already has handlers
+    if logger.handlers:
+        return logger
+    # Use configured log level if not provided
+    if log_level is None:
+        try:
+            config = get_config()
+            log_level = config.log_level
+        except Exception:
+            log_level = "INFO"  # fallback
+    logger.setLevel(getattr(logging, log_level.upper()))
+    # Console handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    console_handler.setFormatter(console_formatter)
+    logger.addHandler(console_handler)
+    # Rotating file handler (if possible)
+    if log_file or True:  # Always try to set up file logging
+        try:
+            log_dir = Path(".logs")
+            log_dir.mkdir(exist_ok=True)
+            if not log_file:
+                log_file = log_dir / f"dd_checklist_{datetime.now().strftime('%Y%m%d')}.log"
+            # Use RotatingFileHandler for better log management
+            file_handler = RotatingFileHandler(
+                log_file,
+                maxBytes=10 * 1024 * 1024,  # 10MB
+                backupCount=5
+            )
+            file_formatter = logging.Formatter(
+                '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
+            )
+            file_handler.setFormatter(file_formatter)
+            logger.addHandler(file_handler)
+        except Exception:
+            # File logging not available (e.g., on Streamlit Cloud)
+            pass
+    return logger
+# Global logger instance
+logger = setup_logging()
+# =============================================================================
+# STREAMLIT UTILITIES - Merged from utils.py
+# =============================================================================
+def show_success(message: str):
+    """Show success message in Streamlit"""
+    if STREAMLIT_AVAILABLE and st:
+        st.success(message)
+    logger.info(message)
+def show_info(message: str):
+    """Show info message in Streamlit"""
+    if STREAMLIT_AVAILABLE and st:
+        st.info(message)
+    logger.info(message)
+def show_error(message: str):
+    """Show error message in Streamlit"""
+    if STREAMLIT_AVAILABLE and st:
+        st.error(message)
+    logger.error(message)
+# =============================================================================
+# FILE UTILITIES - Common patterns extracted for reuse
+# =============================================================================
+def get_mime_type(file_path: Path) -> str:
+    """Get MIME type based on file extension"""
+    file_extension = file_path.suffix.lower()
+    if file_extension == '.pdf':
+        return 'application/pdf'
+    elif file_extension in ['.doc', '.docx']:
+        return 'application/msword'
+    elif file_extension == '.txt':
+        return 'text/plain'
+    elif file_extension == '.md':
+        return 'text/markdown'
+    else:
+        return 'application/octet-stream'
+def format_document_title(doc_name: str) -> str:
+    """Format document name into a readable title"""
+    if '.' in doc_name:
+        doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
+    else:
+        doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
+    return doc_title
+def count_documents_in_directory(directory: Path, supported_extensions: Optional[List[str]] = None) -> int:
+    """Count supported documents in a directory recursively"""
+    if supported_extensions is None:
+        supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md']
+    return sum(1 for f in directory.rglob('*')
+               if f.is_file() and f.suffix.lower() in supported_extensions)

src/document_processing.py CHANGED Viewed

@@ -1,52 +1,78 @@
 #!/usr/bin/env python3
 """
-Document Processing Module
-This module handles all document-related operations including:
-- File text extraction from various formats (PDF, DOCX, TXT, MD)
-- Document scanning and indexing
-- Semantic text chunking for RAG with better context preservation
-- Document metadata handling
 """
 import os
 # Fix tokenizers parallelism warning
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-import fitz  # PyMuPDF
-import docx
-import io
-import re
-from pathlib import Path
-from typing import Dict, List, Tuple, Optional
-import streamlit as st
-import numpy as np
-from sentence_transformers import SentenceTransformer
-import concurrent.futures
-import threading
 import logging
-from functools import wraps
-import joblib
-import hashlib
-import time
-import faiss
-# Semantic chunking
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 # Import configuration
 from .config import get_config
-# Setup logging for thread-safe error handling
 logger = logging.getLogger(__name__)
-# Thread-safe context management for Streamlit
-try:
-    from streamlit.runtime.scriptrunner import add_script_run_ctx, get_script_run_ctx
-    STREAMLIT_CONTEXT_AVAILABLE = True
-except ImportError:
-    STREAMLIT_CONTEXT_AVAILABLE = False
-    logger.warning("Streamlit context management not available")
 def escape_markdown_math(text: str) -> str:
@@ -63,755 +89,331 @@ def escape_markdown_math(text: str) -> str:
     return text
-def extract_text_from_file(file_path: Path, progress_callback=None) -> Tuple[str, Dict]:
-    """
-    Extract text from file with metadata
-    Args:
-        file_path: Path to the file to extract text from
-    Returns:
-        Tuple of (text_content, metadata)
-    """
-    metadata = {'pages': [], 'type': 'unknown'}
-    text_content = ""
-    try:
-        if file_path.suffix.lower() == '.pdf':
-            # Use PyMuPDF (fitz) for faster and more robust PDF processing
-            try:
-                pdf_document = fitz.open(str(file_path))
-                texts = []
-                for page_num in range(pdf_document.page_count):
-                    try:
-                        page = pdf_document[page_num]
-                        page_text = page.get_text()
-                        if page_text.strip():  # Only add non-empty pages
-                            texts.append(page_text)
-                            metadata['pages'].append(page_num + 1)  # 1-based page numbering
-                    except Exception as page_error:
-                        # Handle individual page errors gracefully
-                        logger.warning(f"Error reading page {page_num + 1} of {file_path.name}: {page_error}")
-                        if st and hasattr(st, 'session_state'):
-                            # Only use streamlit in main thread context
-                            try:
-                                st.warning(f"Error reading page {page_num + 1} of {file_path.name}: {page_error}")
-                            except Exception:
-                                pass
-                        continue
-                pdf_document.close()
-                text_content = '\n'.join(texts)[:10000]
-                metadata['type'] = 'pdf'
-            except Exception as pdf_error:
-                # Handle corrupted or unsupported PDF files
-                error_msg = f"Error processing PDF {file_path.name}: {pdf_error}"
-                logger.error(error_msg)
-                if st and hasattr(st, 'session_state'):
-                    # Only use streamlit in main thread context
-                    try:
-                        st.error(error_msg)
-                    except Exception:
-                        pass
-                # Try to return partial content if available
-                if 'pdf_document' in locals():
-                    try:
-                        pdf_document.close()
-                    except:
-                        pass
-                return "", metadata
-        elif file_path.suffix.lower() in ['.docx', '.doc']:
-            doc = docx.Document(str(file_path))
-            text_content = '\n'.join(p.text for p in doc.paragraphs)[:10000]
-            metadata['type'] = 'docx'
-        elif file_path.suffix.lower() in ['.txt', '.md']:
-            text_content = file_path.read_text(encoding='utf-8', errors='ignore')[:10000]
-            metadata['type'] = 'text'
-    except Exception as e:
-        error_msg = f"Could not read {file_path.name}: {e}"
-        logger.warning(error_msg)
-        if st and hasattr(st, 'session_state'):  # Only use streamlit if available and in main thread
-            try:
-                st.warning(error_msg)
-            except Exception:
-                pass
-    # Call progress callback if provided (for parallel processing tracking)
-    if progress_callback:
-        try:
-            progress_callback(file_path.name)
-        except Exception:
-            pass  # Don't let callback errors affect processing
-    return text_content, metadata
-def _process_file_with_context(args):
-    """
-    Thread-safe file processing function with proper context management
-    Args:
-        args: Tuple of (file_path, base_path, progress_callback)
-    Returns:
-        Tuple of (file_path_str, document_info) or None if failed
-    """
-    file_path, base_path, progress_callback = args
-    try:
-        # Extract text from file
-        text, metadata = extract_text_from_file(file_path, progress_callback)
-        if text:
-            # Store relative path for display
-            rel_path = file_path.relative_to(base_path)
-            document_info = {
-                'text': text,
-                'content': text,  # Alias for backward compatibility
-                'name': file_path.name,
-                'rel_path': str(rel_path),
-                'metadata': metadata
-            }
-            return str(file_path), document_info
-    except Exception as e:
-        logger.error(f"Error processing file {file_path.name}: {e}")
-    return None
-def scan_data_room(data_room_path: str, max_workers: Optional[int] = None, progress_callback=None) -> Dict[str, Dict]:
-    """
-    Scan entire data room directory for documents using parallel processing
-    Args:
-        data_room_path: Path to the data room directory
-        max_workers: Maximum number of worker threads (uses config default if None)
-        progress_callback: Optional callback function for progress updates
-    Returns:
-        Dictionary mapping file paths to document information
-    """
-    config = get_config()
-    if max_workers is None:
-        max_workers = config.processing.max_workers
-    documents = {}
-    path = Path(data_room_path)
-    if not path.exists():
-        return documents
-    # Collect all document files first
-    file_paths = []
-    for file_path in path.rglob('*'):
-        if file_path.is_file() and not file_path.name.startswith('.'):
-            if file_path.suffix.lower() in config.processing.supported_file_extensions:
-                file_paths.append(file_path)
-    if not file_paths:
-        return documents
-    logger.info(f"Processing {len(file_paths)} files with {max_workers} workers")
-    # Prepare arguments for parallel processing
-    process_args = [(file_path, path, progress_callback) for file_path in file_paths]
-    # Process files in parallel
-    processed_count = 0
-    failed_count = 0
-    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-        # Submit all tasks
-        future_to_file = {}
-        for args in process_args:
-            future = executor.submit(_process_file_with_context, args)
-            # Add Streamlit context if available
-            if STREAMLIT_CONTEXT_AVAILABLE:
-                try:
-                    script_ctx = get_script_run_ctx()
-                    if script_ctx:
-                        add_script_run_ctx(future)
-                except Exception as e:
-                    logger.warning(f"Could not add script context: {e}")
-            future_to_file[future] = args[0]  # Store file_path for reference
-        # Collect results as they complete
-        for future in concurrent.futures.as_completed(future_to_file):
-            try:
-                result = future.result(timeout=config.processing.file_timeout)
-                if result:
-                    file_path_str, document_info = result
-                    documents[file_path_str] = document_info
-                    processed_count += 1
-                else:
-                    failed_count += 1
-            except concurrent.futures.TimeoutError:
-                file_path = future_to_file[future]
-                logger.error(f"Timeout processing file: {file_path.name}")
-                failed_count += 1
-            except Exception as e:
-                file_path = future_to_file[future]
-                logger.error(f"Error processing file {file_path.name}: {e}")
-                failed_count += 1
-    logger.info(f"Completed processing: {processed_count} successful, {failed_count} failed")
-    return documents
-def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 2000, overlap: int = 200) -> List[Dict]:
-    """
-    Create searchable chunks with semantic splitting and full metadata.
-    Uses RecursiveCharacterTextSplitter for better context preservation.
-    Args:
-        documents: Dictionary of documents
-        chunk_size: Size of each chunk in characters (default: 2000 for ~400 words)
-        overlap: Overlap between chunks in characters (default: 200 for ~50 words)
-    Returns:
-        List of chunk dictionaries with metadata
     """
-    chunks = []
-    # Initialize semantic text splitter with hierarchical separators
-    # This preserves document structure by prioritizing paragraph breaks,
-    # then sentences, then words
-    text_splitter = RecursiveCharacterTextSplitter(
-        chunk_size=chunk_size,
-        chunk_overlap=overlap,
-        separators=["\n\n", "\n", ".", "!", "?", ",", " "],
-        length_function=len,
-        is_separator_regex=False,
-    )
-    for doc_path, doc_info in documents.items():
-        text = doc_info['text']
-        if not text.strip():
-            continue
-        # Split text using semantic boundaries
-        semantic_chunks = text_splitter.split_text(text)
-        # Create chunks with metadata
-        for i, chunk_text in enumerate(semantic_chunks):
-            if chunk_text.strip():
-                chunks.append({
-                    'text': chunk_text.strip(),
-                    'source': doc_info['name'],
-                    'path': doc_info['rel_path'],
-                    'full_path': doc_path,
-                    'chunk_id': f"semantic_chunk_{i}",
-                    'metadata': doc_info['metadata']
-                })
-    return chunks
-def create_embeddings_batch(texts: List[str], model: SentenceTransformer, batch_size: Optional[int] = None) -> np.ndarray:
     """
-    Create embeddings for texts in batches for better performance
-    Args:
-        texts: List of texts to embed
-        model: SentenceTransformer model
-        batch_size: Batch size for processing
-    Returns:
-        NumPy array of embeddings
-    """
-    # Set default batch_size from config if None
-    if batch_size is None:
         config = get_config()
-        batch_size = config.processing.batch_size
-    embeddings_list = []
-    for i in range(0, len(texts), batch_size):
-        batch = texts[i:i + batch_size]
-        batch_embeddings = model.encode(batch)
-        embeddings_list.append(batch_embeddings)
-    return np.vstack(embeddings_list) if embeddings_list else np.array([])
-def search_documents_with_faiss(
-    query: str,
-    chunks: List[Dict],
-    faiss_index: faiss.IndexFlatIP,
-    model: SentenceTransformer,
-    top_k: int = 5,
-    threshold: Optional[float] = None
-) -> List[Dict]:
-    """
-    Search documents using FAISS IndexFlatIP for fast similarity search
-    Args:
-        query: Search query
-        chunks: List of document chunks
-        faiss_index: FAISS index with embeddings
-        model: SentenceTransformer model
-        top_k: Number of top results to return
-        threshold: Minimum similarity threshold (uses config default if None)
-    Returns:
-        List of search results with citations
-    """
-    if not chunks or faiss_index is None:
-        return []
-    config = get_config()
-    if threshold is None:
-        threshold = config.processing.similarity_threshold
-    # Encode query and normalize for inner product similarity
-    query_embedding = model.encode(query).astype('float32')
-    query_embedding = query_embedding.reshape(1, -1)
-    # Normalize for cosine similarity using inner product
-    faiss.normalize_L2(query_embedding)
-    # Search using FAISS (much faster than numpy)
-    scores, indices = faiss_index.search(query_embedding, min(top_k * 2, len(chunks)))
-    results = []
-    seen_texts = set()
-    for score, idx in zip(scores[0], indices[0]):
-        if idx == -1 or score < threshold:  # -1 indicates no more results
-            continue
-        # Avoid duplicates
-        text_preview = chunks[idx]['text'][:100]
-        if text_preview not in seen_texts:
-            seen_texts.add(text_preview)
-            # Format citation based on file type
-            metadata = chunks[idx]['metadata']
-            if metadata['type'] == 'pdf' and metadata.get('pages'):
-                citation = f"page {metadata['pages'][0]}"
-            else:
-                citation = "document"
-            results.append({
-                'text': chunks[idx]['text'],
-                'source': chunks[idx]['source'],
-                'path': chunks[idx]['path'],
-                'full_path': chunks[idx].get('full_path', ''),
-                'citation': citation,
-                'score': float(score)
-            })
-            if len(results) >= top_k:
-                break
-    return results
-def search_documents_with_citations(
-    query: str,
-    chunks: List[Dict],
-    embeddings: np.ndarray,
-    model: SentenceTransformer,
-    top_k: int = 5,
-    threshold: Optional[float] = None
-) -> List[Dict]:
-    """
-    Legacy search documents function - kept for backward compatibility
-    Creates temporary FAISS index and uses FAISS search for better performance
-    Args:
-        query: Search query
-        chunks: List of document chunks
-        embeddings: Precomputed embeddings for chunks
-        model: SentenceTransformer model
-        top_k: Number of top results to return
-        threshold: Minimum similarity threshold
-    Returns:
-        List of search results with citations
-    """
-    if not chunks:
-        return []
-    # Create temporary FAISS index for better performance
-    embeddings_f32 = embeddings.astype('float32')
-    faiss.normalize_L2(embeddings_f32)  # Normalize for cosine similarity
-    index = faiss.IndexFlatIP(embeddings_f32.shape[1])
-    index.add(embeddings_f32)
-    return search_documents_with_faiss(query, chunks, index, model, top_k, threshold)
-def create_progress_tracker(total_files: int = 0, streamlit_progress_bar=None):
-    """
-    Create a thread-safe progress tracking function
-    Args:
-        total_files: Total number of files to process
-        streamlit_progress_bar: Optional Streamlit progress bar
-    Returns:
-        Progress callback function
-    """
-    processed_count = [0]  # Use list for mutable counter in closure
-    lock = threading.Lock()
-    def progress_callback(filename: str = None):
-        with lock:
-            processed_count[0] += 1
-            progress = processed_count[0] / max(total_files, 1)
-            if streamlit_progress_bar and hasattr(st, 'session_state'):
-                try:
-                    streamlit_progress_bar.progress(
-                        min(progress, 1.0),
-                        text=f"Processing {filename or 'documents'}... ({processed_count[0]}/{total_files})"
-                    )
-                except Exception:
-                    pass  # Don't let UI errors affect processing
-    return progress_callback
-def _generate_cache_key(documents: Dict[str, Dict]) -> str:
-    """
-    Generate a cache key based on document paths and modification times
-    Args:
-        documents: Dictionary of documents with file paths
-    Returns:
-        Cache key string
-    """
-    # Create a hash based on file paths and their modification times
-    cache_data = []
-    for file_path, doc_info in documents.items():
         try:
-            path_obj = Path(file_path)
-            if path_obj.exists():
-                mtime = path_obj.stat().st_mtime
-                cache_data.append(f"{file_path}:{mtime}")
         except Exception as e:
-            logger.warning(f"Could not get modification time for {file_path}: {e}")
-            # Use current time as fallback
-            cache_data.append(f"{file_path}:{time.time()}")
-    # Sort to ensure consistent hashing regardless of document order
-    cache_data.sort()
-    cache_string = "|".join(cache_data)
-    # Generate MD5 hash for the cache key
-    return hashlib.md5(cache_string.encode('utf-8')).hexdigest()
-def _get_cache_dir() -> Path:
-    """Get or create the cache directory"""
-    cache_dir = Path(".cache")
-    cache_dir.mkdir(exist_ok=True)
-    return cache_dir
-def _save_embeddings_to_cache(cache_key: str, embeddings: np.ndarray, chunks: List[Dict]) -> bool:
-    """
-    Save embeddings and chunks to cache
-    Args:
-        cache_key: Cache key for the data
-        embeddings: Embeddings array to cache
-        chunks: Document chunks to cache
-    Returns:
-        True if successful, False otherwise
-    """
-    try:
-        cache_dir = _get_cache_dir()
-        cache_file = cache_dir / f"embeddings_{cache_key}.joblib"
-        cache_data = {
-            'embeddings': embeddings,
-            'chunks': chunks,
-            'timestamp': time.time(),
-            'cache_key': cache_key
-        }
-        joblib.dump(cache_data, cache_file, compress=3)
-        logger.info(f"Saved embeddings to cache: {cache_file}")
-        return True
-    except Exception as e:
-        logger.error(f"Failed to save embeddings to cache: {e}")
-        return False
-def _load_embeddings_from_cache(cache_key: str) -> Tuple[Optional[np.ndarray], Optional[List[Dict]]]:
-    """
-    Load embeddings and chunks from cache
-    Args:
-        cache_key: Cache key for the data
-    Returns:
-        Tuple of (embeddings, chunks) or (None, None) if not found
-    """
-    try:
-        cache_dir = _get_cache_dir()
-        cache_file = cache_dir / f"embeddings_{cache_key}.joblib"
-        if not cache_file.exists():
-            return None, None
-        cache_data = joblib.load(cache_file)
-        # Validate cache data structure
-        if not all(key in cache_data for key in ['embeddings', 'chunks', 'timestamp', 'cache_key']):
-            logger.warning(f"Invalid cache data structure in {cache_file}")
-            return None, None
-        # Check if cache key matches (additional validation)
-        if cache_data['cache_key'] != cache_key:
-            logger.warning(f"Cache key mismatch in {cache_file}")
-            return None, None
-        logger.info(f"Loaded embeddings from cache: {cache_file}")
-        return cache_data['embeddings'], cache_data['chunks']
-    except Exception as e:
-        logger.error(f"Failed to load embeddings from cache: {e}")
-        return None, None
-def _invalidate_old_cache_files(max_age_days: int = 7) -> None:
-    """
-    Remove old cache files to prevent cache directory from growing too large
-    Args:
-        max_age_days: Maximum age of cache files in days
-    """
-    try:
-        cache_dir = _get_cache_dir()
-        current_time = time.time()
-        max_age_seconds = max_age_days * 24 * 60 * 60
-        for cache_file in cache_dir.glob("embeddings_*.joblib"):
-            try:
-                file_age = current_time - cache_file.stat().st_mtime
-                if file_age > max_age_seconds:
-                    cache_file.unlink()
-                    logger.info(f"Removed old cache file: {cache_file}")
-            except Exception as e:
-                logger.warning(f"Could not remove old cache file {cache_file}: {e}")
-    except Exception as e:
-        logger.error(f"Failed to invalidate old cache files: {e}")
-class DocumentProcessor:
-    """
-    Main document processing class that orchestrates document operations with parallel processing support
-    Enhanced with FAISS for 10x faster similarity search
-    """
-    def __init__(self, model: Optional[SentenceTransformer] = None):
-        """
-        Initialize the document processor
-        Args:
-            model: SentenceTransformer model for embeddings (optional)
-        """
-        self.model = model
-        self.documents = {}
-        self.chunks = []
-        self.embeddings = None
-        self.faiss_index = None  # FAISS index for fast similarity search
-        self.performance_stats = {}  # Track performance metrics
-    def load_data_room(self, data_room_path: str, max_workers: Optional[int] = None, progress_callback=None) -> Dict[str, any]:
         """
-        Load and process an entire data room with parallel processing
         Args:
             data_room_path: Path to the data room directory
-            max_workers: Maximum number of worker threads (uses config default if None)
-            progress_callback: Optional callback function for progress updates
         Returns:
             Dictionary with processing results including performance metrics
         """
         import time
         config = get_config()
-        if max_workers is None:
-            max_workers = config.processing.max_workers
-        start_time = time.time()
-        logger.info(f"Starting data room processing: {data_room_path}")
-        # Scan documents with parallel processing
-        self.documents = scan_data_room(
-            data_room_path,
-            max_workers=max_workers,
-            progress_callback=progress_callback
-        )
         scan_time = time.time() - start_time
-        logger.info(f"Document scanning completed in {scan_time:.2f} seconds")
-        # Create chunks
         chunk_start = time.time()
-        self.chunks = create_chunks_with_metadata(self.documents)
         chunk_time = time.time() - chunk_start
-        # Create embeddings if model is available
         embedding_time = 0
-        cache_hit = False
-        if self.model and self.chunks:
             embedding_start = time.time()
-            # Try to load from cache first
-            cache_key = _generate_cache_key(self.documents)
-            cached_embeddings, cached_chunks = _load_embeddings_from_cache(cache_key)
-            if cached_embeddings is not None and cached_chunks is not None:
-                # Cache hit - use cached data
-                self.embeddings = cached_embeddings
-                # Verify chunks match (safety check)
-                if len(cached_chunks) == len(self.chunks):
-                    self.chunks = cached_chunks
-                    cache_hit = True
-                    logger.info(f"Loaded embeddings from cache (key: {cache_key[:8]}...)")
-                    # Build FAISS index from cached embeddings
-                    self._build_faiss_index()
-                else:
-                    logger.warning("Cached chunks length mismatch, regenerating embeddings")
-            if not cache_hit:
-                # Cache miss or invalid - generate new embeddings
-                texts = [chunk['text'] for chunk in self.chunks]
-                self.embeddings = create_embeddings_batch(texts, self.model)
-                # Save to cache
-                if _save_embeddings_to_cache(cache_key, self.embeddings, self.chunks):
-                    logger.info(f"Saved new embeddings to cache (key: {cache_key[:8]}...)")
-                # Clean up old cache files
-                _invalidate_old_cache_files()
-            # Build FAISS index for fast similarity search
-            self._build_faiss_index()
             embedding_time = time.time() - embedding_start
-            cache_status = "from cache" if cache_hit else "generated"
-            logger.info(f"Embeddings {cache_status} and FAISS index built in {embedding_time:.2f} seconds")
         total_time = time.time() - start_time
         logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
         return {
-            'documents_count': len(self.documents),
-            'chunks_count': len(self.chunks),
-            'has_embeddings': self.embeddings is not None,
-            'performance': {
-                'total_time': total_time,
-                'scan_time': scan_time,
-                'chunk_time': chunk_time,
-                'embedding_time': embedding_time,
-                'documents_per_second': len(self.documents) / scan_time if scan_time > 0 else 0,
-                'cache_hit': cache_hit,
-                'cache_key': cache_key[:8] + "..." if 'cache_key' in locals() else None
-            }
         }
-    def _build_faiss_index(self) -> None:
-        """
-        Build FAISS IndexFlatIP for fast similarity search
-        """
-        if self.embeddings is None:
-            logger.warning("No embeddings available to build FAISS index")
-            return
-        try:
-            # Convert to float32 and normalize for cosine similarity via inner product
-            embeddings_f32 = self.embeddings.astype('float32')
-            faiss.normalize_L2(embeddings_f32)
-            # Create FAISS index
-            dimension = embeddings_f32.shape[1]
-            self.faiss_index = faiss.IndexFlatIP(dimension)
-            self.faiss_index.add(embeddings_f32)
-            logger.info(f"Built FAISS index with {self.faiss_index.ntotal} vectors, dimension {dimension}")
-        except Exception as e:
-            logger.error(f"Failed to build FAISS index: {e}")
-            self.faiss_index = None
-    def faiss_search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
         """
-        Fast similarity search using FAISS IndexFlatIP
         Args:
             query: Search query
-            top_k: Number of top results
             threshold: Minimum similarity threshold
         Returns:
-            List of search results with citations
         """
-        if not self.model or self.faiss_index is None:
             return []
-        return search_documents_with_faiss(
-            query, self.chunks, self.faiss_index, self.model, top_k, threshold
-        )
-    def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
-        """
-        Search documents using semantic similarity - uses FAISS if available, falls back to numpy
-        Args:
-            query: Search query
-            top_k: Number of top results
-            threshold: Minimum similarity threshold
-        Returns:
-            List of search results
-        """
-        if not self.model:
-            return []
-        # Use FAISS search if index is available (10x faster)
-        if self.faiss_index is not None:
-            return self.faiss_search(query, top_k, threshold)
-        elif self.embeddings is not None:
-            # Fallback to numpy-based search
-            return search_documents_with_citations(
-                query, self.chunks, self.embeddings, self.model, top_k, threshold
-            )
-        else:
             return []
-    def get_statistics(self) -> Dict[str, any]:
-        """Get processing statistics including performance metrics"""
         stats = {
             'total_documents': len(self.documents),
-            'total_chunks': len(self.chunks),
-            'has_embeddings': self.embeddings is not None,
-            'has_faiss_index': self.faiss_index is not None,
-            'faiss_index_size': self.faiss_index.ntotal if self.faiss_index is not None else 0,
-            'embedding_dimension': self.embeddings.shape[1] if self.embeddings is not None else 0
         }
         # Add performance metrics if available
@@ -820,33 +422,3 @@ class DocumentProcessor:
         return stats
-    def load_data_room_with_progress(self, data_room_path: str, max_workers: Optional[int] = None,
-                                   progress_bar=None) -> Dict[str, any]:
-        """
-        Load data room with Streamlit progress bar support
-        Args:
-            data_room_path: Path to the data room directory
-            max_workers: Maximum number of worker threads
-            progress_bar: Streamlit progress bar object
-        Returns:
-            Dictionary with processing results
-        """
-        # Count total files first for accurate progress tracking
-        path = Path(data_room_path)
-        if not path.exists():
-            return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
-        total_files = sum(1 for file_path in path.rglob('*')
-                         if file_path.is_file() and not file_path.name.startswith('.')
-                         and file_path.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
-        # Create progress tracker
-        progress_callback = create_progress_tracker(total_files, progress_bar)
-        # Load with progress tracking
-        result = self.load_data_room(data_room_path, max_workers, progress_callback)
-        self.performance_stats = result.get('performance', {})
-        return result

 #!/usr/bin/env python3
 """
+Streamlined Document Processing Module
+This module provides a simplified document processing pipeline with:
+- Direct LangChain loader integration with glob patterns
+- Built-in FAISS vector storage without external file tracking
+- Semantic text chunking using RecursiveCharacterTextSplitter
+- Consolidated document metadata handling
 """
 import os
+import warnings
 # Fix tokenizers parallelism warning
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
 import logging
+# Suppress verbose LangChain warnings and output
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
+warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
+warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
+# Set LangChain logging to WARNING level to reduce verbosity
+logging.getLogger("langchain").setLevel(logging.WARNING)
+logging.getLogger("langchain_core").setLevel(logging.WARNING)
+logging.getLogger("langchain_community").setLevel(logging.WARNING)
+logging.getLogger("langchain_huggingface").setLevel(logging.WARNING)
+import re
+from pathlib import Path
+from typing import Dict, List, Optional, Any, Callable
+from datetime import datetime
+# LangChain imports
+from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader, Docx2txtLoader, TextLoader
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.documents import Document
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 # Import configuration
 from .config import get_config
+# Import error handling
 logger = logging.getLogger(__name__)
+# =============================================================================
+# ERROR HANDLING UTILITIES - Merged from error_handlers.py
+# =============================================================================
+def safe_execute(func: Callable, default: Any = None, context: str = "", log_errors: bool = True) -> Any:
+    """
+    Execute a function with basic error handling and logging
+    Args:
+        func: Function to execute
+        default: Value to return on error
+        context: Brief description for logs
+        log_errors: Whether to log errors
+    Returns:
+        Function result or default value on error
+    """
+    try:
+        return func()
+    except Exception as e:
+        if log_errors:
+            logger.error(f"{context or func.__name__}: {e}")
+        return default
 def escape_markdown_math(text: str) -> str:
     return text
+class DocumentProcessor:
     """
+    Streamlined document processing class with integrated FAISS vector storage
+    This class consolidates all document processing functionality including:
+    - Document loading using LangChain's DirectoryLoader with glob patterns
+    - Semantic text chunking with RecursiveCharacterTextSplitter
+    - FAISS vector storage for similarity search
+    - Document metadata handling
     """
+    def __init__(self, model_name: Optional[str] = None, store_name: Optional[str] = None):
+        """
+        Initialize the document processor
+        Args:
+            model_name: Name of the sentence transformer model for embeddings (optional)
+            store_name: Name for the FAISS store (optional, uses config default)
+        """
         config = get_config()
+        self.model_name = model_name or config.model.sentence_transformer_model
+        self.store_name = store_name or config.processing.faiss_store_name
+        # Initialize components
+        self.documents: List[Document] = []
+        self.vector_store: Optional[FAISS] = None
+        self.embeddings: Optional[HuggingFaceEmbeddings] = None
+        self.text_splitter: Optional[RecursiveCharacterTextSplitter] = None
+        self.performance_stats = {}
+        # Convenience properties for backward compatibility
+        self.chunks = []  # Will be populated after processing
+        # Initialize text splitter with semantic boundaries
+        self._init_text_splitter()
+        # Initialize embeddings if model name provided
+        if self.model_name:
+            self.embeddings = HuggingFaceEmbeddings(model_name=self.model_name)
+            logger.info(f"Initialized embeddings with model: {self.model_name}")
+        else:
+            logger.warning("No model name provided - embeddings not initialized")
+        # Try to load existing FAISS store
+        self._load_existing_store()
+    def _init_text_splitter(self):
+        """Initialize the text splitter with optimal settings for semantic chunking"""
+        config = get_config()
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=config.processing.chunk_size,
+            chunk_overlap=config.processing.chunk_overlap,
+            separators=["\\n\\n", "\\n", ".", "!", "?", ",", " "],
+            length_function=len,
+            is_separator_regex=False,
+        )
+        logger.info(f"Initialized text splitter: {config.processing.chunk_size} chars, {config.processing.chunk_overlap} overlap")
+    def _load_existing_store(self):
+        """Load existing FAISS store if available"""
+        if not self.embeddings:
+            return
+        config = get_config()
+        faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
+        faiss_index_path = faiss_dir / f"{self.store_name}.faiss"
+        faiss_pkl_path = faiss_dir / f"{self.store_name}.pkl"
         try:
+            if faiss_index_path.exists() and faiss_pkl_path.exists():
+                self.vector_store = FAISS.load_local(
+                    str(faiss_dir),
+                    self.embeddings,
+                    index_name=self.store_name,
+                    allow_dangerous_deserialization=True  # Safe: we created these files ourselves
+                )
+                logger.info(f"Loaded existing FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
+            else:
+                logger.info(f"No existing FAISS store found for: {self.store_name}")
         except Exception as e:
+            logger.error(f"Failed to load FAISS store: {e}")
+            self.vector_store = None
+    def _save_store(self):
+        """Save FAISS store to disk"""
+        if not self.vector_store:
+            return
+        try:
+            config = get_config()
+            faiss_dir = Path(config.paths.data_dir) / "enhanced_faiss"
+            faiss_dir.mkdir(parents=True, exist_ok=True)
+            self.vector_store.save_local(
+                str(faiss_dir),
+                index_name=self.store_name
+            )
+            logger.info(f"Saved FAISS store: {self.store_name} with {self.vector_store.index.ntotal} vectors")
+        except Exception as e:
+            logger.error(f"Failed to save FAISS store: {e}")
+    def load_data_room(self, data_room_path: str, progress_bar=None) -> Dict[str, Any]:
         """
+        Load and process an entire data room using DirectoryLoader with glob patterns
         Args:
             data_room_path: Path to the data room directory
+            progress_bar: Optional Streamlit progress bar object
         Returns:
             Dictionary with processing results including performance metrics
         """
         import time
+        start_time = time.time()
         config = get_config()
+        data_room_path = Path(data_room_path)
+        if not data_room_path.exists():
+            logger.error(f"Data room path does not exist: {data_room_path}")
+            return {'documents_count': 0, 'chunks_count': 0, 'has_embeddings': False}
+        logger.info(f"Starting streamlined data room processing: {data_room_path}")
+        # Clear existing documents
+        self.documents = []
+        documents_loaded = 0
+        # Load documents by file type using DirectoryLoader with glob patterns
+        supported_extensions = config.processing.supported_file_extensions
+        for ext in supported_extensions:
+            try:
+                # Create glob pattern for this extension
+                glob_pattern = f"**/*{ext}"
+                # Choose appropriate loader based on extension
+                if ext == '.pdf':
+                    loader_cls = PyPDFLoader
+                elif ext in ['.docx', '.doc']:
+                    loader_cls = Docx2txtLoader
+                elif ext in ['.txt', '.md']:
+                    loader_cls = TextLoader
+                else:
+                    continue
+                # Use DirectoryLoader with glob pattern
+                loader = DirectoryLoader(
+                    str(data_room_path),
+                    glob=glob_pattern,
+                    loader_cls=loader_cls,
+                    loader_kwargs={'encoding': 'utf-8'} if ext in ['.txt', '.md'] else {},
+                    recursive=True,
+                    show_progress=False,  # Disable verbose progress output
+                    use_multithreading=True
+                )
+                # Load documents for this extension
+                docs = safe_execute(
+                    lambda: loader.load(),
+                    default=[],
+                    context=f"Loading {ext} files"
+                )
+                if docs:
+                    # Add relative path information to metadata
+                    for doc in docs:
+                        if 'source' in doc.metadata:
+                            source_path = Path(doc.metadata['source'])
+                            if source_path.exists():
+                                try:
+                                    rel_path = source_path.relative_to(data_room_path)
+                                    doc.metadata['path'] = str(rel_path)
+                                    doc.metadata['name'] = source_path.name
+                                except ValueError:
+                                    # If relative path fails, use original source
+                                    doc.metadata['path'] = doc.metadata['source']
+                                    doc.metadata['name'] = source_path.name
+                    self.documents.extend(docs)
+                    documents_loaded += len(docs)
+                    logger.info(f"Loaded {len(docs)} {ext} documents")
+            except Exception as e:
+                logger.error(f"Error loading {ext} files: {e}")
         scan_time = time.time() - start_time
+        logger.info(f"Document loading completed in {scan_time:.2f} seconds")
+        # Split documents into chunks using the text splitter
         chunk_start = time.time()
+        if self.documents and self.text_splitter:
+            self.documents = self.text_splitter.split_documents(self.documents)
+            # Add chunk metadata and populate chunks for backward compatibility
+            self.chunks = []
+            for i, doc in enumerate(self.documents):
+                doc.metadata['chunk_id'] = f"chunk_{i}"
+                doc.metadata['processed_at'] = datetime.now().isoformat()
+                # Add citation information if available
+                if 'page' in doc.metadata:
+                    doc.metadata['citation'] = f"page {doc.metadata['page']}"
+                else:
+                    doc.metadata['citation'] = doc.metadata.get('name', 'document')
+                # Create chunk dict for backward compatibility
+                chunk_dict = {
+                    'text': doc.page_content,
+                    'source': doc.metadata.get('name', ''),
+                    'path': doc.metadata.get('path', ''),
+                    'full_path': doc.metadata.get('source', ''),
+                    'metadata': doc.metadata
+                }
+                self.chunks.append(chunk_dict)
         chunk_time = time.time() - chunk_start
+        logger.info(f"Text splitting completed in {chunk_time:.2f} seconds")
+        # Create or update FAISS vector store
         embedding_time = 0
+        if self.embeddings and self.documents:
             embedding_start = time.time()
+            if self.vector_store is None:
+                # Create new FAISS store
+                self.vector_store = FAISS.from_documents(self.documents, self.embeddings)
+                logger.info(f"Created new FAISS store with {len(self.documents)} documents")
+            else:
+                # Add documents to existing store
+                self.vector_store.add_documents(self.documents)
+                logger.info(f"Added {len(self.documents)} documents to existing FAISS store")
+            # Save the updated store
+            self._save_store()
             embedding_time = time.time() - embedding_start
+            logger.info(f"FAISS processing completed in {embedding_time:.2f} seconds")
         total_time = time.time() - start_time
         logger.info(f"Total data room processing completed in {total_time:.2f} seconds")
+        # Store performance stats
+        self.performance_stats = {
+            'total_time': total_time,
+            'scan_time': scan_time,
+            'chunk_time': chunk_time,
+            'embedding_time': embedding_time,
+            'documents_per_second': documents_loaded / scan_time if scan_time > 0 else 0
+        }
         return {
+            'documents_count': documents_loaded,
+            'chunks_count': len(self.documents),
+            'total_chunks_in_store': self.vector_store.index.ntotal if self.vector_store else 0,
+            'has_embeddings': self.vector_store is not None,
+            'performance': self.performance_stats
         }
+    def search(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
         """
+        Search documents using FAISS similarity search
         Args:
             query: Search query
+            top_k: Number of top results to return
             threshold: Minimum similarity threshold
         Returns:
+            List of search results with scores and metadata
         """
+        if not self.vector_store:
+            logger.warning("FAISS vector store not available for search")
             return []
+        config = get_config()
+        if threshold is None:
+            threshold = config.processing.similarity_threshold
+        try:
+            # Perform similarity search with scores
+            docs_and_scores = self.vector_store.similarity_search_with_score(query, k=top_k*2)
+            results = []
+            seen_texts = set()
+            for doc, score in docs_and_scores:
+                # Convert FAISS distance to similarity score (higher is better)
+                similarity_score = 1.0 / (1.0 + score) if score >= 0 else 1.0
+                if similarity_score < threshold:
+                    continue
+                # Avoid duplicates based on text content
+                text_preview = doc.page_content[:100]
+                if text_preview not in seen_texts:
+                    seen_texts.add(text_preview)
+                    results.append({
+                        'text': doc.page_content,
+                        'source': doc.metadata.get('name', ''),
+                        'path': doc.metadata.get('path', ''),
+                        'full_path': doc.metadata.get('source', ''),
+                        'citation': doc.metadata.get('citation', 'document'),
+                        'score': float(similarity_score),
+                        'metadata': doc.metadata
+                    })
+                    if len(results) >= top_k:
+                        break
+            return results
+        except Exception as e:
+            logger.error(f"Failed to search FAISS store: {e}")
             return []
+    def get_statistics(self) -> Dict[str, Any]:
+        """Get processing statistics"""
         stats = {
             'total_documents': len(self.documents),
+            'total_vectors_in_store': self.vector_store.index.ntotal if self.vector_store else 0,
+            'has_embeddings': self.vector_store is not None,
+            'store_name': self.store_name,
+            'model_name': self.model_name
         }
         # Add performance metrics if available
         return stats

src/services.py CHANGED Viewed

@@ -2,408 +2,346 @@
 """
 Business Logic Services Module
-This module contains the core business logic services for the DD-Checklist application.
-Services handle specific domain operations and coordinate between different components.
 """
 import re
-import json
 from pathlib import Path
-from typing import Dict, List, Optional, Any, Tuple
-import numpy as np
-from sentence_transformers import SentenceTransformer
-import faiss
 from .config import get_config
-from src.document_processing import DocumentProcessor, escape_markdown_math
-class ChecklistParser:
-    """Service for parsing due diligence checklists"""
-    @staticmethod
-    def parse_checklist(checklist_text: str) -> Dict:
-        """
-        Parse markdown checklist into categories and items
-        Args:
-            checklist_text: Raw checklist text in markdown format
-        Returns:
-            Dictionary with parsed categories and items
-        """
-        categories = {}
-        current_category = None
-        for line in checklist_text.split('\n'):
-            # Category header (e.g., "A. Corporate Organization" or "## A. Corporate Organization")
-            # Try both formats
-            match = None
-            if line.startswith('## '):
-                match = re.match(r'## ([A-Z])\. (.+)', line)
-            elif line.strip() and not line.startswith('\t') and not line.startswith(' '):
-                # Try plain format (no ##)
-                match = re.match(r'^([A-Z])\. (.+)', line.strip())
             if match:
                 letter, name = match.groups()
-                current_category = letter
-                categories[letter] = {
-                    'name': name.strip(),
-                    'items': []
-                }
-            # Numbered items (may be indented with tabs or spaces)
-            elif current_category:
-                # Check for numbered items with various indentation
-                line_stripped = line.strip()
-                if re.match(r'^\d+\.', line_stripped):
-                    item_text = re.sub(r'^\d+\.\s*', '', line_stripped)
-                    if item_text:
-                        # Clean up [bracketed] content but keep the text
-                        clean_text = re.sub(r'\[.*?\]', '', item_text).strip()
-                        if not clean_text:
-                            clean_text = item_text
-                        categories[current_category]['items'].append({
-                            'text': clean_text,
-                            'original': item_text
-                        })
-        return categories
-class QuestionParser:
-    """Service for parsing due diligence questions"""
-    @staticmethod
-    def parse_questions(questions_text: str) -> List[Dict]:
-        """
-        Parse markdown questions into a list of questions with categories
-        Args:
-            questions_text: Raw questions text in markdown format
-        Returns:
-            List of parsed questions with categories
-        """
-        questions = []
-        current_category = None
-        for line in questions_text.split('\n'):
-            # Category header (e.g., "### A. Organizational and Corporate Documents")
-            if line.startswith('### '):
-                match = re.match(r'### ([A-Z])\. (.+)', line)
-                if match:
-                    letter, name = match.groups()
-                    current_category = f"{letter}. {name.strip()}"
-            # Question lines (numbered items)
-            elif current_category and line.strip():
-                match = re.match(r'^\d+\.\s+(.+)', line.strip())
-                if match:
-                    question_text = match.group(1).strip()
-                    if question_text:
-                        questions.append({
-                            'category': current_category,
-                            'question': question_text,
-                            'id': f"q_{len(questions)}"
-                        })
-        return questions
-class ChecklistMatcher:
-    """Service for matching checklists to documents"""
-    def __init__(self, model: SentenceTransformer):
-        """
-        Initialize the matcher
-        Args:
-            model: SentenceTransformer model for embeddings
-        """
-        self.model = model
-    def match_checklist_to_documents(
-        self,
-        checklist: Dict,
-        chunks: List[Dict],
-        embeddings: np.ndarray,
-        threshold: Optional[float] = None
-    ) -> Dict:
-        """
-        Match each checklist item to relevant documents using FAISS for 10x faster similarity search
-        Args:
-            checklist: Parsed checklist
-            chunks: Document chunks
-            embeddings: Precomputed embeddings
-            threshold: Similarity threshold (uses config default if None)
-        Returns:
-            Matching results
-        """
-        config = get_config()
-        if threshold is None:
-            threshold = config.processing.similarity_threshold
-        # Build FAISS index for fast similarity search
-        embeddings_f32 = embeddings.astype('float32')
-        faiss.normalize_L2(embeddings_f32)  # Normalize for cosine similarity
-        dimension = embeddings_f32.shape[1]
-        faiss_index = faiss.IndexFlatIP(dimension)
-        faiss_index.add(embeddings_f32)
-        results = {}
-        for cat_letter, category in checklist.items():
-            cat_results = {
-                'name': category['name'],
-                'items': [],
-                'total_items': len(category['items']),
-                'matched_items': 0
-            }
-            for item_idx, item in enumerate(category['items']):
-                # Encode checklist item with category context
-                item_text = f"{category['name']} {item['text']}"
-                item_embedding = self.model.encode(item_text).astype('float32').reshape(1, -1)
-                faiss.normalize_L2(item_embedding)
-                # Use FAISS for fast similarity search
-                scores, indices = faiss_index.search(item_embedding, len(chunks))
-                # Get unique documents that match
-                doc_matches = {}
-                for score, idx in zip(scores[0], indices[0]):
-                    if idx == -1 or score < threshold:
-                        continue
-                    doc_path = chunks[idx]['path']
-                    if doc_path not in doc_matches or score > doc_matches[doc_path]['score']:
-                        doc_matches[doc_path] = {
-                            'name': chunks[idx]['source'],
-                            'path': doc_path,
-                            'full_path': chunks[idx].get('full_path', doc_path),
-                            'score': float(score),
-                            'metadata': chunks[idx]['metadata']
-                        }
-                # Sort by score
-                sorted_matches = sorted(doc_matches.values(), key=lambda x: x['score'], reverse=True)
-                item_result = {
-                    'text': item['text'],
-                    'original': item['original'],
-                    'matches': sorted_matches
-                }
-                if sorted_matches:
-                    cat_results['matched_items'] += 1
-                cat_results['items'].append(item_result)
-            results[cat_letter] = cat_results
-        return results
-    def match_checklist_with_summaries(
-        self,
-        checklist: Dict,
-        doc_embeddings_data: Dict,
-        threshold: Optional[float] = None
-    ) -> Dict:
-        """
-        Match checklist items against document summaries using FAISS for 10x faster similarity search
-        Args:
-            checklist: Parsed checklist
-            doc_embeddings_data: Document embeddings with summaries
-            threshold: Similarity threshold
-        Returns:
-            Matching results using AI summaries
-        """
-        doc_embeddings = np.array(doc_embeddings_data['embeddings'], dtype='float32')
-        doc_info = doc_embeddings_data['documents']
-        # Build FAISS index for fast similarity search
-        faiss.normalize_L2(doc_embeddings)  # Normalize for cosine similarity
-        dimension = doc_embeddings.shape[1]
-        faiss_index = faiss.IndexFlatIP(dimension)
-        faiss_index.add(doc_embeddings)
-        results = {}
-        for cat_letter, category in checklist.items():
-            cat_name = category.get('name', '')
-            cat_results = {
-                'name': cat_name,
-                'letter': cat_letter,
-                'total_items': len(category.get('items', [])),
-                'matched_items': 0,
-                'items': []
-            }
-            for item in category.get('items', []):
-                item_text = item.get('text', '')
-                # Create embedding for checklist item with category context
-                checklist_embedding_text = f"{cat_name}: {item_text}"
-                item_embedding = self.model.encode(checklist_embedding_text).astype('float32').reshape(1, -1)
-                faiss.normalize_L2(item_embedding)
-                # Use FAISS for fast similarity search
-                scores, indices = faiss_index.search(item_embedding, len(doc_info))
-                # Find matching documents above threshold
-                matches = []
-                for score, idx in zip(scores[0], indices[0]):
-                    if idx == -1:  # No more results
-                        break
-                    if score > threshold:
-                        matches.append({
-                            'name': doc_info[idx]['name'],
-                            'path': doc_info[idx]['path'],
-                            'summary': doc_info[idx]['summary'],
-                            'score': float(score),
-                            'metadata': doc_info[idx].get('original_doc', {}).get('metadata', {})
-                        })
-                    else:
-                        break  # Scores are sorted, so we can stop here
-                # Keep top 5 matches (already sorted by FAISS)
-                matches = matches[:5]
-                item_result = {
-                    'text': item_text,
-                    'original': item.get('original', item_text),
-                    'matches': matches
-                }
-                if matches:
-                    cat_results['matched_items'] += 1
-                cat_results['items'].append(item_result)
-            results[cat_letter] = cat_results
-        return results
-class QuestionAnswerer:
-    """Service for answering questions using document chunks"""
-    def __init__(self, model: SentenceTransformer):
-        """
-        Initialize the question answerer
-        Args:
-            model: SentenceTransformer model for embeddings
-        """
-        self.model = model
-    def answer_questions_with_chunks(
-        self,
-        questions: List[Dict],
-        chunks: List[Dict],
-        embeddings: np.ndarray,
-        threshold: Optional[float] = None
-    ) -> Dict:
-        """
-        Answer questions using document chunks with FAISS for 10x faster similarity search
-        Args:
-            questions: List of parsed questions
-            chunks: Document chunks
-            embeddings: Precomputed embeddings
-            threshold: Similarity threshold (uses config default if None)
-        Returns:
-            Dictionary of answers with citations
-        """
-        config = get_config()
-        if threshold is None:
-            threshold = config.processing.relevancy_threshold
-        # Build FAISS index for fast similarity search
-        embeddings_f32 = embeddings.astype('float32')
-        faiss.normalize_L2(embeddings_f32)  # Normalize for cosine similarity
-        dimension = embeddings_f32.shape[1]
-        faiss_index = faiss.IndexFlatIP(dimension)
-        faiss_index.add(embeddings_f32)
-        answers = {}
-        for question in questions:
-            # Encode question
-            question_embedding = self.model.encode(question['question']).astype('float32').reshape(1, -1)
-            faiss.normalize_L2(question_embedding)
-            # Use FAISS for fast similarity search
-            scores, indices = faiss_index.search(question_embedding, min(10, len(chunks)))  # Get top 10 candidates
-            # Get top matching chunks above threshold
-            relevant_chunks = []
-            for score, idx in zip(scores[0], indices[0]):
-                if idx == -1 or score < threshold:
-                    continue
-                chunk_info = chunks[idx]
-                relevant_chunks.append({
-                    'text': chunk_info['text'][:500],  # Limit text length
-                    'source': chunk_info['source'],
-                    'path': chunk_info['path'],
-                    'score': float(score),
-                    'metadata': chunk_info.get('metadata', {})
-                })
-                # Limit to top 5 chunks
-                if len(relevant_chunks) >= 5:
-                    break
             answers[question['id']] = {
                 'question': question['question'],
                 'category': question['category'],
-                'chunks': relevant_chunks,
-                'has_answer': len(relevant_chunks) > 0
             }
-        return answers
-class ReportGenerator:
-    """Service for generating reports and summaries"""
-    def __init__(self, agent=None):
-        """
-        Initialize the report generator
-        Args:
-            agent: Optional AI agent for enhanced reporting
-        """
-        self.agent = agent
-    def generate_company_summary(self, documents: Dict[str, Dict], data_room_name: str = "Unknown") -> str:
-        """
-        Generate company overview summary
-        Args:
-            documents: Dictionary of processed documents
-            data_room_name: Name of the data room/company
-        Returns:
-            Company summary text
-        """
-        if not self.agent or not hasattr(self.agent, 'llm'):
-            return self._generate_basic_summary(documents, data_room_name)
-        # Gather key information from documents
         doc_summaries = []
-        for path, doc_info in list(documents.items())[:10]:  # Use top 10 docs
             if 'summary' in doc_info:
                 doc_summaries.append(f"{doc_info['name']}: {doc_info['summary']}")
             else:
-                # Use first 500 chars of content if no summary
                 content_preview = doc_info.get('content', '')[:500]
                 if content_preview:
                     doc_summaries.append(f"{doc_info['name']}: {content_preview}")
@@ -411,34 +349,69 @@ class ReportGenerator:
         if not doc_summaries:
             return "No documents available for summary generation."
-        # Create prompt for company summary
-        from langchain_core.messages import HumanMessage
-        prompt = f"""Based on the following document summaries from a due diligence data room, provide a comprehensive company overview.
-        Company: {data_room_name}
-        Document Summaries:
-        {chr(10).join(doc_summaries[:10])}
-        Please provide:
-        1. Company name and industry
-        2. Business model and key products/services
-        3. Market position and competitive advantages
-        4. Key financials (if available)
-        5. Organizational structure
-        6. Notable risks or concerns
-        7. Overall assessment for M&A consideration
-        Format the response in clear sections with bullet points where appropriate."""
-        try:
-            response = self.agent.llm.invoke([HumanMessage(content=prompt)])
-            return escape_markdown_math(response.content.strip())
-        except Exception as e:
-            return f"Failed to generate AI summary: {str(e)}"
-    def _generate_basic_summary(self, documents: Dict[str, Dict], data_room_name: str) -> str:
-        """Generate basic summary without AI"""
         doc_count = len(documents)
         file_types = {}
@@ -446,7 +419,7 @@ class ReportGenerator:
             doc_type = doc_info.get('metadata', {}).get('type', 'unknown')
             file_types[doc_type] = file_types.get(doc_type, 0) + 1
-        summary = f"""# Company Overview: {data_room_name}
 ## Document Analysis
 - **Total Documents**: {doc_count}
@@ -457,70 +430,11 @@ Based on the document structure, this data room appears to cover standard due di
 *Note: Enable AI features for detailed company analysis and insights.*
 """
-        return summary
-    def generate_strategic_analysis(
-        self,
-        strategy_text: str,
-        checklist_results: Dict,
-        documents: Dict[str, Dict]
-    ) -> str:
-        """
-        Generate strategic analysis based on strategy and checklist results
-        Args:
-            strategy_text: Strategic document content
-            checklist_results: Results from checklist matching
-            documents: Document dictionary
-        Returns:
-            Strategic analysis text
-        """
-        if not self.agent or not hasattr(self.agent, 'llm'):
-            return self._generate_basic_strategic_analysis(checklist_results)
-        # Build context from checklist results
-        checklist_context = []
-        for cat_id, cat_data in checklist_results.items():
-            cat_name = cat_data['name']
-            matched_items = sum(1 for item in cat_data['items'] if item['matches'])
-            total_items = len(cat_data['items'])
-            coverage = (matched_items / total_items * 100) if total_items > 0 else 0
-            checklist_context.append(f"- {cat_name}: {coverage:.0f}% coverage ({matched_items}/{total_items} items)")
-            # Add details about specific gaps
-            missing_items = [item['text'] for item in cat_data['items'] if not item['matches']]
-            if missing_items and len(missing_items) <= 3:
-                checklist_context.append(f"  Missing: {', '.join(missing_items[:3])}")
-        # Build prompt
-        prompt = f"""Based on the due diligence checklist results and the selected strategy, provide a strategic analysis.
-        Strategy Document:
-        {strategy_text}
-        Checklist Coverage:
-        {chr(10).join(checklist_context)}
-        Please provide:
-        1. Strategic alignment assessment
-        2. Key risks and gaps identified
-        3. Opportunities and synergies
-        4. Recommended next steps
-        5. Overall recommendation
-        Format the response with clear sections and bullet points."""
-        try:
-            from langchain_core.messages import HumanMessage
-            response = self.agent.llm.invoke([HumanMessage(content=prompt)])
-            return escape_markdown_math(response.content.strip())
-        except Exception as e:
-            return f"Failed to generate strategic analysis: {str(e)}"
-    def _generate_basic_strategic_analysis(self, checklist_results: Dict) -> str:
-        """Generate basic strategic analysis without AI"""
         total_items = sum(cat['total_items'] for cat in checklist_results.values())
         matched_items = sum(cat['matched_items'] for cat in checklist_results.values())
         coverage = (matched_items / total_items * 100) if total_items > 0 else 0
@@ -547,102 +461,30 @@ Based on the document structure, this data room appears to cover standard due di
 """
         return analysis
-class DDChecklistService:
-    """
-    Main service orchestrator for DD-Checklist operations
-    Coordinates between different services and manages the overall workflow
-    """
-    def __init__(self, model: SentenceTransformer, agent=None):
-        """
-        Initialize the service
-        Args:
-            model: SentenceTransformer model
-            agent: Optional AI agent
-        """
-        self.model = model
-        self.agent = agent
-        self.document_processor = DocumentProcessor(model)
-        self.checklist_parser = ChecklistParser()
-        self.question_parser = QuestionParser()
-        self.checklist_matcher = ChecklistMatcher(model)
-        self.question_answerer = QuestionAnswerer(model)
-        self.report_generator = ReportGenerator(agent)
-    def process_data_room(
-        self,
-        data_room_path: str,
-        checklist_text: str = "",
-        questions_text: str = ""
-    ) -> Dict[str, Any]:
-        """
-        Process entire data room with checklist and questions
-        Args:
-            data_room_path: Path to data room
-            checklist_text: Optional checklist text
-            questions_text: Optional questions text
-        Returns:
-            Dictionary with all processing results
-        """
-        results = {}
-        # Load data room
-        load_results = self.document_processor.load_data_room(data_room_path)
-        results['load_results'] = load_results
-        # Parse checklist if provided
-        checklist = {}
-        if checklist_text:
-            checklist = self.checklist_parser.parse_checklist(checklist_text)
-            results['checklist'] = checklist
-        # Parse questions if provided
-        questions = []
-        if questions_text:
-            questions = self.question_parser.parse_questions(questions_text)
-            results['questions'] = questions
-        # Match checklist to documents
-        checklist_results = {}
-        if checklist and self.document_processor.chunks:
-            checklist_results = self.checklist_matcher.match_checklist_to_documents(
-                checklist,
-                self.document_processor.chunks,
-                self.document_processor.embeddings
-            )
-            results['checklist_results'] = checklist_results
-        # Answer questions
-        question_answers = {}
-        if questions and self.document_processor.chunks and self.document_processor.embeddings is not None:
-            question_answers = self.question_answerer.answer_questions_with_chunks(
-                questions,
-                self.document_processor.chunks,
-                self.document_processor.embeddings
-            )
-            results['question_answers'] = question_answers
-        return results
-    def search_documents(self, query: str, top_k: int = 5, threshold: Optional[float] = None) -> List[Dict]:
-        """
-        Search documents using the document processor
-        Args:
-            query: Search query
-            top_k: Number of results
-            threshold: Similarity threshold
-        Returns:
-            Search results
-        """
-        return self.document_processor.search(query, top_k, threshold)
-    def get_processing_statistics(self) -> Dict[str, Any]:
-        """Get comprehensive processing statistics"""
-        return self.document_processor.get_statistics()

 """
 Business Logic Services Module
+Simplified service layer with focused functions instead of over-abstracted classes.
 """
 import re
+import logging
+import warnings
 from pathlib import Path
+# Suppress verbose LangChain warnings in services
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_core")
+warnings.filterwarnings("ignore", category=UserWarning, module="langchain_community")
+warnings.filterwarnings("ignore", message=".*Relevance scores must be between.*")
+warnings.filterwarnings("ignore", message=".*No relevant docs were retrieved.*")
+from typing import Dict, List, Optional, Any
+import markdown
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import PromptTemplate
+from langchain_community.vectorstores import FAISS
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_core.documents import Document
+from langchain_core.messages import HumanMessage
 from .config import get_config
+from .document_processing import DocumentProcessor, escape_markdown_math
+logger = logging.getLogger(__name__)
+# =============================================================================
+# PARSING FUNCTIONS - Simplified from ChecklistParser and QuestionParser classes
+# =============================================================================
+def parse_checklist(checklist_text: str) -> Dict:
+    """Parse markdown checklist into categories and items using standard markdown parser"""
+    categories = {}
+    current_category = None
+    # Parse line by line for reliable extraction
+    lines = checklist_text.split('\n')
+    for line_num, original_line in enumerate(lines):
+        line = original_line.strip()
+        # Skip empty lines and separator lines
+        if not line or line.startswith('⸻') or line.startswith('---'):
+            continue
+        # Skip title lines
+        if 'due diligence checklist' in line.lower() or line.startswith('#'):
+            continue
+        # Category headers - look for pattern "A. Category Name"
+        category_match = re.match(r'^([A-Z])\.\s+(.+)', line)
+        if category_match and not re.match(r'^\d+\.\s+', line):
+            letter, name = category_match.groups()
+            current_category = letter
+            categories[letter] = {
+                'name': name.strip(),
+                'items': []
+            }
+            continue
+        # Numbered items within categories - look for indented items
+        if current_category and line:
+            # Check if original line was indented (starts with tab or multiple spaces)
+            is_indented = original_line.startswith(('\t', '  ', '    '))
+            item_match = re.match(r'^\d+\.\s+(.+)', line)
+            if item_match and (is_indented or current_category):
+                item_text = item_match.group(1).strip()
+                if item_text and not item_text.lower().startswith('[other requests'):
+                    # Clean up markdown formatting but preserve content
+                    clean_text = re.sub(r'\[.*?\]', '', item_text).strip()
+                    if not clean_text:
+                        clean_text = item_text
+                    categories[current_category]['items'].append({
+                        'text': clean_text,
+                        'original': item_text
+                    })
+    return categories
+def parse_questions(questions_text: str) -> List[Dict]:
+    """Parse markdown questions into a list using standard markdown parser"""
+    # Convert markdown to understand structure
+    md = markdown.Markdown(extensions=['toc'])
+    html = md.convert(questions_text)
+    questions = []
+    current_category = None
+    # Parse line by line for reliable extraction
+    lines = questions_text.split('\n')
+    for line in lines:
+        line = line.strip()
+        # Category headers (### format)
+        if line.startswith('### '):
+            match = re.match(r'###\s+([A-Z])\.\s+(.+)', line)
             if match:
                 letter, name = match.groups()
+                current_category = f"{letter}. {name.strip()}"
+        # Question items (numbered lists)
+        elif current_category and line:
+            match = re.match(r'^\d+\.\s+(.+)', line)
+            if match:
+                question_text = match.group(1).strip()
+                if question_text:
+                    # Clean markdown formatting
+                    clean_question = re.sub(r'\*\*(.*?)\*\*', r'\1', question_text)  # Remove bold
+                    clean_question = re.sub(r'\*(.*?)\*', r'\1', clean_question)      # Remove italics
+                    questions.append({
+                        'category': current_category,
+                        'question': clean_question,
+                        'id': f"q_{len(questions)}"
+                    })
+    return questions
+# =============================================================================
+# SEARCH FUNCTIONS - Consolidated from ChecklistMatcher and QuestionAnswerer
+# =============================================================================
+def create_vector_store(source_data, model_name: str) -> FAISS:
+    """Unified vector store creation from various data sources"""
+    embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    # Handle different input types
+    if isinstance(source_data, list):
+        if all(isinstance(item, Document) for item in source_data):
+            # Already LangChain documents
+            return FAISS.from_documents(source_data, embeddings)
+        elif all(isinstance(item, dict) for item in source_data):
+            # Document chunks
+            documents = [
+                Document(
+                    page_content=chunk['text'],
+                    metadata={
+                        'source': chunk.get('source', ''),
+                        'path': chunk.get('path', ''),
+                        'full_path': chunk.get('full_path', ''),
+                        **chunk.get('metadata', {})
+                    }
+                ) for chunk in source_data
+            ]
+            return FAISS.from_documents(documents, embeddings)
+    elif isinstance(source_data, dict) and 'documents' in source_data:
+        # Document embeddings data with summaries
+        documents = [
+            Document(
+                page_content=f"{doc['name']}\n{doc['path']}\n{doc['summary']}",
+                metadata={
+                    'name': doc['name'],
+                    'path': doc['path'],
+                    'summary': doc['summary'],
+                    **doc.get('original_doc', {}).get('metadata', {})
+                }
+            ) for doc in source_data['documents']
+        ]
+        return FAISS.from_documents(documents, embeddings)
+    raise ValueError("Unsupported data type for vector store creation")
+def search_and_analyze(queries: List[Dict], vector_store: FAISS, llm=None, threshold: float = 0.7, search_type: str = 'items') -> Dict:
+    """Unified search function for both checklist items and questions using LangChain RAG"""
+    from langchain.chains import RetrievalQA
+    from langchain.prompts import PromptTemplate
+    retriever = vector_store.as_retriever(
+        search_type="similarity_score_threshold",
+        search_kwargs={"score_threshold": threshold, "k": 5 if search_type == 'questions' else 10}
+    )
+    # Create RAG chain if LLM is provided
+    qa_chain = None
+    if llm:
+        prompt_template = PromptTemplate(
+            input_variables=["context", "question"],
+            template="""Use the provided context to answer the question. Be concise and factual.
+Context: {context}
+Question: {question}
+Answer:"""
+        )
+        qa_chain = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=retriever,
+            chain_type_kwargs={"prompt": prompt_template}
+        )
+    if search_type == 'items':
+        return _process_checklist_items(queries, retriever, qa_chain)
+    else:
+        return _process_questions(queries, retriever, qa_chain)
+def _process_checklist_items(checklist: Dict, retriever, qa_chain=None) -> Dict:
+    """Process checklist items with unified search logic"""
+    results = {}
+    for cat_letter, category in checklist.items():
+        cat_results = {
+            'name': category['name'],
+            'items': [],
+            'total_items': len(category['items']),
+            'matched_items': 0
+        }
+        for item in category['items']:
+            query = f"{category['name']}: {item['text']}"
+            try:
+                docs = retriever.invoke(query)
+            except Exception as e:
+                logger.error(f"Error in document matching: {e}")
+                docs = []
+            matches = [{
+                'name': doc.metadata.get('source', ''),
+                'path': doc.metadata.get('path', ''),
+                'full_path': doc.metadata.get('full_path', ''),
+                'score': 0.8,  # LangChain similarity scores not directly accessible
+                'metadata': {k: v for k, v in doc.metadata.items()
+                           if k not in ['source', 'path', 'full_path']}
+            } for doc in docs[:5]]
+            if matches:
+                cat_results['matched_items'] += 1
+            cat_results['items'].append({
+                'text': item['text'],
+                'original': item['original'],
+                'matches': matches
+            })
+        results[cat_letter] = cat_results
+    return results
+def _process_questions(questions: List[Dict], retriever, qa_chain=None) -> Dict:
+    """Process questions with unified search logic"""
+    answers = {}
+    for question in questions:
+        try:
+            docs = retriever.invoke(question['question'])
+        except Exception as e:
+            logger.error(f"Error in question answering: {e}")
+            docs = []
+        if docs:
+            chunks_data = [{
+                'text': doc.page_content[:500],
+                'source': doc.metadata.get('source', ''),
+                'path': doc.metadata.get('path', ''),
+                'score': 0.8,
+                'metadata': {k: v for k, v in doc.metadata.items()
+                           if k not in ['source', 'path']}
+            } for doc in docs]
+            # Generate answer using RAG chain if available
+            answer_text = "Retrieved relevant document chunks."
+            if qa_chain:
+                try:
+                    answer_text = qa_chain.run(question['question'])
+                except Exception as e:
+                    logger.error(f"RAG chain failed: {e}")
+                    answer_text = "Retrieved relevant document chunks."
             answers[question['id']] = {
                 'question': question['question'],
                 'category': question['category'],
+                'answer': answer_text,
+                'chunks': chunks_data,
+                'has_answer': True
             }
+        else:
+            answers[question['id']] = {
+                'question': question['question'],
+                'category': question['category'],
+                'answer': "No relevant documents found",
+                'chunks': [],
+                'has_answer': False
+            }
+    return answers
+# =============================================================================
+# REPORT GENERATION FUNCTIONS - Simplified from ReportGenerator class
+# =============================================================================
+def generate_reports(documents: Dict[str, Dict], data_room_name: str = "Unknown",
+                    strategy_text: str = "", checklist_results: Dict = None,
+                    report_type: str = "overview", llm=None) -> str:
+    """Unified report generation using LangChain prompt templates"""
+    if not llm:
+        return _generate_basic_report(documents, data_room_name, checklist_results, report_type)
+    # Define prompt templates
+    if report_type == "overview":
+        template = PromptTemplate(
+            input_variables=["company_name", "document_summaries"],
+            template="""Based on the following document summaries from a due diligence data room, provide a comprehensive company overview.
+Company: {company_name}
+Document Summaries:
+{document_summaries}
+Please provide:
+1. Company name and industry
+2. Business model and key products/services
+3. Market position and competitive advantages
+4. Key financials (if available)
+5. Organizational structure
+6. Notable risks or concerns
+7. Overall assessment for M&A consideration
+Format the response in clear sections with bullet points where appropriate."""
+        )
+        # Prepare document summaries
         doc_summaries = []
+        for path, doc_info in list(documents.items())[:10]:
             if 'summary' in doc_info:
                 doc_summaries.append(f"{doc_info['name']}: {doc_info['summary']}")
             else:
                 content_preview = doc_info.get('content', '')[:500]
                 if content_preview:
                     doc_summaries.append(f"{doc_info['name']}: {content_preview}")
         if not doc_summaries:
             return "No documents available for summary generation."
+        inputs = {
+            "company_name": data_room_name,
+            "document_summaries": "\n".join(doc_summaries[:10])
+        }
+    elif report_type == "strategic":
+        template = PromptTemplate(
+            input_variables=["strategy_text", "checklist_context"],
+            template="""Based on the due diligence checklist results and the selected strategy, provide a strategic analysis.
+Strategy Document:
+{strategy_text}
+Checklist Coverage:
+{checklist_context}
+Please provide:
+1. Strategic alignment assessment
+2. Key risks and gaps identified
+3. Opportunities and synergies
+4. Recommended next steps
+5. Overall recommendation
+Format the response with clear sections and bullet points."""
+        )
+        # Build checklist context
+        if not checklist_results:
+            return "No checklist results available for strategic analysis."
+        checklist_context = []
+        for cat_id, cat_data in checklist_results.items():
+            cat_name = cat_data['name']
+            matched_items = cat_data['matched_items']
+            total_items = cat_data['total_items']
+            coverage = (matched_items / total_items * 100) if total_items > 0 else 0
+            checklist_context.append(f"- {cat_name}: {coverage:.0f}% coverage ({matched_items}/{total_items} items)")
+            # Add details about gaps
+            missing_items = [item['text'] for item in cat_data['items'] if not item['matches']]
+            if missing_items and len(missing_items) <= 3:
+                checklist_context.append(f"  Missing: {', '.join(missing_items[:3])}")
+        inputs = {
+            "strategy_text": strategy_text,
+            "checklist_context": "\n".join(checklist_context)
+        }
+    # Execute the chain
+    try:
+        chain = template | llm | StrOutputParser()
+        response = chain.invoke(inputs)
+        return escape_markdown_math(response.strip())
+    except Exception as e:
+        logger.error(f"LLM report generation failed: {e}")
+        return f"Failed to generate {report_type} report: {str(e)}"
+def _generate_basic_report(documents: Dict[str, Dict], data_room_name: str,
+                          checklist_results: Dict, report_type: str) -> str:
+    """Generate basic reports without AI"""
+    if report_type == "overview":
         doc_count = len(documents)
         file_types = {}
             doc_type = doc_info.get('metadata', {}).get('type', 'unknown')
             file_types[doc_type] = file_types.get(doc_type, 0) + 1
+        return f"""# Company Overview: {data_room_name}
 ## Document Analysis
 - **Total Documents**: {doc_count}
 *Note: Enable AI features for detailed company analysis and insights.*
 """
+    elif report_type == "strategic":
+        if not checklist_results:
+            return "No checklist results available for strategic analysis."
         total_items = sum(cat['total_items'] for cat in checklist_results.values())
         matched_items = sum(cat['matched_items'] for cat in checklist_results.values())
         coverage = (matched_items / total_items * 100) if total_items > 0 else 0
 """
         return analysis
+    return "Invalid report type specified."
+# =============================================================================
+# MAIN SERVICE FUNCTIONS - Simplified orchestration
+# =============================================================================
+def search_documents(doc_processor: DocumentProcessor, query: str, top_k: int = 5,
+                    threshold: Optional[float] = None) -> List[Dict]:
+    """Search documents using the document processor"""
+    return doc_processor.search(query, top_k, threshold)
+def load_default_file(directory: Path, pattern: str) -> str:
+    """Load the first file matching pattern from directory"""
+    try:
+        files = list(directory.glob(pattern))
+        return files[0].read_text(encoding='utf-8') if files else ""
+    except Exception as e:
+        logger.error(f"File loading failed: {e}")
+        return ""

src/ui_components.py CHANGED Viewed

@@ -9,10 +9,11 @@ Separates UI logic from business logic for better maintainability.
 import streamlit as st
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Any
-import numpy as np
 import base64
-from .config import get_config
 def create_document_link(file_path: str, doc_name: str, doc_title: str) -> str:
@@ -89,8 +90,7 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
                     subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
                     if subdirs:
                         # Count total documents in all data rooms
-                        total_docs = sum(1 for f in project_dir.rglob('*')
-                                       if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
                         if total_docs > 0:
                             projects.append({
                                 'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
@@ -106,8 +106,7 @@ def render_project_selector() -> Tuple[Optional[str], Optional[str]]:
                 subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
                 if subdirs:
                     # Count total documents in all data rooms
-                    total_docs = sum(1 for f in project_dir.rglob('*')
-                                   if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
                     if total_docs > 0:
                         projects.append({
                             'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
@@ -169,8 +168,7 @@ def render_data_room_selector(project_path: str) -> Optional[str]:
     for data_room_dir in project_path_obj.iterdir():
         if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
             # Count documents for display
-            doc_count = sum(1 for f in data_room_dir.rglob('*')
-                          if f.is_file() and f.suffix.lower() in ['.pdf', '.docx', '.doc', '.txt', '.md'])
             if doc_count > 0:  # Only show directories with documents
                 data_rooms.append({
                     'name': data_room_dir.name.replace('-', ' ').replace('_', ' ').title(),
@@ -221,12 +219,11 @@ def render_ai_settings() -> Tuple[bool, Optional[str], str]:
     model_choice = config.model.claude_model
     if use_ai_features:
-        # Check if API key is in environment
-        import os
-        env_key = os.getenv('ANTHROPIC_API_KEY')
-        if env_key:
             st.success("✅ API key loaded from .env file")
-            api_key = env_key
         else:
             api_key = st.text_input(
                 "Anthropic API Key",
@@ -276,11 +273,11 @@ def render_file_selector(directory: str, file_type: str, key_suffix: str) -> Tup
     if dir_path.exists():
         for file in dir_path.glob("*.md"):
             if not file.name.startswith('.'):
-                files.append({
-                    'name': file.stem.replace('_', ' ').replace('-', ' ').title(),
-                    'path': str(file),
-                    'filename': file.name
-                })
     file_content = ""
     selected_file_path = None
@@ -483,10 +480,7 @@ def render_document_match(match: Dict, item_idx: int, primary_threshold: float)
     """
     # Get document title (use name without extension)
     doc_name = match.get('name', match.get('path', 'Unknown'))
-    if '.' in doc_name:
-        doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
-    else:
-        doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
     # Compact display with columns
     col1, col2, col3 = st.columns([0.8, 3.5, 0.5])
@@ -535,17 +529,7 @@ def render_download_button(match: Dict, item_idx: int, doc_name: str, doc_title:
                     file_bytes = f.read()
                 # Determine MIME type based on file extension
-                file_extension = file_path.suffix.lower()
-                if file_extension == '.pdf':
-                    mime_type = 'application/pdf'
-                elif file_extension in ['.doc', '.docx']:
-                    mime_type = 'application/msword'
-                elif file_extension == '.txt':
-                    mime_type = 'text/plain'
-                elif file_extension == '.md':
-                    mime_type = 'text/markdown'
-                else:
-                    mime_type = 'application/octet-stream'
                 button_key = f"dl_{item_idx}_{match['score']:.0f}_{doc_name[:20]}".replace(" ", "_").replace("/", "_").replace(".", "_")
@@ -648,10 +632,7 @@ def render_question_source(chunk: Dict, chunk_idx: int, question: str) -> None:
     with col2:
         # Get clean document title
         doc_name = chunk['source']
-        if '.' in doc_name:
-            doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title()
-        else:
-            doc_title = doc_name.replace('_', ' ').replace('-', ' ').title()
         # Document title as clickable link
         doc_path = chunk.get('path', '')
@@ -675,17 +656,7 @@ def render_question_source(chunk: Dict, chunk_idx: int, question: str) -> None:
                         file_bytes = f.read()
                     # Determine MIME type based on file extension
-                    file_extension = file_path.suffix.lower()
-                    if file_extension == '.pdf':
-                        mime_type = 'application/pdf'
-                    elif file_extension in ['.doc', '.docx']:
-                        mime_type = 'application/msword'
-                    elif file_extension == '.txt':
-                        mime_type = 'text/plain'
-                    elif file_extension == '.md':
-                        mime_type = 'text/markdown'
-                    else:
-                        mime_type = 'application/octet-stream'
                     button_key = f"qa_dl_{question[:20]}_{chunk_idx}".replace(" ", "_").replace("?", "").replace("/", "_")
@@ -718,7 +689,7 @@ def render_ai_answer_button(answer_data: Dict, chunks: List[Dict]) -> None:
                     context = "\n\n".join([f"From {c['source']}: {c['text']}" for c in chunks[:3]])
                     # Use LLM directly for more reliable answers
                     from langchain_core.messages import HumanMessage
-                    from src.document_processing import escape_markdown_math
                     prompt = f"Question: {answer_data['question']}\n\nContext from documents:\n{context}\n\nProvide a comprehensive answer based on the context."
                     response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])

 import streamlit as st
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Any
 import base64
+from .config import get_config, get_mime_type, format_document_title, count_documents_in_directory
+from .document_processing import escape_markdown_math
 def create_document_link(file_path: str, doc_name: str, doc_title: str) -> str:
                     subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
                     if subdirs:
                         # Count total documents in all data rooms
+                        total_docs = count_documents_in_directory(project_dir)
                         if total_docs > 0:
                             projects.append({
                                 'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
                 subdirs = [d for d in project_dir.iterdir() if d.is_dir() and not d.name.startswith('.')]
                 if subdirs:
                     # Count total documents in all data rooms
+                    total_docs = count_documents_in_directory(project_dir)
                     if total_docs > 0:
                         projects.append({
                             'name': project_dir.name.replace('-', ' ').replace('_', ' ').title(),
     for data_room_dir in project_path_obj.iterdir():
         if data_room_dir.is_dir() and not data_room_dir.name.startswith('.'):
             # Count documents for display
+            doc_count = count_documents_in_directory(data_room_dir)
             if doc_count > 0:  # Only show directories with documents
                 data_rooms.append({
                     'name': data_room_dir.name.replace('-', ' ').replace('_', ' ').title(),
     model_choice = config.model.claude_model
     if use_ai_features:
+        # Check if API key is available in config (which loads from .env)
+        config_api_key = config.anthropic_api_key
+        if config_api_key:
             st.success("✅ API key loaded from .env file")
+            api_key = config_api_key
         else:
             api_key = st.text_input(
                 "Anthropic API Key",
     if dir_path.exists():
         for file in dir_path.glob("*.md"):
             if not file.name.startswith('.'):
+                                    files.append({
+                        'name': format_document_title(file.stem),
+                        'path': str(file),
+                        'filename': file.name
+                    })
     file_content = ""
     selected_file_path = None
     """
     # Get document title (use name without extension)
     doc_name = match.get('name', match.get('path', 'Unknown'))
+    doc_title = format_document_title(doc_name)
     # Compact display with columns
     col1, col2, col3 = st.columns([0.8, 3.5, 0.5])
                     file_bytes = f.read()
                 # Determine MIME type based on file extension
+                mime_type = get_mime_type(file_path)
                 button_key = f"dl_{item_idx}_{match['score']:.0f}_{doc_name[:20]}".replace(" ", "_").replace("/", "_").replace(".", "_")
     with col2:
         # Get clean document title
         doc_name = chunk['source']
+        doc_title = format_document_title(doc_name)
         # Document title as clickable link
         doc_path = chunk.get('path', '')
                         file_bytes = f.read()
                     # Determine MIME type based on file extension
+                    mime_type = get_mime_type(file_path)
                     button_key = f"qa_dl_{question[:20]}_{chunk_idx}".replace(" ", "_").replace("?", "").replace("/", "_")
                     context = "\n\n".join([f"From {c['source']}: {c['text']}" for c in chunks[:3]])
                     # Use LLM directly for more reliable answers
                     from langchain_core.messages import HumanMessage
                     prompt = f"Question: {answer_data['question']}\n\nContext from documents:\n{context}\n\nProvide a comprehensive answer based on the context."
                     response = st.session_state.agent.llm.invoke([HumanMessage(content=prompt)])

src/utils.py DELETED Viewed

@@ -1,640 +0,0 @@
-#!/usr/bin/env python3
-"""
-Utilities Module
-This module contains error handling, logging, and other utility functions
-for the DD-Checklist application.
-"""
-import logging
-import functools
-import traceback
-from pathlib import Path
-from typing import Any, Callable, Optional, Dict, List, Union
-import streamlit as st
-from datetime import datetime
-import sys
-import os
-class DDChecklistLogger:
-    """
-    Custom logger for DD-Checklist application
-    Handles both file and console logging with Streamlit integration
-    """
-    def __init__(self, name: str = "dd_checklist", log_level: str = "INFO"):
-        """
-        Initialize logger
-        Args:
-            name: Logger name
-            log_level: Logging level
-        """
-        self.logger = logging.getLogger(name)
-        self.logger.setLevel(getattr(logging, log_level.upper()))
-        # Prevent duplicate handlers
-        if not self.logger.handlers:
-            self._setup_handlers()
-    def _setup_handlers(self):
-        """Setup logging handlers"""
-        # Console handler
-        console_handler = logging.StreamHandler(sys.stdout)
-        console_formatter = logging.Formatter(
-            '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-        )
-        console_handler.setFormatter(console_formatter)
-        self.logger.addHandler(console_handler)
-        # File handler (if possible)
-        try:
-            log_dir = Path(".logs")
-            log_dir.mkdir(exist_ok=True)
-            log_file = log_dir / f"dd_checklist_{datetime.now().strftime('%Y%m%d')}.log"
-            file_handler = logging.FileHandler(log_file)
-            file_formatter = logging.Formatter(
-                '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s'
-            )
-            file_handler.setFormatter(file_formatter)
-            self.logger.addHandler(file_handler)
-        except Exception:
-            # File logging not available (e.g., on Streamlit Cloud)
-            pass
-    def info(self, message: str, **kwargs):
-        """Log info message"""
-        self.logger.info(message, **kwargs)
-    def warning(self, message: str, **kwargs):
-        """Log warning message"""
-        self.logger.warning(message, **kwargs)
-        # Also show in Streamlit if available
-        if 'st' in globals() and st:
-            st.warning(message)
-    def error(self, message: str, **kwargs):
-        """Log error message"""
-        self.logger.error(message, **kwargs)
-        # Also show in Streamlit if available
-        if 'st' in globals() and st:
-            st.error(message)
-    def debug(self, message: str, **kwargs):
-        """Log debug message"""
-        self.logger.debug(message, **kwargs)
-    def exception(self, message: str, **kwargs):
-        """Log exception with traceback"""
-        self.logger.exception(message, **kwargs)
-        # Show error in Streamlit if available
-        if 'st' in globals() and st:
-            st.error(f"{message} - Check logs for details.")
-# Global logger instance
-logger = DDChecklistLogger()
-def handle_exceptions(
-    return_value: Any = None,
-    show_error: bool = True,
-    log_error: bool = True
-) -> Callable:
-    """
-    Decorator for handling exceptions in functions
-    Args:
-        return_value: Value to return on exception
-        show_error: Whether to show error in UI
-        log_error: Whether to log the error
-    Returns:
-        Decorated function
-    """
-    def decorator(func: Callable) -> Callable:
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            try:
-                return func(*args, **kwargs)
-            except Exception as e:
-                error_msg = f"Error in {func.__name__}: {str(e)}"
-                if log_error:
-                    logger.exception(error_msg)
-                if show_error and 'st' in globals() and st:
-                    st.error(error_msg)
-                return return_value
-        return wrapper
-    return decorator
-def safe_execute(
-    func: Callable,
-    *args,
-    default_return: Any = None,
-    error_message: Optional[str] = None,
-    show_error: bool = True,
-    **kwargs
-) -> Any:
-    """
-    Safely execute a function with error handling
-    Args:
-        func: Function to execute
-        *args: Function arguments
-        default_return: Default return value on error
-        error_message: Custom error message
-        show_error: Whether to show error in UI
-        **kwargs: Function keyword arguments
-    Returns:
-        Function result or default_return on error
-    """
-    try:
-        return func(*args, **kwargs)
-    except Exception as e:
-        msg = error_message or f"Error executing {func.__name__}: {str(e)}"
-        logger.exception(msg)
-        if show_error and 'st' in globals() and st:
-            st.error(msg)
-        return default_return
-class ErrorHandler:
-    """
-    Context manager for error handling
-    """
-    def __init__(
-        self,
-        error_message: str = "An error occurred",
-        show_error: bool = True,
-        reraise: bool = False
-    ):
-        """
-        Initialize error handler
-        Args:
-            error_message: Message to display on error
-            show_error: Whether to show error in UI
-            reraise: Whether to reraise the exception
-        """
-        self.error_message = error_message
-        self.show_error = show_error
-        self.reraise = reraise
-    def __enter__(self):
-        return self
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            error_msg = f"{self.error_message}: {str(exc_val)}"
-            logger.exception(error_msg)
-            if self.show_error and 'st' in globals() and st:
-                st.error(error_msg)
-            if self.reraise:
-                return False  # Reraise the exception
-            return True  # Suppress the exception
-def validate_file_path(file_path: Union[str, Path]) -> bool:
-    """
-    Validate that a file path exists and is readable
-    Args:
-        file_path: Path to validate
-    Returns:
-        True if valid, False otherwise
-    """
-    try:
-        path = Path(file_path)
-        return path.exists() and path.is_file()
-    except Exception as e:
-        logger.warning(f"Invalid file path {file_path}: {e}")
-        return False
-def validate_directory_path(dir_path: Union[str, Path]) -> bool:
-    """
-    Validate that a directory path exists
-    Args:
-        dir_path: Directory path to validate
-    Returns:
-        True if valid, False otherwise
-    """
-    try:
-        path = Path(dir_path)
-        return path.exists() and path.is_dir()
-    except Exception as e:
-        logger.warning(f"Invalid directory path {dir_path}: {e}")
-        return False
-def ensure_directory(dir_path: Union[str, Path]) -> bool:
-    """
-    Ensure directory exists, create if it doesn't
-    Args:
-        dir_path: Directory path
-    Returns:
-        True if directory exists or was created, False otherwise
-    """
-    try:
-        path = Path(dir_path)
-        path.mkdir(parents=True, exist_ok=True)
-        return True
-    except Exception as e:
-        logger.error(f"Could not create directory {dir_path}: {e}")
-        return False
-def get_file_size(file_path: Union[str, Path]) -> Optional[int]:
-    """
-    Get file size in bytes
-    Args:
-        file_path: Path to file
-    Returns:
-        File size in bytes or None if error
-    """
-    try:
-        return Path(file_path).stat().st_size
-    except Exception as e:
-        logger.warning(f"Could not get size for {file_path}: {e}")
-        return None
-def format_file_size(size_bytes: int) -> str:
-    """
-    Format file size in human-readable format
-    Args:
-        size_bytes: Size in bytes
-    Returns:
-        Formatted size string
-    """
-    if size_bytes == 0:
-        return "0 B"
-    size_names = ["B", "KB", "MB", "GB"]
-    size = size_bytes
-    for i, unit in enumerate(size_names):
-        if size < 1024 or i == len(size_names) - 1:
-            return f"{size:.1f} {unit}"
-        size /= 1024
-    return f"{size:.1f} GB"
-def sanitize_filename(filename: str) -> str:
-    """
-    Sanitize filename for safe file operations
-    Args:
-        filename: Original filename
-    Returns:
-        Sanitized filename
-    """
-    import re
-    # Remove or replace invalid characters
-    sanitized = re.sub(r'[<>:"/\\|?*]', '_', filename)
-    # Remove multiple underscores
-    sanitized = re.sub(r'_+', '_', sanitized)
-    # Trim and ensure not empty
-    sanitized = sanitized.strip('_. ')
-    if not sanitized:
-        sanitized = "untitled"
-    return sanitized
-def get_memory_usage() -> Dict[str, float]:
-    """
-    Get current memory usage information
-    Returns:
-        Dictionary with memory usage stats
-    """
-    try:
-        import psutil
-        process = psutil.Process(os.getpid())
-        memory_info = process.memory_info()
-        return {
-            'rss_mb': memory_info.rss / 1024 / 1024,  # Resident Set Size
-            'vms_mb': memory_info.vms / 1024 / 1024,  # Virtual Memory Size
-            'percent': process.memory_percent()
-        }
-    except ImportError:
-        logger.warning("psutil not available, cannot get memory usage")
-        return {}
-    except Exception as e:
-        logger.warning(f"Could not get memory usage: {e}")
-        return {}
-def timing_decorator(func: Callable) -> Callable:
-    """
-    Decorator to time function execution
-    Args:
-        func: Function to time
-    Returns:
-        Decorated function
-    """
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        import time
-        start_time = time.time()
-        result = func(*args, **kwargs)
-        end_time = time.time()
-        execution_time = end_time - start_time
-        logger.debug(f"{func.__name__} executed in {execution_time:.2f} seconds")
-        return result
-    return wrapper
-class ProgressTracker:
-    """
-    Progress tracking utility for long-running operations with weighted ETA calculation
-    """
-    def __init__(self, total_steps: int, description: str = "Processing", step_weights: Optional[Dict[int, float]] = None):
-        """
-        Initialize progress tracker
-        Args:
-            total_steps: Total number of steps
-            description: Description of the operation
-            step_weights: Optional dict mapping step numbers to relative weights (default: all steps equal weight)
-        """
-        self.total_steps = total_steps
-        self.current_step = 0
-        self.description = description
-        self.start_time = datetime.now()
-        self.step_start_times = {}  # Track when each step started
-        self.step_durations = {}    # Track actual duration of completed steps
-        # Set up step weights (default: equal weight for all steps)
-        if step_weights:
-            self.step_weights = step_weights
-        else:
-            self.step_weights = {i: 1.0 for i in range(1, total_steps + 1)}
-        # Calculate total weight for progress calculation
-        self.total_weight = sum(self.step_weights.values())
-        # Initialize Streamlit progress bar if available
-        if 'st' in globals() and st:
-            self.progress_bar = st.progress(0, text=f"{description}...")
-            self.status_text = st.empty()
-        else:
-            self.progress_bar = None
-            self.status_text = None
-    def update(self, step: int, message: str = ""):
-        """
-        Update progress with weighted ETA calculation
-        Args:
-            step: Current step number
-            message: Optional status message
-        """
-        now = datetime.now()
-        # Record step timing
-        if self.current_step != step:
-            # Mark completion of previous step
-            if self.current_step > 0 and self.current_step in self.step_start_times:
-                self.step_durations[self.current_step] = (now - self.step_start_times[self.current_step]).total_seconds()
-            # Mark start of new step
-            self.step_start_times[step] = now
-            self.current_step = step
-        # Calculate weighted progress
-        completed_weight = sum(self.step_weights.get(i, 1.0) for i in range(1, step))
-        current_step_weight = self.step_weights.get(step, 1.0)
-        # For current step, assume 50% completion unless we have sub-progress info
-        current_progress_weight = completed_weight + (current_step_weight * 0.5)
-        progress = current_progress_weight / self.total_weight if self.total_weight > 0 else 0
-        progress = min(progress, 1.0)  # Cap at 100%
-        # Calculate improved ETA using weighted approach
-        elapsed = (now - self.start_time).total_seconds()
-        eta_str = ""
-        if step > 1 and completed_weight > 0:
-            # Use actual timing data from completed steps
-            avg_time_per_weight = elapsed / completed_weight
-            remaining_weight = self.total_weight - current_progress_weight
-            eta = avg_time_per_weight * remaining_weight
-            if eta > 1:
-                if eta < 60:
-                    eta_str = f" (ETA: {eta:.0f}s)"
-                elif eta < 3600:
-                    eta_str = f" (ETA: {eta/60:.1f}m)"
-                else:
-                    eta_str = f" (ETA: {eta/3600:.1f}h)"
-        elif step == 1 and elapsed > 5:  # Only show ETA after 5 seconds
-            # For first step, make a rough estimate based on step weights
-            estimated_time_per_weight = elapsed / self.step_weights.get(1, 1.0)
-            remaining_weight = self.total_weight - current_progress_weight
-            eta = estimated_time_per_weight * remaining_weight
-            if eta > 10:  # Only show if meaningful
-                if eta < 60:
-                    eta_str = f" (ETA: ~{eta:.0f}s)"
-                else:
-                    eta_str = f" (ETA: ~{eta/60:.1f}m)"
-        status_msg = f"{self.description}: {step}/{self.total_steps}{eta_str}"
-        if message:
-            status_msg += f" - {message}"
-        # Update Streamlit components
-        if self.progress_bar:
-            self.progress_bar.progress(progress, text=status_msg)
-        # Log progress at key milestones
-        if step == 1 or step % max(1, self.total_steps // 5) == 0:  # Log every 20%
-            logger.info(status_msg)
-    def update_step_progress(self, step: int, sub_progress: float, message: str = ""):
-        """
-        Update progress within a specific step (for long-running operations)
-        Args:
-            step: Current step number
-            sub_progress: Progress within the step (0.0 to 1.0)
-            message: Optional status message
-        """
-        now = datetime.now()
-        # Ensure we're tracking this step
-        if step not in self.step_start_times:
-            self.step_start_times[step] = now
-            self.current_step = step
-        # Calculate weighted progress with sub-progress
-        completed_weight = sum(self.step_weights.get(i, 1.0) for i in range(1, step))
-        current_step_weight = self.step_weights.get(step, 1.0)
-        # Use actual sub-progress instead of assuming 50%
-        current_progress_weight = completed_weight + (current_step_weight * sub_progress)
-        progress = current_progress_weight / self.total_weight if self.total_weight > 0 else 0
-        progress = min(progress, 1.0)  # Cap at 100%
-        # Calculate improved ETA
-        elapsed = (now - self.start_time).total_seconds()
-        eta_str = ""
-        if step > 1 and completed_weight > 0:
-            # Use actual timing data from completed steps
-            avg_time_per_weight = elapsed / completed_weight
-            remaining_weight = self.total_weight - current_progress_weight
-            eta = avg_time_per_weight * remaining_weight
-            if eta > 1:
-                if eta < 60:
-                    eta_str = f" (ETA: {eta:.0f}s)"
-                elif eta < 3600:
-                    eta_str = f" (ETA: {eta/60:.1f}m)"
-                else:
-                    eta_str = f" (ETA: {eta/3600:.1f}h)"
-        elif step == 1 and elapsed > 5:
-            # For first step, estimate based on current progress
-            if sub_progress > 0.1:  # Only estimate if we have meaningful progress
-                step_elapsed = (now - self.step_start_times[step]).total_seconds()
-                estimated_step_time = step_elapsed / sub_progress
-                remaining_step_time = estimated_step_time * (1 - sub_progress)
-                # Add estimated time for remaining steps
-                remaining_weight = self.total_weight - self.step_weights.get(step, 1.0)
-                estimated_time_per_weight = estimated_step_time / self.step_weights.get(step, 1.0)
-                eta = remaining_step_time + (estimated_time_per_weight * remaining_weight)
-                if eta > 10:
-                    if eta < 60:
-                        eta_str = f" (ETA: ~{eta:.0f}s)"
-                    else:
-                        eta_str = f" (ETA: ~{eta/60:.1f}m)"
-        status_msg = f"{self.description}: {step}/{self.total_steps}{eta_str}"
-        if message:
-            status_msg += f" - {message}"
-        # Update Streamlit components
-        if self.progress_bar:
-            self.progress_bar.progress(progress, text=status_msg)
-    def complete(self, message: str = "Complete"):
-        """
-        Mark progress as complete
-        Args:
-            message: Completion message
-        """
-        if self.progress_bar:
-            self.progress_bar.progress(1.0, text=f"{self.description}: {message}")
-        elapsed = (datetime.now() - self.start_time).total_seconds()
-        logger.info(f"{self.description} completed in {elapsed:.1f} seconds")
-def batch_process(
-    items: List[Any],
-    process_func: Callable,
-    batch_size: int = 10,
-    description: str = "Processing"
-) -> List[Any]:
-    """
-    Process items in batches with progress tracking
-    Args:
-        items: List of items to process
-        process_func: Function to process each item
-        batch_size: Size of each batch
-        description: Description for progress tracking
-    Returns:
-        List of processed results
-    """
-    results = []
-    total_batches = (len(items) + batch_size - 1) // batch_size
-    tracker = ProgressTracker(total_batches, description)
-    for i in range(0, len(items), batch_size):
-        batch = items[i:i + batch_size]
-        batch_num = i // batch_size + 1
-        try:
-            batch_results = [process_func(item) for item in batch]
-            results.extend(batch_results)
-            tracker.update(batch_num, f"Batch {batch_num}/{total_batches}")
-        except Exception as e:
-            logger.error(f"Error processing batch {batch_num}: {e}")
-            # Continue with remaining batches
-            continue
-    tracker.complete()
-    return results
-# Streamlit-specific utilities
-def show_success(message: str):
-    """Show success message in Streamlit"""
-    if 'st' in globals() and st:
-        st.success(message)
-    logger.info(message)
-def show_info(message: str):
-    """Show info message in Streamlit"""
-    if 'st' in globals() and st:
-        st.info(message)
-    logger.info(message)
-def show_warning(message: str):
-    """Show warning message in Streamlit"""
-    if 'st' in globals() and st:
-        st.warning(message)
-    logger.warning(message)
-def show_error(message: str):
-    """Show error message in Streamlit"""
-    if 'st' in globals() and st:
-        st.error(message)
-    logger.error(message)

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff