Spaces:

jmzlx
/

dd-poc

Sleeping

App Files Files Community

Juan Salas commited on Aug 31, 2025

Commit

2a6c9a4

1 Parent(s): 0f98252

Reverted embedding caching added semantic chunking

Browse files

Files changed (5) hide show

app.py +2 -0
pyproject.toml +1 -0
requirements.txt +19 -18
src/document_processing.py +30 -11
uv.lock +24 -0

app.py CHANGED Viewed

@@ -47,6 +47,7 @@ except ImportError:
     DDChecklistAgent = None
 class DDChecklistApp:
     """
     Main application class that orchestrates all components
@@ -198,6 +199,7 @@ class DDChecklistApp:
         return selected_data_room_path, use_ai_features, process_button
     def render_summary_tab(self):
         """Render the summary and analysis tab"""
         # Strategy selector

     DDChecklistAgent = None
 class DDChecklistApp:
     """
     Main application class that orchestrates all components
         return selected_data_room_path, use_ai_features, process_button
     def render_summary_tab(self):
         """Render the summary and analysis tab"""
         # Strategy selector

pyproject.toml CHANGED Viewed

@@ -21,6 +21,7 @@ dependencies = [
     "langchain-anthropic>=0.1.0",
     "langgraph>=0.0.20",
     "langchain-core>=0.1.0",
 ]
 [build-system]

     "langchain-anthropic>=0.1.0",
     "langgraph>=0.0.20",
     "langchain-core>=0.1.0",
+    "langchain-text-splitters>=0.3.10",
 ]
 [build-system]

requirements.txt CHANGED Viewed

@@ -1,22 +1,23 @@
-# Core dependencies
-streamlit>=1.28.0
-sentence-transformers>=2.2.0
-numpy>=1.24.0
-pandas>=2.0.0
-watchdog>=3.0.0  # For auto-reload during development
-# Document processing
-pymupdf>=1.23.0
-python-docx>=0.8.11
-joblib>=1.3.0
-# Environment and configuration
-python-dotenv>=1.0.0
-# Vector store
-faiss-cpu>=1.7.4
-# AI Enhancement (always included for consistency)
-langchain-anthropic>=0.1.0
-langgraph>=0.0.20
-langchain-core>=0.1.0

+# Core dependencies - pinned for deployment
+streamlit==1.49.1
+sentence-transformers==5.1.0
+numpy==2.3.2
+pandas==2.3.2
+watchdog==6.0.0
+# Document processing - pinned for deployment
+pymupdf==1.26.4
+python-docx==1.2.0
+joblib==1.5.2
+# Environment and configuration - pinned for deployment
+python-dotenv==1.1.1
+# Vector store - pinned for deployment
+faiss-cpu==1.12.0
+# AI Enhancement - pinned for deployment
+langchain-anthropic==0.3.19
+langgraph==0.6.6
+langchain-core==0.3.75
+langchain-text-splitters==0.3.10

src/document_processing.py CHANGED Viewed

@@ -5,7 +5,7 @@ Document Processing Module
 This module handles all document-related operations including:
 - File text extraction from various formats (PDF, DOCX, TXT, MD)
 - Document scanning and indexing
-- Text chunking for RAG
 - Document metadata handling
 """
@@ -30,6 +30,9 @@ import joblib
 import hashlib
 import time
 # Setup logging for thread-safe error handling
 logger = logging.getLogger(__name__)
@@ -256,34 +259,50 @@ def scan_data_room(data_room_path: str, max_workers: int = 4, progress_callback=
     return documents
-def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 400, overlap: int = 50) -> List[Dict]:
     """
-    Create searchable chunks with full metadata
     Args:
         documents: Dictionary of documents
-        chunk_size: Size of each chunk in words
-        overlap: Overlap between chunks in words
     Returns:
         List of chunk dictionaries with metadata
     """
     chunks = []
     for doc_path, doc_info in documents.items():
         text = doc_info['text']
-        words = text.split()
-        # Create overlapping chunks
-        for i in range(0, len(words), chunk_size - overlap):
-            chunk_text = ' '.join(words[i:i + chunk_size])
             if chunk_text.strip():
                 chunks.append({
-                    'text': chunk_text,
                     'source': doc_info['name'],
                     'path': doc_info['rel_path'],
                     'full_path': doc_path,
-                    'chunk_id': f"chunk_{i}",
                     'metadata': doc_info['metadata']
                 })

 This module handles all document-related operations including:
 - File text extraction from various formats (PDF, DOCX, TXT, MD)
 - Document scanning and indexing
+- Semantic text chunking for RAG with better context preservation
 - Document metadata handling
 """
 import hashlib
 import time
+# Semantic chunking
+from langchain_text_splitters import RecursiveCharacterTextSplitter
 # Setup logging for thread-safe error handling
 logger = logging.getLogger(__name__)
     return documents
+def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 2000, overlap: int = 200) -> List[Dict]:
     """
+    Create searchable chunks with semantic splitting and full metadata.
+    Uses RecursiveCharacterTextSplitter for better context preservation.
     Args:
         documents: Dictionary of documents
+        chunk_size: Size of each chunk in characters (default: 2000 for ~400 words)
+        overlap: Overlap between chunks in characters (default: 200 for ~50 words)
     Returns:
         List of chunk dictionaries with metadata
     """
     chunks = []
+    # Initialize semantic text splitter with hierarchical separators
+    # This preserves document structure by prioritizing paragraph breaks,
+    # then sentences, then words
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=chunk_size,
+        chunk_overlap=overlap,
+        separators=["\n\n", "\n", ".", "!", "?", ",", " "],
+        length_function=len,
+        is_separator_regex=False,
+    )
     for doc_path, doc_info in documents.items():
         text = doc_info['text']
+        if not text.strip():
+            continue
+        # Split text using semantic boundaries
+        semantic_chunks = text_splitter.split_text(text)
+        # Create chunks with metadata
+        for i, chunk_text in enumerate(semantic_chunks):
             if chunk_text.strip():
                 chunks.append({
+                    'text': chunk_text.strip(),
                     'source': doc_info['name'],
                     'path': doc_info['rel_path'],
                     'full_path': doc_path,
+                    'chunk_id': f"semantic_chunk_{i}",
                     'metadata': doc_info['metadata']
                 })

uv.lock CHANGED Viewed

@@ -181,6 +181,7 @@ dependencies = [
     { name = "faiss-cpu" },
     { name = "langchain-anthropic" },
     { name = "langchain-core" },
     { name = "langgraph" },
     { name = "numpy" },
     { name = "pandas" },
@@ -197,6 +198,7 @@ requires-dist = [
     { name = "faiss-cpu", specifier = ">=1.7.4" },
     { name = "langchain-anthropic", specifier = ">=0.1.0" },
     { name = "langchain-core", specifier = ">=0.1.0" },
     { name = "langgraph", specifier = ">=0.0.20" },
     { name = "numpy", specifier = ">=1.24.0" },
     { name = "pandas", specifier = ">=2.0.0" },
@@ -552,6 +554,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" },
 ]
 [[package]]
 name = "langgraph"
 version = "0.6.6"
@@ -1220,6 +1235,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
 ]
 [[package]]
 name = "protobuf"
 version = "6.32.0"

     { name = "faiss-cpu" },
     { name = "langchain-anthropic" },
     { name = "langchain-core" },
+    { name = "langchain-text-splitters" },
     { name = "langgraph" },
     { name = "numpy" },
     { name = "pandas" },
     { name = "faiss-cpu", specifier = ">=1.7.4" },
     { name = "langchain-anthropic", specifier = ">=0.1.0" },
     { name = "langchain-core", specifier = ">=0.1.0" },
+    { name = "langchain-text-splitters", specifier = ">=0.3.10" },
     { name = "langgraph", specifier = ">=0.0.20" },
     { name = "numpy", specifier = ">=1.24.0" },
     { name = "pandas", specifier = ">=2.0.0" },
     { url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" },
 ]
+[[package]]
+name = "langchain-text-splitters"
+version = "0.3.10"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "langchain-core" },
+    { name = "pip" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/24/0b/a6ff43f23ff1ca0c6c3d7839eb5573f299eb96fc06d59c336488142fedf3/langchain_text_splitters-0.3.10.tar.gz", hash = "sha256:b00a82b92eb362a9842f7d7a16d6d223fc93a9be4c51c14109be7d15d120c67f", size = 46563, upload-time = "2025-08-28T17:17:44.783Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4c/dc/d64c9990f6aeb209e8d47b34ebaa2b787f3e4c10c99b8a5568a10beda449/langchain_text_splitters-0.3.10-py3-none-any.whl", hash = "sha256:4bc6ebef274d954e79321d0781dc5ab89b79f40c3cb8ba3310cc2d05ff73c945", size = 34040, upload-time = "2025-08-28T17:17:43.36Z" },
+]
 [[package]]
 name = "langgraph"
 version = "0.6.6"
     { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
 ]
+[[package]]
+name = "pip"
+version = "25.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/16/650289cd3f43d5a2fadfd98c68bd1e1e7f2550a1a5326768cddfbcedb2c5/pip-25.2.tar.gz", hash = "sha256:578283f006390f85bb6282dffb876454593d637f5d1be494b5202ce4877e71f2", size = 1840021, upload-time = "2025-07-30T21:50:15.401Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl", hash = "sha256:6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717", size = 1752557, upload-time = "2025-07-30T21:50:13.323Z" },
+]
 [[package]]
 name = "protobuf"
 version = "6.32.0"