Juan Salas commited on
Commit ·
2a6c9a4
1
Parent(s): 0f98252
Reverted embedding caching added semantic chunking
Browse files- app.py +2 -0
- pyproject.toml +1 -0
- requirements.txt +19 -18
- src/document_processing.py +30 -11
- uv.lock +24 -0
app.py
CHANGED
|
@@ -47,6 +47,7 @@ except ImportError:
|
|
| 47 |
DDChecklistAgent = None
|
| 48 |
|
| 49 |
|
|
|
|
| 50 |
class DDChecklistApp:
|
| 51 |
"""
|
| 52 |
Main application class that orchestrates all components
|
|
@@ -198,6 +199,7 @@ class DDChecklistApp:
|
|
| 198 |
|
| 199 |
return selected_data_room_path, use_ai_features, process_button
|
| 200 |
|
|
|
|
| 201 |
def render_summary_tab(self):
|
| 202 |
"""Render the summary and analysis tab"""
|
| 203 |
# Strategy selector
|
|
|
|
| 47 |
DDChecklistAgent = None
|
| 48 |
|
| 49 |
|
| 50 |
+
|
| 51 |
class DDChecklistApp:
|
| 52 |
"""
|
| 53 |
Main application class that orchestrates all components
|
|
|
|
| 199 |
|
| 200 |
return selected_data_room_path, use_ai_features, process_button
|
| 201 |
|
| 202 |
+
|
| 203 |
def render_summary_tab(self):
|
| 204 |
"""Render the summary and analysis tab"""
|
| 205 |
# Strategy selector
|
pyproject.toml
CHANGED
|
@@ -21,6 +21,7 @@ dependencies = [
|
|
| 21 |
"langchain-anthropic>=0.1.0",
|
| 22 |
"langgraph>=0.0.20",
|
| 23 |
"langchain-core>=0.1.0",
|
|
|
|
| 24 |
]
|
| 25 |
|
| 26 |
[build-system]
|
|
|
|
| 21 |
"langchain-anthropic>=0.1.0",
|
| 22 |
"langgraph>=0.0.20",
|
| 23 |
"langchain-core>=0.1.0",
|
| 24 |
+
"langchain-text-splitters>=0.3.10",
|
| 25 |
]
|
| 26 |
|
| 27 |
[build-system]
|
requirements.txt
CHANGED
|
@@ -1,22 +1,23 @@
|
|
| 1 |
-
# Core dependencies
|
| 2 |
-
streamlit
|
| 3 |
-
sentence-transformers
|
| 4 |
-
numpy
|
| 5 |
-
pandas
|
| 6 |
-
watchdog
|
| 7 |
|
| 8 |
-
# Document processing
|
| 9 |
-
pymupdf
|
| 10 |
-
python-docx
|
| 11 |
-
joblib
|
| 12 |
|
| 13 |
-
# Environment and configuration
|
| 14 |
-
python-dotenv
|
| 15 |
|
| 16 |
-
# Vector store
|
| 17 |
-
faiss-cpu
|
| 18 |
|
| 19 |
-
# AI Enhancement
|
| 20 |
-
langchain-anthropic
|
| 21 |
-
langgraph
|
| 22 |
-
langchain-core
|
|
|
|
|
|
| 1 |
+
# Core dependencies - pinned for deployment
|
| 2 |
+
streamlit==1.49.1
|
| 3 |
+
sentence-transformers==5.1.0
|
| 4 |
+
numpy==2.3.2
|
| 5 |
+
pandas==2.3.2
|
| 6 |
+
watchdog==6.0.0
|
| 7 |
|
| 8 |
+
# Document processing - pinned for deployment
|
| 9 |
+
pymupdf==1.26.4
|
| 10 |
+
python-docx==1.2.0
|
| 11 |
+
joblib==1.5.2
|
| 12 |
|
| 13 |
+
# Environment and configuration - pinned for deployment
|
| 14 |
+
python-dotenv==1.1.1
|
| 15 |
|
| 16 |
+
# Vector store - pinned for deployment
|
| 17 |
+
faiss-cpu==1.12.0
|
| 18 |
|
| 19 |
+
# AI Enhancement - pinned for deployment
|
| 20 |
+
langchain-anthropic==0.3.19
|
| 21 |
+
langgraph==0.6.6
|
| 22 |
+
langchain-core==0.3.75
|
| 23 |
+
langchain-text-splitters==0.3.10
|
src/document_processing.py
CHANGED
|
@@ -5,7 +5,7 @@ Document Processing Module
|
|
| 5 |
This module handles all document-related operations including:
|
| 6 |
- File text extraction from various formats (PDF, DOCX, TXT, MD)
|
| 7 |
- Document scanning and indexing
|
| 8 |
-
-
|
| 9 |
- Document metadata handling
|
| 10 |
"""
|
| 11 |
|
|
@@ -30,6 +30,9 @@ import joblib
|
|
| 30 |
import hashlib
|
| 31 |
import time
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
# Setup logging for thread-safe error handling
|
| 34 |
logger = logging.getLogger(__name__)
|
| 35 |
|
|
@@ -256,34 +259,50 @@ def scan_data_room(data_room_path: str, max_workers: int = 4, progress_callback=
|
|
| 256 |
return documents
|
| 257 |
|
| 258 |
|
| 259 |
-
def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int =
|
| 260 |
"""
|
| 261 |
-
Create searchable chunks with full metadata
|
|
|
|
| 262 |
|
| 263 |
Args:
|
| 264 |
documents: Dictionary of documents
|
| 265 |
-
chunk_size: Size of each chunk in words
|
| 266 |
-
overlap: Overlap between chunks in words
|
| 267 |
|
| 268 |
Returns:
|
| 269 |
List of chunk dictionaries with metadata
|
| 270 |
"""
|
| 271 |
chunks = []
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
for doc_path, doc_info in documents.items():
|
| 274 |
text = doc_info['text']
|
| 275 |
-
words = text.split()
|
| 276 |
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
if chunk_text.strip():
|
| 281 |
chunks.append({
|
| 282 |
-
'text': chunk_text,
|
| 283 |
'source': doc_info['name'],
|
| 284 |
'path': doc_info['rel_path'],
|
| 285 |
'full_path': doc_path,
|
| 286 |
-
'chunk_id': f"
|
| 287 |
'metadata': doc_info['metadata']
|
| 288 |
})
|
| 289 |
|
|
|
|
| 5 |
This module handles all document-related operations including:
|
| 6 |
- File text extraction from various formats (PDF, DOCX, TXT, MD)
|
| 7 |
- Document scanning and indexing
|
| 8 |
+
- Semantic text chunking for RAG with better context preservation
|
| 9 |
- Document metadata handling
|
| 10 |
"""
|
| 11 |
|
|
|
|
| 30 |
import hashlib
|
| 31 |
import time
|
| 32 |
|
| 33 |
+
# Semantic chunking
|
| 34 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 35 |
+
|
| 36 |
# Setup logging for thread-safe error handling
|
| 37 |
logger = logging.getLogger(__name__)
|
| 38 |
|
|
|
|
| 259 |
return documents
|
| 260 |
|
| 261 |
|
| 262 |
+
def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 2000, overlap: int = 200) -> List[Dict]:
|
| 263 |
"""
|
| 264 |
+
Create searchable chunks with semantic splitting and full metadata.
|
| 265 |
+
Uses RecursiveCharacterTextSplitter for better context preservation.
|
| 266 |
|
| 267 |
Args:
|
| 268 |
documents: Dictionary of documents
|
| 269 |
+
chunk_size: Size of each chunk in characters (default: 2000 for ~400 words)
|
| 270 |
+
overlap: Overlap between chunks in characters (default: 200 for ~50 words)
|
| 271 |
|
| 272 |
Returns:
|
| 273 |
List of chunk dictionaries with metadata
|
| 274 |
"""
|
| 275 |
chunks = []
|
| 276 |
|
| 277 |
+
# Initialize semantic text splitter with hierarchical separators
|
| 278 |
+
# This preserves document structure by prioritizing paragraph breaks,
|
| 279 |
+
# then sentences, then words
|
| 280 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 281 |
+
chunk_size=chunk_size,
|
| 282 |
+
chunk_overlap=overlap,
|
| 283 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " "],
|
| 284 |
+
length_function=len,
|
| 285 |
+
is_separator_regex=False,
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
for doc_path, doc_info in documents.items():
|
| 289 |
text = doc_info['text']
|
|
|
|
| 290 |
|
| 291 |
+
if not text.strip():
|
| 292 |
+
continue
|
| 293 |
+
|
| 294 |
+
# Split text using semantic boundaries
|
| 295 |
+
semantic_chunks = text_splitter.split_text(text)
|
| 296 |
+
|
| 297 |
+
# Create chunks with metadata
|
| 298 |
+
for i, chunk_text in enumerate(semantic_chunks):
|
| 299 |
if chunk_text.strip():
|
| 300 |
chunks.append({
|
| 301 |
+
'text': chunk_text.strip(),
|
| 302 |
'source': doc_info['name'],
|
| 303 |
'path': doc_info['rel_path'],
|
| 304 |
'full_path': doc_path,
|
| 305 |
+
'chunk_id': f"semantic_chunk_{i}",
|
| 306 |
'metadata': doc_info['metadata']
|
| 307 |
})
|
| 308 |
|
uv.lock
CHANGED
|
@@ -181,6 +181,7 @@ dependencies = [
|
|
| 181 |
{ name = "faiss-cpu" },
|
| 182 |
{ name = "langchain-anthropic" },
|
| 183 |
{ name = "langchain-core" },
|
|
|
|
| 184 |
{ name = "langgraph" },
|
| 185 |
{ name = "numpy" },
|
| 186 |
{ name = "pandas" },
|
|
@@ -197,6 +198,7 @@ requires-dist = [
|
|
| 197 |
{ name = "faiss-cpu", specifier = ">=1.7.4" },
|
| 198 |
{ name = "langchain-anthropic", specifier = ">=0.1.0" },
|
| 199 |
{ name = "langchain-core", specifier = ">=0.1.0" },
|
|
|
|
| 200 |
{ name = "langgraph", specifier = ">=0.0.20" },
|
| 201 |
{ name = "numpy", specifier = ">=1.24.0" },
|
| 202 |
{ name = "pandas", specifier = ">=2.0.0" },
|
|
@@ -552,6 +554,19 @@ wheels = [
|
|
| 552 |
{ url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" },
|
| 553 |
]
|
| 554 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
[[package]]
|
| 556 |
name = "langgraph"
|
| 557 |
version = "0.6.6"
|
|
@@ -1220,6 +1235,15 @@ wheels = [
|
|
| 1220 |
{ url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
|
| 1221 |
]
|
| 1222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1223 |
[[package]]
|
| 1224 |
name = "protobuf"
|
| 1225 |
version = "6.32.0"
|
|
|
|
| 181 |
{ name = "faiss-cpu" },
|
| 182 |
{ name = "langchain-anthropic" },
|
| 183 |
{ name = "langchain-core" },
|
| 184 |
+
{ name = "langchain-text-splitters" },
|
| 185 |
{ name = "langgraph" },
|
| 186 |
{ name = "numpy" },
|
| 187 |
{ name = "pandas" },
|
|
|
|
| 198 |
{ name = "faiss-cpu", specifier = ">=1.7.4" },
|
| 199 |
{ name = "langchain-anthropic", specifier = ">=0.1.0" },
|
| 200 |
{ name = "langchain-core", specifier = ">=0.1.0" },
|
| 201 |
+
{ name = "langchain-text-splitters", specifier = ">=0.3.10" },
|
| 202 |
{ name = "langgraph", specifier = ">=0.0.20" },
|
| 203 |
{ name = "numpy", specifier = ">=1.24.0" },
|
| 204 |
{ name = "pandas", specifier = ">=2.0.0" },
|
|
|
|
| 554 |
{ url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" },
|
| 555 |
]
|
| 556 |
|
| 557 |
+
[[package]]
|
| 558 |
+
name = "langchain-text-splitters"
|
| 559 |
+
version = "0.3.10"
|
| 560 |
+
source = { registry = "https://pypi.org/simple" }
|
| 561 |
+
dependencies = [
|
| 562 |
+
{ name = "langchain-core" },
|
| 563 |
+
{ name = "pip" },
|
| 564 |
+
]
|
| 565 |
+
sdist = { url = "https://files.pythonhosted.org/packages/24/0b/a6ff43f23ff1ca0c6c3d7839eb5573f299eb96fc06d59c336488142fedf3/langchain_text_splitters-0.3.10.tar.gz", hash = "sha256:b00a82b92eb362a9842f7d7a16d6d223fc93a9be4c51c14109be7d15d120c67f", size = 46563, upload-time = "2025-08-28T17:17:44.783Z" }
|
| 566 |
+
wheels = [
|
| 567 |
+
{ url = "https://files.pythonhosted.org/packages/4c/dc/d64c9990f6aeb209e8d47b34ebaa2b787f3e4c10c99b8a5568a10beda449/langchain_text_splitters-0.3.10-py3-none-any.whl", hash = "sha256:4bc6ebef274d954e79321d0781dc5ab89b79f40c3cb8ba3310cc2d05ff73c945", size = 34040, upload-time = "2025-08-28T17:17:43.36Z" },
|
| 568 |
+
]
|
| 569 |
+
|
| 570 |
[[package]]
|
| 571 |
name = "langgraph"
|
| 572 |
version = "0.6.6"
|
|
|
|
| 1235 |
{ url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
|
| 1236 |
]
|
| 1237 |
|
| 1238 |
+
[[package]]
|
| 1239 |
+
name = "pip"
|
| 1240 |
+
version = "25.2"
|
| 1241 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1242 |
+
sdist = { url = "https://files.pythonhosted.org/packages/20/16/650289cd3f43d5a2fadfd98c68bd1e1e7f2550a1a5326768cddfbcedb2c5/pip-25.2.tar.gz", hash = "sha256:578283f006390f85bb6282dffb876454593d637f5d1be494b5202ce4877e71f2", size = 1840021, upload-time = "2025-07-30T21:50:15.401Z" }
|
| 1243 |
+
wheels = [
|
| 1244 |
+
{ url = "https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl", hash = "sha256:6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717", size = 1752557, upload-time = "2025-07-30T21:50:13.323Z" },
|
| 1245 |
+
]
|
| 1246 |
+
|
| 1247 |
[[package]]
|
| 1248 |
name = "protobuf"
|
| 1249 |
version = "6.32.0"
|