Juan Salas commited on
Commit
2a6c9a4
·
1 Parent(s): 0f98252

Reverted embedding caching added semantic chunking

Browse files
Files changed (5) hide show
  1. app.py +2 -0
  2. pyproject.toml +1 -0
  3. requirements.txt +19 -18
  4. src/document_processing.py +30 -11
  5. uv.lock +24 -0
app.py CHANGED
@@ -47,6 +47,7 @@ except ImportError:
47
  DDChecklistAgent = None
48
 
49
 
 
50
  class DDChecklistApp:
51
  """
52
  Main application class that orchestrates all components
@@ -198,6 +199,7 @@ class DDChecklistApp:
198
 
199
  return selected_data_room_path, use_ai_features, process_button
200
 
 
201
  def render_summary_tab(self):
202
  """Render the summary and analysis tab"""
203
  # Strategy selector
 
47
  DDChecklistAgent = None
48
 
49
 
50
+
51
  class DDChecklistApp:
52
  """
53
  Main application class that orchestrates all components
 
199
 
200
  return selected_data_room_path, use_ai_features, process_button
201
 
202
+
203
  def render_summary_tab(self):
204
  """Render the summary and analysis tab"""
205
  # Strategy selector
pyproject.toml CHANGED
@@ -21,6 +21,7 @@ dependencies = [
21
  "langchain-anthropic>=0.1.0",
22
  "langgraph>=0.0.20",
23
  "langchain-core>=0.1.0",
 
24
  ]
25
 
26
  [build-system]
 
21
  "langchain-anthropic>=0.1.0",
22
  "langgraph>=0.0.20",
23
  "langchain-core>=0.1.0",
24
+ "langchain-text-splitters>=0.3.10",
25
  ]
26
 
27
  [build-system]
requirements.txt CHANGED
@@ -1,22 +1,23 @@
1
- # Core dependencies
2
- streamlit>=1.28.0
3
- sentence-transformers>=2.2.0
4
- numpy>=1.24.0
5
- pandas>=2.0.0
6
- watchdog>=3.0.0 # For auto-reload during development
7
 
8
- # Document processing
9
- pymupdf>=1.23.0
10
- python-docx>=0.8.11
11
- joblib>=1.3.0
12
 
13
- # Environment and configuration
14
- python-dotenv>=1.0.0
15
 
16
- # Vector store
17
- faiss-cpu>=1.7.4
18
 
19
- # AI Enhancement (always included for consistency)
20
- langchain-anthropic>=0.1.0
21
- langgraph>=0.0.20
22
- langchain-core>=0.1.0
 
 
1
+ # Core dependencies - pinned for deployment
2
+ streamlit==1.49.1
3
+ sentence-transformers==5.1.0
4
+ numpy==2.3.2
5
+ pandas==2.3.2
6
+ watchdog==6.0.0
7
 
8
+ # Document processing - pinned for deployment
9
+ pymupdf==1.26.4
10
+ python-docx==1.2.0
11
+ joblib==1.5.2
12
 
13
+ # Environment and configuration - pinned for deployment
14
+ python-dotenv==1.1.1
15
 
16
+ # Vector store - pinned for deployment
17
+ faiss-cpu==1.12.0
18
 
19
+ # AI Enhancement - pinned for deployment
20
+ langchain-anthropic==0.3.19
21
+ langgraph==0.6.6
22
+ langchain-core==0.3.75
23
+ langchain-text-splitters==0.3.10
src/document_processing.py CHANGED
@@ -5,7 +5,7 @@ Document Processing Module
5
  This module handles all document-related operations including:
6
  - File text extraction from various formats (PDF, DOCX, TXT, MD)
7
  - Document scanning and indexing
8
- - Text chunking for RAG
9
  - Document metadata handling
10
  """
11
 
@@ -30,6 +30,9 @@ import joblib
30
  import hashlib
31
  import time
32
 
 
 
 
33
  # Setup logging for thread-safe error handling
34
  logger = logging.getLogger(__name__)
35
 
@@ -256,34 +259,50 @@ def scan_data_room(data_room_path: str, max_workers: int = 4, progress_callback=
256
  return documents
257
 
258
 
259
- def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 400, overlap: int = 50) -> List[Dict]:
260
  """
261
- Create searchable chunks with full metadata
 
262
 
263
  Args:
264
  documents: Dictionary of documents
265
- chunk_size: Size of each chunk in words
266
- overlap: Overlap between chunks in words
267
 
268
  Returns:
269
  List of chunk dictionaries with metadata
270
  """
271
  chunks = []
272
 
 
 
 
 
 
 
 
 
 
 
 
273
  for doc_path, doc_info in documents.items():
274
  text = doc_info['text']
275
- words = text.split()
276
 
277
- # Create overlapping chunks
278
- for i in range(0, len(words), chunk_size - overlap):
279
- chunk_text = ' '.join(words[i:i + chunk_size])
 
 
 
 
 
280
  if chunk_text.strip():
281
  chunks.append({
282
- 'text': chunk_text,
283
  'source': doc_info['name'],
284
  'path': doc_info['rel_path'],
285
  'full_path': doc_path,
286
- 'chunk_id': f"chunk_{i}",
287
  'metadata': doc_info['metadata']
288
  })
289
 
 
5
  This module handles all document-related operations including:
6
  - File text extraction from various formats (PDF, DOCX, TXT, MD)
7
  - Document scanning and indexing
8
+ - Semantic text chunking for RAG with better context preservation
9
  - Document metadata handling
10
  """
11
 
 
30
  import hashlib
31
  import time
32
 
33
+ # Semantic chunking
34
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
35
+
36
  # Setup logging for thread-safe error handling
37
  logger = logging.getLogger(__name__)
38
 
 
259
  return documents
260
 
261
 
262
+ def create_chunks_with_metadata(documents: Dict[str, Dict], chunk_size: int = 2000, overlap: int = 200) -> List[Dict]:
263
  """
264
+ Create searchable chunks with semantic splitting and full metadata.
265
+ Uses RecursiveCharacterTextSplitter for better context preservation.
266
 
267
  Args:
268
  documents: Dictionary of documents
269
+ chunk_size: Size of each chunk in characters (default: 2000 for ~400 words)
270
+ overlap: Overlap between chunks in characters (default: 200 for ~50 words)
271
 
272
  Returns:
273
  List of chunk dictionaries with metadata
274
  """
275
  chunks = []
276
 
277
+ # Initialize semantic text splitter with hierarchical separators
278
+ # This preserves document structure by prioritizing paragraph breaks,
279
+ # then sentences, then words
280
+ text_splitter = RecursiveCharacterTextSplitter(
281
+ chunk_size=chunk_size,
282
+ chunk_overlap=overlap,
283
+ separators=["\n\n", "\n", ".", "!", "?", ",", " "],
284
+ length_function=len,
285
+ is_separator_regex=False,
286
+ )
287
+
288
  for doc_path, doc_info in documents.items():
289
  text = doc_info['text']
 
290
 
291
+ if not text.strip():
292
+ continue
293
+
294
+ # Split text using semantic boundaries
295
+ semantic_chunks = text_splitter.split_text(text)
296
+
297
+ # Create chunks with metadata
298
+ for i, chunk_text in enumerate(semantic_chunks):
299
  if chunk_text.strip():
300
  chunks.append({
301
+ 'text': chunk_text.strip(),
302
  'source': doc_info['name'],
303
  'path': doc_info['rel_path'],
304
  'full_path': doc_path,
305
+ 'chunk_id': f"semantic_chunk_{i}",
306
  'metadata': doc_info['metadata']
307
  })
308
 
uv.lock CHANGED
@@ -181,6 +181,7 @@ dependencies = [
181
  { name = "faiss-cpu" },
182
  { name = "langchain-anthropic" },
183
  { name = "langchain-core" },
 
184
  { name = "langgraph" },
185
  { name = "numpy" },
186
  { name = "pandas" },
@@ -197,6 +198,7 @@ requires-dist = [
197
  { name = "faiss-cpu", specifier = ">=1.7.4" },
198
  { name = "langchain-anthropic", specifier = ">=0.1.0" },
199
  { name = "langchain-core", specifier = ">=0.1.0" },
 
200
  { name = "langgraph", specifier = ">=0.0.20" },
201
  { name = "numpy", specifier = ">=1.24.0" },
202
  { name = "pandas", specifier = ">=2.0.0" },
@@ -552,6 +554,19 @@ wheels = [
552
  { url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" },
553
  ]
554
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  [[package]]
556
  name = "langgraph"
557
  version = "0.6.6"
@@ -1220,6 +1235,15 @@ wheels = [
1220
  { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
1221
  ]
1222
 
 
 
 
 
 
 
 
 
 
1223
  [[package]]
1224
  name = "protobuf"
1225
  version = "6.32.0"
 
181
  { name = "faiss-cpu" },
182
  { name = "langchain-anthropic" },
183
  { name = "langchain-core" },
184
+ { name = "langchain-text-splitters" },
185
  { name = "langgraph" },
186
  { name = "numpy" },
187
  { name = "pandas" },
 
198
  { name = "faiss-cpu", specifier = ">=1.7.4" },
199
  { name = "langchain-anthropic", specifier = ">=0.1.0" },
200
  { name = "langchain-core", specifier = ">=0.1.0" },
201
+ { name = "langchain-text-splitters", specifier = ">=0.3.10" },
202
  { name = "langgraph", specifier = ">=0.0.20" },
203
  { name = "numpy", specifier = ">=1.24.0" },
204
  { name = "pandas", specifier = ">=2.0.0" },
 
554
  { url = "https://files.pythonhosted.org/packages/fb/42/0d0221cce6f168f644d7d96cb6c87c4e42fc55d2941da7a36e970e3ab8ab/langchain_core-0.3.75-py3-none-any.whl", hash = "sha256:03ca1fadf955ee3c7d5806a841f4b3a37b816acea5e61a7e6ba1298c05eea7f5", size = 443986, upload-time = "2025-08-26T15:24:10.883Z" },
555
  ]
556
 
557
+ [[package]]
558
+ name = "langchain-text-splitters"
559
+ version = "0.3.10"
560
+ source = { registry = "https://pypi.org/simple" }
561
+ dependencies = [
562
+ { name = "langchain-core" },
563
+ { name = "pip" },
564
+ ]
565
+ sdist = { url = "https://files.pythonhosted.org/packages/24/0b/a6ff43f23ff1ca0c6c3d7839eb5573f299eb96fc06d59c336488142fedf3/langchain_text_splitters-0.3.10.tar.gz", hash = "sha256:b00a82b92eb362a9842f7d7a16d6d223fc93a9be4c51c14109be7d15d120c67f", size = 46563, upload-time = "2025-08-28T17:17:44.783Z" }
566
+ wheels = [
567
+ { url = "https://files.pythonhosted.org/packages/4c/dc/d64c9990f6aeb209e8d47b34ebaa2b787f3e4c10c99b8a5568a10beda449/langchain_text_splitters-0.3.10-py3-none-any.whl", hash = "sha256:4bc6ebef274d954e79321d0781dc5ab89b79f40c3cb8ba3310cc2d05ff73c945", size = 34040, upload-time = "2025-08-28T17:17:43.36Z" },
568
+ ]
569
+
570
  [[package]]
571
  name = "langgraph"
572
  version = "0.6.6"
 
1235
  { url = "https://files.pythonhosted.org/packages/34/e7/ae39f538fd6844e982063c3a5e4598b8ced43b9633baa3a85ef33af8c05c/pillow-11.3.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c84d689db21a1c397d001aa08241044aa2069e7587b398c8cc63020390b1c1b8", size = 6984598, upload-time = "2025-07-01T09:16:27.732Z" },
1236
  ]
1237
 
1238
+ [[package]]
1239
+ name = "pip"
1240
+ version = "25.2"
1241
+ source = { registry = "https://pypi.org/simple" }
1242
+ sdist = { url = "https://files.pythonhosted.org/packages/20/16/650289cd3f43d5a2fadfd98c68bd1e1e7f2550a1a5326768cddfbcedb2c5/pip-25.2.tar.gz", hash = "sha256:578283f006390f85bb6282dffb876454593d637f5d1be494b5202ce4877e71f2", size = 1840021, upload-time = "2025-07-30T21:50:15.401Z" }
1243
+ wheels = [
1244
+ { url = "https://files.pythonhosted.org/packages/b7/3f/945ef7ab14dc4f9d7f40288d2df998d1837ee0888ec3659c813487572faa/pip-25.2-py3-none-any.whl", hash = "sha256:6d67a2b4e7f14d8b31b8b52648866fa717f45a1eb70e83002f4331d07e953717", size = 1752557, upload-time = "2025-07-30T21:50:13.323Z" },
1245
+ ]
1246
+
1247
  [[package]]
1248
  name = "protobuf"
1249
  version = "6.32.0"