Francisco Zanartu commited on
Commit
c48f556
·
1 Parent(s): 2d7f9db

refactor: streamline chunk creation by removing total_chunks parameter and updating chunk handling

Browse files
Files changed (1) hide show
  1. src/utils/chunking.py +6 -8
src/utils/chunking.py CHANGED
@@ -24,7 +24,6 @@ def get_context_enriched_chunks(
24
  document_overview,
25
  chunk,
26
  i,
27
- len(base_chunks),
28
  llm,
29
  )
30
 
@@ -37,23 +36,22 @@ def create_enriched_document(
37
  document_overview,
38
  chunk,
39
  chunk_id,
40
- total_chunks,
41
  llm,
42
  ):
43
 
44
  metadata = {
45
  "chunk_id": chunk_id,
46
- "total_chunks": total_chunks,
47
- "chunk_size": len(chunk),
48
- "chunk": chunk,
49
  "document_summary": document_overview,
50
  }
51
 
52
- chunk_summary = summarize_context(document_overview, chunk, llm)
53
 
54
  metadata["chunk_summary"] = chunk_summary
55
 
56
- return Document(page_content=chunk, metadata=metadata)
57
 
58
 
59
  def document_summary(document_text, llm):
@@ -100,6 +98,6 @@ def get_base_chunks(document_text, chunk_size, chunk_overlap):
100
  strip_whitespace=False,
101
  )
102
 
103
- base_chunks = text_splitter.split_text(document_text)
104
 
105
  return base_chunks
 
24
  document_overview,
25
  chunk,
26
  i,
 
27
  llm,
28
  )
29
 
 
36
  document_overview,
37
  chunk,
38
  chunk_id,
 
39
  llm,
40
  ):
41
 
42
  metadata = {
43
  "chunk_id": chunk_id,
44
+ "chunk_length": len(chunk.page_content),
45
+ "start_index": chunk.metadata.get("start_index", 0),
46
+ "chunk": chunk.page_content,
47
  "document_summary": document_overview,
48
  }
49
 
50
+ chunk_summary = summarize_context(document_overview, chunk.page_content, llm)
51
 
52
  metadata["chunk_summary"] = chunk_summary
53
 
54
+ return Document(page_content=chunk.page_content, metadata=metadata)
55
 
56
 
57
  def document_summary(document_text, llm):
 
98
  strip_whitespace=False,
99
  )
100
 
101
+ base_chunks = text_splitter.create_documents([document_text])
102
 
103
  return base_chunks