Francisco Zanartu commited on
Commit ·
c48f556
1
Parent(s): 2d7f9db
refactor: streamline chunk creation by removing total_chunks parameter and updating chunk handling
Browse files- src/utils/chunking.py +6 -8
src/utils/chunking.py
CHANGED
|
@@ -24,7 +24,6 @@ def get_context_enriched_chunks(
|
|
| 24 |
document_overview,
|
| 25 |
chunk,
|
| 26 |
i,
|
| 27 |
-
len(base_chunks),
|
| 28 |
llm,
|
| 29 |
)
|
| 30 |
|
|
@@ -37,23 +36,22 @@ def create_enriched_document(
|
|
| 37 |
document_overview,
|
| 38 |
chunk,
|
| 39 |
chunk_id,
|
| 40 |
-
total_chunks,
|
| 41 |
llm,
|
| 42 |
):
|
| 43 |
|
| 44 |
metadata = {
|
| 45 |
"chunk_id": chunk_id,
|
| 46 |
-
"
|
| 47 |
-
"
|
| 48 |
-
"chunk": chunk,
|
| 49 |
"document_summary": document_overview,
|
| 50 |
}
|
| 51 |
|
| 52 |
-
chunk_summary = summarize_context(document_overview, chunk, llm)
|
| 53 |
|
| 54 |
metadata["chunk_summary"] = chunk_summary
|
| 55 |
|
| 56 |
-
return Document(page_content=chunk, metadata=metadata)
|
| 57 |
|
| 58 |
|
| 59 |
def document_summary(document_text, llm):
|
|
@@ -100,6 +98,6 @@ def get_base_chunks(document_text, chunk_size, chunk_overlap):
|
|
| 100 |
strip_whitespace=False,
|
| 101 |
)
|
| 102 |
|
| 103 |
-
base_chunks = text_splitter.
|
| 104 |
|
| 105 |
return base_chunks
|
|
|
|
| 24 |
document_overview,
|
| 25 |
chunk,
|
| 26 |
i,
|
|
|
|
| 27 |
llm,
|
| 28 |
)
|
| 29 |
|
|
|
|
| 36 |
document_overview,
|
| 37 |
chunk,
|
| 38 |
chunk_id,
|
|
|
|
| 39 |
llm,
|
| 40 |
):
|
| 41 |
|
| 42 |
metadata = {
|
| 43 |
"chunk_id": chunk_id,
|
| 44 |
+
"chunk_length": len(chunk.page_content),
|
| 45 |
+
"start_index": chunk.metadata.get("start_index", 0),
|
| 46 |
+
"chunk": chunk.page_content,
|
| 47 |
"document_summary": document_overview,
|
| 48 |
}
|
| 49 |
|
| 50 |
+
chunk_summary = summarize_context(document_overview, chunk.page_content, llm)
|
| 51 |
|
| 52 |
metadata["chunk_summary"] = chunk_summary
|
| 53 |
|
| 54 |
+
return Document(page_content=chunk.page_content, metadata=metadata)
|
| 55 |
|
| 56 |
|
| 57 |
def document_summary(document_text, llm):
|
|
|
|
| 98 |
strip_whitespace=False,
|
| 99 |
)
|
| 100 |
|
| 101 |
+
base_chunks = text_splitter.create_documents([document_text])
|
| 102 |
|
| 103 |
return base_chunks
|