cryogenic22 commited on
Commit
354813e
·
verified ·
1 Parent(s): 09e8d84

Create document_chunker.py

Browse files
Files changed (1) hide show
  1. utils/document_chunker.py +96 -0
utils/document_chunker.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2
+ from typing import List, Tuple
3
+ import streamlit as st
4
+
5
+ class DocumentChunker:
6
+ def __init__(
7
+ self,
8
+ chunk_size: int = 1000,
9
+ chunk_overlap: int = 200,
10
+ length_function: callable = len,
11
+ max_tokens_per_chunk: int = 2000
12
+ ):
13
+ """Initialize the document chunker with configurable parameters.
14
+
15
+ Args:
16
+ chunk_size: The target size of each text chunk
17
+ chunk_overlap: The number of characters that overlap between chunks
18
+ length_function: Function to measure text length (default: character count)
19
+ max_tokens_per_chunk: Maximum number of tokens allowed per chunk
20
+ """
21
+ self.text_splitter = RecursiveCharacterTextSplitter(
22
+ chunk_size=chunk_size,
23
+ chunk_overlap=chunk_overlap,
24
+ length_function=length_function,
25
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
26
+ )
27
+ self.max_tokens_per_chunk = max_tokens_per_chunk
28
+
29
+ def chunk_document(self, content: str, metadata: dict) -> List[dict]:
30
+ """Split document content into chunks with metadata.
31
+
32
+ Args:
33
+ content: The document text content
34
+ metadata: Dictionary containing document metadata (e.g., source, page numbers)
35
+
36
+ Returns:
37
+ List of dictionaries containing chunks and their metadata
38
+ """
39
+ try:
40
+ # Split the text into chunks
41
+ chunks = self.text_splitter.create_documents(
42
+ texts=[content],
43
+ metadatas=[metadata]
44
+ )
45
+
46
+ # Process chunks and ensure they don't exceed token limit
47
+ processed_chunks = []
48
+ for i, chunk in enumerate(chunks):
49
+ # Estimate tokens (rough approximation: 4 chars ≈ 1 token)
50
+ estimated_tokens = len(chunk.page_content) // 4
51
+
52
+ if estimated_tokens > self.max_tokens_per_chunk:
53
+ # Further split if chunk is too large
54
+ sub_chunks = self.text_splitter.create_documents(
55
+ texts=[chunk.page_content],
56
+ metadatas=[{**chunk.metadata, 'sub_chunk': i}]
57
+ )
58
+ processed_chunks.extend(sub_chunks)
59
+ else:
60
+ processed_chunks.append(chunk)
61
+
62
+ return processed_chunks
63
+
64
+ except Exception as e:
65
+ st.error(f"Error chunking document: {str(e)}")
66
+ return []
67
+
68
+ def process_documents(
69
+ self,
70
+ documents: List[Tuple[str, str]]
71
+ ) -> Tuple[List[str], List[dict]]:
72
+ """Process multiple documents and their metadata.
73
+
74
+ Args:
75
+ documents: List of tuples containing (content, filename)
76
+
77
+ Returns:
78
+ Tuple containing (list of chunk contents, list of chunk metadata)
79
+ """
80
+ all_chunks = []
81
+ all_metadatas = []
82
+
83
+ for content, filename in documents:
84
+ metadata = {
85
+ 'source': filename,
86
+ 'chunk_index': 0 # Will be updated for each chunk
87
+ }
88
+
89
+ chunks = self.chunk_document(content, metadata)
90
+
91
+ for i, chunk in enumerate(chunks):
92
+ chunk.metadata['chunk_index'] = i
93
+ all_chunks.append(chunk.page_content)
94
+ all_metadatas.append(chunk.metadata)
95
+
96
+ return all_chunks, all_metadatas