Krishna172912 commited on
Commit
c6868fa
Β·
unverified Β·
1 Parent(s): 8725d0d

Create splitter.py

Browse files
Files changed (1) hide show
  1. back_end/core/splitter.py +196 -0
back_end/core/splitter.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ from typing import List
4
+
5
+ from langchain_core.documents import Document
6
+ from langchain_text_splitters import (
7
+ RecursiveCharacterTextSplitter,
8
+ MarkdownHeaderTextSplitter,
9
+ RecursiveJsonSplitter,
10
+ )
11
+ from chonkie import CodeChunker
12
+
13
+ from config import CHUNK_OVERLAP,CHUNK_SIZE,AST_BASED_SPLITTING
14
+
15
+ def custom_splitter(docs: List[Document],current_dir: Path) -> List[Document]:
16
+ all_chunks: List[Document] = []
17
+
18
+ md_splitter = MarkdownHeaderTextSplitter(
19
+ headers_to_split_on=[("#", "H1"), ("##", "H2"), ("###", "H3")]
20
+ )
21
+
22
+ text_fallback_splitter = RecursiveCharacterTextSplitter(
23
+ chunk_size=CHUNK_SIZE,
24
+ chunk_overlap=CHUNK_OVERLAP,
25
+ )
26
+
27
+ json_splitter = RecursiveJsonSplitter(
28
+ max_chunk_size=CHUNK_SIZE,
29
+ )
30
+
31
+ csv_splitter = RecursiveCharacterTextSplitter(
32
+ separators=["\n"],
33
+ chunk_size=CHUNK_SIZE,
34
+ chunk_overlap=0,
35
+ )
36
+
37
+ for doc in docs:
38
+ # --- FIX: Empty Files Check ---
39
+ # Skip completely empty documents to save compute time
40
+ if not doc.page_content or not doc.page_content.strip():
41
+ continue
42
+
43
+ source_str = doc.metadata.get("source", "")
44
+ if not source_str:
45
+ continue
46
+
47
+ path = Path(source_str)
48
+ ext = path.suffix.lower()
49
+
50
+ try:
51
+ repo_path = str(path.relative_to(current_dir))
52
+ except ValueError:
53
+ repo_path = str(path)
54
+
55
+ base_metadata = {
56
+ **doc.metadata,
57
+ "file_name": path.name,
58
+ "extension": ext,
59
+ "path_rel_repo": repo_path,
60
+ }
61
+
62
+ doc_chunks: List[Document] = []
63
+
64
+ # AST-based code chunking
65
+ if ext in AST_BASED_SPLITTING:
66
+ ast_chunker = CodeChunker(
67
+ language=AST_BASED_SPLITTING.get(ext),
68
+ tokenizer="character",
69
+ chunk_size=CHUNK_SIZE,
70
+ include_nodes=False,
71
+ )
72
+ try:
73
+ chonkie_chunks = ast_chunker.chunk(doc.page_content)
74
+ for chunk in chonkie_chunks:
75
+ doc_chunks.append(
76
+ Document(
77
+ page_content=chunk.text,
78
+ metadata=base_metadata.copy(),
79
+ )
80
+ )
81
+ except Exception as e:
82
+ print(
83
+ f"Warning: AST parsing failed for {path.name}. "
84
+ f"Falling back to text. Error: {e}"
85
+ )
86
+ doc_chunks = text_fallback_splitter.split_documents([doc])
87
+
88
+ # Markdown
89
+ elif ext in {".md", ".mdx"}:
90
+ md_splits = md_splitter.split_text(doc.page_content)
91
+ for split in md_splits:
92
+ split.metadata = {**base_metadata, **split.metadata}
93
+ doc_chunks = text_fallback_splitter.split_documents(md_splits)
94
+
95
+ # JSON
96
+ elif ext == ".json":
97
+ try:
98
+ parsed_data = json.loads(doc.page_content)
99
+
100
+ #------ Normalize the data: because remeber json can be in two formate one single dictionary or list of dictionary
101
+ texts_to_split = []
102
+
103
+ if isinstance(parsed_data, list):
104
+ # If it's a list, treat each item as a separate document
105
+ # This yields much better search results for RAG
106
+ for item in parsed_data:
107
+ if isinstance(item, dict):
108
+ texts_to_split.append(item)
109
+ else:
110
+ texts_to_split.append({"value": item})
111
+ elif isinstance(parsed_data, dict):
112
+ # If it's already a dict, it's safe
113
+ texts_to_split.append(parsed_data)
114
+ else:
115
+ # If it's a raw string/number/bool
116
+ texts_to_split.append({"value": parsed_data})
117
+ # ---------------------------------------------
118
+
119
+ # Create metadatas array to match the length of texts_to_split
120
+ metadatas = [base_metadata.copy() for _ in texts_to_split]
121
+
122
+ json_docs = json_splitter.create_documents(
123
+ texts=texts_to_split,
124
+ metadatas=metadatas,
125
+ )
126
+ doc_chunks.extend(json_docs)
127
+
128
+ except json.JSONDecodeError as e:
129
+ print(
130
+ f"Warning: Invalid JSON syntax in {path.name}. "
131
+ f"Falling back to text. Error: {e}"
132
+ )
133
+ doc_chunks = text_fallback_splitter.split_documents([doc])
134
+
135
+ # JSONL
136
+ elif ext == ".jsonl":
137
+ for line in doc.page_content.splitlines():
138
+ line = line.strip()
139
+ if not line:
140
+ continue
141
+
142
+ try:
143
+ line_data = json.loads(line)
144
+
145
+ # --- Normalize JSONL lines ---
146
+ if not isinstance(line_data, dict):
147
+ line_data = {"value": line_data}
148
+
149
+ json_docs = json_splitter.create_documents(
150
+ texts=[line_data],
151
+ metadatas=[base_metadata.copy()],
152
+ )
153
+ doc_chunks.extend(json_docs)
154
+ except json.JSONDecodeError as e:
155
+ print(
156
+ f"Warning: Invalid JSONL line in {path.name}. "
157
+ f"Skipping. Error: {e}"
158
+ )
159
+
160
+ # CSV / TSV
161
+ elif ext in {".csv", ".tsv"}:
162
+ lines = doc.page_content.splitlines()
163
+ if not lines:
164
+ continue
165
+
166
+ header = lines[0]
167
+ doc_chunks = csv_splitter.split_documents([doc])
168
+
169
+ for i, chunk in enumerate(doc_chunks):
170
+ if i == 0:
171
+ continue
172
+
173
+ # --- FIX: CSV Header Logic ---
174
+ # Ensure the chunk doesn't already have the header and strip leading newlines
175
+ # to prevent broken/malformed line boundaries.
176
+ if not chunk.page_content.startswith(header):
177
+ chunk.page_content = header + "\n" + chunk.page_content.lstrip()
178
+
179
+ chunk.metadata = base_metadata.copy()
180
+
181
+ # Fallback
182
+ else:
183
+ doc_chunks = text_fallback_splitter.split_documents([doc])
184
+
185
+ # ── FILE NAME INJECTION ───────────────────────────────────────────────
186
+ # Inject the file name into the text payload to give LLM Context.
187
+ for chunk in doc_chunks:
188
+ # 1. Update metadata
189
+ chunk.metadata = {**base_metadata, **chunk.metadata}
190
+ chunk.page_content = f"[FILE: {path.name}]\n\n" + chunk.page_content
191
+ all_chunks.append(chunk)
192
+
193
+ print(f"Original Files Processed : {len(docs)}")
194
+ print(f"Total Chunks Generated : {len(all_chunks)}")
195
+
196
+ return all_chunks