aman1762 commited on
Commit
089f9ae
·
verified ·
1 Parent(s): dd452e0

Update chunker.py

Browse files
Files changed (1) hide show
  1. chunker.py +64 -10
chunker.py CHANGED
@@ -1,18 +1,72 @@
 
1
  from langchain_core.documents import Document
2
 
3
- MAX_CHARS = 1200
4
-
5
  def chunk_code(file_path: str, code: str):
6
- chunks = []
 
 
 
 
7
 
8
- for i in range(0, len(code), MAX_CHARS):
9
- chunk = code[i:i + MAX_CHARS]
10
- if len(chunk.strip()) > 100:
11
- chunks.append(
 
 
12
  Document(
13
- page_content=chunk,
14
- metadata={"file": file_path}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  )
 
17
 
18
- return chunks
 
1
+ import ast
2
  from langchain_core.documents import Document
3
 
 
 
4
  def chunk_code(file_path: str, code: str):
5
+ """
6
+ Chunk Python code by functions and classes using AST.
7
+ Falls back to whole-file chunk if parsing fails.
8
+ """
9
+ documents = []
10
 
11
+ try:
12
+ tree = ast.parse(code)
13
+ except Exception:
14
+ # Fallback: whole file as one chunk
15
+ if len(code.strip()) > 100:
16
+ documents.append(
17
  Document(
18
+ page_content=code,
19
+ metadata={
20
+ "file": file_path,
21
+ "type": "file"
22
+ }
23
+ )
24
+ )
25
+ return documents
26
+
27
+ for node in ast.walk(tree):
28
+ # -------- FUNCTIONS --------
29
+ if isinstance(node, ast.FunctionDef):
30
+ source = ast.get_source_segment(code, node)
31
+ if source:
32
+ documents.append(
33
+ Document(
34
+ page_content=source,
35
+ metadata={
36
+ "file": file_path,
37
+ "type": "function",
38
+ "name": node.name,
39
+ "line_start": node.lineno
40
+ }
41
+ )
42
  )
43
+
44
+ # -------- CLASSES --------
45
+ elif isinstance(node, ast.ClassDef):
46
+ source = ast.get_source_segment(code, node)
47
+ if source:
48
+ documents.append(
49
+ Document(
50
+ page_content=source,
51
+ metadata={
52
+ "file": file_path,
53
+ "type": "class",
54
+ "name": node.name,
55
+ "line_start": node.lineno
56
+ }
57
+ )
58
+ )
59
+
60
+ # If no functions/classes found, keep whole file
61
+ if not documents and len(code.strip()) > 100:
62
+ documents.append(
63
+ Document(
64
+ page_content=code,
65
+ metadata={
66
+ "file": file_path,
67
+ "type": "file"
68
+ }
69
  )
70
+ )
71
 
72
+ return documents