Lorenzo Cian commited on
Commit
4ddb694
·
1 Parent(s): ec63959

Fix chunking for .ts and .tsx files (#73)

Browse files
Files changed (3) hide show
  1. pyproject.toml +1 -0
  2. sage/chunker.py +1 -10
  3. tests/test_chunker.py +5 -9
pyproject.toml CHANGED
@@ -56,6 +56,7 @@ dependencies = [
56
  "tokenizers==0.19.1",
57
  "transformers==4.44.2",
58
  "tree-sitter==0.22.3",
 
59
  "tree-sitter-language-pack==0.2.0",
60
  "voyageai==0.2.3",
61
  "setuptools" # Added from the setup.py install_requires
 
56
  "tokenizers==0.19.1",
57
  "transformers==4.44.2",
58
  "tree-sitter==0.22.3",
59
+ "tree-sitter-typescript==0.21.2",
60
  "tree-sitter-language-pack==0.2.0",
61
  "voyageai==0.2.3",
62
  "setuptools" # Added from the setup.py install_requires
sage/chunker.py CHANGED
@@ -170,10 +170,7 @@ class CodeFileChunker(Chunker):
170
  def is_code_file(filename: str) -> bool:
171
  """Checks whether pygment & tree_sitter can parse the file as code."""
172
  language = CodeFileChunker._get_language_from_filename(filename)
173
- # tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
174
- # resolved, otherwise we'll have to clone and fix the library.
175
- # See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
176
- return language and language not in ["text only", "None", "typescript", "tsx"]
177
 
178
  @staticmethod
179
  def parse_tree(filename: str, content: str) -> List[str]:
@@ -184,12 +181,6 @@ class CodeFileChunker(Chunker):
184
  logging.debug("%s doesn't seem to be a code file.", filename)
185
  return None
186
 
187
- if language in ["typescript", "tsx"]:
188
- # tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
189
- # resolved, otherwise we'll have to clone and fix the library.
190
- # See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
191
- return None
192
-
193
  try:
194
  parser = get_parser(language)
195
  except LookupError:
 
170
  def is_code_file(filename: str) -> bool:
171
  """Checks whether pygment & tree_sitter can parse the file as code."""
172
  language = CodeFileChunker._get_language_from_filename(filename)
173
+ return language and language not in ["text only", "None"]
 
 
 
174
 
175
  @staticmethod
176
  def parse_tree(filename: str, content: str) -> List[str]:
 
181
  logging.debug("%s doesn't seem to be a code file.", filename)
182
  return None
183
 
 
 
 
 
 
 
184
  try:
185
  parser = get_parser(language)
186
  except LookupError:
tests/test_chunker.py CHANGED
@@ -40,9 +40,11 @@ def test_code_chunker_happy_path():
40
  assert len(chunks) >= 1
41
 
42
 
43
- @mark.parametrize("filename", [param("assets/sample-script.ts"), param("assets/sample-script.tsx")])
44
- def test_code_chunker_typescript(filename):
45
- """Tests CodeFileChunker on .ts and .tsx files (tree_sitter_language_pack doesn't work out of the box)."""
 
 
46
  file_path = os.path.join(os.path.dirname(__file__), filename)
47
  with open(file_path, "r") as file:
48
  content = file.read()
@@ -50,13 +52,7 @@ def test_code_chunker_typescript(filename):
50
 
51
  chunker = sage.chunker.CodeFileChunker(max_tokens=100)
52
  chunks = chunker.chunk(content, metadata)
53
- # There's a bug in the tree-sitter-language-pack library for TypeScript. Before it gets fixed, we expect this to
54
- # return an empty list (instead of crashing).
55
- assert len(chunks) == 0
56
 
57
- # However, the UniversalFileChunker should fallback onto a regular text chunker, and return some chunks.
58
- chunker = sage.chunker.UniversalFileChunker(max_tokens=100)
59
- chunks = chunker.chunk(content, metadata)
60
  assert len(chunks) >= 1
61
 
62
 
 
40
  assert len(chunks) >= 1
41
 
42
 
43
+ @mark.parametrize(
44
+ "filename", [param("assets/sample-script.ts"), param("assets/sample-script.tsx")]
45
+ )
46
+ def test_code_chunker_typescript_happy_path(filename):
47
+ """Tests the happy path for the CodeFileChunker on .ts and .tsx files."""
48
  file_path = os.path.join(os.path.dirname(__file__), filename)
49
  with open(file_path, "r") as file:
50
  content = file.read()
 
52
 
53
  chunker = sage.chunker.CodeFileChunker(max_tokens=100)
54
  chunks = chunker.chunk(content, metadata)
 
 
 
55
 
 
 
 
56
  assert len(chunks) >= 1
57
 
58