Spaces:
Running
Running
Lorenzo Cian commited on
Commit ·
4ddb694
1
Parent(s): ec63959
Fix chunking for .ts and .tsx files (#73)
Browse files- pyproject.toml +1 -0
- sage/chunker.py +1 -10
- tests/test_chunker.py +5 -9
pyproject.toml
CHANGED
|
@@ -56,6 +56,7 @@ dependencies = [
|
|
| 56 |
"tokenizers==0.19.1",
|
| 57 |
"transformers==4.44.2",
|
| 58 |
"tree-sitter==0.22.3",
|
|
|
|
| 59 |
"tree-sitter-language-pack==0.2.0",
|
| 60 |
"voyageai==0.2.3",
|
| 61 |
"setuptools" # Added from the setup.py install_requires
|
|
|
|
| 56 |
"tokenizers==0.19.1",
|
| 57 |
"transformers==4.44.2",
|
| 58 |
"tree-sitter==0.22.3",
|
| 59 |
+
"tree-sitter-typescript==0.21.2",
|
| 60 |
"tree-sitter-language-pack==0.2.0",
|
| 61 |
"voyageai==0.2.3",
|
| 62 |
"setuptools" # Added from the setup.py install_requires
|
sage/chunker.py
CHANGED
|
@@ -170,10 +170,7 @@ class CodeFileChunker(Chunker):
|
|
| 170 |
def is_code_file(filename: str) -> bool:
|
| 171 |
"""Checks whether pygment & tree_sitter can parse the file as code."""
|
| 172 |
language = CodeFileChunker._get_language_from_filename(filename)
|
| 173 |
-
|
| 174 |
-
# resolved, otherwise we'll have to clone and fix the library.
|
| 175 |
-
# See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
|
| 176 |
-
return language and language not in ["text only", "None", "typescript", "tsx"]
|
| 177 |
|
| 178 |
@staticmethod
|
| 179 |
def parse_tree(filename: str, content: str) -> List[str]:
|
|
@@ -184,12 +181,6 @@ class CodeFileChunker(Chunker):
|
|
| 184 |
logging.debug("%s doesn't seem to be a code file.", filename)
|
| 185 |
return None
|
| 186 |
|
| 187 |
-
if language in ["typescript", "tsx"]:
|
| 188 |
-
# tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
|
| 189 |
-
# resolved, otherwise we'll have to clone and fix the library.
|
| 190 |
-
# See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
|
| 191 |
-
return None
|
| 192 |
-
|
| 193 |
try:
|
| 194 |
parser = get_parser(language)
|
| 195 |
except LookupError:
|
|
|
|
| 170 |
def is_code_file(filename: str) -> bool:
|
| 171 |
"""Checks whether pygment & tree_sitter can parse the file as code."""
|
| 172 |
language = CodeFileChunker._get_language_from_filename(filename)
|
| 173 |
+
return language and language not in ["text only", "None"]
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
@staticmethod
|
| 176 |
def parse_tree(filename: str, content: str) -> List[str]:
|
|
|
|
| 181 |
logging.debug("%s doesn't seem to be a code file.", filename)
|
| 182 |
return None
|
| 183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
try:
|
| 185 |
parser = get_parser(language)
|
| 186 |
except LookupError:
|
tests/test_chunker.py
CHANGED
|
@@ -40,9 +40,11 @@ def test_code_chunker_happy_path():
|
|
| 40 |
assert len(chunks) >= 1
|
| 41 |
|
| 42 |
|
| 43 |
-
@mark.parametrize(
|
| 44 |
-
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
file_path = os.path.join(os.path.dirname(__file__), filename)
|
| 47 |
with open(file_path, "r") as file:
|
| 48 |
content = file.read()
|
|
@@ -50,13 +52,7 @@ def test_code_chunker_typescript(filename):
|
|
| 50 |
|
| 51 |
chunker = sage.chunker.CodeFileChunker(max_tokens=100)
|
| 52 |
chunks = chunker.chunk(content, metadata)
|
| 53 |
-
# There's a bug in the tree-sitter-language-pack library for TypeScript. Before it gets fixed, we expect this to
|
| 54 |
-
# return an empty list (instead of crashing).
|
| 55 |
-
assert len(chunks) == 0
|
| 56 |
|
| 57 |
-
# However, the UniversalFileChunker should fallback onto a regular text chunker, and return some chunks.
|
| 58 |
-
chunker = sage.chunker.UniversalFileChunker(max_tokens=100)
|
| 59 |
-
chunks = chunker.chunk(content, metadata)
|
| 60 |
assert len(chunks) >= 1
|
| 61 |
|
| 62 |
|
|
|
|
| 40 |
assert len(chunks) >= 1
|
| 41 |
|
| 42 |
|
| 43 |
+
@mark.parametrize(
|
| 44 |
+
"filename", [param("assets/sample-script.ts"), param("assets/sample-script.tsx")]
|
| 45 |
+
)
|
| 46 |
+
def test_code_chunker_typescript_happy_path(filename):
|
| 47 |
+
"""Tests the happy path for the CodeFileChunker on .ts and .tsx files."""
|
| 48 |
file_path = os.path.join(os.path.dirname(__file__), filename)
|
| 49 |
with open(file_path, "r") as file:
|
| 50 |
content = file.read()
|
|
|
|
| 52 |
|
| 53 |
chunker = sage.chunker.CodeFileChunker(max_tokens=100)
|
| 54 |
chunks = chunker.chunk(content, metadata)
|
|
|
|
|
|
|
|
|
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
assert len(chunks) >= 1
|
| 57 |
|
| 58 |
|