juliaturc commited on
Commit
501b0f3
·
1 Parent(s): 72b8d27

Fix crashes on typescript parsing

Browse files
repo2vec/chunker.py CHANGED
@@ -1,6 +1,7 @@
1
  """Chunker abstraction and implementations."""
2
 
3
  import logging
 
4
  from abc import ABC, abstractmethod
5
  from dataclasses import dataclass
6
  from functools import cached_property
@@ -104,6 +105,11 @@ class CodeFileChunker(Chunker):
104
  """Returns a canonical name for the language of the file, based on its extension.
105
  Returns None if the language is unknown to the pygments lexer.
106
  """
 
 
 
 
 
107
  try:
108
  lexer = pygments.lexers.get_lexer_for_filename(filename)
109
  return lexer.name.lower()
@@ -162,7 +168,10 @@ class CodeFileChunker(Chunker):
162
  def is_code_file(filename: str) -> bool:
163
  """Checks whether pygment & tree_sitter can parse the file as code."""
164
  language = CodeFileChunker._get_language_from_filename(filename)
165
- return language and language not in ["text only", "None"]
 
 
 
166
 
167
  @staticmethod
168
  def parse_tree(filename: str, content: str) -> List[str]:
@@ -173,11 +182,21 @@ class CodeFileChunker(Chunker):
173
  logging.debug("%s doesn't seem to be a code file.", filename)
174
  return None
175
 
 
 
 
 
 
 
176
  try:
177
  parser = get_parser(language)
178
  except LookupError:
179
  logging.debug("%s doesn't seem to be a code file.", filename)
180
  return None
 
 
 
 
181
 
182
  tree = parser.parse(bytes(content, "utf8"))
183
 
 
1
  """Chunker abstraction and implementations."""
2
 
3
  import logging
4
+ import os
5
  from abc import ABC, abstractmethod
6
  from dataclasses import dataclass
7
  from functools import cached_property
 
105
  """Returns a canonical name for the language of the file, based on its extension.
106
  Returns None if the language is unknown to the pygments lexer.
107
  """
108
+ # pygments doesn't recognize .tsx files and returns None. So we need to special-case them.
109
+ extension = os.path.splitext(filename)[1]
110
+ if extension == ".tsx":
111
+ return "tsx"
112
+
113
  try:
114
  lexer = pygments.lexers.get_lexer_for_filename(filename)
115
  return lexer.name.lower()
 
168
  def is_code_file(filename: str) -> bool:
169
  """Checks whether pygment & tree_sitter can parse the file as code."""
170
  language = CodeFileChunker._get_language_from_filename(filename)
171
+ # tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
172
+ # resolved, otherwise we'll have to clone and fix the library.
173
+ # See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
174
+ return language and language not in ["text only", "None", "typescript", "tsx"]
175
 
176
  @staticmethod
177
  def parse_tree(filename: str, content: str) -> List[str]:
 
182
  logging.debug("%s doesn't seem to be a code file.", filename)
183
  return None
184
 
185
+ if language in ["typescript", "tsx"]:
186
+ # tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
187
+ # resolved, otherwise we'll have to clone and fix the library.
188
+ # See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
189
+ return None
190
+
191
  try:
192
  parser = get_parser(language)
193
  except LookupError:
194
  logging.debug("%s doesn't seem to be a code file.", filename)
195
  return None
196
+ # This should never happen unless there's a bug in the code, but we'd rather not crash.
197
+ except Exception as e:
198
+ logging.warn("Failed to get parser for %s: %s", filename, e)
199
+ return None
200
 
201
  tree = parser.parse(bytes(content, "utf8"))
202
 
setup.py CHANGED
@@ -8,7 +8,7 @@ def readfile(filename):
8
 
9
  setup(
10
  name="repo2vec",
11
- version="0.1.7",
12
  packages=find_packages(),
13
  include_package_data=True,
14
  package_data={
 
8
 
9
  setup(
10
  name="repo2vec",
11
+ version="0.1.8",
12
  packages=find_packages(),
13
  include_package_data=True,
14
  package_data={
tests/assets/sample-script.ts ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function bubbleSort(arr: number[]): number[] {
2
+ let n = arr.length;
3
+ let swapped: boolean;
4
+
5
+ // Outer loop for traversing the array
6
+ for (let i = 0; i < n; i++) {
7
+ swapped = false;
8
+
9
+ // Inner loop for comparing adjacent elements
10
+ for (let j = 0; j < n - i - 1; j++) {
11
+ if (arr[j] > arr[j + 1]) {
12
+ // Swap the elements if they are in the wrong order
13
+ let temp = arr[j];
14
+ arr[j] = arr[j + 1];
15
+ arr[j + 1] = temp;
16
+ swapped = true;
17
+ }
18
+ }
19
+
20
+ // If no elements were swapped in the inner loop, break out of the loop
21
+ if (!swapped) {
22
+ break;
23
+ }
24
+ }
25
+
26
+ return arr;
27
+ }
28
+
29
+ // Example usage
30
+ let arr = [64, 34, 25, 12, 22, 11, 90];
31
+ console.log("Sorted array:", bubbleSort(arr));
32
+
33
+ function mergeSort(arr: number[]): number[] {
34
+ // Base case: if the array has only one element or is empty, return it
35
+ if (arr.length <= 1) {
36
+ return arr;
37
+ }
38
+
39
+ // Find the middle point of the array
40
+ const middle = Math.floor(arr.length / 2);
41
+
42
+ // Divide the array into left and right halves
43
+ const left = arr.slice(0, middle);
44
+ const right = arr.slice(middle);
45
+
46
+ // Recursively sort both halves and then merge them
47
+ return merge(mergeSort(left), mergeSort(right));
48
+ }
49
+
50
+ // Function to merge two sorted arrays
51
+ function merge(left: number[], right: number[]): number[] {
52
+ let resultArray: number[] = [];
53
+ let leftIndex = 0;
54
+ let rightIndex = 0;
55
+
56
+ // Compare the elements in the left and right arrays and merge them in sorted order
57
+ while (leftIndex < left.length && rightIndex < right.length) {
58
+ if (left[leftIndex] < right[rightIndex]) {
59
+ resultArray.push(left[leftIndex]);
60
+ leftIndex++;
61
+ } else {
62
+ resultArray.push(right[rightIndex]);
63
+ rightIndex++;
64
+ }
65
+ }
66
+
67
+ // Concatenate any remaining elements in the left or right arrays
68
+ return resultArray.concat(left.slice(leftIndex)).concat(right.slice(rightIndex));
69
+ }
70
+
71
+ // Example usage
72
+ let arr2 = [38, 27, 43, 3, 9, 82, 10];
73
+ console.log("Sorted array:", mergeSort(arr2));
74
+
tests/assets/sample-script.tsx ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import React, { useState } from 'react';
2
+
3
+ // Define the types for the props
4
+ interface MyComponentProps {
5
+ title: string;
6
+ subtitle?: string; // Optional prop
7
+ }
8
+
9
+ const MyComponent: React.FC<MyComponentProps> = ({ title, subtitle }) => {
10
+ // Define a state variable with an initial value
11
+ const [count, setCount] = useState<number>(0);
12
+
13
+ // Function to handle button click
14
+ const handleButtonClick = () => {
15
+ setCount(count + 1);
16
+ };
17
+
18
+ return (
19
+ <div style={{ padding: '20px', border: '1px solid #ccc', borderRadius: '8px' }}>
20
+ <h1>{title}</h1>
21
+ {subtitle && <h2>{subtitle}</h2>}
22
+ <p>Current count: {count}</p>
23
+ <button onClick={handleButtonClick}>Increase count</button>
24
+ </div>
25
+ );
26
+ };
27
+
28
+ export default MyComponent;
tests/test_chunker.py CHANGED
@@ -9,6 +9,8 @@ pip install pytest-mock
9
 
10
  import os
11
 
 
 
12
  import repo2vec.chunker
13
 
14
 
@@ -38,6 +40,26 @@ def test_code_chunker_happy_path():
38
  assert len(chunks) >= 1
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def test_ipynb_chunker_happy_path():
42
  """Tests the happy path for the IPynbChunker."""
43
  code_chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)
 
9
 
10
  import os
11
 
12
+ from pytest import mark, param
13
+
14
  import repo2vec.chunker
15
 
16
 
 
40
  assert len(chunks) >= 1
41
 
42
 
43
+ @mark.parametrize("filename", [param("assets/sample-script.ts"), param("assets/sample-script.tsx")])
44
+ def test_code_chunker_typescript(filename):
45
+ """Tests CodeFileChunker on .ts and .tsx files (tree_sitter_language_pack doesn't work out of the box)."""
46
+ file_path = os.path.join(os.path.dirname(__file__), filename)
47
+ with open(file_path, "r") as file:
48
+ content = file.read()
49
+ metadata = {"file_path": file_path}
50
+
51
+ chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)
52
+ chunks = chunker.chunk(content, metadata)
53
+ # There's a bug in the tree-sitter-language-pack library for TypeScript. Before it gets fixed, we expect this to
54
+ # return an empty list (instead of crashing).
55
+ assert len(chunks) == 0
56
+
57
+ # However, the UniversalFileChunker should fallback onto a regular text chunker, and return some chunks.
58
+ chunker = repo2vec.chunker.UniversalFileChunker(max_tokens=100)
59
+ chunks = chunker.chunk(content, metadata)
60
+ assert len(chunks) >= 1
61
+
62
+
63
  def test_ipynb_chunker_happy_path():
64
  """Tests the happy path for the IPynbChunker."""
65
  code_chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)