Spaces:

Asish22
/

code-crawler

Sleeping

App Files Files Community

juliaturc commited on Sep 10, 2024

Commit

501b0f3

1 Parent(s): 72b8d27

Fix crashes on typescript parsing

Browse files

Files changed (5) hide show

repo2vec/chunker.py +20 -1
setup.py +1 -1
tests/assets/sample-script.ts +74 -0
tests/assets/sample-script.tsx +28 -0
tests/test_chunker.py +22 -0

repo2vec/chunker.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Chunker abstraction and implementations."""
 import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import cached_property
@@ -104,6 +105,11 @@ class CodeFileChunker(Chunker):
         """Returns a canonical name for the language of the file, based on its extension.
         Returns None if the language is unknown to the pygments lexer.
         """
         try:
             lexer = pygments.lexers.get_lexer_for_filename(filename)
             return lexer.name.lower()
@@ -162,7 +168,10 @@ class CodeFileChunker(Chunker):
     def is_code_file(filename: str) -> bool:
         """Checks whether pygment & tree_sitter can parse the file as code."""
         language = CodeFileChunker._get_language_from_filename(filename)
-        return language and language not in ["text only", "None"]
     @staticmethod
     def parse_tree(filename: str, content: str) -> List[str]:
@@ -173,11 +182,21 @@ class CodeFileChunker(Chunker):
             logging.debug("%s doesn't seem to be a code file.", filename)
             return None
         try:
             parser = get_parser(language)
         except LookupError:
             logging.debug("%s doesn't seem to be a code file.", filename)
             return None
         tree = parser.parse(bytes(content, "utf8"))

 """Chunker abstraction and implementations."""
 import logging
+import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from functools import cached_property
         """Returns a canonical name for the language of the file, based on its extension.
         Returns None if the language is unknown to the pygments lexer.
         """
+        # pygments doesn't recognize .tsx files and returns None. So we need to special-case them.
+        extension = os.path.splitext(filename)[1]
+        if extension == ".tsx":
+            return "tsx"
         try:
             lexer = pygments.lexers.get_lexer_for_filename(filename)
             return lexer.name.lower()
     def is_code_file(filename: str) -> bool:
         """Checks whether pygment & tree_sitter can parse the file as code."""
         language = CodeFileChunker._get_language_from_filename(filename)
+        # tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
+        # resolved, otherwise we'll have to clone and fix the library.
+        # See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
+        return language and language not in ["text only", "None", "typescript", "tsx"]
     @staticmethod
     def parse_tree(filename: str, content: str) -> List[str]:
             logging.debug("%s doesn't seem to be a code file.", filename)
             return None
+        if language in ["typescript", "tsx"]:
+            # tree-sitter-language-pack crashes on TypeScript files. We'll wait for a bit to see if the issue gets
+            # resolved, otherwise we'll have to clone and fix the library.
+            # See https://github.com/Goldziher/tree-sitter-language-pack/issues/5
+            return None
         try:
             parser = get_parser(language)
         except LookupError:
             logging.debug("%s doesn't seem to be a code file.", filename)
             return None
+        # This should never happen unless there's a bug in the code, but we'd rather not crash.
+        except Exception as e:
+            logging.warn("Failed to get parser for %s: %s", filename, e)
+            return None
         tree = parser.parse(bytes(content, "utf8"))

setup.py CHANGED Viewed

@@ -8,7 +8,7 @@ def readfile(filename):
 setup(
     name="repo2vec",
-    version="0.1.7",
     packages=find_packages(),
     include_package_data=True,
     package_data={

 setup(
     name="repo2vec",
+    version="0.1.8",
     packages=find_packages(),
     include_package_data=True,
     package_data={

tests/assets/sample-script.ts ADDED Viewed

	@@ -0,0 +1,74 @@

+function bubbleSort(arr: number[]): number[] {
+    let n = arr.length;
+    let swapped: boolean;
+    // Outer loop for traversing the array
+    for (let i = 0; i < n; i++) {
+        swapped = false;
+        // Inner loop for comparing adjacent elements
+        for (let j = 0; j < n - i - 1; j++) {
+            if (arr[j] > arr[j + 1]) {
+                // Swap the elements if they are in the wrong order
+                let temp = arr[j];
+                arr[j] = arr[j + 1];
+                arr[j + 1] = temp;
+                swapped = true;
+            }
+        }
+        // If no elements were swapped in the inner loop, break out of the loop
+        if (!swapped) {
+            break;
+        }
+    }
+    return arr;
+}
+// Example usage
+let arr = [64, 34, 25, 12, 22, 11, 90];
+console.log("Sorted array:", bubbleSort(arr));
+function mergeSort(arr: number[]): number[] {
+    // Base case: if the array has only one element or is empty, return it
+    if (arr.length <= 1) {
+        return arr;
+    }
+    // Find the middle point of the array
+    const middle = Math.floor(arr.length / 2);
+    // Divide the array into left and right halves
+    const left = arr.slice(0, middle);
+    const right = arr.slice(middle);
+    // Recursively sort both halves and then merge them
+    return merge(mergeSort(left), mergeSort(right));
+}
+// Function to merge two sorted arrays
+function merge(left: number[], right: number[]): number[] {
+    let resultArray: number[] = [];
+    let leftIndex = 0;
+    let rightIndex = 0;
+    // Compare the elements in the left and right arrays and merge them in sorted order
+    while (leftIndex < left.length && rightIndex < right.length) {
+        if (left[leftIndex] < right[rightIndex]) {
+            resultArray.push(left[leftIndex]);
+            leftIndex++;
+        } else {
+            resultArray.push(right[rightIndex]);
+            rightIndex++;
+        }
+    }
+    // Concatenate any remaining elements in the left or right arrays
+    return resultArray.concat(left.slice(leftIndex)).concat(right.slice(rightIndex));
+}
+// Example usage
+let arr2 = [38, 27, 43, 3, 9, 82, 10];
+console.log("Sorted array:", mergeSort(arr2));

tests/assets/sample-script.tsx ADDED Viewed

	@@ -0,0 +1,28 @@

+import React, { useState } from 'react';
+// Define the types for the props
+interface MyComponentProps {
+  title: string;
+  subtitle?: string; // Optional prop
+}
+const MyComponent: React.FC<MyComponentProps> = ({ title, subtitle }) => {
+  // Define a state variable with an initial value
+  const [count, setCount] = useState<number>(0);
+  // Function to handle button click
+  const handleButtonClick = () => {
+    setCount(count + 1);
+  };
+  return (
+    <div style={{ padding: '20px', border: '1px solid #ccc', borderRadius: '8px' }}>
+      <h1>{title}</h1>
+      {subtitle && <h2>{subtitle}</h2>}
+      <p>Current count: {count}</p>
+      <button onClick={handleButtonClick}>Increase count</button>
+    </div>
+  );
+};
+export default MyComponent;

tests/test_chunker.py CHANGED Viewed

@@ -9,6 +9,8 @@ pip install pytest-mock
 import os
 import repo2vec.chunker
@@ -38,6 +40,26 @@ def test_code_chunker_happy_path():
     assert len(chunks) >= 1
 def test_ipynb_chunker_happy_path():
     """Tests the happy path for the IPynbChunker."""
     code_chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)

 import os
+from pytest import mark, param
 import repo2vec.chunker
     assert len(chunks) >= 1
+@mark.parametrize("filename", [param("assets/sample-script.ts"), param("assets/sample-script.tsx")])
+def test_code_chunker_typescript(filename):
+    """Tests CodeFileChunker on .ts and .tsx files (tree_sitter_language_pack doesn't work out of the box)."""
+    file_path = os.path.join(os.path.dirname(__file__), filename)
+    with open(file_path, "r") as file:
+        content = file.read()
+    metadata = {"file_path": file_path}
+    chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)
+    chunks = chunker.chunk(content, metadata)
+    # There's a bug in the tree-sitter-language-pack library for TypeScript. Before it gets fixed, we expect this to
+    # return an empty list (instead of crashing).
+    assert len(chunks) == 0
+    # However, the UniversalFileChunker should fallback onto a regular text chunker, and return some chunks.
+    chunker = repo2vec.chunker.UniversalFileChunker(max_tokens=100)
+    chunks = chunker.chunk(content, metadata)
+    assert len(chunks) >= 1
 def test_ipynb_chunker_happy_path():
     """Tests the happy path for the IPynbChunker."""
     code_chunker = repo2vec.chunker.CodeFileChunker(max_tokens=100)