File size: 2,322 Bytes
3ec78dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import logging
import os
from dotenv import load_dotenv
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

from .utils.logger_utils import setup_logger
load_dotenv()


LOGGER_NAME = 'CODE_PARSER_LOGGER'
CODE_CHUNK_OVERLAP = int(os.getenv('CODE_CHUNK_OVERLAP', 0))
CODE_CHUNK_SIZE = int(os.getenv('CODE_CHUNK_SIZE', 2000))


class CodeParser:
    def __init__(self):
        setup_logger(LOGGER_NAME)
        self.logger = logging.getLogger(LOGGER_NAME)

        self.extension_mapping = {
            'c': Language.C,
            'h': Language.C,
            'cpp': Language.CPP,
            'cc': Language.CPP,
            'cxx': Language.CPP,
            'hpp': Language.CPP,
            'hh': Language.CPP,
            'hxx': Language.CPP,
            'go': Language.GO,
            'java': Language.JAVA,
            'py': Language.PYTHON,
            'pyw': Language.PYTHON,
            'js': Language.JS,
            'mjs': Language.JS,
            'cjs': Language.JS,
            'md': Language.MARKDOWN,
            'markdown': Language.MARKDOWN,
            'html': Language.HTML,
        }

    def parse(self, file_name:str, file_content:str) -> list:
        file_extension = file_name.split('.')[-1]

        try:
            self.logger.debug(f'Parsing file: {file_name}')
            if file_extension not in self.extension_mapping:
                self.logger.debug(f'File extension not supported: {file_extension}')
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=CODE_CHUNK_SIZE,
                    chunk_overlap=CODE_CHUNK_OVERLAP,
                    length_function=len,
                    is_separator_regex=False,
                )
                docs = text_splitter.create_documents([file_content])

            else:
                self.logger.debug(f'File extension supported: {file_extension}')
                code_splitter = RecursiveCharacterTextSplitter.from_language(language=self.extension_mapping[file_extension], chunk_size=CODE_CHUNK_SIZE, chunk_overlap=CODE_CHUNK_OVERLAP)
                docs = code_splitter.create_documents([file_content])
        except Exception as e:
            self.logger.error(f'Error when parsing code: {e}')
        return [doc.page_content for doc in docs]