| import logging | |
| import os | |
| from dotenv import load_dotenv | |
| from langchain_text_splitters import ( | |
| Language, | |
| RecursiveCharacterTextSplitter, | |
| ) | |
| from .utils.logger_utils import setup_logger | |
| load_dotenv() | |
| LOGGER_NAME = 'CODE_PARSER_LOGGER' | |
| CODE_CHUNK_OVERLAP = int(os.getenv('CODE_CHUNK_OVERLAP', 0)) | |
| CODE_CHUNK_SIZE = int(os.getenv('CODE_CHUNK_SIZE', 2000)) | |
| class CodeParser: | |
| def __init__(self): | |
| setup_logger(LOGGER_NAME) | |
| self.logger = logging.getLogger(LOGGER_NAME) | |
| self.extension_mapping = { | |
| 'c': Language.C, | |
| 'h': Language.C, | |
| 'cpp': Language.CPP, | |
| 'cc': Language.CPP, | |
| 'cxx': Language.CPP, | |
| 'hpp': Language.CPP, | |
| 'hh': Language.CPP, | |
| 'hxx': Language.CPP, | |
| 'go': Language.GO, | |
| 'java': Language.JAVA, | |
| 'py': Language.PYTHON, | |
| 'pyw': Language.PYTHON, | |
| 'js': Language.JS, | |
| 'mjs': Language.JS, | |
| 'cjs': Language.JS, | |
| 'md': Language.MARKDOWN, | |
| 'markdown': Language.MARKDOWN, | |
| 'html': Language.HTML, | |
| } | |
| def parse(self, file_name:str, file_content:str) -> list: | |
| file_extension = file_name.split('.')[-1] | |
| try: | |
| self.logger.debug(f'Parsing file: {file_name}') | |
| if file_extension not in self.extension_mapping: | |
| self.logger.debug(f'File extension not supported: {file_extension}') | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=CODE_CHUNK_SIZE, | |
| chunk_overlap=CODE_CHUNK_OVERLAP, | |
| length_function=len, | |
| is_separator_regex=False, | |
| ) | |
| docs = text_splitter.create_documents([file_content]) | |
| else: | |
| self.logger.debug(f'File extension supported: {file_extension}') | |
| code_splitter = RecursiveCharacterTextSplitter.from_language(language=self.extension_mapping[file_extension], chunk_size=CODE_CHUNK_SIZE, chunk_overlap=CODE_CHUNK_OVERLAP) | |
| docs = code_splitter.create_documents([file_content]) | |
| except Exception as e: | |
| self.logger.error(f'Error when parsing code: {e}') | |
| return [doc.page_content for doc in docs] | |