| from langchain_text_splitters import CharacterTextSplitter | |
| import os | |
| from typing import List, Optional | |
| def splitBioModels(directory: str, final_items: Optional[List[str]] = None) -> List[str]: | |
| """Separates BioModel database based on indentation | |
| Args: | |
| directory (str): Relative path to the folder containing the files. | |
| final_items (Optional[List[str]]): A list to store the split content. If None, a new list will be created. | |
| Returns: | |
| List[str]: A list of text chunks split from the BioModel files. | |
| """ | |
| text_splitter2 = CharacterTextSplitter( | |
| separator=" // ", | |
| chunk_size=1000000000, | |
| chunk_overlap=20, | |
| length_function=len, | |
| is_separator_regex=False | |
| ) | |
| if final_items is None: | |
| final_items = [] | |
| final_items = list(final_items) | |
| directory_path = os.path.abspath(directory) | |
| if not os.path.isdir(directory_path): | |
| print(f"Directory not found: {directory_path}") | |
| return final_items | |
| files = os.listdir(directory_path) | |
| for file in files: | |
| file_path = os.path.join(directory_path, file) | |
| try: | |
| with open(file_path, 'r') as f: | |
| last_part = os.path.basename(file_path) | |
| file_content = f.read() | |
| items = text_splitter2.create_documents([file_content]) | |
| for item in items: | |
| item.metadata = last_part | |
| final_items.extend(items) | |
| except Exception as e: | |
| print(f"Error reading file {file_path}: {e}") | |
| return final_items | |