| """Tabular parser. | |
| Contains parsers for tabular data files. | |
| """ | |
| from pathlib import Path | |
| from typing import Any, Dict, List, Union | |
| from application.parser.file.base_parser import BaseParser | |
| class CSVParser(BaseParser): | |
| """CSV parser. | |
| Args: | |
| concat_rows (bool): whether to concatenate all rows into one document. | |
| If set to False, a Document will be created for each row. | |
| True by default. | |
| """ | |
| def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: | |
| """Init params.""" | |
| super().__init__(*args, **kwargs) | |
| self._concat_rows = concat_rows | |
| def _init_parser(self) -> Dict: | |
| """Init parser.""" | |
| return {} | |
| def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: | |
| """Parse file. | |
| Returns: | |
| Union[str, List[str]]: a string or a List of strings. | |
| """ | |
| try: | |
| import csv | |
| except ImportError: | |
| raise ValueError("csv module is required to read CSV files.") | |
| text_list = [] | |
| with open(file, "r") as fp: | |
| csv_reader = csv.reader(fp) | |
| for row in csv_reader: | |
| text_list.append(", ".join(row)) | |
| if self._concat_rows: | |
| return "\n".join(text_list) | |
| else: | |
| return text_list | |
| class PandasCSVParser(BaseParser): | |
| r"""Pandas-based CSV parser. | |
| Parses CSVs using the separator detection from Pandas `read_csv`function. | |
| If special parameters are required, use the `pandas_config` dict. | |
| Args: | |
| concat_rows (bool): whether to concatenate all rows into one document. | |
| If set to False, a Document will be created for each row. | |
| True by default. | |
| col_joiner (str): Separator to use for joining cols per row. | |
| Set to ", " by default. | |
| row_joiner (str): Separator to use for joining each row. | |
| Only used when `concat_rows=True`. | |
| Set to "\n" by default. | |
| pandas_config (dict): Options for the `pandas.read_csv` function call. | |
| Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html | |
| for more information. | |
| Set to empty dict by default, this means pandas will try to figure | |
| out the separators, table head, etc. on its own. | |
| """ | |
| def __init__( | |
| self, | |
| *args: Any, | |
| concat_rows: bool = True, | |
| col_joiner: str = ", ", | |
| row_joiner: str = "\n", | |
| pandas_config: dict = {}, | |
| **kwargs: Any | |
| ) -> None: | |
| """Init params.""" | |
| super().__init__(*args, **kwargs) | |
| self._concat_rows = concat_rows | |
| self._col_joiner = col_joiner | |
| self._row_joiner = row_joiner | |
| self._pandas_config = pandas_config | |
| def _init_parser(self) -> Dict: | |
| """Init parser.""" | |
| return {} | |
| def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: | |
| """Parse file.""" | |
| try: | |
| import pandas as pd | |
| except ImportError: | |
| raise ValueError("pandas module is required to read CSV files.") | |
| df = pd.read_csv(file, **self._pandas_config) | |
| text_list = df.apply( | |
| lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 | |
| ).tolist() | |
| if self._concat_rows: | |
| return (self._row_joiner).join(text_list) | |
| else: | |
| return text_list | |