Spaces:

VTechAI
/

Chat

Runtime error

App Files Files Community

Chat / parser /file /tabular_parser.py

VTechAI

init

8a41f4d almost 2 years ago

raw

history blame contribute delete

3.51 kB

	"""Tabular parser.

	Contains parsers for tabular data files.

	"""
	from pathlib import Path
	from typing import Any, Dict, List, Union

	from application.parser.file.base_parser import BaseParser


	class CSVParser(BaseParser):
	"""CSV parser.

	Args:
	concat_rows (bool): whether to concatenate all rows into one document.
	If set to False, a Document will be created for each row.
	True by default.

	"""

	def __init__(self, args: Any, concat_rows: bool = True, *kwargs: Any) -> None:
	"""Init params."""
	super().__init__(args, *kwargs)
	self._concat_rows = concat_rows

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
	"""Parse file.

	Returns:
	Union[str, List[str]]: a string or a List of strings.

	"""
	try:
	import csv
	except ImportError:
	raise ValueError("csv module is required to read CSV files.")
	text_list = []
	with open(file, "r") as fp:
	csv_reader = csv.reader(fp)
	for row in csv_reader:
	text_list.append(", ".join(row))
	if self._concat_rows:
	return "\n".join(text_list)
	else:
	return text_list


	class PandasCSVParser(BaseParser):
	r"""Pandas-based CSV parser.

	Parses CSVs using the separator detection from Pandas `read_csv`function.
	If special parameters are required, use the `pandas_config` dict.

	Args:
	concat_rows (bool): whether to concatenate all rows into one document.
	If set to False, a Document will be created for each row.
	True by default.

	col_joiner (str): Separator to use for joining cols per row.
	Set to ", " by default.

	row_joiner (str): Separator to use for joining each row.
	Only used when `concat_rows=True`.
	Set to "\n" by default.

	pandas_config (dict): Options for the `pandas.read_csv` function call.
	Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
	for more information.
	Set to empty dict by default, this means pandas will try to figure
	out the separators, table head, etc. on its own.

	"""

	def __init__(
	self,
	*args: Any,
	concat_rows: bool = True,
	col_joiner: str = ", ",
	row_joiner: str = "\n",
	pandas_config: dict = {},
	**kwargs: Any
	) -> None:
	"""Init params."""
	super().__init__(args, *kwargs)
	self._concat_rows = concat_rows
	self._col_joiner = col_joiner
	self._row_joiner = row_joiner
	self._pandas_config = pandas_config

	def _init_parser(self) -> Dict:
	"""Init parser."""
	return {}

	def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
	"""Parse file."""
	try:
	import pandas as pd
	except ImportError:
	raise ValueError("pandas module is required to read CSV files.")

	df = pd.read_csv(file, **self._pandas_config)

	text_list = df.apply(
	lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
	).tolist()

	if self._concat_rows:
	return (self._row_joiner).join(text_list)
	else:
	return text_list