Spaces:

Kevinshh
/

Preformu

Running

App Files Files Community

Preformu / utils /data_parser.py

Kevinshh

Upload data_parser.py

ce70732 verified 3 months ago

raw

history blame contribute delete

6.95 kB

	import csv
	import io
	import os
	from typing import Optional, List, Dict, Any

	class DataParser:
	"""
	Utility class to parse stability data files into LLM-readable format.
	Supports CSV and Excel files.
	"""

	@staticmethod
	def parse_file(file_path: str) -> str:
	"""
	Parse a data file and return its content as a markdown table or text.

	Args:
	file_path: Absolute path to the file

	Returns:
	String content suitable for LLM context
	"""
	if not os.path.exists(file_path):
	return f"Error: File not found: {file_path}"

	ext = os.path.splitext(file_path)[1].lower()

	try:
	if ext == '.csv':
	return DataParser._parse_csv(file_path)
	elif ext in ['.xlsx', '.xls']:
	return DataParser._parse_excel(file_path)
	elif ext in ['.txt', '.md']:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	elif ext == '.pdf':
	return DataParser._parse_pdf(file_path)
	elif ext in ['.docx', '.doc']:
	return DataParser._parse_word(file_path)
	elif ext in ['.pptx', '.ppt']:
	return DataParser._parse_ppt(file_path)
	else:
	return f"Unsupported file format: {ext}. Please provide CSV, Excel, TXT, PDF, Word, or PPT files."
	except Exception as e:
	return f"Error parsing file {os.path.basename(file_path)}: {str(e)}"

	@staticmethod
	def _parse_csv(file_path: str) -> str:
	"""Parse CSV file to markdown table."""
	try:
	with open(file_path, 'r', encoding='utf-8-sig') as f:
	reader = csv.reader(f)
	rows = list(reader)

	if not rows:
	return "Empty CSV file."

	return DataParser._rows_to_markdown(rows)
	except Exception as e:
	# Try different encoding if utf-8 fails
	try:
	with open(file_path, 'r', encoding='gbk') as f:
	reader = csv.reader(f)
	rows = list(reader)
	return DataParser._rows_to_markdown(rows)
	except:
	raise e

	@staticmethod
	def _parse_excel(file_path: str) -> str:
	"""Parse Excel file to markdown table using pandas if available, else openpyxl."""
	try:
	import pandas as pd
	# Use pandas for robust Excel handling
	df = pd.read_excel(file_path)
	# Convert to markdown directly
	return df.to_markdown(index=False)
	except ImportError:
	try:
	import openpyxl
	wb = openpyxl.load_workbook(file_path, data_only=True)
	sheet = wb.active
	rows = []
	for row in sheet.iter_rows(values_only=True):
	# Filter out completely empty rows
	if any(row):
	# Convert None to empty string and force string conversion
	clean_row = [str(cell) if cell is not None else "" for cell in row]
	rows.append(clean_row)
	return DataParser._rows_to_markdown(rows)
	except ImportError:
	return "Error: Neither 'pandas' nor 'openpyxl' libraries are installed. Cannot parse Excel files."

	@staticmethod
	def _parse_pdf(file_path: str) -> str:
	"""Parse PDF file to text."""
	try:
	import pypdf
	text = ""
	with open(file_path, 'rb') as f:
	reader = pypdf.PdfReader(f)
	for page in reader.pages:
	text += page.extract_text() + "\n\n"
	return text if text.strip() else "[PDF contains no extractable text]"
	except ImportError:
	return "Error: 'pypdf' library is not installed. Cannot parse PDF files."
	except Exception as e:
	return f"Error parsing PDF: {str(e)}"

	@staticmethod
	def _parse_word(file_path: str) -> str:
	"""Parse Word file to text."""
	try:
	import docx
	doc = docx.Document(file_path)
	text = "\n".join([para.text for para in doc.paragraphs])

	# Also extract tables
	for table in doc.tables:
	text += "\n[Table Extracted from Word]\n"
	rows = []
	for row in table.rows:
	rows.append([cell.text for cell in row.cells])
	text += DataParser._rows_to_markdown(rows) + "\n"

	return text
	except ImportError:
	return "Error: 'python-docx' library is not installed. Cannot parse Word files."
	except Exception as e:
	return f"Error parsing Word file: {str(e)}"

	@staticmethod
	def _parse_ppt(file_path: str) -> str:
	"""Parse PowerPoint file to text."""
	try:
	from pptx import Presentation
	prs = Presentation(file_path)
	text = ""
	for i, slide in enumerate(prs.slides):
	text += f"\n--- Slide {i+1} ---\n"
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text += shape.text + "\n"
	return text
	except ImportError:
	return "Error: 'python-pptx' library is not installed. Cannot parse PPT files."
	except Exception as e:
	return f"Error parsing PPT file: {str(e)}"

	@staticmethod
	def _rows_to_markdown(rows: List[List[str]]) -> str:
	"""Convert list of lists to markdown table."""
	if not rows:
	return ""

	header = rows[0]
	# Ensure header elements are strings
	header = [str(h) for h in header]

	# Create separator line
	separator = ["---"] * len(header)

	md_lines = []
	# Join with pipes
	md_lines.append("\| " + " \| ".join(header) + " \|")
	md_lines.append("\| " + " \| ".join(separator) + " \|")

	for row in rows[1:]:
	# Clean row data
	clean_row = [str(cell).replace('\n', ' ') for cell in row]
	# Handle row length mismatch
	if len(clean_row) < len(header):
	clean_row += [""] * (len(header) - len(clean_row))
	elif len(clean_row) > len(header):
	clean_row = clean_row[:len(header)]

	md_lines.append("\| " + " \| ".join(clean_row) + " \|")

	return "\n".join(md_lines)