Preformu / utils /data_parser.py
Kevinshh's picture
Upload data_parser.py
ce70732 verified
import csv
import io
import os
from typing import Optional, List, Dict, Any
class DataParser:
"""
Utility class to parse stability data files into LLM-readable format.
Supports CSV and Excel files.
"""
@staticmethod
def parse_file(file_path: str) -> str:
"""
Parse a data file and return its content as a markdown table or text.
Args:
file_path: Absolute path to the file
Returns:
String content suitable for LLM context
"""
if not os.path.exists(file_path):
return f"Error: File not found: {file_path}"
ext = os.path.splitext(file_path)[1].lower()
try:
if ext == '.csv':
return DataParser._parse_csv(file_path)
elif ext in ['.xlsx', '.xls']:
return DataParser._parse_excel(file_path)
elif ext in ['.txt', '.md']:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
elif ext == '.pdf':
return DataParser._parse_pdf(file_path)
elif ext in ['.docx', '.doc']:
return DataParser._parse_word(file_path)
elif ext in ['.pptx', '.ppt']:
return DataParser._parse_ppt(file_path)
else:
return f"Unsupported file format: {ext}. Please provide CSV, Excel, TXT, PDF, Word, or PPT files."
except Exception as e:
return f"Error parsing file {os.path.basename(file_path)}: {str(e)}"
@staticmethod
def _parse_csv(file_path: str) -> str:
"""Parse CSV file to markdown table."""
try:
with open(file_path, 'r', encoding='utf-8-sig') as f:
reader = csv.reader(f)
rows = list(reader)
if not rows:
return "Empty CSV file."
return DataParser._rows_to_markdown(rows)
except Exception as e:
# Try different encoding if utf-8 fails
try:
with open(file_path, 'r', encoding='gbk') as f:
reader = csv.reader(f)
rows = list(reader)
return DataParser._rows_to_markdown(rows)
except:
raise e
@staticmethod
def _parse_excel(file_path: str) -> str:
"""Parse Excel file to markdown table using pandas if available, else openpyxl."""
try:
import pandas as pd
# Use pandas for robust Excel handling
df = pd.read_excel(file_path)
# Convert to markdown directly
return df.to_markdown(index=False)
except ImportError:
try:
import openpyxl
wb = openpyxl.load_workbook(file_path, data_only=True)
sheet = wb.active
rows = []
for row in sheet.iter_rows(values_only=True):
# Filter out completely empty rows
if any(row):
# Convert None to empty string and force string conversion
clean_row = [str(cell) if cell is not None else "" for cell in row]
rows.append(clean_row)
return DataParser._rows_to_markdown(rows)
except ImportError:
return "Error: Neither 'pandas' nor 'openpyxl' libraries are installed. Cannot parse Excel files."
@staticmethod
def _parse_pdf(file_path: str) -> str:
"""Parse PDF file to text."""
try:
import pypdf
text = ""
with open(file_path, 'rb') as f:
reader = pypdf.PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n\n"
return text if text.strip() else "[PDF contains no extractable text]"
except ImportError:
return "Error: 'pypdf' library is not installed. Cannot parse PDF files."
except Exception as e:
return f"Error parsing PDF: {str(e)}"
@staticmethod
def _parse_word(file_path: str) -> str:
"""Parse Word file to text."""
try:
import docx
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs])
# Also extract tables
for table in doc.tables:
text += "\n[Table Extracted from Word]\n"
rows = []
for row in table.rows:
rows.append([cell.text for cell in row.cells])
text += DataParser._rows_to_markdown(rows) + "\n"
return text
except ImportError:
return "Error: 'python-docx' library is not installed. Cannot parse Word files."
except Exception as e:
return f"Error parsing Word file: {str(e)}"
@staticmethod
def _parse_ppt(file_path: str) -> str:
"""Parse PowerPoint file to text."""
try:
from pptx import Presentation
prs = Presentation(file_path)
text = ""
for i, slide in enumerate(prs.slides):
text += f"\n--- Slide {i+1} ---\n"
for shape in slide.shapes:
if hasattr(shape, "text"):
text += shape.text + "\n"
return text
except ImportError:
return "Error: 'python-pptx' library is not installed. Cannot parse PPT files."
except Exception as e:
return f"Error parsing PPT file: {str(e)}"
@staticmethod
def _rows_to_markdown(rows: List[List[str]]) -> str:
"""Convert list of lists to markdown table."""
if not rows:
return ""
header = rows[0]
# Ensure header elements are strings
header = [str(h) for h in header]
# Create separator line
separator = ["---"] * len(header)
md_lines = []
# Join with pipes
md_lines.append("| " + " | ".join(header) + " |")
md_lines.append("| " + " | ".join(separator) + " |")
for row in rows[1:]:
# Clean row data
clean_row = [str(cell).replace('\n', ' ') for cell in row]
# Handle row length mismatch
if len(clean_row) < len(header):
clean_row += [""] * (len(header) - len(clean_row))
elif len(clean_row) > len(header):
clean_row = clean_row[:len(header)]
md_lines.append("| " + " | ".join(clean_row) + " |")
return "\n".join(md_lines)