| import csv
|
| import io
|
| import os
|
| from typing import Optional, List, Dict, Any
|
|
|
| class DataParser:
|
| """
|
| Utility class to parse stability data files into LLM-readable format.
|
| Supports CSV and Excel files.
|
| """
|
|
|
| @staticmethod
|
| def parse_file(file_path: str) -> str:
|
| """
|
| Parse a data file and return its content as a markdown table or text.
|
|
|
| Args:
|
| file_path: Absolute path to the file
|
|
|
| Returns:
|
| String content suitable for LLM context
|
| """
|
| if not os.path.exists(file_path):
|
| return f"Error: File not found: {file_path}"
|
|
|
| ext = os.path.splitext(file_path)[1].lower()
|
|
|
| try:
|
| if ext == '.csv':
|
| return DataParser._parse_csv(file_path)
|
| elif ext in ['.xlsx', '.xls']:
|
| return DataParser._parse_excel(file_path)
|
| elif ext in ['.txt', '.md']:
|
| with open(file_path, 'r', encoding='utf-8') as f:
|
| return f.read()
|
| elif ext == '.pdf':
|
| return DataParser._parse_pdf(file_path)
|
| elif ext in ['.docx', '.doc']:
|
| return DataParser._parse_word(file_path)
|
| elif ext in ['.pptx', '.ppt']:
|
| return DataParser._parse_ppt(file_path)
|
| else:
|
| return f"Unsupported file format: {ext}. Please provide CSV, Excel, TXT, PDF, Word, or PPT files."
|
| except Exception as e:
|
| return f"Error parsing file {os.path.basename(file_path)}: {str(e)}"
|
|
|
| @staticmethod
|
| def _parse_csv(file_path: str) -> str:
|
| """Parse CSV file to markdown table."""
|
| try:
|
| with open(file_path, 'r', encoding='utf-8-sig') as f:
|
| reader = csv.reader(f)
|
| rows = list(reader)
|
|
|
| if not rows:
|
| return "Empty CSV file."
|
|
|
| return DataParser._rows_to_markdown(rows)
|
| except Exception as e:
|
|
|
| try:
|
| with open(file_path, 'r', encoding='gbk') as f:
|
| reader = csv.reader(f)
|
| rows = list(reader)
|
| return DataParser._rows_to_markdown(rows)
|
| except:
|
| raise e
|
|
|
| @staticmethod
|
| def _parse_excel(file_path: str) -> str:
|
| """Parse Excel file to markdown table using pandas if available, else openpyxl."""
|
| try:
|
| import pandas as pd
|
|
|
| df = pd.read_excel(file_path)
|
|
|
| return df.to_markdown(index=False)
|
| except ImportError:
|
| try:
|
| import openpyxl
|
| wb = openpyxl.load_workbook(file_path, data_only=True)
|
| sheet = wb.active
|
| rows = []
|
| for row in sheet.iter_rows(values_only=True):
|
|
|
| if any(row):
|
|
|
| clean_row = [str(cell) if cell is not None else "" for cell in row]
|
| rows.append(clean_row)
|
| return DataParser._rows_to_markdown(rows)
|
| except ImportError:
|
| return "Error: Neither 'pandas' nor 'openpyxl' libraries are installed. Cannot parse Excel files."
|
|
|
| @staticmethod
|
| def _parse_pdf(file_path: str) -> str:
|
| """Parse PDF file to text."""
|
| try:
|
| import pypdf
|
| text = ""
|
| with open(file_path, 'rb') as f:
|
| reader = pypdf.PdfReader(f)
|
| for page in reader.pages:
|
| text += page.extract_text() + "\n\n"
|
| return text if text.strip() else "[PDF contains no extractable text]"
|
| except ImportError:
|
| return "Error: 'pypdf' library is not installed. Cannot parse PDF files."
|
| except Exception as e:
|
| return f"Error parsing PDF: {str(e)}"
|
|
|
| @staticmethod
|
| def _parse_word(file_path: str) -> str:
|
| """Parse Word file to text."""
|
| try:
|
| import docx
|
| doc = docx.Document(file_path)
|
| text = "\n".join([para.text for para in doc.paragraphs])
|
|
|
|
|
| for table in doc.tables:
|
| text += "\n[Table Extracted from Word]\n"
|
| rows = []
|
| for row in table.rows:
|
| rows.append([cell.text for cell in row.cells])
|
| text += DataParser._rows_to_markdown(rows) + "\n"
|
|
|
| return text
|
| except ImportError:
|
| return "Error: 'python-docx' library is not installed. Cannot parse Word files."
|
| except Exception as e:
|
| return f"Error parsing Word file: {str(e)}"
|
|
|
| @staticmethod
|
| def _parse_ppt(file_path: str) -> str:
|
| """Parse PowerPoint file to text."""
|
| try:
|
| from pptx import Presentation
|
| prs = Presentation(file_path)
|
| text = ""
|
| for i, slide in enumerate(prs.slides):
|
| text += f"\n--- Slide {i+1} ---\n"
|
| for shape in slide.shapes:
|
| if hasattr(shape, "text"):
|
| text += shape.text + "\n"
|
| return text
|
| except ImportError:
|
| return "Error: 'python-pptx' library is not installed. Cannot parse PPT files."
|
| except Exception as e:
|
| return f"Error parsing PPT file: {str(e)}"
|
|
|
| @staticmethod
|
| def _rows_to_markdown(rows: List[List[str]]) -> str:
|
| """Convert list of lists to markdown table."""
|
| if not rows:
|
| return ""
|
|
|
| header = rows[0]
|
|
|
| header = [str(h) for h in header]
|
|
|
|
|
| separator = ["---"] * len(header)
|
|
|
| md_lines = []
|
|
|
| md_lines.append("| " + " | ".join(header) + " |")
|
| md_lines.append("| " + " | ".join(separator) + " |")
|
|
|
| for row in rows[1:]:
|
|
|
| clean_row = [str(cell).replace('\n', ' ') for cell in row]
|
|
|
| if len(clean_row) < len(header):
|
| clean_row += [""] * (len(header) - len(clean_row))
|
| elif len(clean_row) > len(header):
|
| clean_row = clean_row[:len(header)]
|
|
|
| md_lines.append("| " + " | ".join(clean_row) + " |")
|
|
|
| return "\n".join(md_lines)
|
|
|