import csv import io import os from typing import Optional, List, Dict, Any class DataParser: """ Utility class to parse stability data files into LLM-readable format. Supports CSV and Excel files. """ @staticmethod def parse_file(file_path: str) -> str: """ Parse a data file and return its content as a markdown table or text. Args: file_path: Absolute path to the file Returns: String content suitable for LLM context """ if not os.path.exists(file_path): return f"Error: File not found: {file_path}" ext = os.path.splitext(file_path)[1].lower() try: if ext == '.csv': return DataParser._parse_csv(file_path) elif ext in ['.xlsx', '.xls']: return DataParser._parse_excel(file_path) elif ext in ['.txt', '.md']: with open(file_path, 'r', encoding='utf-8') as f: return f.read() elif ext == '.pdf': return DataParser._parse_pdf(file_path) elif ext in ['.docx', '.doc']: return DataParser._parse_word(file_path) elif ext in ['.pptx', '.ppt']: return DataParser._parse_ppt(file_path) else: return f"Unsupported file format: {ext}. Please provide CSV, Excel, TXT, PDF, Word, or PPT files." except Exception as e: return f"Error parsing file {os.path.basename(file_path)}: {str(e)}" @staticmethod def _parse_csv(file_path: str) -> str: """Parse CSV file to markdown table.""" try: with open(file_path, 'r', encoding='utf-8-sig') as f: reader = csv.reader(f) rows = list(reader) if not rows: return "Empty CSV file." return DataParser._rows_to_markdown(rows) except Exception as e: # Try different encoding if utf-8 fails try: with open(file_path, 'r', encoding='gbk') as f: reader = csv.reader(f) rows = list(reader) return DataParser._rows_to_markdown(rows) except: raise e @staticmethod def _parse_excel(file_path: str) -> str: """Parse Excel file to markdown table using pandas if available, else openpyxl.""" try: import pandas as pd # Use pandas for robust Excel handling df = pd.read_excel(file_path) # Convert to markdown directly return df.to_markdown(index=False) except ImportError: try: import openpyxl wb = openpyxl.load_workbook(file_path, data_only=True) sheet = wb.active rows = [] for row in sheet.iter_rows(values_only=True): # Filter out completely empty rows if any(row): # Convert None to empty string and force string conversion clean_row = [str(cell) if cell is not None else "" for cell in row] rows.append(clean_row) return DataParser._rows_to_markdown(rows) except ImportError: return "Error: Neither 'pandas' nor 'openpyxl' libraries are installed. Cannot parse Excel files." @staticmethod def _parse_pdf(file_path: str) -> str: """Parse PDF file to text.""" try: import pypdf text = "" with open(file_path, 'rb') as f: reader = pypdf.PdfReader(f) for page in reader.pages: text += page.extract_text() + "\n\n" return text if text.strip() else "[PDF contains no extractable text]" except ImportError: return "Error: 'pypdf' library is not installed. Cannot parse PDF files." except Exception as e: return f"Error parsing PDF: {str(e)}" @staticmethod def _parse_word(file_path: str) -> str: """Parse Word file to text.""" try: import docx doc = docx.Document(file_path) text = "\n".join([para.text for para in doc.paragraphs]) # Also extract tables for table in doc.tables: text += "\n[Table Extracted from Word]\n" rows = [] for row in table.rows: rows.append([cell.text for cell in row.cells]) text += DataParser._rows_to_markdown(rows) + "\n" return text except ImportError: return "Error: 'python-docx' library is not installed. Cannot parse Word files." except Exception as e: return f"Error parsing Word file: {str(e)}" @staticmethod def _parse_ppt(file_path: str) -> str: """Parse PowerPoint file to text.""" try: from pptx import Presentation prs = Presentation(file_path) text = "" for i, slide in enumerate(prs.slides): text += f"\n--- Slide {i+1} ---\n" for shape in slide.shapes: if hasattr(shape, "text"): text += shape.text + "\n" return text except ImportError: return "Error: 'python-pptx' library is not installed. Cannot parse PPT files." except Exception as e: return f"Error parsing PPT file: {str(e)}" @staticmethod def _rows_to_markdown(rows: List[List[str]]) -> str: """Convert list of lists to markdown table.""" if not rows: return "" header = rows[0] # Ensure header elements are strings header = [str(h) for h in header] # Create separator line separator = ["---"] * len(header) md_lines = [] # Join with pipes md_lines.append("| " + " | ".join(header) + " |") md_lines.append("| " + " | ".join(separator) + " |") for row in rows[1:]: # Clean row data clean_row = [str(cell).replace('\n', ' ') for cell in row] # Handle row length mismatch if len(clean_row) < len(header): clean_row += [""] * (len(header) - len(clean_row)) elif len(clean_row) > len(header): clean_row = clean_row[:len(header)] md_lines.append("| " + " | ".join(clean_row) + " |") return "\n".join(md_lines)