Spaces:

Kevinshh
/

Preformu

Running

File size: 6,946 Bytes

ce70732

import csv
import io
import os
from typing import Optional, List, Dict, Any

class DataParser:
    """

    Utility class to parse stability data files into LLM-readable format.

    Supports CSV and Excel files.

    """
    
    @staticmethod
    def parse_file(file_path: str) -> str:
        """

        Parse a data file and return its content as a markdown table or text.

        

        Args:

            file_path: Absolute path to the file

            

        Returns:

            String content suitable for LLM context

        """
        if not os.path.exists(file_path):
            return f"Error: File not found: {file_path}"
            
        ext = os.path.splitext(file_path)[1].lower()
        
        try:
            if ext == '.csv':
                return DataParser._parse_csv(file_path)
            elif ext in ['.xlsx', '.xls']:
                return DataParser._parse_excel(file_path)
            elif ext in ['.txt', '.md']:
                with open(file_path, 'r', encoding='utf-8') as f:
                    return f.read()
            elif ext == '.pdf':
                return DataParser._parse_pdf(file_path)
            elif ext in ['.docx', '.doc']:
                return DataParser._parse_word(file_path)
            elif ext in ['.pptx', '.ppt']:
                return DataParser._parse_ppt(file_path)
            else:
                return f"Unsupported file format: {ext}. Please provide CSV, Excel, TXT, PDF, Word, or PPT files."
        except Exception as e:
            return f"Error parsing file {os.path.basename(file_path)}: {str(e)}"

    @staticmethod
    def _parse_csv(file_path: str) -> str:
        """Parse CSV file to markdown table."""
        try:
            with open(file_path, 'r', encoding='utf-8-sig') as f:
                reader = csv.reader(f)
                rows = list(reader)
                
            if not rows:
                return "Empty CSV file."
                
            return DataParser._rows_to_markdown(rows)
        except Exception as e:
            # Try different encoding if utf-8 fails
            try:
                with open(file_path, 'r', encoding='gbk') as f:
                    reader = csv.reader(f)
                    rows = list(reader)
                return DataParser._rows_to_markdown(rows)
            except:
                raise e

    @staticmethod
    def _parse_excel(file_path: str) -> str:
        """Parse Excel file to markdown table using pandas if available, else openpyxl."""
        try:
            import pandas as pd
            # Use pandas for robust Excel handling
            df = pd.read_excel(file_path)
            # Convert to markdown directly
            return df.to_markdown(index=False)
        except ImportError:
            try:
                import openpyxl
                wb = openpyxl.load_workbook(file_path, data_only=True)
                sheet = wb.active
                rows = []
                for row in sheet.iter_rows(values_only=True):
                    # Filter out completely empty rows
                    if any(row):
                         # Convert None to empty string and force string conversion
                        clean_row = [str(cell) if cell is not None else "" for cell in row]
                        rows.append(clean_row)
                return DataParser._rows_to_markdown(rows)
            except ImportError:
                return "Error: Neither 'pandas' nor 'openpyxl' libraries are installed. Cannot parse Excel files."

    @staticmethod
    def _parse_pdf(file_path: str) -> str:
        """Parse PDF file to text."""
        try:
            import pypdf
            text = ""
            with open(file_path, 'rb') as f:
                reader = pypdf.PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() + "\n\n"
            return text if text.strip() else "[PDF contains no extractable text]"
        except ImportError:
            return "Error: 'pypdf' library is not installed. Cannot parse PDF files."
        except Exception as e:
            return f"Error parsing PDF: {str(e)}"

    @staticmethod
    def _parse_word(file_path: str) -> str:
        """Parse Word file to text."""
        try:
            import docx
            doc = docx.Document(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            
            # Also extract tables
            for table in doc.tables:
                text += "\n[Table Extracted from Word]\n"
                rows = []
                for row in table.rows:
                    rows.append([cell.text for cell in row.cells])
                text += DataParser._rows_to_markdown(rows) + "\n"
                
            return text
        except ImportError:
            return "Error: 'python-docx' library is not installed. Cannot parse Word files."
        except Exception as e:
            return f"Error parsing Word file: {str(e)}"

    @staticmethod
    def _parse_ppt(file_path: str) -> str:
        """Parse PowerPoint file to text."""
        try:
            from pptx import Presentation
            prs = Presentation(file_path)
            text = ""
            for i, slide in enumerate(prs.slides):
                text += f"\n--- Slide {i+1} ---\n"
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + "\n"
            return text
        except ImportError:
            return "Error: 'python-pptx' library is not installed. Cannot parse PPT files."
        except Exception as e:
            return f"Error parsing PPT file: {str(e)}"

    @staticmethod
    def _rows_to_markdown(rows: List[List[str]]) -> str:
        """Convert list of lists to markdown table."""
        if not rows:
            return ""
            
        header = rows[0]
        # Ensure header elements are strings
        header = [str(h) for h in header]
        
        # Create separator line
        separator = ["---"] * len(header)
        
        md_lines = []
        # Join with pipes
        md_lines.append("| " + " | ".join(header) + " |")
        md_lines.append("| " + " | ".join(separator) + " |")
        
        for row in rows[1:]:
            # Clean row data
            clean_row = [str(cell).replace('\n', ' ') for cell in row]
            # Handle row length mismatch
            if len(clean_row) < len(header):
                clean_row += [""] * (len(header) - len(clean_row))
            elif len(clean_row) > len(header):
                clean_row = clean_row[:len(header)]
                
            md_lines.append("| " + " | ".join(clean_row) + " |")
            
        return "\n".join(md_lines)