File size: 6,946 Bytes
ce70732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import csv
import io
import os
from typing import Optional, List, Dict, Any

class DataParser:
    """

    Utility class to parse stability data files into LLM-readable format.

    Supports CSV and Excel files.

    """
    
    @staticmethod
    def parse_file(file_path: str) -> str:
        """

        Parse a data file and return its content as a markdown table or text.

        

        Args:

            file_path: Absolute path to the file

            

        Returns:

            String content suitable for LLM context

        """
        if not os.path.exists(file_path):
            return f"Error: File not found: {file_path}"
            
        ext = os.path.splitext(file_path)[1].lower()
        
        try:
            if ext == '.csv':
                return DataParser._parse_csv(file_path)
            elif ext in ['.xlsx', '.xls']:
                return DataParser._parse_excel(file_path)
            elif ext in ['.txt', '.md']:
                with open(file_path, 'r', encoding='utf-8') as f:
                    return f.read()
            elif ext == '.pdf':
                return DataParser._parse_pdf(file_path)
            elif ext in ['.docx', '.doc']:
                return DataParser._parse_word(file_path)
            elif ext in ['.pptx', '.ppt']:
                return DataParser._parse_ppt(file_path)
            else:
                return f"Unsupported file format: {ext}. Please provide CSV, Excel, TXT, PDF, Word, or PPT files."
        except Exception as e:
            return f"Error parsing file {os.path.basename(file_path)}: {str(e)}"

    @staticmethod
    def _parse_csv(file_path: str) -> str:
        """Parse CSV file to markdown table."""
        try:
            with open(file_path, 'r', encoding='utf-8-sig') as f:
                reader = csv.reader(f)
                rows = list(reader)
                
            if not rows:
                return "Empty CSV file."
                
            return DataParser._rows_to_markdown(rows)
        except Exception as e:
            # Try different encoding if utf-8 fails
            try:
                with open(file_path, 'r', encoding='gbk') as f:
                    reader = csv.reader(f)
                    rows = list(reader)
                return DataParser._rows_to_markdown(rows)
            except:
                raise e

    @staticmethod
    def _parse_excel(file_path: str) -> str:
        """Parse Excel file to markdown table using pandas if available, else openpyxl."""
        try:
            import pandas as pd
            # Use pandas for robust Excel handling
            df = pd.read_excel(file_path)
            # Convert to markdown directly
            return df.to_markdown(index=False)
        except ImportError:
            try:
                import openpyxl
                wb = openpyxl.load_workbook(file_path, data_only=True)
                sheet = wb.active
                rows = []
                for row in sheet.iter_rows(values_only=True):
                    # Filter out completely empty rows
                    if any(row):
                         # Convert None to empty string and force string conversion
                        clean_row = [str(cell) if cell is not None else "" for cell in row]
                        rows.append(clean_row)
                return DataParser._rows_to_markdown(rows)
            except ImportError:
                return "Error: Neither 'pandas' nor 'openpyxl' libraries are installed. Cannot parse Excel files."

    @staticmethod
    def _parse_pdf(file_path: str) -> str:
        """Parse PDF file to text."""
        try:
            import pypdf
            text = ""
            with open(file_path, 'rb') as f:
                reader = pypdf.PdfReader(f)
                for page in reader.pages:
                    text += page.extract_text() + "\n\n"
            return text if text.strip() else "[PDF contains no extractable text]"
        except ImportError:
            return "Error: 'pypdf' library is not installed. Cannot parse PDF files."
        except Exception as e:
            return f"Error parsing PDF: {str(e)}"

    @staticmethod
    def _parse_word(file_path: str) -> str:
        """Parse Word file to text."""
        try:
            import docx
            doc = docx.Document(file_path)
            text = "\n".join([para.text for para in doc.paragraphs])
            
            # Also extract tables
            for table in doc.tables:
                text += "\n[Table Extracted from Word]\n"
                rows = []
                for row in table.rows:
                    rows.append([cell.text for cell in row.cells])
                text += DataParser._rows_to_markdown(rows) + "\n"
                
            return text
        except ImportError:
            return "Error: 'python-docx' library is not installed. Cannot parse Word files."
        except Exception as e:
            return f"Error parsing Word file: {str(e)}"

    @staticmethod
    def _parse_ppt(file_path: str) -> str:
        """Parse PowerPoint file to text."""
        try:
            from pptx import Presentation
            prs = Presentation(file_path)
            text = ""
            for i, slide in enumerate(prs.slides):
                text += f"\n--- Slide {i+1} ---\n"
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        text += shape.text + "\n"
            return text
        except ImportError:
            return "Error: 'python-pptx' library is not installed. Cannot parse PPT files."
        except Exception as e:
            return f"Error parsing PPT file: {str(e)}"

    @staticmethod
    def _rows_to_markdown(rows: List[List[str]]) -> str:
        """Convert list of lists to markdown table."""
        if not rows:
            return ""
            
        header = rows[0]
        # Ensure header elements are strings
        header = [str(h) for h in header]
        
        # Create separator line
        separator = ["---"] * len(header)
        
        md_lines = []
        # Join with pipes
        md_lines.append("| " + " | ".join(header) + " |")
        md_lines.append("| " + " | ".join(separator) + " |")
        
        for row in rows[1:]:
            # Clean row data
            clean_row = [str(cell).replace('\n', ' ') for cell in row]
            # Handle row length mismatch
            if len(clean_row) < len(header):
                clean_row += [""] * (len(header) - len(clean_row))
            elif len(clean_row) > len(header):
                clean_row = clean_row[:len(header)]
                
            md_lines.append("| " + " | ".join(clean_row) + " |")
            
        return "\n".join(md_lines)