File size: 8,168 Bytes
96ad218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
"""

File processor for attachment feature

Supports: txt, md, py, js, json, csv, pdf, docx, xlsx

"""

import os
from typing import Dict, Optional
import datetime


def process_file(file_path: str) -> Optional[Dict]:
    """

    Extract text content from uploaded file



    Args:

        file_path: Path to the uploaded file



    Returns:

        Dictionary with file metadata and content, or None if processing failed

    """
    try:
        filename = os.path.basename(file_path)
        file_size = os.path.getsize(file_path)
        file_ext = os.path.splitext(filename)[1].lower()

        # Process based on file type
        content = extract_content(file_path, file_ext)

        if content is None:
            return None

        # Truncate if too large (keep first 20,000 chars for context)
        original_length = len(content)
        max_chars = 20000

        if original_length > max_chars:
            content = content[:max_chars]
            content += f"\n\n[πŸ“ Content truncated - original file was {original_length:,} characters, showing first {max_chars:,}]"

        return {
            "filename": filename,
            "content": content,
            "size_bytes": file_size,
            "file_type": file_ext[1:],  # Remove the dot
            "uploaded_at": datetime.datetime.now().isoformat(),
            "char_count": len(content)
        }

    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None


def extract_content(file_path: str, file_ext: str) -> Optional[str]:
    """Extract text content based on file extension"""

    # Plain text files
    if file_ext in ['.txt', '.md', '.log', '.json', '.html', '.xml', '.css', '.sql']:
        return read_text_file(file_path)

    # Code files
    elif file_ext in ['.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.cpp', '.c', '.h',
                      '.cs', '.php', '.rb', '.go', '.rs', '.swift', '.kt', '.sh', '.yml', '.yaml']:
        return read_text_file(file_path)

    # CSV files
    elif file_ext == '.csv':
        return read_csv_file(file_path)

    # PDF files
    elif file_ext == '.pdf':
        return read_pdf_file(file_path)

    # Word documents
    elif file_ext in ['.docx', '.doc']:
        return read_docx_file(file_path)

    # Excel files
    elif file_ext in ['.xlsx', '.xls']:
        return read_excel_file(file_path)

    else:
        return f"[❌ Unsupported file type: {file_ext}]"


def read_text_file(file_path: str) -> Optional[str]:
    """Read plain text file with multiple encoding attempts"""
    encodings = ['utf-8', 'utf-16', 'latin-1', 'cp1252']

    for encoding in encodings:
        try:
            with open(file_path, 'r', encoding=encoding) as f:
                return f.read()
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error reading text file with {encoding}: {e}")
            continue

    return "[❌ Could not decode text file - unsupported encoding]"


def read_csv_file(file_path: str) -> Optional[str]:
    """Read CSV file and convert to formatted text"""
    try:
        import csv

        with open(file_path, 'r', encoding='utf-8', newline='') as f:
            reader = csv.reader(f)
            rows = list(reader)

            if not rows:
                return "[Empty CSV file]"

            # Format as text with column alignment
            output = []
            output.append(f"CSV Data ({len(rows)} rows):\n")
            output.append("=" * 50)

            # Header
            if rows:
                output.append(" | ".join(rows[0]))
                output.append("-" * 50)

            # Data rows (limit to first 100 rows for context)
            for row in rows[1:101]:
                output.append(" | ".join(str(cell) for cell in row))

            if len(rows) > 101:
                output.append(f"\n[... {len(rows) - 101} more rows]")

            return "\n".join(output)

    except Exception as e:
        return f"[❌ Error reading CSV: {str(e)}]"


def read_pdf_file(file_path: str) -> Optional[str]:
    """Extract text from PDF file"""
    try:
        import pdfplumber

        with pdfplumber.open(file_path) as pdf:
            pages_text = []

            for i, page in enumerate(pdf.pages[:50]):  # Limit to first 50 pages
                text = page.extract_text()
                if text:
                    pages_text.append(f"--- Page {i + 1} ---\n{text}")

            if len(pdf.pages) > 50:
                pages_text.append(f"\n[... {len(pdf.pages) - 50} more pages not shown]")

            content = "\n\n".join(pages_text)

            if not content.strip():
                return "[❌ PDF appears to be empty or contains only images]"

            return content

    except ImportError:
        return "[❌ pdfplumber not installed - run: pip install pdfplumber]"
    except Exception as e:
        return f"[❌ Error reading PDF: {str(e)}]"


def read_docx_file(file_path: str) -> Optional[str]:
    """Extract text from Word document"""
    try:
        from docx import Document

        doc = Document(file_path)
        paragraphs = []

        for para in doc.paragraphs:
            if para.text.strip():
                paragraphs.append(para.text)

        # Also extract text from tables
        for table in doc.tables:
            for row in table.rows:
                row_text = " | ".join(cell.text.strip() for cell in row.cells)
                if row_text.strip():
                    paragraphs.append(row_text)

        content = "\n\n".join(paragraphs)

        if not content.strip():
            return "[❌ Word document appears to be empty]"

        return content

    except ImportError:
        return "[❌ python-docx not installed - run: pip install python-docx]"
    except Exception as e:
        return f"[❌ Error reading Word document: {str(e)}]"


def read_excel_file(file_path: str) -> Optional[str]:
    """Extract text from Excel file"""
    try:
        import pandas as pd

        # Read all sheets
        excel_file = pd.ExcelFile(file_path)
        output = []

        output.append(f"Excel File - {len(excel_file.sheet_names)} sheet(s)\n")
        output.append("=" * 50)

        for sheet_name in excel_file.sheet_names:
            df = pd.read_excel(file_path, sheet_name=sheet_name)

            output.append(f"\nπŸ“Š Sheet: {sheet_name}")
            output.append(f"Dimensions: {df.shape[0]} rows Γ— {df.shape[1]} columns")
            output.append("-" * 50)

            # Convert to string representation (limit rows)
            if len(df) > 50:
                output.append(df.head(50).to_string(index=False))
                output.append(f"\n[... {len(df) - 50} more rows]")
            else:
                output.append(df.to_string(index=False))

            output.append("\n")

        return "\n".join(output)

    except ImportError:
        return "[❌ pandas/openpyxl not installed - run: pip install pandas openpyxl]"
    except Exception as e:
        return f"[❌ Error reading Excel file: {str(e)}]"


def get_file_icon(file_type: str) -> str:
    """Return emoji icon for file type"""
    icons = {
        'txt': 'πŸ“„', 'md': 'πŸ“', 'pdf': 'πŸ“•', 'doc': 'πŸ“˜', 'docx': 'πŸ“˜',
        'xls': 'πŸ“Š', 'xlsx': 'πŸ“Š', 'csv': 'πŸ“Š',
        'json': 'πŸ“‹', 'xml': 'πŸ“‹', 'html': '🌐',
        'py': '🐍', 'js': 'πŸ“œ', 'ts': 'πŸ“œ', 'java': 'β˜•', 'cpp': 'βš™οΈ',
        'log': 'πŸ“‹', 'sql': 'πŸ—„οΈ', 'yml': 'βš™οΈ', 'yaml': 'βš™οΈ'
    }
    return icons.get(file_type, 'πŸ“Ž')


def format_file_size(size_bytes: int) -> str:
    """Format file size in human-readable format"""
    if size_bytes < 1024:
        return f"{size_bytes} B"
    elif size_bytes < 1024 * 1024:
        return f"{size_bytes / 1024:.1f} KB"
    else:
        return f"{size_bytes / (1024 * 1024):.1f} MB"