File size: 16,506 Bytes
05760a8
 
 
 
 
 
 
 
 
 
918120f
05760a8
07b7d87
 
 
 
 
 
 
 
05760a8
 
 
 
 
 
 
fe8dabf
 
 
07b7d87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe8dabf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05760a8
 
918120f
05760a8
fa3f320
05760a8
 
2fa2126
4467901
2fa2126
 
 
fe8dabf
2fa2126
 
 
 
 
 
 
 
fe14ce8
2fa2126
 
 
 
 
 
 
 
 
 
 
 
 
4467901
2fa2126
 
 
 
 
fa3f320
fe14ce8
fa3f320
4467901
fe14ce8
2fa2126
05760a8
 
2fa2126
 
05760a8
 
07b7d87
 
 
 
 
 
 
 
 
 
05760a8
2fa2126
05760a8
2fa2126
05760a8
2fa2126
05760a8
2fa2126
05760a8
918120f
 
 
 
fa3f320
 
918120f
fa3f320
 
 
 
 
 
 
 
 
918120f
fa3f320
2fa2126
 
fa3f320
05760a8
2fa2126
 
 
 
918120f
 
2fa2126
918120f
 
fe14ce8
05760a8
2fa2126
fa3f320
05760a8
 
918120f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa3f320
918120f
 
 
 
 
 
 
 
 
 
 
 
 
fa3f320
918120f
 
 
 
 
 
fa3f320
918120f
05760a8
2fa2126
05760a8
 
 
918120f
05760a8
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
import gradio as gr
import openpyxl
import os
from datetime import datetime
from pptx import Presentation
import PyPDF2
from docx import Document
import io
import tempfile
import logging
import base64

# Importowanie biblioteki do starszych plików Excel
try:
    import xlrd
    XLRD_AVAILABLE = True
except ImportError:
    XLRD_AVAILABLE = False
    logging.warning("xlrd not available, .xls files may not be supported")

# Konfiguracja logowania
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

class MultiConverter:
    def convert_excel_to_formatted_text(self, excel_file):
        """Convert Excel to formatted Markdown-style text."""
        output = io.StringIO()
        file_ext = os.path.splitext(excel_file)[1].lower()
        
        # Obsługa błędów
        try:
            if file_ext == '.xls' and XLRD_AVAILABLE:
                # Użyj xlrd dla starszego formatu .xls
                logging.info("Processing old Excel format (.xls) with xlrd")
                return self._convert_xls_with_xlrd(excel_file, output)
            else:
                # Użyj openpyxl dla nowszego formatu .xlsx
                logging.info("Processing Excel format with openpyxl")
                try:
                    workbook = openpyxl.load_workbook(excel_file, data_only=True)
                except Exception as e:
                    logging.error(f"Error opening Excel file with openpyxl: {str(e)}")
                    output.write(f"# Error opening Excel file\n\n")
                    output.write(f"Details: {str(e)}\n\n")
                    output.write("Possible reasons:\n")
                    output.write("- The file may be in an older Excel format (.xls). Try saving it as .xlsx\n")
                    output.write("- The file may be corrupted or password-protected\n")
                    output.write("- The file may contain unsupported features\n\n")
                    return output.getvalue()
                
                # Przetwarzanie arkuszy
                for idx, sheet_name in enumerate(workbook.sheetnames):
                    if idx > 0:
                        output.write("\n" + "-" * 70 + "\n\n")
                    output.write(f"### {sheet_name}:\n")
                    sheet = workbook[sheet_name]
                    
                    # Sprawdź, czy arkusz zawiera dane
                    if sheet.max_row <= 1 and sheet.max_column <= 1:
                        output.write("# No data in sheet\n\n")
                        continue
                        
                    # Znajdź niepuste komórki
                    data = []
                    max_col_widths = []
                    non_empty_rows = []
                    non_empty_cols = []
                    
                    for row_idx in range(1, sheet.max_row + 1):
                        for col_idx in range(1, sheet.max_column + 1):
                            try:
                                cell_value = sheet.cell(row=row_idx, column=col_idx).value
                                if cell_value is not None:
                                    non_empty_rows.append(row_idx)
                                    non_empty_cols.append(col_idx)
                            except Exception as e:
                                logging.warning(f"Error reading cell at row {row_idx}, col {col_idx}: {str(e)}")
                    
                    if not non_empty_rows or not non_empty_cols:
                        output.write("# No data in sheet\n\n")
                        continue
                        
                    # Określ zakres danych
                    min_row, max_row = min(non_empty_rows), max(non_empty_rows)
                    min_col, max_col = min(non_empty_cols), max(non_empty_cols)
                    max_col_widths = [0] * (max_col - min_col + 1)
                    
                    # Zbierz dane
                    for row_idx in range(min_row, max_row + 1):
                        row_data = []
                        for col_idx in range(min_col, max_col + 1):
                            try:
                                value = str(sheet.cell(row=row_idx, column=col_idx).value or "")
                            except:
                                value = ""
                            row_data.append(value)
                            col_pos = col_idx - min_col
                            max_col_widths[col_pos] = max(max_col_widths[col_pos], len(value))
                        data.append(row_data)
                    
                    # Sformatuj jako tabelę Markdown
                    for row_idx, row in enumerate(data):
                        if row_idx == 0:
                            header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                            output.write(header_line + "\n")
                            separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|"
                            output.write(separator_line + "\n")
                        data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                        output.write(data_line + "\n")
                    output.write("\n")
        except Exception as e:
            logging.exception(f"Error processing Excel file: {str(e)}")
            output.write(f"# Error processing Excel file\n\n")
            output.write(f"Details: {str(e)}\n\n")
        
        return output.getvalue()
        
    def _convert_xls_with_xlrd(self, excel_file, output):
        """Convert old Excel (.xls) format using xlrd."""
        if not XLRD_AVAILABLE:
            output.write("# Error: xlrd library not available to process .xls files\n\n")
            output.write("Please install xlrd with 'pip install xlrd' to process .xls files\n")
            return output.getvalue()
            
        try:
            # Otwórz plik Excel za pomocą xlrd
            workbook = xlrd.open_workbook(excel_file)
            
            # Przetwórz każdy arkusz
            for idx, sheet in enumerate(workbook.sheets()):
                if idx > 0:
                    output.write("\n" + "-" * 70 + "\n\n")
                    
                sheet_name = sheet.name
                output.write(f"### {sheet_name}:\n")
                
                # Sprawdź, czy arkusz zawiera dane
                if sheet.nrows <= 0 or sheet.ncols <= 0:
                    output.write("# No data in sheet\n\n")
                    continue
                
                # Zbierz dane i określ szerokości kolumn
                data = []
                max_col_widths = [0] * sheet.ncols
                
                for row_idx in range(sheet.nrows):
                    row_data = []
                    for col_idx in range(sheet.ncols):
                        try:
                            cell = sheet.cell(row_idx, col_idx)
                            if cell.ctype == xlrd.XL_CELL_DATE:
                                # Konwertuj datę na czytelny format
                                date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode)
                                value = datetime(*date_tuple).strftime("%Y-%m-%d %H:%M:%S")
                            else:
                                value = str(cell.value).strip()
                        except:
                            value = ""
                            
                        row_data.append(value)
                        max_col_widths[col_idx] = max(max_col_widths[col_idx], len(value))
                    data.append(row_data)
                
                # Sformatuj jako tabelę Markdown
                for row_idx, row in enumerate(data):
                    if row_idx == 0:
                        header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                        output.write(header_line + "\n")
                        separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|"
                        output.write(separator_line + "\n")
                    data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                    output.write(data_line + "\n")
                output.write("\n")
                
        except Exception as e:
            logging.exception(f"Error processing .xls file with xlrd: {str(e)}")
            output.write(f"# Error processing .xls file\n\n")
            output.write(f"Details: {str(e)}\n\n")
            
        return output.getvalue()

    def convert_pptx_to_text(self, pptx_file, filename):
        """Convert PowerPoint to plain text."""
        output = io.StringIO()
        prs = Presentation(pptx_file)
        output.write(f"# PowerPoint Presentation: {filename}\n\n")
        for slide_num, slide in enumerate(prs.slides, 1):
            output.write(f"## Slide {slide_num}\n")
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    output.write(f"{shape.text}\n\n")
        return output.getvalue()

    def convert_pdf_to_text(self, pdf_file, filename):
        """Convert PDF to plain text."""
        output = io.StringIO()
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        output.write(f"# PDF Document: {filename}\n\n")
        for page_num, page in enumerate(pdf_reader.pages, 1):
            output.write(f"## Page {page_num}\n")
            output.write(page.extract_text() + "\n\n")
        return output.getvalue()

    def convert_docx_to_text(self, docx_file, filename):
        """Convert Word to plain text."""
        output = io.StringIO()
        doc = Document(docx_file)
        output.write(f"# Word Document: {filename}\n\n")
        for para in doc.paragraphs:
            output.write(para.text + "\n\n")
        return output.getvalue()


def convert_file(file):
    """Process uploaded file and convert it to text"""
    if file is None:
        return "No file uploaded. Please select a file first.", ""
    
    try:
        logging.info(f"Starting conversion for file: {file.name if hasattr(file, 'name') else 'unknown'}")
        
        # Save uploaded file to a temporary file
        temp_dir = tempfile.mkdtemp()
        temp_file_path = os.path.join(temp_dir, "uploaded_file")
        output_file_path = ""
        
        # Zapisz plik na dysk niezależnie od jego formatu
        if hasattr(file, 'name'):
            file_name = file.name
        else:
            file_name = "unknown_file"
            
        # Zapisujemy zawartość pliku do pliku tymczasowego
        try:
            # Próbuj odczytać jako obiekt z metodą read()
            if hasattr(file, 'read'):
                with open(temp_file_path, 'wb') as f:
                    f.write(file.read())
            # Sprawdź czy to jest ścieżka
            elif isinstance(file, str) and os.path.exists(file):
                with open(file, 'rb') as src, open(temp_file_path, 'wb') as dst:
                    dst.write(src.read())
            # Sprawdź czy to jest tuple (nazwa, ścieżka)
            elif isinstance(file, tuple) and len(file) > 1 and os.path.exists(file[1]):
                with open(file[1], 'rb') as src, open(temp_file_path, 'wb') as dst:
                    dst.write(src.read())
                file_name = file[0]
            else:
                # Ostatnia szansa - spróbuj potraktować plik jako ścieżkę
                try:
                    with open(str(file), 'rb') as src, open(temp_file_path, 'wb') as dst:
                        dst.write(src.read())
                except:
                    return f"Could not read file. Type: {type(file)}", ""
        except Exception as e:
            return f"Error reading file: {str(e)}", ""
        
        # Określ rozszerzenie pliku
        _, file_ext = os.path.splitext(file_name)
        file_ext = file_ext.lower()
        
        # Konwertuj plik w zależności od formatu
        converter = MultiConverter()
        try:
            if file_ext in [".xlsx", ".xls"]:
                try:
                    result = converter.convert_excel_to_formatted_text(temp_file_path)
                except Exception as e:
                    logging.exception(f"Error during Excel conversion: {str(e)}")
                    result = f"Error converting Excel file: {str(e)}\n\n"
                    result += "This may be due to:\n"
                    result += "- Unsupported Excel format (some .xls files require xlrd library)\n"
                    result += "- Corrupted or password-protected file\n"
                    result += "- Excel file with complex formatting or macros\n\n"
                    result += "Try saving your Excel file as a simple .xlsx file before uploading."
            elif file_ext in [".pptx", ".ppt"]:
                result = converter.convert_pptx_to_text(temp_file_path, file_name)
            elif file_ext == ".pdf":
                result = converter.convert_pdf_to_text(temp_file_path, file_name)
            elif file_ext in [".docx", ".doc"]:
                result = converter.convert_docx_to_text(temp_file_path, file_name)
            else:
                result = f"Unsupported file format: {file_ext}"
            
            # Utwórz nazwę pliku wyjściowego
            output_filename = os.path.splitext(file_name)[0] + ".txt"
            
            # Przygotuj plik do pobrania
            content_bytes = result.encode('utf-8')
            b64 = base64.b64encode(content_bytes).decode()
            
            # Przygotuj przycisk do pobrania
            download_link = f"""
            <a href="data:text/plain;base64,{b64}" download="{output_filename}" 
              style="display: inline-block; padding: 0.6em 1.2em; margin: 0.5em 0; 
              background-color: #4CAF50; color: white; border: none; border-radius: 4px; 
              cursor: pointer; text-decoration: none; font-weight: bold;">
              ⬇️ Download {output_filename}
            </a>
            """
            
            return result, download_link
        except Exception as e:
            logging.exception(f"Error converting file: {str(e)}")
            return f"Error converting file: {str(e)}", ""
        finally:
            # Usuń pliki tymczasowe
            try:
                if os.path.exists(temp_file_path):
                    os.unlink(temp_file_path)
                if os.path.exists(output_file_path):
                    os.unlink(output_file_path)
                os.rmdir(temp_dir)
            except Exception as e:
                logging.warning(f"Could not clean up temporary files: {str(e)}")
    
    except Exception as e:
        logging.exception(f"Unexpected error: {str(e)}")
        return f"Unexpected error: {str(e)}", ""


# Utwórz interfejs Gradio
with gr.Blocks(title="Multi-Format to TXT Converter") as app:
    gr.Markdown("# Multi-Format to TXT Converter by Heuristica.pl")
    gr.Markdown("Convert Excel, PowerPoint, PDF, and Word files to text format.")
    
    with gr.Row():
        file_input = gr.File(label="Upload a file (Excel, PowerPoint, PDF, or Word)")
    
    with gr.Row():
        convert_button = gr.Button("Convert to TXT", variant="primary")
    
    with gr.Row():
        text_output = gr.Textbox(label="Converted Text", lines=15)
    
    with gr.Row():
        download_html = gr.HTML(label="Download")
    
    # Info about supported formats
    gr.Markdown("""
    ## Supported file formats:
    - **Excel**: .xlsx, .xls
    - **PowerPoint**: .pptx, .ppt
    - **PDF**: .pdf
    - **Word**: .docx, .doc
    
    ## How to use:
    1. Upload a file using the file upload button
    2. Click "Convert to TXT"
    3. View the converted text
    4. Click the download button to save the converted text file
    """)
    
    # Obsługa konwersji
    convert_button.click(
        fn=convert_file,
        inputs=[file_input],
        outputs=[text_output, download_html]
    )

# Uruchom aplikację
if __name__ == "__main__":
    try:
        logging.info("Starting the application")
        app.launch(debug=True)
        logging.info("Application stopped")
    except Exception as e:
        logging.exception(f"Error launching application: {str(e)}")