import gradio as gr import openpyxl import os from datetime import datetime from pptx import Presentation import PyPDF2 from docx import Document import io import tempfile import logging import base64 # Importowanie biblioteki do starszych plików Excel try: import xlrd XLRD_AVAILABLE = True except ImportError: XLRD_AVAILABLE = False logging.warning("xlrd not available, .xls files may not be supported") # Konfiguracja logowania logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) class MultiConverter: def convert_excel_to_formatted_text(self, excel_file): """Convert Excel to formatted Markdown-style text.""" output = io.StringIO() file_ext = os.path.splitext(excel_file)[1].lower() # Obsługa błędów try: if file_ext == '.xls' and XLRD_AVAILABLE: # Użyj xlrd dla starszego formatu .xls logging.info("Processing old Excel format (.xls) with xlrd") return self._convert_xls_with_xlrd(excel_file, output) else: # Użyj openpyxl dla nowszego formatu .xlsx logging.info("Processing Excel format with openpyxl") try: workbook = openpyxl.load_workbook(excel_file, data_only=True) except Exception as e: logging.error(f"Error opening Excel file with openpyxl: {str(e)}") output.write(f"# Error opening Excel file\n\n") output.write(f"Details: {str(e)}\n\n") output.write("Possible reasons:\n") output.write("- The file may be in an older Excel format (.xls). Try saving it as .xlsx\n") output.write("- The file may be corrupted or password-protected\n") output.write("- The file may contain unsupported features\n\n") return output.getvalue() # Przetwarzanie arkuszy for idx, sheet_name in enumerate(workbook.sheetnames): if idx > 0: output.write("\n" + "-" * 70 + "\n\n") output.write(f"### {sheet_name}:\n") sheet = workbook[sheet_name] # Sprawdź, czy arkusz zawiera dane if sheet.max_row <= 1 and sheet.max_column <= 1: output.write("# No data in sheet\n\n") continue # Znajdź niepuste komórki data = [] max_col_widths = [] non_empty_rows = [] non_empty_cols = [] for row_idx in range(1, sheet.max_row + 1): for col_idx in range(1, sheet.max_column + 1): try: cell_value = sheet.cell(row=row_idx, column=col_idx).value if cell_value is not None: non_empty_rows.append(row_idx) non_empty_cols.append(col_idx) except Exception as e: logging.warning(f"Error reading cell at row {row_idx}, col {col_idx}: {str(e)}") if not non_empty_rows or not non_empty_cols: output.write("# No data in sheet\n\n") continue # Określ zakres danych min_row, max_row = min(non_empty_rows), max(non_empty_rows) min_col, max_col = min(non_empty_cols), max(non_empty_cols) max_col_widths = [0] * (max_col - min_col + 1) # Zbierz dane for row_idx in range(min_row, max_row + 1): row_data = [] for col_idx in range(min_col, max_col + 1): try: value = str(sheet.cell(row=row_idx, column=col_idx).value or "") except: value = "" row_data.append(value) col_pos = col_idx - min_col max_col_widths[col_pos] = max(max_col_widths[col_pos], len(value)) data.append(row_data) # Sformatuj jako tabelę Markdown for row_idx, row in enumerate(data): if row_idx == 0: header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" output.write(header_line + "\n") separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|" output.write(separator_line + "\n") data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" output.write(data_line + "\n") output.write("\n") except Exception as e: logging.exception(f"Error processing Excel file: {str(e)}") output.write(f"# Error processing Excel file\n\n") output.write(f"Details: {str(e)}\n\n") return output.getvalue() def _convert_xls_with_xlrd(self, excel_file, output): """Convert old Excel (.xls) format using xlrd.""" if not XLRD_AVAILABLE: output.write("# Error: xlrd library not available to process .xls files\n\n") output.write("Please install xlrd with 'pip install xlrd' to process .xls files\n") return output.getvalue() try: # Otwórz plik Excel za pomocą xlrd workbook = xlrd.open_workbook(excel_file) # Przetwórz każdy arkusz for idx, sheet in enumerate(workbook.sheets()): if idx > 0: output.write("\n" + "-" * 70 + "\n\n") sheet_name = sheet.name output.write(f"### {sheet_name}:\n") # Sprawdź, czy arkusz zawiera dane if sheet.nrows <= 0 or sheet.ncols <= 0: output.write("# No data in sheet\n\n") continue # Zbierz dane i określ szerokości kolumn data = [] max_col_widths = [0] * sheet.ncols for row_idx in range(sheet.nrows): row_data = [] for col_idx in range(sheet.ncols): try: cell = sheet.cell(row_idx, col_idx) if cell.ctype == xlrd.XL_CELL_DATE: # Konwertuj datę na czytelny format date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode) value = datetime(*date_tuple).strftime("%Y-%m-%d %H:%M:%S") else: value = str(cell.value).strip() except: value = "" row_data.append(value) max_col_widths[col_idx] = max(max_col_widths[col_idx], len(value)) data.append(row_data) # Sformatuj jako tabelę Markdown for row_idx, row in enumerate(data): if row_idx == 0: header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" output.write(header_line + "\n") separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|" output.write(separator_line + "\n") data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" output.write(data_line + "\n") output.write("\n") except Exception as e: logging.exception(f"Error processing .xls file with xlrd: {str(e)}") output.write(f"# Error processing .xls file\n\n") output.write(f"Details: {str(e)}\n\n") return output.getvalue() def convert_pptx_to_text(self, pptx_file, filename): """Convert PowerPoint to plain text.""" output = io.StringIO() prs = Presentation(pptx_file) output.write(f"# PowerPoint Presentation: {filename}\n\n") for slide_num, slide in enumerate(prs.slides, 1): output.write(f"## Slide {slide_num}\n") for shape in slide.shapes: if hasattr(shape, "text"): output.write(f"{shape.text}\n\n") return output.getvalue() def convert_pdf_to_text(self, pdf_file, filename): """Convert PDF to plain text.""" output = io.StringIO() pdf_reader = PyPDF2.PdfReader(pdf_file) output.write(f"# PDF Document: {filename}\n\n") for page_num, page in enumerate(pdf_reader.pages, 1): output.write(f"## Page {page_num}\n") output.write(page.extract_text() + "\n\n") return output.getvalue() def convert_docx_to_text(self, docx_file, filename): """Convert Word to plain text.""" output = io.StringIO() doc = Document(docx_file) output.write(f"# Word Document: {filename}\n\n") for para in doc.paragraphs: output.write(para.text + "\n\n") return output.getvalue() def convert_file(file): """Process uploaded file and convert it to text""" if file is None: return "No file uploaded. Please select a file first.", "" try: logging.info(f"Starting conversion for file: {file.name if hasattr(file, 'name') else 'unknown'}") # Save uploaded file to a temporary file temp_dir = tempfile.mkdtemp() temp_file_path = os.path.join(temp_dir, "uploaded_file") output_file_path = "" # Zapisz plik na dysk niezależnie od jego formatu if hasattr(file, 'name'): file_name = file.name else: file_name = "unknown_file" # Zapisujemy zawartość pliku do pliku tymczasowego try: # Próbuj odczytać jako obiekt z metodą read() if hasattr(file, 'read'): with open(temp_file_path, 'wb') as f: f.write(file.read()) # Sprawdź czy to jest ścieżka elif isinstance(file, str) and os.path.exists(file): with open(file, 'rb') as src, open(temp_file_path, 'wb') as dst: dst.write(src.read()) # Sprawdź czy to jest tuple (nazwa, ścieżka) elif isinstance(file, tuple) and len(file) > 1 and os.path.exists(file[1]): with open(file[1], 'rb') as src, open(temp_file_path, 'wb') as dst: dst.write(src.read()) file_name = file[0] else: # Ostatnia szansa - spróbuj potraktować plik jako ścieżkę try: with open(str(file), 'rb') as src, open(temp_file_path, 'wb') as dst: dst.write(src.read()) except: return f"Could not read file. Type: {type(file)}", "" except Exception as e: return f"Error reading file: {str(e)}", "" # Określ rozszerzenie pliku _, file_ext = os.path.splitext(file_name) file_ext = file_ext.lower() # Konwertuj plik w zależności od formatu converter = MultiConverter() try: if file_ext in [".xlsx", ".xls"]: try: result = converter.convert_excel_to_formatted_text(temp_file_path) except Exception as e: logging.exception(f"Error during Excel conversion: {str(e)}") result = f"Error converting Excel file: {str(e)}\n\n" result += "This may be due to:\n" result += "- Unsupported Excel format (some .xls files require xlrd library)\n" result += "- Corrupted or password-protected file\n" result += "- Excel file with complex formatting or macros\n\n" result += "Try saving your Excel file as a simple .xlsx file before uploading." elif file_ext in [".pptx", ".ppt"]: result = converter.convert_pptx_to_text(temp_file_path, file_name) elif file_ext == ".pdf": result = converter.convert_pdf_to_text(temp_file_path, file_name) elif file_ext in [".docx", ".doc"]: result = converter.convert_docx_to_text(temp_file_path, file_name) else: result = f"Unsupported file format: {file_ext}" # Utwórz nazwę pliku wyjściowego output_filename = os.path.splitext(file_name)[0] + ".txt" # Przygotuj plik do pobrania content_bytes = result.encode('utf-8') b64 = base64.b64encode(content_bytes).decode() # Przygotuj przycisk do pobrania download_link = f""" ⬇️ Download {output_filename} """ return result, download_link except Exception as e: logging.exception(f"Error converting file: {str(e)}") return f"Error converting file: {str(e)}", "" finally: # Usuń pliki tymczasowe try: if os.path.exists(temp_file_path): os.unlink(temp_file_path) if os.path.exists(output_file_path): os.unlink(output_file_path) os.rmdir(temp_dir) except Exception as e: logging.warning(f"Could not clean up temporary files: {str(e)}") except Exception as e: logging.exception(f"Unexpected error: {str(e)}") return f"Unexpected error: {str(e)}", "" # Utwórz interfejs Gradio with gr.Blocks(title="Multi-Format to TXT Converter") as app: gr.Markdown("# Multi-Format to TXT Converter by Heuristica.pl") gr.Markdown("Convert Excel, PowerPoint, PDF, and Word files to text format.") with gr.Row(): file_input = gr.File(label="Upload a file (Excel, PowerPoint, PDF, or Word)") with gr.Row(): convert_button = gr.Button("Convert to TXT", variant="primary") with gr.Row(): text_output = gr.Textbox(label="Converted Text", lines=15) with gr.Row(): download_html = gr.HTML(label="Download") # Info about supported formats gr.Markdown(""" ## Supported file formats: - **Excel**: .xlsx, .xls - **PowerPoint**: .pptx, .ppt - **PDF**: .pdf - **Word**: .docx, .doc ## How to use: 1. Upload a file using the file upload button 2. Click "Convert to TXT" 3. View the converted text 4. Click the download button to save the converted text file """) # Obsługa konwersji convert_button.click( fn=convert_file, inputs=[file_input], outputs=[text_output, download_html] ) # Uruchom aplikację if __name__ == "__main__": try: logging.info("Starting the application") app.launch(debug=True) logging.info("Application stopped") except Exception as e: logging.exception(f"Error launching application: {str(e)}")