Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import openpyxl | |
| import os | |
| from datetime import datetime | |
| from pptx import Presentation | |
| import PyPDF2 | |
| from docx import Document | |
| import io | |
| import tempfile | |
| import logging | |
| import base64 | |
| # Importowanie biblioteki do starszych plik贸w Excel | |
| try: | |
| import xlrd | |
| XLRD_AVAILABLE = True | |
| except ImportError: | |
| XLRD_AVAILABLE = False | |
| logging.warning("xlrd not available, .xls files may not be supported") | |
| # Konfiguracja logowania | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| class MultiConverter: | |
| def convert_excel_to_formatted_text(self, excel_file): | |
| """Convert Excel to formatted Markdown-style text.""" | |
| output = io.StringIO() | |
| file_ext = os.path.splitext(excel_file)[1].lower() | |
| # Obs艂uga b艂臋d贸w | |
| try: | |
| if file_ext == '.xls' and XLRD_AVAILABLE: | |
| # U偶yj xlrd dla starszego formatu .xls | |
| logging.info("Processing old Excel format (.xls) with xlrd") | |
| return self._convert_xls_with_xlrd(excel_file, output) | |
| else: | |
| # U偶yj openpyxl dla nowszego formatu .xlsx | |
| logging.info("Processing Excel format with openpyxl") | |
| try: | |
| workbook = openpyxl.load_workbook(excel_file, data_only=True) | |
| except Exception as e: | |
| logging.error(f"Error opening Excel file with openpyxl: {str(e)}") | |
| output.write(f"# Error opening Excel file\n\n") | |
| output.write(f"Details: {str(e)}\n\n") | |
| output.write("Possible reasons:\n") | |
| output.write("- The file may be in an older Excel format (.xls). Try saving it as .xlsx\n") | |
| output.write("- The file may be corrupted or password-protected\n") | |
| output.write("- The file may contain unsupported features\n\n") | |
| return output.getvalue() | |
| # Przetwarzanie arkuszy | |
| for idx, sheet_name in enumerate(workbook.sheetnames): | |
| if idx > 0: | |
| output.write("\n" + "-" * 70 + "\n\n") | |
| output.write(f"### {sheet_name}:\n") | |
| sheet = workbook[sheet_name] | |
| # Sprawd藕, czy arkusz zawiera dane | |
| if sheet.max_row <= 1 and sheet.max_column <= 1: | |
| output.write("# No data in sheet\n\n") | |
| continue | |
| # Znajd藕 niepuste kom贸rki | |
| data = [] | |
| max_col_widths = [] | |
| non_empty_rows = [] | |
| non_empty_cols = [] | |
| for row_idx in range(1, sheet.max_row + 1): | |
| for col_idx in range(1, sheet.max_column + 1): | |
| try: | |
| cell_value = sheet.cell(row=row_idx, column=col_idx).value | |
| if cell_value is not None: | |
| non_empty_rows.append(row_idx) | |
| non_empty_cols.append(col_idx) | |
| except Exception as e: | |
| logging.warning(f"Error reading cell at row {row_idx}, col {col_idx}: {str(e)}") | |
| if not non_empty_rows or not non_empty_cols: | |
| output.write("# No data in sheet\n\n") | |
| continue | |
| # Okre艣l zakres danych | |
| min_row, max_row = min(non_empty_rows), max(non_empty_rows) | |
| min_col, max_col = min(non_empty_cols), max(non_empty_cols) | |
| max_col_widths = [0] * (max_col - min_col + 1) | |
| # Zbierz dane | |
| for row_idx in range(min_row, max_row + 1): | |
| row_data = [] | |
| for col_idx in range(min_col, max_col + 1): | |
| try: | |
| value = str(sheet.cell(row=row_idx, column=col_idx).value or "") | |
| except: | |
| value = "" | |
| row_data.append(value) | |
| col_pos = col_idx - min_col | |
| max_col_widths[col_pos] = max(max_col_widths[col_pos], len(value)) | |
| data.append(row_data) | |
| # Sformatuj jako tabel臋 Markdown | |
| for row_idx, row in enumerate(data): | |
| if row_idx == 0: | |
| header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" | |
| output.write(header_line + "\n") | |
| separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|" | |
| output.write(separator_line + "\n") | |
| data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" | |
| output.write(data_line + "\n") | |
| output.write("\n") | |
| except Exception as e: | |
| logging.exception(f"Error processing Excel file: {str(e)}") | |
| output.write(f"# Error processing Excel file\n\n") | |
| output.write(f"Details: {str(e)}\n\n") | |
| return output.getvalue() | |
| def _convert_xls_with_xlrd(self, excel_file, output): | |
| """Convert old Excel (.xls) format using xlrd.""" | |
| if not XLRD_AVAILABLE: | |
| output.write("# Error: xlrd library not available to process .xls files\n\n") | |
| output.write("Please install xlrd with 'pip install xlrd' to process .xls files\n") | |
| return output.getvalue() | |
| try: | |
| # Otw贸rz plik Excel za pomoc膮 xlrd | |
| workbook = xlrd.open_workbook(excel_file) | |
| # Przetw贸rz ka偶dy arkusz | |
| for idx, sheet in enumerate(workbook.sheets()): | |
| if idx > 0: | |
| output.write("\n" + "-" * 70 + "\n\n") | |
| sheet_name = sheet.name | |
| output.write(f"### {sheet_name}:\n") | |
| # Sprawd藕, czy arkusz zawiera dane | |
| if sheet.nrows <= 0 or sheet.ncols <= 0: | |
| output.write("# No data in sheet\n\n") | |
| continue | |
| # Zbierz dane i okre艣l szeroko艣ci kolumn | |
| data = [] | |
| max_col_widths = [0] * sheet.ncols | |
| for row_idx in range(sheet.nrows): | |
| row_data = [] | |
| for col_idx in range(sheet.ncols): | |
| try: | |
| cell = sheet.cell(row_idx, col_idx) | |
| if cell.ctype == xlrd.XL_CELL_DATE: | |
| # Konwertuj dat臋 na czytelny format | |
| date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode) | |
| value = datetime(*date_tuple).strftime("%Y-%m-%d %H:%M:%S") | |
| else: | |
| value = str(cell.value).strip() | |
| except: | |
| value = "" | |
| row_data.append(value) | |
| max_col_widths[col_idx] = max(max_col_widths[col_idx], len(value)) | |
| data.append(row_data) | |
| # Sformatuj jako tabel臋 Markdown | |
| for row_idx, row in enumerate(data): | |
| if row_idx == 0: | |
| header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" | |
| output.write(header_line + "\n") | |
| separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|" | |
| output.write(separator_line + "\n") | |
| data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |" | |
| output.write(data_line + "\n") | |
| output.write("\n") | |
| except Exception as e: | |
| logging.exception(f"Error processing .xls file with xlrd: {str(e)}") | |
| output.write(f"# Error processing .xls file\n\n") | |
| output.write(f"Details: {str(e)}\n\n") | |
| return output.getvalue() | |
| def convert_pptx_to_text(self, pptx_file, filename): | |
| """Convert PowerPoint to plain text.""" | |
| output = io.StringIO() | |
| prs = Presentation(pptx_file) | |
| output.write(f"# PowerPoint Presentation: {filename}\n\n") | |
| for slide_num, slide in enumerate(prs.slides, 1): | |
| output.write(f"## Slide {slide_num}\n") | |
| for shape in slide.shapes: | |
| if hasattr(shape, "text"): | |
| output.write(f"{shape.text}\n\n") | |
| return output.getvalue() | |
| def convert_pdf_to_text(self, pdf_file, filename): | |
| """Convert PDF to plain text.""" | |
| output = io.StringIO() | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| output.write(f"# PDF Document: {filename}\n\n") | |
| for page_num, page in enumerate(pdf_reader.pages, 1): | |
| output.write(f"## Page {page_num}\n") | |
| output.write(page.extract_text() + "\n\n") | |
| return output.getvalue() | |
| def convert_docx_to_text(self, docx_file, filename): | |
| """Convert Word to plain text.""" | |
| output = io.StringIO() | |
| doc = Document(docx_file) | |
| output.write(f"# Word Document: {filename}\n\n") | |
| for para in doc.paragraphs: | |
| output.write(para.text + "\n\n") | |
| return output.getvalue() | |
| def convert_file(file): | |
| """Process uploaded file and convert it to text""" | |
| if file is None: | |
| return "No file uploaded. Please select a file first.", "" | |
| try: | |
| logging.info(f"Starting conversion for file: {file.name if hasattr(file, 'name') else 'unknown'}") | |
| # Save uploaded file to a temporary file | |
| temp_dir = tempfile.mkdtemp() | |
| temp_file_path = os.path.join(temp_dir, "uploaded_file") | |
| output_file_path = "" | |
| # Zapisz plik na dysk niezale偶nie od jego formatu | |
| if hasattr(file, 'name'): | |
| file_name = file.name | |
| else: | |
| file_name = "unknown_file" | |
| # Zapisujemy zawarto艣膰 pliku do pliku tymczasowego | |
| try: | |
| # Pr贸buj odczyta膰 jako obiekt z metod膮 read() | |
| if hasattr(file, 'read'): | |
| with open(temp_file_path, 'wb') as f: | |
| f.write(file.read()) | |
| # Sprawd藕 czy to jest 艣cie偶ka | |
| elif isinstance(file, str) and os.path.exists(file): | |
| with open(file, 'rb') as src, open(temp_file_path, 'wb') as dst: | |
| dst.write(src.read()) | |
| # Sprawd藕 czy to jest tuple (nazwa, 艣cie偶ka) | |
| elif isinstance(file, tuple) and len(file) > 1 and os.path.exists(file[1]): | |
| with open(file[1], 'rb') as src, open(temp_file_path, 'wb') as dst: | |
| dst.write(src.read()) | |
| file_name = file[0] | |
| else: | |
| # Ostatnia szansa - spr贸buj potraktowa膰 plik jako 艣cie偶k臋 | |
| try: | |
| with open(str(file), 'rb') as src, open(temp_file_path, 'wb') as dst: | |
| dst.write(src.read()) | |
| except: | |
| return f"Could not read file. Type: {type(file)}", "" | |
| except Exception as e: | |
| return f"Error reading file: {str(e)}", "" | |
| # Okre艣l rozszerzenie pliku | |
| _, file_ext = os.path.splitext(file_name) | |
| file_ext = file_ext.lower() | |
| # Konwertuj plik w zale偶no艣ci od formatu | |
| converter = MultiConverter() | |
| try: | |
| if file_ext in [".xlsx", ".xls"]: | |
| try: | |
| result = converter.convert_excel_to_formatted_text(temp_file_path) | |
| except Exception as e: | |
| logging.exception(f"Error during Excel conversion: {str(e)}") | |
| result = f"Error converting Excel file: {str(e)}\n\n" | |
| result += "This may be due to:\n" | |
| result += "- Unsupported Excel format (some .xls files require xlrd library)\n" | |
| result += "- Corrupted or password-protected file\n" | |
| result += "- Excel file with complex formatting or macros\n\n" | |
| result += "Try saving your Excel file as a simple .xlsx file before uploading." | |
| elif file_ext in [".pptx", ".ppt"]: | |
| result = converter.convert_pptx_to_text(temp_file_path, file_name) | |
| elif file_ext == ".pdf": | |
| result = converter.convert_pdf_to_text(temp_file_path, file_name) | |
| elif file_ext in [".docx", ".doc"]: | |
| result = converter.convert_docx_to_text(temp_file_path, file_name) | |
| else: | |
| result = f"Unsupported file format: {file_ext}" | |
| # Utw贸rz nazw臋 pliku wyj艣ciowego | |
| output_filename = os.path.splitext(file_name)[0] + ".txt" | |
| # Przygotuj plik do pobrania | |
| content_bytes = result.encode('utf-8') | |
| b64 = base64.b64encode(content_bytes).decode() | |
| # Przygotuj przycisk do pobrania | |
| download_link = f""" | |
| <a href="data:text/plain;base64,{b64}" download="{output_filename}" | |
| style="display: inline-block; padding: 0.6em 1.2em; margin: 0.5em 0; | |
| background-color: #4CAF50; color: white; border: none; border-radius: 4px; | |
| cursor: pointer; text-decoration: none; font-weight: bold;"> | |
| 猬囷笍 Download {output_filename} | |
| </a> | |
| """ | |
| return result, download_link | |
| except Exception as e: | |
| logging.exception(f"Error converting file: {str(e)}") | |
| return f"Error converting file: {str(e)}", "" | |
| finally: | |
| # Usu艅 pliki tymczasowe | |
| try: | |
| if os.path.exists(temp_file_path): | |
| os.unlink(temp_file_path) | |
| if os.path.exists(output_file_path): | |
| os.unlink(output_file_path) | |
| os.rmdir(temp_dir) | |
| except Exception as e: | |
| logging.warning(f"Could not clean up temporary files: {str(e)}") | |
| except Exception as e: | |
| logging.exception(f"Unexpected error: {str(e)}") | |
| return f"Unexpected error: {str(e)}", "" | |
| # Utw贸rz interfejs Gradio | |
| with gr.Blocks(title="Multi-Format to TXT Converter") as app: | |
| gr.Markdown("# Multi-Format to TXT Converter by Heuristica.pl") | |
| gr.Markdown("Convert Excel, PowerPoint, PDF, and Word files to text format.") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload a file (Excel, PowerPoint, PDF, or Word)") | |
| with gr.Row(): | |
| convert_button = gr.Button("Convert to TXT", variant="primary") | |
| with gr.Row(): | |
| text_output = gr.Textbox(label="Converted Text", lines=15) | |
| with gr.Row(): | |
| download_html = gr.HTML(label="Download") | |
| # Info about supported formats | |
| gr.Markdown(""" | |
| ## Supported file formats: | |
| - **Excel**: .xlsx, .xls | |
| - **PowerPoint**: .pptx, .ppt | |
| - **PDF**: .pdf | |
| - **Word**: .docx, .doc | |
| ## How to use: | |
| 1. Upload a file using the file upload button | |
| 2. Click "Convert to TXT" | |
| 3. View the converted text | |
| 4. Click the download button to save the converted text file | |
| """) | |
| # Obs艂uga konwersji | |
| convert_button.click( | |
| fn=convert_file, | |
| inputs=[file_input], | |
| outputs=[text_output, download_html] | |
| ) | |
| # Uruchom aplikacj臋 | |
| if __name__ == "__main__": | |
| try: | |
| logging.info("Starting the application") | |
| app.launch(debug=True) | |
| logging.info("Application stopped") | |
| except Exception as e: | |
| logging.exception(f"Error launching application: {str(e)}") |