Spaces:

Marek4321
/

Converter_Multi

Sleeping

File size: 16,506 Bytes

import gradio as gr
import openpyxl
import os
from datetime import datetime
from pptx import Presentation
import PyPDF2
from docx import Document
import io
import tempfile
import logging
import base64

# Importowanie biblioteki do starszych plików Excel
try:
    import xlrd
    XLRD_AVAILABLE = True
except ImportError:
    XLRD_AVAILABLE = False
    logging.warning("xlrd not available, .xls files may not be supported")

# Konfiguracja logowania
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

class MultiConverter:
    def convert_excel_to_formatted_text(self, excel_file):
        """Convert Excel to formatted Markdown-style text."""
        output = io.StringIO()
        file_ext = os.path.splitext(excel_file)[1].lower()
        
        # Obsługa błędów
        try:
            if file_ext == '.xls' and XLRD_AVAILABLE:
                # Użyj xlrd dla starszego formatu .xls
                logging.info("Processing old Excel format (.xls) with xlrd")
                return self._convert_xls_with_xlrd(excel_file, output)
            else:
                # Użyj openpyxl dla nowszego formatu .xlsx
                logging.info("Processing Excel format with openpyxl")
                try:
                    workbook = openpyxl.load_workbook(excel_file, data_only=True)
                except Exception as e:
                    logging.error(f"Error opening Excel file with openpyxl: {str(e)}")
                    output.write(f"# Error opening Excel file\n\n")
                    output.write(f"Details: {str(e)}\n\n")
                    output.write("Possible reasons:\n")
                    output.write("- The file may be in an older Excel format (.xls). Try saving it as .xlsx\n")
                    output.write("- The file may be corrupted or password-protected\n")
                    output.write("- The file may contain unsupported features\n\n")
                    return output.getvalue()
                
                # Przetwarzanie arkuszy
                for idx, sheet_name in enumerate(workbook.sheetnames):
                    if idx > 0:
                        output.write("\n" + "-" * 70 + "\n\n")
                    output.write(f"### {sheet_name}:\n")
                    sheet = workbook[sheet_name]
                    
                    # Sprawdź, czy arkusz zawiera dane
                    if sheet.max_row <= 1 and sheet.max_column <= 1:
                        output.write("# No data in sheet\n\n")
                        continue
                        
                    # Znajdź niepuste komórki
                    data = []
                    max_col_widths = []
                    non_empty_rows = []
                    non_empty_cols = []
                    
                    for row_idx in range(1, sheet.max_row + 1):
                        for col_idx in range(1, sheet.max_column + 1):
                            try:
                                cell_value = sheet.cell(row=row_idx, column=col_idx).value
                                if cell_value is not None:
                                    non_empty_rows.append(row_idx)
                                    non_empty_cols.append(col_idx)
                            except Exception as e:
                                logging.warning(f"Error reading cell at row {row_idx}, col {col_idx}: {str(e)}")
                    
                    if not non_empty_rows or not non_empty_cols:
                        output.write("# No data in sheet\n\n")
                        continue
                        
                    # Określ zakres danych
                    min_row, max_row = min(non_empty_rows), max(non_empty_rows)
                    min_col, max_col = min(non_empty_cols), max(non_empty_cols)
                    max_col_widths = [0] * (max_col - min_col + 1)
                    
                    # Zbierz dane
                    for row_idx in range(min_row, max_row + 1):
                        row_data = []
                        for col_idx in range(min_col, max_col + 1):
                            try:
                                value = str(sheet.cell(row=row_idx, column=col_idx).value or "")
                            except:
                                value = ""
                            row_data.append(value)
                            col_pos = col_idx - min_col
                            max_col_widths[col_pos] = max(max_col_widths[col_pos], len(value))
                        data.append(row_data)
                    
                    # Sformatuj jako tabelę Markdown
                    for row_idx, row in enumerate(data):
                        if row_idx == 0:
                            header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                            output.write(header_line + "\n")
                            separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|"
                            output.write(separator_line + "\n")
                        data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                        output.write(data_line + "\n")
                    output.write("\n")
        except Exception as e:
            logging.exception(f"Error processing Excel file: {str(e)}")
            output.write(f"# Error processing Excel file\n\n")
            output.write(f"Details: {str(e)}\n\n")
        
        return output.getvalue()
        
    def _convert_xls_with_xlrd(self, excel_file, output):
        """Convert old Excel (.xls) format using xlrd."""
        if not XLRD_AVAILABLE:
            output.write("# Error: xlrd library not available to process .xls files\n\n")
            output.write("Please install xlrd with 'pip install xlrd' to process .xls files\n")
            return output.getvalue()
            
        try:
            # Otwórz plik Excel za pomocą xlrd
            workbook = xlrd.open_workbook(excel_file)
            
            # Przetwórz każdy arkusz
            for idx, sheet in enumerate(workbook.sheets()):
                if idx > 0:
                    output.write("\n" + "-" * 70 + "\n\n")
                    
                sheet_name = sheet.name
                output.write(f"### {sheet_name}:\n")
                
                # Sprawdź, czy arkusz zawiera dane
                if sheet.nrows <= 0 or sheet.ncols <= 0:
                    output.write("# No data in sheet\n\n")
                    continue
                
                # Zbierz dane i określ szerokości kolumn
                data = []
                max_col_widths = [0] * sheet.ncols
                
                for row_idx in range(sheet.nrows):
                    row_data = []
                    for col_idx in range(sheet.ncols):
                        try:
                            cell = sheet.cell(row_idx, col_idx)
                            if cell.ctype == xlrd.XL_CELL_DATE:
                                # Konwertuj datę na czytelny format
                                date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode)
                                value = datetime(*date_tuple).strftime("%Y-%m-%d %H:%M:%S")
                            else:
                                value = str(cell.value).strip()
                        except:
                            value = ""
                            
                        row_data.append(value)
                        max_col_widths[col_idx] = max(max_col_widths[col_idx], len(value))
                    data.append(row_data)
                
                # Sformatuj jako tabelę Markdown
                for row_idx, row in enumerate(data):
                    if row_idx == 0:
                        header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                        output.write(header_line + "\n")
                        separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|"
                        output.write(separator_line + "\n")
                    data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
                    output.write(data_line + "\n")
                output.write("\n")
                
        except Exception as e:
            logging.exception(f"Error processing .xls file with xlrd: {str(e)}")
            output.write(f"# Error processing .xls file\n\n")
            output.write(f"Details: {str(e)}\n\n")
            
        return output.getvalue()

    def convert_pptx_to_text(self, pptx_file, filename):
        """Convert PowerPoint to plain text."""
        output = io.StringIO()
        prs = Presentation(pptx_file)
        output.write(f"# PowerPoint Presentation: {filename}\n\n")
        for slide_num, slide in enumerate(prs.slides, 1):
            output.write(f"## Slide {slide_num}\n")
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    output.write(f"{shape.text}\n\n")
        return output.getvalue()

    def convert_pdf_to_text(self, pdf_file, filename):
        """Convert PDF to plain text."""
        output = io.StringIO()
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        output.write(f"# PDF Document: {filename}\n\n")
        for page_num, page in enumerate(pdf_reader.pages, 1):
            output.write(f"## Page {page_num}\n")
            output.write(page.extract_text() + "\n\n")
        return output.getvalue()

    def convert_docx_to_text(self, docx_file, filename):
        """Convert Word to plain text."""
        output = io.StringIO()
        doc = Document(docx_file)
        output.write(f"# Word Document: {filename}\n\n")
        for para in doc.paragraphs:
            output.write(para.text + "\n\n")
        return output.getvalue()


def convert_file(file):
    """Process uploaded file and convert it to text"""
    if file is None:
        return "No file uploaded. Please select a file first.", ""
    
    try:
        logging.info(f"Starting conversion for file: {file.name if hasattr(file, 'name') else 'unknown'}")
        
        # Save uploaded file to a temporary file
        temp_dir = tempfile.mkdtemp()
        temp_file_path = os.path.join(temp_dir, "uploaded_file")
        output_file_path = ""
        
        # Zapisz plik na dysk niezależnie od jego formatu
        if hasattr(file, 'name'):
            file_name = file.name
        else:
            file_name = "unknown_file"
            
        # Zapisujemy zawartość pliku do pliku tymczasowego
        try:
            # Próbuj odczytać jako obiekt z metodą read()
            if hasattr(file, 'read'):
                with open(temp_file_path, 'wb') as f:
                    f.write(file.read())
            # Sprawdź czy to jest ścieżka
            elif isinstance(file, str) and os.path.exists(file):
                with open(file, 'rb') as src, open(temp_file_path, 'wb') as dst:
                    dst.write(src.read())
            # Sprawdź czy to jest tuple (nazwa, ścieżka)
            elif isinstance(file, tuple) and len(file) > 1 and os.path.exists(file[1]):
                with open(file[1], 'rb') as src, open(temp_file_path, 'wb') as dst:
                    dst.write(src.read())
                file_name = file[0]
            else:
                # Ostatnia szansa - spróbuj potraktować plik jako ścieżkę
                try:
                    with open(str(file), 'rb') as src, open(temp_file_path, 'wb') as dst:
                        dst.write(src.read())
                except:
                    return f"Could not read file. Type: {type(file)}", ""
        except Exception as e:
            return f"Error reading file: {str(e)}", ""
        
        # Określ rozszerzenie pliku
        _, file_ext = os.path.splitext(file_name)
        file_ext = file_ext.lower()
        
        # Konwertuj plik w zależności od formatu
        converter = MultiConverter()
        try:
            if file_ext in [".xlsx", ".xls"]:
                try:
                    result = converter.convert_excel_to_formatted_text(temp_file_path)
                except Exception as e:
                    logging.exception(f"Error during Excel conversion: {str(e)}")
                    result = f"Error converting Excel file: {str(e)}\n\n"
                    result += "This may be due to:\n"
                    result += "- Unsupported Excel format (some .xls files require xlrd library)\n"
                    result += "- Corrupted or password-protected file\n"
                    result += "- Excel file with complex formatting or macros\n\n"
                    result += "Try saving your Excel file as a simple .xlsx file before uploading."
            elif file_ext in [".pptx", ".ppt"]:
                result = converter.convert_pptx_to_text(temp_file_path, file_name)
            elif file_ext == ".pdf":
                result = converter.convert_pdf_to_text(temp_file_path, file_name)
            elif file_ext in [".docx", ".doc"]:
                result = converter.convert_docx_to_text(temp_file_path, file_name)
            else:
                result = f"Unsupported file format: {file_ext}"
            
            # Utwórz nazwę pliku wyjściowego
            output_filename = os.path.splitext(file_name)[0] + ".txt"
            
            # Przygotuj plik do pobrania
            content_bytes = result.encode('utf-8')
            b64 = base64.b64encode(content_bytes).decode()
            
            # Przygotuj przycisk do pobrania
            download_link = f"""
            <a href="data:text/plain;base64,{b64}" download="{output_filename}" 
              style="display: inline-block; padding: 0.6em 1.2em; margin: 0.5em 0; 
              background-color: #4CAF50; color: white; border: none; border-radius: 4px; 
              cursor: pointer; text-decoration: none; font-weight: bold;">
              ⬇️ Download {output_filename}
            </a>
            """
            
            return result, download_link
        except Exception as e:
            logging.exception(f"Error converting file: {str(e)}")
            return f"Error converting file: {str(e)}", ""
        finally:
            # Usuń pliki tymczasowe
            try:
                if os.path.exists(temp_file_path):
                    os.unlink(temp_file_path)
                if os.path.exists(output_file_path):
                    os.unlink(output_file_path)
                os.rmdir(temp_dir)
            except Exception as e:
                logging.warning(f"Could not clean up temporary files: {str(e)}")
    
    except Exception as e:
        logging.exception(f"Unexpected error: {str(e)}")
        return f"Unexpected error: {str(e)}", ""


# Utwórz interfejs Gradio
with gr.Blocks(title="Multi-Format to TXT Converter") as app:
    gr.Markdown("# Multi-Format to TXT Converter by Heuristica.pl")
    gr.Markdown("Convert Excel, PowerPoint, PDF, and Word files to text format.")
    
    with gr.Row():
        file_input = gr.File(label="Upload a file (Excel, PowerPoint, PDF, or Word)")
    
    with gr.Row():
        convert_button = gr.Button("Convert to TXT", variant="primary")
    
    with gr.Row():
        text_output = gr.Textbox(label="Converted Text", lines=15)
    
    with gr.Row():
        download_html = gr.HTML(label="Download")
    
    # Info about supported formats
    gr.Markdown("""
    ## Supported file formats:
    - **Excel**: .xlsx, .xls
    - **PowerPoint**: .pptx, .ppt
    - **PDF**: .pdf
    - **Word**: .docx, .doc
    
    ## How to use:
    1. Upload a file using the file upload button
    2. Click "Convert to TXT"
    3. View the converted text
    4. Click the download button to save the converted text file
    """)
    
    # Obsługa konwersji
    convert_button.click(
        fn=convert_file,
        inputs=[file_input],
        outputs=[text_output, download_html]
    )

# Uruchom aplikację
if __name__ == "__main__":
    try:
        logging.info("Starting the application")
        app.launch(debug=True)
        logging.info("Application stopped")
    except Exception as e:
        logging.exception(f"Error launching application: {str(e)}")