Converter_Multi / app.py
Marek4321's picture
Update app.py
07b7d87 verified
import gradio as gr
import openpyxl
import os
from datetime import datetime
from pptx import Presentation
import PyPDF2
from docx import Document
import io
import tempfile
import logging
import base64
# Importowanie biblioteki do starszych plik贸w Excel
try:
import xlrd
XLRD_AVAILABLE = True
except ImportError:
XLRD_AVAILABLE = False
logging.warning("xlrd not available, .xls files may not be supported")
# Konfiguracja logowania
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
class MultiConverter:
def convert_excel_to_formatted_text(self, excel_file):
"""Convert Excel to formatted Markdown-style text."""
output = io.StringIO()
file_ext = os.path.splitext(excel_file)[1].lower()
# Obs艂uga b艂臋d贸w
try:
if file_ext == '.xls' and XLRD_AVAILABLE:
# U偶yj xlrd dla starszego formatu .xls
logging.info("Processing old Excel format (.xls) with xlrd")
return self._convert_xls_with_xlrd(excel_file, output)
else:
# U偶yj openpyxl dla nowszego formatu .xlsx
logging.info("Processing Excel format with openpyxl")
try:
workbook = openpyxl.load_workbook(excel_file, data_only=True)
except Exception as e:
logging.error(f"Error opening Excel file with openpyxl: {str(e)}")
output.write(f"# Error opening Excel file\n\n")
output.write(f"Details: {str(e)}\n\n")
output.write("Possible reasons:\n")
output.write("- The file may be in an older Excel format (.xls). Try saving it as .xlsx\n")
output.write("- The file may be corrupted or password-protected\n")
output.write("- The file may contain unsupported features\n\n")
return output.getvalue()
# Przetwarzanie arkuszy
for idx, sheet_name in enumerate(workbook.sheetnames):
if idx > 0:
output.write("\n" + "-" * 70 + "\n\n")
output.write(f"### {sheet_name}:\n")
sheet = workbook[sheet_name]
# Sprawd藕, czy arkusz zawiera dane
if sheet.max_row <= 1 and sheet.max_column <= 1:
output.write("# No data in sheet\n\n")
continue
# Znajd藕 niepuste kom贸rki
data = []
max_col_widths = []
non_empty_rows = []
non_empty_cols = []
for row_idx in range(1, sheet.max_row + 1):
for col_idx in range(1, sheet.max_column + 1):
try:
cell_value = sheet.cell(row=row_idx, column=col_idx).value
if cell_value is not None:
non_empty_rows.append(row_idx)
non_empty_cols.append(col_idx)
except Exception as e:
logging.warning(f"Error reading cell at row {row_idx}, col {col_idx}: {str(e)}")
if not non_empty_rows or not non_empty_cols:
output.write("# No data in sheet\n\n")
continue
# Okre艣l zakres danych
min_row, max_row = min(non_empty_rows), max(non_empty_rows)
min_col, max_col = min(non_empty_cols), max(non_empty_cols)
max_col_widths = [0] * (max_col - min_col + 1)
# Zbierz dane
for row_idx in range(min_row, max_row + 1):
row_data = []
for col_idx in range(min_col, max_col + 1):
try:
value = str(sheet.cell(row=row_idx, column=col_idx).value or "")
except:
value = ""
row_data.append(value)
col_pos = col_idx - min_col
max_col_widths[col_pos] = max(max_col_widths[col_pos], len(value))
data.append(row_data)
# Sformatuj jako tabel臋 Markdown
for row_idx, row in enumerate(data):
if row_idx == 0:
header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
output.write(header_line + "\n")
separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|"
output.write(separator_line + "\n")
data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
output.write(data_line + "\n")
output.write("\n")
except Exception as e:
logging.exception(f"Error processing Excel file: {str(e)}")
output.write(f"# Error processing Excel file\n\n")
output.write(f"Details: {str(e)}\n\n")
return output.getvalue()
def _convert_xls_with_xlrd(self, excel_file, output):
"""Convert old Excel (.xls) format using xlrd."""
if not XLRD_AVAILABLE:
output.write("# Error: xlrd library not available to process .xls files\n\n")
output.write("Please install xlrd with 'pip install xlrd' to process .xls files\n")
return output.getvalue()
try:
# Otw贸rz plik Excel za pomoc膮 xlrd
workbook = xlrd.open_workbook(excel_file)
# Przetw贸rz ka偶dy arkusz
for idx, sheet in enumerate(workbook.sheets()):
if idx > 0:
output.write("\n" + "-" * 70 + "\n\n")
sheet_name = sheet.name
output.write(f"### {sheet_name}:\n")
# Sprawd藕, czy arkusz zawiera dane
if sheet.nrows <= 0 or sheet.ncols <= 0:
output.write("# No data in sheet\n\n")
continue
# Zbierz dane i okre艣l szeroko艣ci kolumn
data = []
max_col_widths = [0] * sheet.ncols
for row_idx in range(sheet.nrows):
row_data = []
for col_idx in range(sheet.ncols):
try:
cell = sheet.cell(row_idx, col_idx)
if cell.ctype == xlrd.XL_CELL_DATE:
# Konwertuj dat臋 na czytelny format
date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode)
value = datetime(*date_tuple).strftime("%Y-%m-%d %H:%M:%S")
else:
value = str(cell.value).strip()
except:
value = ""
row_data.append(value)
max_col_widths[col_idx] = max(max_col_widths[col_idx], len(value))
data.append(row_data)
# Sformatuj jako tabel臋 Markdown
for row_idx, row in enumerate(data):
if row_idx == 0:
header_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
output.write(header_line + "\n")
separator_line = "|" + "|".join("-" * (width + 2) for width in max_col_widths) + "|"
output.write(separator_line + "\n")
data_line = "| " + " | ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " |"
output.write(data_line + "\n")
output.write("\n")
except Exception as e:
logging.exception(f"Error processing .xls file with xlrd: {str(e)}")
output.write(f"# Error processing .xls file\n\n")
output.write(f"Details: {str(e)}\n\n")
return output.getvalue()
def convert_pptx_to_text(self, pptx_file, filename):
"""Convert PowerPoint to plain text."""
output = io.StringIO()
prs = Presentation(pptx_file)
output.write(f"# PowerPoint Presentation: {filename}\n\n")
for slide_num, slide in enumerate(prs.slides, 1):
output.write(f"## Slide {slide_num}\n")
for shape in slide.shapes:
if hasattr(shape, "text"):
output.write(f"{shape.text}\n\n")
return output.getvalue()
def convert_pdf_to_text(self, pdf_file, filename):
"""Convert PDF to plain text."""
output = io.StringIO()
pdf_reader = PyPDF2.PdfReader(pdf_file)
output.write(f"# PDF Document: {filename}\n\n")
for page_num, page in enumerate(pdf_reader.pages, 1):
output.write(f"## Page {page_num}\n")
output.write(page.extract_text() + "\n\n")
return output.getvalue()
def convert_docx_to_text(self, docx_file, filename):
"""Convert Word to plain text."""
output = io.StringIO()
doc = Document(docx_file)
output.write(f"# Word Document: {filename}\n\n")
for para in doc.paragraphs:
output.write(para.text + "\n\n")
return output.getvalue()
def convert_file(file):
"""Process uploaded file and convert it to text"""
if file is None:
return "No file uploaded. Please select a file first.", ""
try:
logging.info(f"Starting conversion for file: {file.name if hasattr(file, 'name') else 'unknown'}")
# Save uploaded file to a temporary file
temp_dir = tempfile.mkdtemp()
temp_file_path = os.path.join(temp_dir, "uploaded_file")
output_file_path = ""
# Zapisz plik na dysk niezale偶nie od jego formatu
if hasattr(file, 'name'):
file_name = file.name
else:
file_name = "unknown_file"
# Zapisujemy zawarto艣膰 pliku do pliku tymczasowego
try:
# Pr贸buj odczyta膰 jako obiekt z metod膮 read()
if hasattr(file, 'read'):
with open(temp_file_path, 'wb') as f:
f.write(file.read())
# Sprawd藕 czy to jest 艣cie偶ka
elif isinstance(file, str) and os.path.exists(file):
with open(file, 'rb') as src, open(temp_file_path, 'wb') as dst:
dst.write(src.read())
# Sprawd藕 czy to jest tuple (nazwa, 艣cie偶ka)
elif isinstance(file, tuple) and len(file) > 1 and os.path.exists(file[1]):
with open(file[1], 'rb') as src, open(temp_file_path, 'wb') as dst:
dst.write(src.read())
file_name = file[0]
else:
# Ostatnia szansa - spr贸buj potraktowa膰 plik jako 艣cie偶k臋
try:
with open(str(file), 'rb') as src, open(temp_file_path, 'wb') as dst:
dst.write(src.read())
except:
return f"Could not read file. Type: {type(file)}", ""
except Exception as e:
return f"Error reading file: {str(e)}", ""
# Okre艣l rozszerzenie pliku
_, file_ext = os.path.splitext(file_name)
file_ext = file_ext.lower()
# Konwertuj plik w zale偶no艣ci od formatu
converter = MultiConverter()
try:
if file_ext in [".xlsx", ".xls"]:
try:
result = converter.convert_excel_to_formatted_text(temp_file_path)
except Exception as e:
logging.exception(f"Error during Excel conversion: {str(e)}")
result = f"Error converting Excel file: {str(e)}\n\n"
result += "This may be due to:\n"
result += "- Unsupported Excel format (some .xls files require xlrd library)\n"
result += "- Corrupted or password-protected file\n"
result += "- Excel file with complex formatting or macros\n\n"
result += "Try saving your Excel file as a simple .xlsx file before uploading."
elif file_ext in [".pptx", ".ppt"]:
result = converter.convert_pptx_to_text(temp_file_path, file_name)
elif file_ext == ".pdf":
result = converter.convert_pdf_to_text(temp_file_path, file_name)
elif file_ext in [".docx", ".doc"]:
result = converter.convert_docx_to_text(temp_file_path, file_name)
else:
result = f"Unsupported file format: {file_ext}"
# Utw贸rz nazw臋 pliku wyj艣ciowego
output_filename = os.path.splitext(file_name)[0] + ".txt"
# Przygotuj plik do pobrania
content_bytes = result.encode('utf-8')
b64 = base64.b64encode(content_bytes).decode()
# Przygotuj przycisk do pobrania
download_link = f"""
<a href="data:text/plain;base64,{b64}" download="{output_filename}"
style="display: inline-block; padding: 0.6em 1.2em; margin: 0.5em 0;
background-color: #4CAF50; color: white; border: none; border-radius: 4px;
cursor: pointer; text-decoration: none; font-weight: bold;">
猬囷笍 Download {output_filename}
</a>
"""
return result, download_link
except Exception as e:
logging.exception(f"Error converting file: {str(e)}")
return f"Error converting file: {str(e)}", ""
finally:
# Usu艅 pliki tymczasowe
try:
if os.path.exists(temp_file_path):
os.unlink(temp_file_path)
if os.path.exists(output_file_path):
os.unlink(output_file_path)
os.rmdir(temp_dir)
except Exception as e:
logging.warning(f"Could not clean up temporary files: {str(e)}")
except Exception as e:
logging.exception(f"Unexpected error: {str(e)}")
return f"Unexpected error: {str(e)}", ""
# Utw贸rz interfejs Gradio
with gr.Blocks(title="Multi-Format to TXT Converter") as app:
gr.Markdown("# Multi-Format to TXT Converter by Heuristica.pl")
gr.Markdown("Convert Excel, PowerPoint, PDF, and Word files to text format.")
with gr.Row():
file_input = gr.File(label="Upload a file (Excel, PowerPoint, PDF, or Word)")
with gr.Row():
convert_button = gr.Button("Convert to TXT", variant="primary")
with gr.Row():
text_output = gr.Textbox(label="Converted Text", lines=15)
with gr.Row():
download_html = gr.HTML(label="Download")
# Info about supported formats
gr.Markdown("""
## Supported file formats:
- **Excel**: .xlsx, .xls
- **PowerPoint**: .pptx, .ppt
- **PDF**: .pdf
- **Word**: .docx, .doc
## How to use:
1. Upload a file using the file upload button
2. Click "Convert to TXT"
3. View the converted text
4. Click the download button to save the converted text file
""")
# Obs艂uga konwersji
convert_button.click(
fn=convert_file,
inputs=[file_input],
outputs=[text_output, download_html]
)
# Uruchom aplikacj臋
if __name__ == "__main__":
try:
logging.info("Starting the application")
app.launch(debug=True)
logging.info("Application stopped")
except Exception as e:
logging.exception(f"Error launching application: {str(e)}")