Spaces:

Marek4321
/

Converter_Multi

Sleeping

App Files Files Community

Converter_Multi / app.py

Marek4321

Update app.py

07b7d87 verified 11 months ago

raw

history blame contribute delete

16.5 kB

	import gradio as gr
	import openpyxl
	import os
	from datetime import datetime
	from pptx import Presentation
	import PyPDF2
	from docx import Document
	import io
	import tempfile
	import logging
	import base64

	# Importowanie biblioteki do starszych plików Excel
	try:
	import xlrd
	XLRD_AVAILABLE = True
	except ImportError:
	XLRD_AVAILABLE = False
	logging.warning("xlrd not available, .xls files may not be supported")

	# Konfiguracja logowania
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)

	class MultiConverter:
	def convert_excel_to_formatted_text(self, excel_file):
	"""Convert Excel to formatted Markdown-style text."""
	output = io.StringIO()
	file_ext = os.path.splitext(excel_file)[1].lower()

	# Obsługa błędów
	try:
	if file_ext == '.xls' and XLRD_AVAILABLE:
	# Użyj xlrd dla starszego formatu .xls
	logging.info("Processing old Excel format (.xls) with xlrd")
	return self._convert_xls_with_xlrd(excel_file, output)
	else:
	# Użyj openpyxl dla nowszego formatu .xlsx
	logging.info("Processing Excel format with openpyxl")
	try:
	workbook = openpyxl.load_workbook(excel_file, data_only=True)
	except Exception as e:
	logging.error(f"Error opening Excel file with openpyxl: {str(e)}")
	output.write(f"# Error opening Excel file\n\n")
	output.write(f"Details: {str(e)}\n\n")
	output.write("Possible reasons:\n")
	output.write("- The file may be in an older Excel format (.xls). Try saving it as .xlsx\n")
	output.write("- The file may be corrupted or password-protected\n")
	output.write("- The file may contain unsupported features\n\n")
	return output.getvalue()

	# Przetwarzanie arkuszy
	for idx, sheet_name in enumerate(workbook.sheetnames):
	if idx > 0:
	output.write("\n" + "-" * 70 + "\n\n")
	output.write(f"### {sheet_name}:\n")
	sheet = workbook[sheet_name]

	# Sprawdź, czy arkusz zawiera dane
	if sheet.max_row <= 1 and sheet.max_column <= 1:
	output.write("# No data in sheet\n\n")
	continue

	# Znajdź niepuste komórki
	data = []
	max_col_widths = []
	non_empty_rows = []
	non_empty_cols = []

	for row_idx in range(1, sheet.max_row + 1):
	for col_idx in range(1, sheet.max_column + 1):
	try:
	cell_value = sheet.cell(row=row_idx, column=col_idx).value
	if cell_value is not None:
	non_empty_rows.append(row_idx)
	non_empty_cols.append(col_idx)
	except Exception as e:
	logging.warning(f"Error reading cell at row {row_idx}, col {col_idx}: {str(e)}")

	if not non_empty_rows or not non_empty_cols:
	output.write("# No data in sheet\n\n")
	continue

	# Określ zakres danych
	min_row, max_row = min(non_empty_rows), max(non_empty_rows)
	min_col, max_col = min(non_empty_cols), max(non_empty_cols)
	max_col_widths = [0] * (max_col - min_col + 1)

	# Zbierz dane
	for row_idx in range(min_row, max_row + 1):
	row_data = []
	for col_idx in range(min_col, max_col + 1):
	try:
	value = str(sheet.cell(row=row_idx, column=col_idx).value or "")
	except:
	value = ""
	row_data.append(value)
	col_pos = col_idx - min_col
	max_col_widths[col_pos] = max(max_col_widths[col_pos], len(value))
	data.append(row_data)

	# Sformatuj jako tabelę Markdown
	for row_idx, row in enumerate(data):
	if row_idx == 0:
	header_line = "\| " + " \| ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " \|"
	output.write(header_line + "\n")
	separator_line = "\|" + "\|".join("-" * (width + 2) for width in max_col_widths) + "\|"
	output.write(separator_line + "\n")
	data_line = "\| " + " \| ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " \|"
	output.write(data_line + "\n")
	output.write("\n")
	except Exception as e:
	logging.exception(f"Error processing Excel file: {str(e)}")
	output.write(f"# Error processing Excel file\n\n")
	output.write(f"Details: {str(e)}\n\n")

	return output.getvalue()

	def _convert_xls_with_xlrd(self, excel_file, output):
	"""Convert old Excel (.xls) format using xlrd."""
	if not XLRD_AVAILABLE:
	output.write("# Error: xlrd library not available to process .xls files\n\n")
	output.write("Please install xlrd with 'pip install xlrd' to process .xls files\n")
	return output.getvalue()

	try:
	# Otwórz plik Excel za pomocą xlrd
	workbook = xlrd.open_workbook(excel_file)

	# Przetwórz każdy arkusz
	for idx, sheet in enumerate(workbook.sheets()):
	if idx > 0:
	output.write("\n" + "-" * 70 + "\n\n")

	sheet_name = sheet.name
	output.write(f"### {sheet_name}:\n")

	# Sprawdź, czy arkusz zawiera dane
	if sheet.nrows <= 0 or sheet.ncols <= 0:
	output.write("# No data in sheet\n\n")
	continue

	# Zbierz dane i określ szerokości kolumn
	data = []
	max_col_widths = [0] * sheet.ncols

	for row_idx in range(sheet.nrows):
	row_data = []
	for col_idx in range(sheet.ncols):
	try:
	cell = sheet.cell(row_idx, col_idx)
	if cell.ctype == xlrd.XL_CELL_DATE:
	# Konwertuj datę na czytelny format
	date_tuple = xlrd.xldate_as_tuple(cell.value, workbook.datemode)
	value = datetime(*date_tuple).strftime("%Y-%m-%d %H:%M:%S")
	else:
	value = str(cell.value).strip()
	except:
	value = ""

	row_data.append(value)
	max_col_widths[col_idx] = max(max_col_widths[col_idx], len(value))
	data.append(row_data)

	# Sformatuj jako tabelę Markdown
	for row_idx, row in enumerate(data):
	if row_idx == 0:
	header_line = "\| " + " \| ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " \|"
	output.write(header_line + "\n")
	separator_line = "\|" + "\|".join("-" * (width + 2) for width in max_col_widths) + "\|"
	output.write(separator_line + "\n")
	data_line = "\| " + " \| ".join(cell + " " * (max_col_widths[i] - len(cell)) for i, cell in enumerate(row)) + " \|"
	output.write(data_line + "\n")
	output.write("\n")

	except Exception as e:
	logging.exception(f"Error processing .xls file with xlrd: {str(e)}")
	output.write(f"# Error processing .xls file\n\n")
	output.write(f"Details: {str(e)}\n\n")

	return output.getvalue()

	def convert_pptx_to_text(self, pptx_file, filename):
	"""Convert PowerPoint to plain text."""
	output = io.StringIO()
	prs = Presentation(pptx_file)
	output.write(f"# PowerPoint Presentation: {filename}\n\n")
	for slide_num, slide in enumerate(prs.slides, 1):
	output.write(f"## Slide {slide_num}\n")
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	output.write(f"{shape.text}\n\n")
	return output.getvalue()

	def convert_pdf_to_text(self, pdf_file, filename):
	"""Convert PDF to plain text."""
	output = io.StringIO()
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	output.write(f"# PDF Document: {filename}\n\n")
	for page_num, page in enumerate(pdf_reader.pages, 1):
	output.write(f"## Page {page_num}\n")
	output.write(page.extract_text() + "\n\n")
	return output.getvalue()

	def convert_docx_to_text(self, docx_file, filename):
	"""Convert Word to plain text."""
	output = io.StringIO()
	doc = Document(docx_file)
	output.write(f"# Word Document: {filename}\n\n")
	for para in doc.paragraphs:
	output.write(para.text + "\n\n")
	return output.getvalue()


	def convert_file(file):
	"""Process uploaded file and convert it to text"""
	if file is None:
	return "No file uploaded. Please select a file first.", ""

	try:
	logging.info(f"Starting conversion for file: {file.name if hasattr(file, 'name') else 'unknown'}")

	# Save uploaded file to a temporary file
	temp_dir = tempfile.mkdtemp()
	temp_file_path = os.path.join(temp_dir, "uploaded_file")
	output_file_path = ""

	# Zapisz plik na dysk niezależnie od jego formatu
	if hasattr(file, 'name'):
	file_name = file.name
	else:
	file_name = "unknown_file"

	# Zapisujemy zawartość pliku do pliku tymczasowego
	try:
	# Próbuj odczytać jako obiekt z metodą read()
	if hasattr(file, 'read'):
	with open(temp_file_path, 'wb') as f:
	f.write(file.read())
	# Sprawdź czy to jest ścieżka
	elif isinstance(file, str) and os.path.exists(file):
	with open(file, 'rb') as src, open(temp_file_path, 'wb') as dst:
	dst.write(src.read())
	# Sprawdź czy to jest tuple (nazwa, ścieżka)
	elif isinstance(file, tuple) and len(file) > 1 and os.path.exists(file[1]):
	with open(file[1], 'rb') as src, open(temp_file_path, 'wb') as dst:
	dst.write(src.read())
	file_name = file[0]
	else:
	# Ostatnia szansa - spróbuj potraktować plik jako ścieżkę
	try:
	with open(str(file), 'rb') as src, open(temp_file_path, 'wb') as dst:
	dst.write(src.read())
	except:
	return f"Could not read file. Type: {type(file)}", ""
	except Exception as e:
	return f"Error reading file: {str(e)}", ""

	# Określ rozszerzenie pliku
	_, file_ext = os.path.splitext(file_name)
	file_ext = file_ext.lower()

	# Konwertuj plik w zależności od formatu
	converter = MultiConverter()
	try:
	if file_ext in [".xlsx", ".xls"]:
	try:
	result = converter.convert_excel_to_formatted_text(temp_file_path)
	except Exception as e:
	logging.exception(f"Error during Excel conversion: {str(e)}")
	result = f"Error converting Excel file: {str(e)}\n\n"
	result += "This may be due to:\n"
	result += "- Unsupported Excel format (some .xls files require xlrd library)\n"
	result += "- Corrupted or password-protected file\n"
	result += "- Excel file with complex formatting or macros\n\n"
	result += "Try saving your Excel file as a simple .xlsx file before uploading."
	elif file_ext in [".pptx", ".ppt"]:
	result = converter.convert_pptx_to_text(temp_file_path, file_name)
	elif file_ext == ".pdf":
	result = converter.convert_pdf_to_text(temp_file_path, file_name)
	elif file_ext in [".docx", ".doc"]:
	result = converter.convert_docx_to_text(temp_file_path, file_name)
	else:
	result = f"Unsupported file format: {file_ext}"

	# Utwórz nazwę pliku wyjściowego
	output_filename = os.path.splitext(file_name)[0] + ".txt"

	# Przygotuj plik do pobrania
	content_bytes = result.encode('utf-8')
	b64 = base64.b64encode(content_bytes).decode()

	# Przygotuj przycisk do pobrania
	download_link = f"""
	<a href="data:text/plain;base64,{b64}" download="{output_filename}"
	style="display: inline-block; padding: 0.6em 1.2em; margin: 0.5em 0;
	background-color: #4CAF50; color: white; border: none; border-radius: 4px;
	cursor: pointer; text-decoration: none; font-weight: bold;">
	⬇️ Download {output_filename}
	</a>
	"""

	return result, download_link
	except Exception as e:
	logging.exception(f"Error converting file: {str(e)}")
	return f"Error converting file: {str(e)}", ""
	finally:
	# Usuń pliki tymczasowe
	try:
	if os.path.exists(temp_file_path):
	os.unlink(temp_file_path)
	if os.path.exists(output_file_path):
	os.unlink(output_file_path)
	os.rmdir(temp_dir)
	except Exception as e:
	logging.warning(f"Could not clean up temporary files: {str(e)}")

	except Exception as e:
	logging.exception(f"Unexpected error: {str(e)}")
	return f"Unexpected error: {str(e)}", ""


	# Utwórz interfejs Gradio
	with gr.Blocks(title="Multi-Format to TXT Converter") as app:
	gr.Markdown("# Multi-Format to TXT Converter by Heuristica.pl")
	gr.Markdown("Convert Excel, PowerPoint, PDF, and Word files to text format.")

	with gr.Row():
	file_input = gr.File(label="Upload a file (Excel, PowerPoint, PDF, or Word)")

	with gr.Row():
	convert_button = gr.Button("Convert to TXT", variant="primary")

	with gr.Row():
	text_output = gr.Textbox(label="Converted Text", lines=15)

	with gr.Row():
	download_html = gr.HTML(label="Download")

	# Info about supported formats
	gr.Markdown("""
	## Supported file formats:
	- Excel: .xlsx, .xls
	- PowerPoint: .pptx, .ppt
	- PDF: .pdf
	- Word: .docx, .doc

	## How to use:
	1. Upload a file using the file upload button
	2. Click "Convert to TXT"
	3. View the converted text
	4. Click the download button to save the converted text file
	""")

	# Obsługa konwersji
	convert_button.click(
	fn=convert_file,
	inputs=[file_input],
	outputs=[text_output, download_html]
	)

	# Uruchom aplikację
	if __name__ == "__main__":
	try:
	logging.info("Starting the application")
	app.launch(debug=True)
	logging.info("Application stopped")
	except Exception as e:
	logging.exception(f"Error launching application: {str(e)}")