Spaces:
Sleeping
Sleeping
| """ | |
| File Converter Tool - Convert between different file formats | |
| """ | |
| import logging | |
| from typing import Dict, Any | |
| from pathlib import Path | |
| import sys | |
| import os | |
| # Add parent directory to path for imports | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| logger = logging.getLogger(__name__) | |
| def convert_file(input_path: str, output_format: str, output_path: str = None) -> Dict[str, Any]: | |
| """ | |
| Convert a file from one format to another. | |
| Supported conversions: | |
| - PDF to TXT | |
| - TXT to CSV (assumes structured text) | |
| - CSV to TXT | |
| - Any text-based format conversions | |
| Args: | |
| input_path: Path to input file | |
| output_format: Desired output format ('txt', 'csv', 'pdf') | |
| output_path: Optional output path; auto-generated if not provided | |
| Returns: | |
| Dictionary with conversion results | |
| """ | |
| try: | |
| input_file = Path(input_path) | |
| if not input_file.exists(): | |
| raise FileNotFoundError(f"Input file not found: {input_path}") | |
| # Determine input format | |
| input_format = input_file.suffix.lower().replace('.', '') | |
| # Generate output path if not provided | |
| if output_path is None: | |
| output_path = str(input_file.parent / f"{input_file.stem}.{output_format}") | |
| output_file = Path(output_path) | |
| # Perform conversion based on formats | |
| if input_format == 'pdf' and output_format == 'txt': | |
| success, message = _pdf_to_txt(input_path, output_path) | |
| elif input_format == 'txt' and output_format == 'csv': | |
| success, message = _txt_to_csv(input_path, output_path) | |
| elif input_format == 'csv' and output_format == 'txt': | |
| success, message = _csv_to_txt(input_path, output_path) | |
| elif input_format in ['txt', 'md', 'log'] and output_format in ['txt', 'md', 'log']: | |
| success, message = _text_to_text(input_path, output_path) | |
| else: | |
| raise ValueError(f"Conversion from {input_format} to {output_format} not supported") | |
| return { | |
| "output_path": str(output_file), | |
| "success": success, | |
| "message": message, | |
| "input_format": input_format, | |
| "output_format": output_format, | |
| "file_size_bytes": output_file.stat().st_size if output_file.exists() else 0 | |
| } | |
| except Exception as e: | |
| logger.error(f"Error converting file: {e}") | |
| raise | |
| def _pdf_to_txt(input_path: str, output_path: str) -> tuple: | |
| """Convert PDF to TXT""" | |
| try: | |
| from PyPDF2 import PdfReader | |
| reader = PdfReader(input_path) | |
| text_parts = [] | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| text_parts.append(text) | |
| full_text = "\n\n".join(text_parts) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(full_text) | |
| return True, f"Successfully converted PDF to TXT ({len(reader.pages)} pages)" | |
| except Exception as e: | |
| logger.error(f"PDF to TXT conversion error: {e}") | |
| return False, str(e) | |
| def _txt_to_csv(input_path: str, output_path: str) -> tuple: | |
| """Convert TXT to CSV (assumes tab or comma separated values)""" | |
| try: | |
| import pandas as pd | |
| # Try to read as CSV with different delimiters | |
| try: | |
| df = pd.read_csv(input_path, sep='\t') | |
| except: | |
| try: | |
| df = pd.read_csv(input_path, sep=',') | |
| except: | |
| # If not structured, create simple CSV with one column | |
| with open(input_path, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| df = pd.DataFrame({'text': [line.strip() for line in lines if line.strip()]}) | |
| df.to_csv(output_path, index=False) | |
| return True, f"Successfully converted TXT to CSV ({len(df)} rows)" | |
| except Exception as e: | |
| logger.error(f"TXT to CSV conversion error: {e}") | |
| return False, str(e) | |
| def _csv_to_txt(input_path: str, output_path: str) -> tuple: | |
| """Convert CSV to TXT""" | |
| try: | |
| import pandas as pd | |
| df = pd.read_csv(input_path) | |
| # Convert to formatted text | |
| text = df.to_string(index=False) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(text) | |
| return True, f"Successfully converted CSV to TXT ({len(df)} rows)" | |
| except Exception as e: | |
| logger.error(f"CSV to TXT conversion error: {e}") | |
| return False, str(e) | |
| def _text_to_text(input_path: str, output_path: str) -> tuple: | |
| """Convert between text-based formats""" | |
| try: | |
| with open(input_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| return True, "Successfully converted text file" | |
| except Exception as e: | |
| logger.error(f"Text to text conversion error: {e}") | |
| return False, str(e) | |
| def batch_convert(input_files: list, output_format: str) -> Dict[str, Any]: | |
| """ | |
| Convert multiple files to the same output format. | |
| Args: | |
| input_files: List of input file paths | |
| output_format: Desired output format for all files | |
| Returns: | |
| Dictionary with batch conversion results | |
| """ | |
| results = [] | |
| for input_file in input_files: | |
| try: | |
| result = convert_file(input_file, output_format) | |
| result["input_file"] = input_file | |
| results.append(result) | |
| except Exception as e: | |
| logger.error(f"Error converting {input_file}: {e}") | |
| results.append({ | |
| "input_file": input_file, | |
| "success": False, | |
| "message": str(e) | |
| }) | |
| successful = sum(1 for r in results if r.get("success", False)) | |
| return { | |
| "total_files": len(input_files), | |
| "successful": successful, | |
| "failed": len(input_files) - successful, | |
| "results": results | |
| } | |