File size: 6,574 Bytes
f1b19d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""

File Converter Tool - Convert between different file formats

"""
import logging
from typing import Dict, Any
from pathlib import Path
import sys
import os

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

logger = logging.getLogger(__name__)


def convert_file(input_path: str, output_format: str, output_path: str = None) -> Dict[str, Any]:
    """

    Convert a file from one format to another.

    

    Supported conversions:

    - PDF to TXT

    - TXT to CSV (assumes structured text)

    - CSV to TXT

    - Any text-based format conversions

    

    Args:

        input_path: Path to input file

        output_format: Desired output format ('txt', 'csv', 'pdf')

        output_path: Optional output path; auto-generated if not provided

        

    Returns:

        Dictionary with conversion results

    """
    try:
        input_file = Path(input_path)
        
        if not input_file.exists():
            raise FileNotFoundError(f"Input file not found: {input_path}")
        
        # Determine input format
        input_format = input_file.suffix.lower().replace('.', '')
        
        # Generate output path if not provided
        if output_path is None:
            output_path = str(input_file.parent / f"{input_file.stem}.{output_format}")
        
        output_file = Path(output_path)
        
        # Perform conversion based on formats
        if input_format == 'pdf' and output_format == 'txt':
            success, message = _pdf_to_txt(input_path, output_path)
            
        elif input_format == 'txt' and output_format == 'csv':
            success, message = _txt_to_csv(input_path, output_path)
            
        elif input_format == 'csv' and output_format == 'txt':
            success, message = _csv_to_txt(input_path, output_path)
            
        elif input_format in ['txt', 'md', 'log'] and output_format in ['txt', 'md', 'log']:
            success, message = _text_to_text(input_path, output_path)
            
        else:
            raise ValueError(f"Conversion from {input_format} to {output_format} not supported")
        
        return {
            "output_path": str(output_file),
            "success": success,
            "message": message,
            "input_format": input_format,
            "output_format": output_format,
            "file_size_bytes": output_file.stat().st_size if output_file.exists() else 0
        }
        
    except Exception as e:
        logger.error(f"Error converting file: {e}")
        raise


def _pdf_to_txt(input_path: str, output_path: str) -> tuple:
    """Convert PDF to TXT"""
    try:
        from PyPDF2 import PdfReader
        
        reader = PdfReader(input_path)
        text_parts = []
        
        for page in reader.pages:
            text = page.extract_text()
            if text:
                text_parts.append(text)
        
        full_text = "\n\n".join(text_parts)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(full_text)
        
        return True, f"Successfully converted PDF to TXT ({len(reader.pages)} pages)"
        
    except Exception as e:
        logger.error(f"PDF to TXT conversion error: {e}")
        return False, str(e)


def _txt_to_csv(input_path: str, output_path: str) -> tuple:
    """Convert TXT to CSV (assumes tab or comma separated values)"""
    try:
        import pandas as pd
        
        # Try to read as CSV with different delimiters
        try:
            df = pd.read_csv(input_path, sep='\t')
        except:
            try:
                df = pd.read_csv(input_path, sep=',')
            except:
                # If not structured, create simple CSV with one column
                with open(input_path, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                
                df = pd.DataFrame({'text': [line.strip() for line in lines if line.strip()]})
        
        df.to_csv(output_path, index=False)
        
        return True, f"Successfully converted TXT to CSV ({len(df)} rows)"
        
    except Exception as e:
        logger.error(f"TXT to CSV conversion error: {e}")
        return False, str(e)


def _csv_to_txt(input_path: str, output_path: str) -> tuple:
    """Convert CSV to TXT"""
    try:
        import pandas as pd
        
        df = pd.read_csv(input_path)
        
        # Convert to formatted text
        text = df.to_string(index=False)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        return True, f"Successfully converted CSV to TXT ({len(df)} rows)"
        
    except Exception as e:
        logger.error(f"CSV to TXT conversion error: {e}")
        return False, str(e)


def _text_to_text(input_path: str, output_path: str) -> tuple:
    """Convert between text-based formats"""
    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(content)
        
        return True, "Successfully converted text file"
        
    except Exception as e:
        logger.error(f"Text to text conversion error: {e}")
        return False, str(e)


def batch_convert(input_files: list, output_format: str) -> Dict[str, Any]:
    """

    Convert multiple files to the same output format.

    

    Args:

        input_files: List of input file paths

        output_format: Desired output format for all files

        

    Returns:

        Dictionary with batch conversion results

    """
    results = []
    
    for input_file in input_files:
        try:
            result = convert_file(input_file, output_format)
            result["input_file"] = input_file
            results.append(result)
        except Exception as e:
            logger.error(f"Error converting {input_file}: {e}")
            results.append({
                "input_file": input_file,
                "success": False,
                "message": str(e)
            })
    
    successful = sum(1 for r in results if r.get("success", False))
    
    return {
        "total_files": len(input_files),
        "successful": successful,
        "failed": len(input_files) - successful,
        "results": results
    }