File size: 18,297 Bytes
a469ee1
 
 
 
 
 
 
 
de80732
a469ee1
 
 
 
 
 
 
de80732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a469ee1
 
 
 
 
 
 
de80732
 
 
 
 
 
 
a469ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de80732
a469ee1
 
de80732
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a469ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de80732
 
 
 
 
 
 
 
 
 
 
 
a469ee1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de80732
 
 
 
 
 
 
 
 
 
 
 
 
a469ee1
 
 
 
 
 
de80732
a469ee1
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import os
import tempfile
import uuid
from werkzeug.utils import secure_filename
from converter import convert_docx_to_latex
import shutil
import stat

app = Flask(__name__)
CORS(app)  # Enable CORS for all routes

# Configuration
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size

# Use system temp directory for better compatibility with Hugging Face Spaces
TEMP_BASE_DIR = tempfile.mkdtemp(prefix='docx_converter_')
UPLOAD_FOLDER = os.path.join(TEMP_BASE_DIR, 'uploads')
OUTPUT_FOLDER = os.path.join(TEMP_BASE_DIR, 'outputs')

# Ensure directories exist with proper permissions
def create_temp_dirs():
    """Create temporary directories with proper permissions"""
    for directory in [UPLOAD_FOLDER, OUTPUT_FOLDER]:
        os.makedirs(directory, exist_ok=True)
        # Set full permissions for the directory
        try:
            os.chmod(directory, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)
        except OSError:
            # If chmod fails, continue anyway (some systems don't allow it)
            pass

# Create directories on startup
create_temp_dirs()

# Store conversion tasks
conversion_tasks = {}

@app.route('/api/health', methods=['GET'])
def health_check():
    """Health check endpoint"""
    return jsonify({
        'status': 'healthy', 
        'message': 'DOCX to LaTeX API is running',
        'temp_dir': TEMP_BASE_DIR,
        'upload_dir': UPLOAD_FOLDER,
        'output_dir': OUTPUT_FOLDER
    })

@app.route('/api/upload', methods=['POST'])
def upload_file():
    """Handle file upload"""
    try:
        if 'file' not in request.files:
            return jsonify({'error': 'No file provided'}), 400
        
        file = request.files['file']
        if file.filename == '':
            return jsonify({'error': 'No file selected'}), 400
        
        if not file.filename.lower().endswith('.docx'):
            return jsonify({'error': 'Only DOCX files are allowed'}), 400
        
        # Generate unique task ID
        task_id = str(uuid.uuid4())
        
        # Save uploaded file using tempfile for better compatibility
        filename = secure_filename(file.filename)
        
        # Create a temporary file instead of using a fixed path
        temp_fd, temp_path = tempfile.mkstemp(
            suffix=f'_{filename}', 
            prefix=f'{task_id}_',
            dir=UPLOAD_FOLDER
        )
        
        try:
            # Close the file descriptor and save the file
            os.close(temp_fd)
            file.save(temp_path)
            
            # Set proper permissions on the file
            try:
                os.chmod(temp_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)
            except OSError:
                # If chmod fails, continue anyway
                pass
            
            # Store task info
            conversion_tasks[task_id] = {
                'status': 'uploaded',
                'original_filename': filename,
                'file_path': temp_path,
                'output_filename': filename.replace('.docx', '.tex'),
                'created_at': os.path.getctime(temp_path)
            }
            
            return jsonify({
                'task_id': task_id,
                'filename': filename,
                'status': 'uploaded',
                'message': 'File uploaded successfully'
            })
            
        except Exception as e:
            # Clean up the temp file if something goes wrong
            try:
                os.unlink(temp_path)
            except:
                pass
            raise e
        
    except Exception as e:
        return jsonify({'error': f'Upload failed: {str(e)}'}), 500

@app.route('/api/convert', methods=['POST'])
def convert_document():
    """Convert DOCX to LaTeX"""
    try:
        data = request.get_json()
        
        if not data or 'task_id' not in data:
            return jsonify({'error': 'Task ID is required'}), 400
        
        task_id = data['task_id']
        
        if task_id not in conversion_tasks:
            return jsonify({'error': 'Invalid task ID'}), 404
        
        task = conversion_tasks[task_id]
        
        if task['status'] != 'uploaded':
            return jsonify({'error': 'Task is not in uploadable state'}), 400
        
        # Get conversion options
        options = data.get('options', {})
        output_filename = data.get('output_filename', task['output_filename'])
        
        # Update task status
        task['status'] = 'converting'
        task['output_filename'] = output_filename
        
        # Prepare output paths using tempfile for better compatibility
        output_fd, output_path = tempfile.mkstemp(
            suffix=f'_{output_filename}',
            prefix=f'{task_id}_',
            dir=OUTPUT_FOLDER
        )
        os.close(output_fd)  # Close file descriptor, we'll write to the path directly
        
        media_path = tempfile.mkdtemp(
            prefix=f'{task_id}_media_',
            dir=OUTPUT_FOLDER
        )
        
        # Perform conversion
        success, message = convert_docx_to_latex(
            docx_path=task['file_path'],
            latex_path=output_path,
            generate_toc=options.get('generateToc', False),
            extract_media_to_path=media_path if options.get('extractMedia', True) else None,
            latex_template_path=None,  # Could be added later for custom templates
            overleaf_compatible=options.get('overleafCompatible', True),
            preserve_styles=options.get('preserveStyles', True),
            preserve_linebreaks=options.get('preserveLineBreaks', True)
        )
        
        if success:
            task['status'] = 'completed'
            task['output_path'] = output_path
            task['media_path'] = media_path if os.path.exists(media_path) else None
            task['conversion_message'] = message
            
            return jsonify({
                'task_id': task_id,
                'status': 'completed',
                'message': message,
                'output_filename': output_filename,
                'has_media': os.path.exists(media_path)
            })
        else:
            task['status'] = 'failed'
            task['error_message'] = message
            
            return jsonify({
                'task_id': task_id,
                'status': 'failed',
                'error': message
            }), 500
            
    except Exception as e:
        # Update task status if possible
        if 'task_id' in locals() and task_id in conversion_tasks:
            conversion_tasks[task_id]['status'] = 'failed'
            conversion_tasks[task_id]['error_message'] = str(e)
        
        return jsonify({'error': f'Conversion failed: {str(e)}'}), 500

@app.route('/api/download/<task_id>', methods=['GET'])
def download_file(task_id):
    """Download converted LaTeX file"""
    try:
        if task_id not in conversion_tasks:
            return jsonify({'error': 'Invalid task ID'}), 404
        
        task = conversion_tasks[task_id]
        
        if task['status'] != 'completed':
            return jsonify({'error': 'Conversion not completed'}), 400
        
        if not os.path.exists(task['output_path']):
            return jsonify({'error': 'Output file not found'}), 404
        
        return send_file(
            task['output_path'],
            as_attachment=True,
            download_name=task['output_filename'],
            mimetype='text/plain'
        )
        
    except Exception as e:
        return jsonify({'error': f'Download failed: {str(e)}'}), 500

@app.route('/api/download-media/<task_id>', methods=['GET'])
def download_media(task_id):
    """Download media files as a ZIP archive"""
    try:
        if task_id not in conversion_tasks:
            return jsonify({'error': 'Invalid task ID'}), 404
        
        task = conversion_tasks[task_id]
        
        if task['status'] != 'completed':
            return jsonify({'error': 'Conversion not completed'}), 400
        
        if not task.get('media_path') or not os.path.exists(task['media_path']):
            return jsonify({'error': 'No media files found'}), 404
        
        # Create a ZIP file of the media directory
        zip_path = task['media_path'] + '.zip'
        shutil.make_archive(task['media_path'], 'zip', task['media_path'])
        
        return send_file(
            zip_path,
            as_attachment=True,
            download_name=f"{task['output_filename'].replace('.tex', '')}_media.zip",
            mimetype='application/zip'
        )
        
    except Exception as e:
        return jsonify({'error': f'Media download failed: {str(e)}'}), 500

@app.route('/api/download-complete/<task_id>', methods=['GET'])
def download_complete_package(task_id):
    """Download complete package (LaTeX + media) as a ZIP archive"""
    try:
        if task_id not in conversion_tasks:
            return jsonify({'error': 'Invalid task ID'}), 404
        
        task = conversion_tasks[task_id]
        
        if task['status'] != 'completed':
            return jsonify({'error': 'Conversion not completed'}), 400
        
        if not os.path.exists(task['output_path']):
            return jsonify({'error': 'Output file not found'}), 404
        
        # Create a temporary directory for the complete package
        import tempfile
        base_name = task['output_filename'].replace('.tex', '')
        
        with tempfile.TemporaryDirectory() as temp_dir:
            package_dir = os.path.join(temp_dir, base_name)
            os.makedirs(package_dir, exist_ok=True)
            
            # Copy and fix LaTeX file for Overleaf compatibility
            latex_dest = os.path.join(package_dir, task['output_filename'])
            
            # Read the original LaTeX file
            with open(task['output_path'], 'r', encoding='utf-8') as f:
                latex_content = f.read()
            
            # Fix image paths to use relative paths suitable for Overleaf
            # Convert paths like: task_id_media/media/image.png -> media/image.png
            import re
            
            # Fix paths with task IDs
            latex_content = re.sub(
                r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[a-f0-9\-]+_media[/\\]media[/\\]([^{}]+)\}',
                r'\\includegraphics\1{media/\2}',
                latex_content
            )
            
            # Fix any remaining absolute paths
            latex_content = re.sub(
                r'\\includegraphics(\[[^\]]*\])?\{[^{}]*[/\\]media[/\\]([^{}]+)\}',
                r'\\includegraphics\1{media/\2}',
                latex_content
            )
            
            # Write the fixed LaTeX file
            with open(latex_dest, 'w', encoding='utf-8') as f:
                f.write(latex_content)
            
            # Copy media files if they exist
            if task.get('media_path') and os.path.exists(task['media_path']):
                media_dest = os.path.join(package_dir, 'media')
                
                # Check if there's a nested media folder structure
                inner_media = os.path.join(task['media_path'], 'media')
                if os.path.exists(inner_media):
                    # Copy from the inner media folder to avoid media/media/ nesting
                    shutil.copytree(inner_media, media_dest)
                else:
                    # Copy the media_path directly if no nesting
                    shutil.copytree(task['media_path'], media_dest)
            
            # Create README file
            readme_content = f"""# {base_name} - DOCX to LaTeX Conversion

## Package Contents:

1. **{task['output_filename']}** - Main LaTeX file
2. **media/** - Images and media files (if any)

## How to Use:

### Compiling LaTeX:
```bash
pdflatex {task['output_filename']}
```

### For Overleaf:
1. Upload all files to a new Overleaf project
2. Set main file: {task['output_filename']}
3. Compile the project

### Local Compilation:
```bash
# Basic compilation
pdflatex {task['output_filename']}

# For bibliography and cross-references
pdflatex {task['output_filename']}
bibtex {task['output_filename'].replace('.tex', '')}
pdflatex {task['output_filename']}
pdflatex {task['output_filename']}
```

## Features:
- Enhanced formatting preservation
- Overleaf compatibility
- Automatic image path fixing
- Unicode character conversion
- Mathematical expression optimization

## Generated by:
DOCX to LaTeX Web Converter
https://github.com/your-username/docx-to-latex
"""
            
            readme_path = os.path.join(package_dir, 'README.txt')
            with open(readme_path, 'w', encoding='utf-8') as f:
                f.write(readme_content)
            
            # Create ZIP file
            zip_path = os.path.join(temp_dir, f"{base_name}_complete.zip")
            shutil.make_archive(zip_path.replace('.zip', ''), 'zip', package_dir)
            
            return send_file(
                zip_path,
                as_attachment=True,
                download_name=f"{base_name}_complete.zip",
                mimetype='application/zip'
            )
        
    except Exception as e:
        return jsonify({'error': f'Complete package download failed: {str(e)}'}), 500

@app.route('/api/status/<task_id>', methods=['GET'])
def get_task_status(task_id):
    """Get conversion task status"""
    try:
        if task_id not in conversion_tasks:
            return jsonify({'error': 'Invalid task ID'}), 404
        
        task = conversion_tasks[task_id]
        
        response_data = {
            'task_id': task_id,
            'status': task['status'],
            'original_filename': task['original_filename'],
            'output_filename': task.get('output_filename', ''),
        }
        
        if task['status'] == 'completed':
            response_data['message'] = task.get('conversion_message', 'Conversion completed successfully')
            response_data['has_media'] = task.get('media_path') and os.path.exists(task['media_path'])
        elif task['status'] == 'failed':
            response_data['error'] = task.get('error_message', 'Conversion failed')
        
        return jsonify(response_data)
        
    except Exception as e:
        return jsonify({'error': f'Status check failed: {str(e)}'}), 500

@app.route('/api/cleanup/<task_id>', methods=['DELETE'])
def cleanup_task(task_id):
    """Clean up task files"""
    try:
        if task_id not in conversion_tasks:
            return jsonify({'error': 'Invalid task ID'}), 404
        
        task = conversion_tasks[task_id]
        
        # Remove uploaded file
        if os.path.exists(task['file_path']):
            os.remove(task['file_path'])
        
        # Remove output file
        if task.get('output_path') and os.path.exists(task['output_path']):
            os.remove(task['output_path'])
        
        # Remove media directory
        if task.get('media_path') and os.path.exists(task['media_path']):
            shutil.rmtree(task['media_path'])
        
        # Remove media ZIP if it exists
        media_zip = task.get('media_path', '') + '.zip'
        if os.path.exists(media_zip):
            os.remove(media_zip)
        
        # Remove task from memory
        del conversion_tasks[task_id]
        
        return jsonify({'message': 'Task cleaned up successfully'})
        
    except Exception as e:
        return jsonify({'error': f'Cleanup failed: {str(e)}'}), 500

@app.route('/api/tasks', methods=['GET'])
def list_tasks():
    """List all conversion tasks (for debugging)"""
    try:
        tasks_summary = {}
        for task_id, task in conversion_tasks.items():
            tasks_summary[task_id] = {
                'status': task['status'],
                'original_filename': task['original_filename'],
                'output_filename': task.get('output_filename', ''),
                'created_at': task.get('created_at', 0)
            }
        
        return jsonify(tasks_summary)
        
    except Exception as e:
        return jsonify({'error': f'Failed to list tasks: {str(e)}'}), 500

# Cleanup old files on startup
def cleanup_old_files():
    """Remove old temporary files"""
    try:
        import time
        current_time = time.time()
        cutoff_time = current_time - (24 * 60 * 60)  # 24 hours ago
        
        for folder in [UPLOAD_FOLDER, OUTPUT_FOLDER]:
            if os.path.exists(folder):
                for filename in os.listdir(folder):
                    file_path = os.path.join(folder, filename)
                    if os.path.isfile(file_path):
                        file_time = os.path.getctime(file_path)
                        if file_time < cutoff_time:
                            os.remove(file_path)
                    elif os.path.isdir(file_path):
                        dir_time = os.path.getctime(file_path)
                        if dir_time < cutoff_time:
                            shutil.rmtree(file_path)
    except Exception as e:
        print(f"Warning: Failed to cleanup old files: {e}")

# Add cleanup on application exit
import atexit

def cleanup_on_exit():
    """Clean up temporary directory on exit"""
    try:
        shutil.rmtree(TEMP_BASE_DIR)
        print(f"Cleaned up temporary directory: {TEMP_BASE_DIR}")
    except OSError:
        pass

atexit.register(cleanup_on_exit)

if __name__ == '__main__':
    # Cleanup old files on startup
    cleanup_old_files()
    
    # Run the Flask app
    print("Starting DOCX to LaTeX API server...")
    print(f"Using temporary directory: {TEMP_BASE_DIR}")
    print("API endpoints:")
    print("  POST /api/upload - Upload DOCX file")
    print("  POST /api/convert - Convert to LaTeX")
    print("  GET /api/download/<task_id> - Download LaTeX file")
    print("  GET /api/download-media/<task_id> - Download media files")
    print("  GET /api/status/<task_id> - Get conversion status")
    print("  DELETE /api/cleanup/<task_id> - Cleanup task files")
    print("  GET /api/health - Health check")
    
    app.run(debug=True, host='0.0.0.0', port=5000)