Spaces:
Runtime error
Runtime error
File size: 5,166 Bytes
85c59d5 128728a 4e3d16d 128728a 4e3d16d 684ccd8 4e3d16d 85c59d5 4e3d16d 128728a f32ce28 684ccd8 4e3d16d 128728a 8005bb9 128728a 4e3d16d 85c59d5 8005bb9 85c59d5 128728a 4e3d16d 85c59d5 4e3d16d 128728a 85c59d5 128728a 4e3d16d 85c59d5 4e3d16d 8005bb9 128728a 8005bb9 4e3d16d 128728a 8005bb9 4e3d16d 85c59d5 4e3d16d 85c59d5 128728a 4e3d16d 85c59d5 4e3d16d 85c59d5 4e3d16d 8005bb9 85c59d5 8005bb9 85c59d5 8005bb9 85c59d5 8005bb9 85c59d5 4e3d16d 128728a 85c59d5 f32ce28 85c59d5 128728a 85c59d5 4e3d16d 85c59d5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import os
import uuid
import logging
import subprocess
import json
import sys
# Create Flask app
app = Flask(__name__)
CORS(app)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('/tmp/mineruapi.log')
]
)
logger = logging.getLogger()
# Constants
UPLOAD_FOLDER = '/tmp/pdf_uploads'
OUTPUT_FOLDER = '/tmp/pdf_output'
ALLOWED_EXTENSIONS = {'pdf'}
# Create directories
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Function to check file extensions
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
return """
<html>
<head>
<title>MinerU PDF API</title>
<style>
body { font-family: Arial; max-width: 800px; margin: 0 auto; padding: 20px; }
h1 { color: #2c3e50; }
.info { background: #f8f9fa; padding: 15px; border-radius: 5px; }
</style>
</head>
<body>
<h1>MinerU PDF API</h1>
<div class="info">
<p>This API service converts PDF files to Markdown and JSON.</p>
<p>Usage: POST a PDF file to /api/convert endpoint.</p>
<p>Example: <code>curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert</code></p>
</div>
</body>
</html>
"""
@app.route('/health')
def health():
"""Health check endpoint"""
return jsonify({
"status": "healthy",
"message": "API is running"
})
@app.route('/api/convert', methods=['POST'])
def convert_pdf():
"""PDF conversion endpoint"""
try:
# Check if file is in the request
if 'file' not in request.files:
return jsonify({
"success": False,
"error": "No file part in the request"
}), 400
file = request.files['file']
# Check if file is selected
if file.filename == '':
return jsonify({
"success": False,
"error": "No file selected"
}), 400
# Check if file is a PDF
if not allowed_file(file.filename):
return jsonify({
"success": False,
"error": "File must be a PDF"
}), 400
# Generate session ID
session_id = str(uuid.uuid4())
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
os.makedirs(session_dir, exist_ok=True)
# Save the uploaded file
filename = os.path.basename(file.filename)
base_filename = os.path.splitext(filename)[0]
input_path = os.path.join(session_dir, filename)
file.save(input_path)
# Create a simple Markdown and JSON from the PDF
# This is a minimal version that should always work
markdown_content = f"# {base_filename}\n\n*PDF file: {filename}*\n\nMinerU PDF processing service is running.\n"
json_content = {
"title": base_filename,
"filename": filename,
"status": "processed",
"service": "MinerU PDF API"
}
# Save the files
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
json_path = os.path.join(session_dir, f"{base_filename}.json")
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_content, f, indent=2)
# Return the response
return jsonify({
"success": True,
"session_id": session_id,
"message": "PDF file processed successfully",
"markdown_content": markdown_content,
"json_content": json_content,
"markdown_url": f"/download/{session_id}/{base_filename}.md",
"json_url": f"/download/{session_id}/{base_filename}.json"
})
except Exception as e:
logging.exception("Error in convert_pdf endpoint")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/download/<session_id>/<filename>')
def download_file(session_id, filename):
"""Download generated files"""
# Simple security check
if not all(c.isalnum() or c in ['-', '_', '.'] for c in session_id + filename):
return jsonify({"error": "Invalid filename or session ID"}), 400
file_path = os.path.join(OUTPUT_FOLDER, session_id, filename)
if os.path.exists(file_path):
return send_file(file_path, as_attachment=True)
else:
return jsonify({"error": "File not found"}), 404
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=False) |