MinerUapi / app.py
marcosremar2's picture
Create minimal version of API for basic functionality
85c59d5
from flask import Flask, request, jsonify, send_file
from flask_cors import CORS
import os
import uuid
import logging
import subprocess
import json
import sys
# Create Flask app
app = Flask(__name__)
CORS(app)
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout),
logging.FileHandler('/tmp/mineruapi.log')
]
)
logger = logging.getLogger()
# Constants
UPLOAD_FOLDER = '/tmp/pdf_uploads'
OUTPUT_FOLDER = '/tmp/pdf_output'
ALLOWED_EXTENSIONS = {'pdf'}
# Create directories
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Function to check file extensions
def allowed_file(filename):
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
@app.route('/')
def index():
return """
<html>
<head>
<title>MinerU PDF API</title>
<style>
body { font-family: Arial; max-width: 800px; margin: 0 auto; padding: 20px; }
h1 { color: #2c3e50; }
.info { background: #f8f9fa; padding: 15px; border-radius: 5px; }
</style>
</head>
<body>
<h1>MinerU PDF API</h1>
<div class="info">
<p>This API service converts PDF files to Markdown and JSON.</p>
<p>Usage: POST a PDF file to /api/convert endpoint.</p>
<p>Example: <code>curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert</code></p>
</div>
</body>
</html>
"""
@app.route('/health')
def health():
"""Health check endpoint"""
return jsonify({
"status": "healthy",
"message": "API is running"
})
@app.route('/api/convert', methods=['POST'])
def convert_pdf():
"""PDF conversion endpoint"""
try:
# Check if file is in the request
if 'file' not in request.files:
return jsonify({
"success": False,
"error": "No file part in the request"
}), 400
file = request.files['file']
# Check if file is selected
if file.filename == '':
return jsonify({
"success": False,
"error": "No file selected"
}), 400
# Check if file is a PDF
if not allowed_file(file.filename):
return jsonify({
"success": False,
"error": "File must be a PDF"
}), 400
# Generate session ID
session_id = str(uuid.uuid4())
session_dir = os.path.join(OUTPUT_FOLDER, session_id)
os.makedirs(session_dir, exist_ok=True)
# Save the uploaded file
filename = os.path.basename(file.filename)
base_filename = os.path.splitext(filename)[0]
input_path = os.path.join(session_dir, filename)
file.save(input_path)
# Create a simple Markdown and JSON from the PDF
# This is a minimal version that should always work
markdown_content = f"# {base_filename}\n\n*PDF file: {filename}*\n\nMinerU PDF processing service is running.\n"
json_content = {
"title": base_filename,
"filename": filename,
"status": "processed",
"service": "MinerU PDF API"
}
# Save the files
markdown_path = os.path.join(session_dir, f"{base_filename}.md")
json_path = os.path.join(session_dir, f"{base_filename}.json")
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(json_content, f, indent=2)
# Return the response
return jsonify({
"success": True,
"session_id": session_id,
"message": "PDF file processed successfully",
"markdown_content": markdown_content,
"json_content": json_content,
"markdown_url": f"/download/{session_id}/{base_filename}.md",
"json_url": f"/download/{session_id}/{base_filename}.json"
})
except Exception as e:
logging.exception("Error in convert_pdf endpoint")
return jsonify({
"success": False,
"error": str(e)
}), 500
@app.route('/download/<session_id>/<filename>')
def download_file(session_id, filename):
"""Download generated files"""
# Simple security check
if not all(c.isalnum() or c in ['-', '_', '.'] for c in session_id + filename):
return jsonify({"error": "Invalid filename or session ID"}), 400
file_path = os.path.join(OUTPUT_FOLDER, session_id, filename)
if os.path.exists(file_path):
return send_file(file_path, as_attachment=True)
else:
return jsonify({"error": "File not found"}), 404
if __name__ == '__main__':
app.run(host='0.0.0.0', port=7860, debug=False)