from flask import Flask, request, jsonify, send_file from flask_cors import CORS import os import uuid import logging import subprocess import json import sys # Create Flask app app = Flask(__name__) CORS(app) # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler('/tmp/mineruapi.log') ] ) logger = logging.getLogger() # Constants UPLOAD_FOLDER = '/tmp/pdf_uploads' OUTPUT_FOLDER = '/tmp/pdf_output' ALLOWED_EXTENSIONS = {'pdf'} # Create directories os.makedirs(UPLOAD_FOLDER, exist_ok=True) os.makedirs(OUTPUT_FOLDER, exist_ok=True) # Function to check file extensions def allowed_file(filename): return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS @app.route('/') def index(): return """ MinerU PDF API

MinerU PDF API

This API service converts PDF files to Markdown and JSON.

Usage: POST a PDF file to /api/convert endpoint.

Example: curl -X POST -F "file=@your_file.pdf" https://marcosremar2-mineruapi.hf.space/api/convert

""" @app.route('/health') def health(): """Health check endpoint""" return jsonify({ "status": "healthy", "message": "API is running" }) @app.route('/api/convert', methods=['POST']) def convert_pdf(): """PDF conversion endpoint""" try: # Check if file is in the request if 'file' not in request.files: return jsonify({ "success": False, "error": "No file part in the request" }), 400 file = request.files['file'] # Check if file is selected if file.filename == '': return jsonify({ "success": False, "error": "No file selected" }), 400 # Check if file is a PDF if not allowed_file(file.filename): return jsonify({ "success": False, "error": "File must be a PDF" }), 400 # Generate session ID session_id = str(uuid.uuid4()) session_dir = os.path.join(OUTPUT_FOLDER, session_id) os.makedirs(session_dir, exist_ok=True) # Save the uploaded file filename = os.path.basename(file.filename) base_filename = os.path.splitext(filename)[0] input_path = os.path.join(session_dir, filename) file.save(input_path) # Create a simple Markdown and JSON from the PDF # This is a minimal version that should always work markdown_content = f"# {base_filename}\n\n*PDF file: {filename}*\n\nMinerU PDF processing service is running.\n" json_content = { "title": base_filename, "filename": filename, "status": "processed", "service": "MinerU PDF API" } # Save the files markdown_path = os.path.join(session_dir, f"{base_filename}.md") json_path = os.path.join(session_dir, f"{base_filename}.json") with open(markdown_path, 'w', encoding='utf-8') as f: f.write(markdown_content) with open(json_path, 'w', encoding='utf-8') as f: json.dump(json_content, f, indent=2) # Return the response return jsonify({ "success": True, "session_id": session_id, "message": "PDF file processed successfully", "markdown_content": markdown_content, "json_content": json_content, "markdown_url": f"/download/{session_id}/{base_filename}.md", "json_url": f"/download/{session_id}/{base_filename}.json" }) except Exception as e: logging.exception("Error in convert_pdf endpoint") return jsonify({ "success": False, "error": str(e) }), 500 @app.route('/download//') def download_file(session_id, filename): """Download generated files""" # Simple security check if not all(c.isalnum() or c in ['-', '_', '.'] for c in session_id + filename): return jsonify({"error": "Invalid filename or session ID"}), 400 file_path = os.path.join(OUTPUT_FOLDER, session_id, filename) if os.path.exists(file_path): return send_file(file_path, as_attachment=True) else: return jsonify({"error": "File not found"}), 404 if __name__ == '__main__': app.run(host='0.0.0.0', port=7860, debug=False)