PDF-Parser

Sleeping

App Files Files Community

PDF-Parser / app.py

saifisvibin

Update app.py

62fe271 verified 23 days ago

raw

history blame contribute delete

36 kB

	import json
	import os
	import shutil
	import threading
	import uuid
	from pathlib import Path
	from typing import Dict, List, Optional
	from flask import Flask, render_template, request, jsonify, send_file, send_from_directory
	from flask_cors import CORS
	from werkzeug.utils import secure_filename
	import torch

	import main as extractor
	from loguru import logger

	app = Flask(__name__)
	# Enable CORS for all routes
	CORS(app, resources={r"/api/": {"origins": ""}})
	app.config['MAX_CONTENT_LENGTH'] = 500 * 1024 * 1024 # 500MB max file size
	app.config['UPLOAD_FOLDER'] = './uploads'
	app.config['OUTPUT_FOLDER'] = './output'

	# Ensure directories exist
	os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
	os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)

	# Global model instance
	_model = None

	# Progress tracking: {task_id: {'status': 'processing'\|'completed'\|'error', 'progress': 0-100, 'message': str, 'results': [], 'file_progress': {filename: progress}}}
	_progress_tracker: Dict[str, Dict] = {}
	_progress_lock = threading.RLock() # Use RLock for reentrant locking


	def get_device_info() -> Dict[str, any]:
	"""Get information about GPU/CPU availability."""
	cuda_available = torch.cuda.is_available()
	device = "cuda" if cuda_available else "cpu"

	info = {
	"device": device,
	"cuda_available": cuda_available,
	"device_name": None,
	"device_count": 0,
	}

	if cuda_available:
	info["device_name"] = torch.cuda.get_device_name(0)
	info["device_count"] = torch.cuda.device_count()

	return info


	def load_model_once():
	"""Load the model once and cache it."""
	global _model
	if _model is None:
	logger.info("Loading DocLayout-YOLO model...")
	_model = extractor.get_model()
	logger.info("Model loaded successfully")
	return _model


	@app.route('/')
	def index():
	"""Main page."""
	device_info = get_device_info()
	return render_template('index.html', device_info=device_info)


	@app.route('/api/docs')
	def api_docs():
	"""API documentation page showing all available endpoints."""
	routes = []
	for rule in app.url_map.iter_rules():
	if rule.rule.startswith('/api') or rule.rule.startswith('/output'):
	methods = ','.join(sorted(rule.methods - {'OPTIONS', 'HEAD'}))
	func = app.view_functions.get(rule.endpoint)
	doc = func.__doc__ if func and hasattr(func, '__doc__') else 'No description'

	routes.append({
	'endpoint': rule.rule,
	'methods': methods,
	'description': doc.strip() if doc else 'No description'
	})

	# Force HTTPS for Hugging Face Spaces (always use HTTPS)
	base_url = request.host_url.rstrip('/')
	if base_url.startswith('http://'):
	base_url = base_url.replace('http://', 'https://')
	return render_template('api_docs.html', routes=routes, base_url=base_url)


	@app.route('/api/predict', methods=['POST', 'GET'])
	def predict():
	# Handle GET requests with info message
	if request.method == 'GET':
	return jsonify({
	'status': 'info',
	'message': 'This endpoint accepts POST requests only. Please use POST method with a PDF file in the "file" field.',
	'usage': {
	'method': 'POST',
	'content_type': 'multipart/form-data',
	'body': {
	'file': 'PDF file to process'
	},
	'example_curl': 'curl -X POST https://saifisvibin-volaris-pdf-tool.hf.space/api/predict -F "file=@document.pdf"'
	}
	}), 405
	"""
	Clean REST API endpoint for PDF extraction.
	Accepts a PDF file and returns extracted text, tables, and figures.

	Request:
	- Method: POST
	- Content-Type: multipart/form-data
	- Body: file (PDF file)

	Response:
	{
	"status": "success",
	"filename": "document.pdf",
	"text": "extracted markdown text...",
	"tables": [...],
	"figures": [...],
	"summary": {...}
	}
	"""
	try:
	# Check if file is present
	if 'file' not in request.files:
	return jsonify({
	'status': 'error',
	'error': 'No file provided. Please upload a PDF file using the "file" field.'
	}), 400

	file = request.files['file']

	if file.filename == '':
	return jsonify({
	'status': 'error',
	'error': 'No file selected'
	}), 400

	if not file.filename.lower().endswith('.pdf'):
	return jsonify({
	'status': 'error',
	'error': 'Invalid file type. Please upload a PDF file.'
	}), 400

	filename = secure_filename(file.filename)
	stem = Path(filename).stem

	# Create a permanent output directory for this request (using stem + timestamp for uniqueness)
	import time
	unique_id = f"{stem}_{int(time.time())}"
	output_dir = Path(app.config['OUTPUT_FOLDER']) / unique_id
	output_dir.mkdir(parents=True, exist_ok=True)

	# Create temporary upload directory
	temp_upload = Path(app.config['UPLOAD_FOLDER']) / f"temp_{uuid.uuid4().hex}"
	temp_upload.mkdir(parents=True, exist_ok=True)

	try:
	# Save uploaded file
	pdf_path = temp_upload / filename
	file_data = file.read()
	pdf_path.write_bytes(file_data)

	# Load model if needed
	load_model_once()

	# Process PDF (extract both images and markdown)
	extractor.USE_MULTIPROCESSING = False
	extractor.process_pdf_with_pool(
	pdf_path,
	output_dir,
	pool=None,
	extract_images=True,
	extract_markdown=True,
	)

	# Collect extracted data
	result = {
	'status': 'success',
	'filename': filename,
	'text': '',
	'tables': [],
	'figures': [],
	'summary': {
	'total_pages': 0,
	'figures_count': 0,
	'tables_count': 0,
	'elements_count': 0
	}
	}

	# Extract markdown text
	markdown_path = output_dir / f"{stem}.md"
	if markdown_path.exists():
	result['text'] = markdown_path.read_text(encoding='utf-8')

	# Get base URL for constructing full image URLs
	base_url = request.host_url.rstrip('/')
	if 'hf.space' in base_url:
	# Force HTTPS for Hugging Face Spaces
	base_url = base_url.replace('http://', 'https://')

	# Extract figures and tables from JSON
	json_path = output_dir / f"{stem}_content_list.json"
	if json_path.exists():
	elements = json.loads(json_path.read_text(encoding='utf-8'))

	figures = [e for e in elements if e.get('type') == 'figure']
	tables = [e for e in elements if e.get('type') == 'table']

	# Get page count
	try:
	import pypdfium2 as pdfium
	pdf_bytes = pdf_path.read_bytes()
	doc = pdfium.PdfDocument(pdf_bytes)
	result['summary']['total_pages'] = len(doc)
	doc.close()
	except:
	pass

	# Format figures
	for fig in figures:
	figure_data = {
	'page': fig.get('page', 0),
	'bbox': fig.get('bbox_pixels', []),
	'confidence': fig.get('conf', 0.0),
	'width': fig.get('width', 0),
	'height': fig.get('height', 0),
	}

	# Include image URL if available
	if fig.get('image_path'):
	img_path = output_dir / fig['image_path']
	if img_path.exists():
	# Get relative path from OUTPUT_FOLDER
	relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
	# Construct full URL
	image_url = f"{base_url}/output/{relative_path}"
	figure_data['image_url'] = image_url
	figure_data['image_path'] = relative_path

	result['figures'].append(figure_data)

	# Format tables
	for tab in tables:
	table_data = {
	'page': tab.get('page', 0),
	'bbox': tab.get('bbox_pixels', []),
	'confidence': tab.get('conf', 0.0),
	'width': tab.get('width', 0),
	'height': tab.get('height', 0),
	}

	# Include image URL if available
	if tab.get('image_path'):
	img_path = output_dir / tab['image_path']
	if img_path.exists():
	# Get relative path from OUTPUT_FOLDER
	relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
	# Construct full URL
	image_url = f"{base_url}/output/{relative_path}"
	table_data['image_url'] = image_url
	table_data['image_path'] = relative_path

	result['tables'].append(table_data)

	result['summary']['figures_count'] = len(figures)
	result['summary']['tables_count'] = len(tables)
	result['summary']['elements_count'] = len(elements)

	return jsonify(result)

	finally:
	# Clean up temporary upload directory only (keep output_dir for file access)
	try:
	if temp_upload.exists():
	if temp_upload.is_file():
	temp_upload.unlink()
	else:
	shutil.rmtree(temp_upload, ignore_errors=True)
	except Exception as e:
	logger.warning(f"Error cleaning up temp upload files: {e}")

	except Exception as e:
	logger.error(f"Error in /api/predict: {e}")
	import traceback
	logger.error(traceback.format_exc())
	return jsonify({
	'status': 'error',
	'error': str(e)
	}), 500


	@app.route('/api/device-info')
	def device_info():
	"""API endpoint to get device information."""
	return jsonify(get_device_info())


	def _update_task_progress(task_id: str, filename: str, file_progress: int, message: str):
	"""Update progress for a specific file and calculate overall progress."""
	with _progress_lock:
	if task_id not in _progress_tracker:
	return

	# Update file-specific progress
	if 'file_progress' not in _progress_tracker[task_id]:
	_progress_tracker[task_id]['file_progress'] = {}
	_progress_tracker[task_id]['file_progress'][filename] = file_progress

	# Calculate overall progress (average of all files)
	file_progresses = _progress_tracker[task_id]['file_progress']
	if file_progresses:
	total_progress = sum(file_progresses.values()) / len(file_progresses)
	_progress_tracker[task_id]['progress'] = int(total_progress)

	_progress_tracker[task_id]['message'] = message


	def process_file_background(task_id: str, file_data: bytes, filename: str, extraction_mode: str):
	"""Process a single file in the background and update progress."""
	filename = secure_filename(filename)

	try:
	_update_task_progress(task_id, filename, 5, f'Processing {filename}...')

	stem = Path(filename).stem
	include_images = extraction_mode != 'markdown'
	include_markdown = extraction_mode != 'images'

	# Ensure upload directory exists
	upload_dir = Path(app.config['UPLOAD_FOLDER'])
	upload_dir.mkdir(parents=True, exist_ok=True)

	# Save uploaded file from bytes data
	upload_path = upload_dir / filename
	upload_path.write_bytes(file_data)

	_update_task_progress(task_id, filename, 15, f'Saved {filename}, preparing output...')

	# Prepare output directory
	output_dir = Path(app.config['OUTPUT_FOLDER']) / stem
	output_dir.mkdir(parents=True, exist_ok=True)

	# Copy PDF to output directory, using replace to overwrite if it already exists
	pdf_path = output_dir / filename
	upload_path.replace(pdf_path)

	_update_task_progress(task_id, filename, 25, f'Loading model and processing {filename}...')

	# Process PDF
	extractor.USE_MULTIPROCESSING = False
	logger.info(f"Processing {filename} (images={include_images}, markdown={include_markdown})")

	if include_images:
	try:
	load_model_once()
	logger.info(f"Model loaded successfully for {filename}")
	except Exception as model_error:
	logger.error(f"Failed to load model for {filename}: {model_error}")
	import traceback
	logger.error(traceback.format_exc())
	raise Exception(f"Model loading failed: {str(model_error)}. The processing service may be unavailable.")

	_update_task_progress(task_id, filename, 30, f'Extracting content from {filename}...')

	extractor.process_pdf_with_pool(
	pdf_path,
	output_dir,
	pool=None,
	extract_images=include_images,
	extract_markdown=include_markdown,
	)

	_update_task_progress(task_id, filename, 85, f'Collecting results for {filename}...')

	# Collect results
	json_path = output_dir / f"{stem}_content_list.json"
	elements = []
	if include_images and json_path.exists():
	elements = json.loads(json_path.read_text(encoding='utf-8'))

	annotated_pdf = None
	if include_images:
	candidate_pdf = output_dir / f"{stem}_layout.pdf"
	if candidate_pdf.exists():
	annotated_pdf = str(candidate_pdf.relative_to(app.config['OUTPUT_FOLDER']))

	markdown_path = None
	if include_markdown:
	candidate_md = output_dir / f"{stem}.md"
	if candidate_md.exists():
	markdown_path = str(candidate_md.relative_to(app.config['OUTPUT_FOLDER']))

	# Get figure and table counts
	figures = [e for e in elements if e.get('type') == 'figure']
	tables = [e for e in elements if e.get('type') == 'table']

	# Get base URL for constructing full URLs
	# Note: We can't use request.host_url here since we're in a background thread
	# So we'll construct URLs that will be completed in the API endpoint
	result = {
	'filename': filename,
	'stem': stem,
	'output_dir': str(output_dir.relative_to(app.config['OUTPUT_FOLDER'])),
	'figures_count': len(figures),
	'tables_count': len(tables),
	'elements_count': len(elements),
	'annotated_pdf': annotated_pdf,
	'markdown_path': markdown_path,
	'include_images': include_images,
	'include_markdown': include_markdown,
	}

	with _progress_lock:
	# Update file progress to 100%
	if 'file_progress' not in _progress_tracker[task_id]:
	_progress_tracker[task_id]['file_progress'] = {}
	_progress_tracker[task_id]['file_progress'][filename] = 100

	# Calculate overall progress
	file_progresses = _progress_tracker[task_id]['file_progress']
	if file_progresses:
	total_progress = sum(file_progresses.values()) / len(file_progresses)
	_progress_tracker[task_id]['progress'] = int(total_progress)

	# Add result
	_progress_tracker[task_id]['results'].append(result)
	_progress_tracker[task_id]['message'] = f'Completed processing {filename}'

	# Check if all files are done
	total_files = _progress_tracker[task_id].get('total_files', 1)
	completed_count = len([r for r in _progress_tracker[task_id]['results'] if 'error' not in r])
	error_count = len([r for r in _progress_tracker[task_id]['results'] if 'error' in r])

	if completed_count + error_count >= total_files:
	if error_count == 0:
	_progress_tracker[task_id]['status'] = 'completed'
	_progress_tracker[task_id]['progress'] = 100
	_progress_tracker[task_id]['message'] = f'All {total_files} file(s) processed successfully'
	else:
	_progress_tracker[task_id]['status'] = 'completed' # Still mark as completed even with some errors
	_progress_tracker[task_id]['message'] = f'Processing complete: {completed_count} succeeded, {error_count} failed'

	except Exception as e:
	logger.error(f"Error processing {filename}: {e}")
	import traceback
	logger.error(traceback.format_exc())
	with _progress_lock:
	_progress_tracker[task_id]['results'].append({
	'filename': filename,
	'error': str(e)
	})
	# Check if this was the last file
	total_files = _progress_tracker[task_id].get('total_files', 1)
	if len(_progress_tracker[task_id]['results']) >= total_files:
	_progress_tracker[task_id]['status'] = 'error'
	_progress_tracker[task_id]['message'] = f'Error processing {filename}: {str(e)}'


	@app.route('/api/upload', methods=['POST'])
	def upload_files():
	"""Handle multiple PDF file uploads with sequential background processing."""
	if 'files[]' not in request.files:
	return jsonify({'error': 'No files provided'}), 400

	files = request.files.getlist('files[]')
	extraction_mode = request.form.get('extraction_mode', 'both')

	if not files or all(f.filename == '' for f in files):
	return jsonify({'error': 'No files selected'}), 400

	# Read all file data eagerly before threads start (request context will close)
	file_payloads = []
	for file in files:
	if file and file.filename.endswith('.pdf'):
	data = file.read()
	if data:
	file_payloads.append((file.filename, data))
	else:
	logger.warning(f"Empty file skipped: {file.filename}")

	if not file_payloads:
	return jsonify({'error': 'No valid PDF files could be read'}), 400

	# Create a task ID for this upload batch
	task_id = str(uuid.uuid4())

	with _progress_lock:
	_progress_tracker[task_id] = {
	'status': 'processing',
	'progress': 0,
	'message': f'Queued {len(file_payloads)} file(s) for processing...',
	'results': [],
	'total_files': len(file_payloads),
	}

	def process_queue():
	"""Process all files sequentially in a single background thread."""
	total = len(file_payloads)
	for idx, (filename, file_data) in enumerate(file_payloads, start=1):
	with _progress_lock:
	_progress_tracker[task_id]['message'] = f'Processing file {idx} of {total}: {filename}'
	try:
	process_file_background(task_id, file_data, filename, extraction_mode)
	except Exception as e:
	logger.error(f"Unhandled error processing {filename}: {e}")
	import traceback
	logger.error(traceback.format_exc())
	with _progress_lock:
	_progress_tracker[task_id]['results'].append({
	'filename': filename,
	'error': str(e)
	})

	# Final status update after all files are done
	with _progress_lock:
	tracker = _progress_tracker[task_id]
	good = [r for r in tracker['results'] if 'error' not in r]
	bad = [r for r in tracker['results'] if 'error' in r]
	tracker['status'] = 'completed'
	tracker['progress'] = 100
	if bad:
	tracker['message'] = f'{len(good)} succeeded, {len(bad)} failed.'
	else:
	tracker['message'] = f'All {total} file(s) processed successfully.'

	thread = threading.Thread(target=process_queue)
	thread.daemon = True
	thread.start()
	logger.info(f"Started sequential processing queue for {len(file_payloads)} file(s), task={task_id}")

	return jsonify({
	'task_id': task_id,
	'message': 'Processing started',
	'total_files': len(file_payloads)
	})



	@app.route('/api/progress/<task_id>')
	def get_progress(task_id):
	"""Get progress for a processing task."""
	with _progress_lock:
	progress = _progress_tracker.get(task_id)
	if not progress:
	return jsonify({'error': 'Task not found'}), 404

	# Get base URL for constructing full URLs
	base_url = request.host_url.rstrip('/')
	if 'hf.space' in base_url:
	# Force HTTPS for Hugging Face Spaces
	base_url = base_url.replace('http://', 'https://')

	# Add full URLs to results if they exist
	if 'results' in progress:
	for result in progress['results']:
	# Add full URL for annotated PDF
	if result.get('annotated_pdf'):
	result['annotated_pdf_url'] = f"{base_url}/output/{result['annotated_pdf']}"

	# Add full URL for markdown
	if result.get('markdown_path'):
	result['markdown_url'] = f"{base_url}/output/{result['markdown_path']}"

	# Add image URLs for figures and tables if available
	output_dir = Path(app.config['OUTPUT_FOLDER']) / result.get('stem', '')
	if output_dir.exists():
	# Load content list to get figure and table image paths
	json_files = list(output_dir.glob('*_content_list.json'))
	if json_files:
	try:
	elements = json.loads(json_files[0].read_text(encoding='utf-8'))
	figures = [e for e in elements if e.get('type') == 'figure']
	tables = [e for e in elements if e.get('type') == 'table']

	# Add figure URLs
	figure_urls = []
	for fig in figures:
	if fig.get('image_path'):
	img_path = output_dir / fig['image_path']
	if img_path.exists():
	relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
	figure_urls.append({
	'page': fig.get('page', 0),
	'url': f"{base_url}/output/{relative_path}",
	'path': relative_path
	})

	# Add table URLs
	table_urls = []
	for tab in tables:
	if tab.get('image_path'):
	img_path = output_dir / tab['image_path']
	if img_path.exists():
	relative_path = str(img_path.relative_to(app.config['OUTPUT_FOLDER']))
	table_urls.append({
	'page': tab.get('page', 0),
	'url': f"{base_url}/output/{relative_path}",
	'path': relative_path
	})

	if figure_urls:
	result['figure_urls'] = figure_urls
	if table_urls:
	result['table_urls'] = table_urls
	except Exception as e:
	logger.warning(f"Error loading image URLs for {result.get('stem')}: {e}")

	return jsonify(progress)


	@app.route('/api/pdf-list')
	def pdf_list():
	"""Get list of processed PDFs."""
	output_dir = Path(app.config['OUTPUT_FOLDER'])
	pdfs = []

	for item in output_dir.iterdir():
	if item.is_dir():
	# Check if this directory has processed content
	json_files = list(item.glob('*_content_list.json'))
	md_files = list(item.glob('*.md'))
	pdf_files = list(item.glob('*.pdf'))

	if json_files or md_files or pdf_files:
	stem = item.name
	pdfs.append({
	'stem': stem,
	'output_dir': str(item.relative_to(app.config['OUTPUT_FOLDER'])),
	})

	return jsonify({'pdfs': pdfs})


	@app.route('/api/pdf-details/<path:pdf_stem>')
	def pdf_details(pdf_stem):
	"""Get detailed information about a processed PDF."""
	output_dir = Path(app.config['OUTPUT_FOLDER']) / pdf_stem

	if not output_dir.exists():
	return jsonify({'error': 'PDF not found'}), 404

	# Get base URL for constructing full URLs
	base_url = request.host_url.rstrip('/')
	if 'hf.space' in base_url:
	# Force HTTPS for Hugging Face Spaces
	base_url = base_url.replace('http://', 'https://')

	# Load content list
	json_files = list(output_dir.glob('*_content_list.json'))
	elements = []
	if json_files:
	elements = json.loads(json_files[0].read_text(encoding='utf-8'))

	# Get figures and tables
	figures = [e for e in elements if e.get('type') == 'figure']
	tables = [e for e in elements if e.get('type') == 'table']

	# Get file paths
	annotated_pdf = None
	pdf_files = list(output_dir.glob('*_layout.pdf'))
	if pdf_files:
	annotated_pdf = str(pdf_files[0].relative_to(app.config['OUTPUT_FOLDER']))

	markdown_path = None
	md_files = list(output_dir.glob('*.md'))
	if md_files:
	markdown_path = str(md_files[0].relative_to(app.config['OUTPUT_FOLDER']))

	# Get figure and table images
	figure_dir = output_dir / 'figures'
	table_dir = output_dir / 'tables'

	figure_images = []
	if figure_dir.exists():
	figure_images = [str(f.relative_to(app.config['OUTPUT_FOLDER']))
	for f in sorted(figure_dir.glob('*.png'))]

	table_images = []
	if table_dir.exists():
	table_images = [str(t.relative_to(app.config['OUTPUT_FOLDER']))
	for t in sorted(table_dir.glob('*.png'))]

	return jsonify({
	'stem': pdf_stem,
	'figures': figures,
	'tables': tables,
	'figures_count': len(figures),
	'tables_count': len(tables),
	'elements_count': len(elements),
	'annotated_pdf': annotated_pdf,
	'markdown_path': markdown_path,
	'figure_images': figure_images,
	'table_images': table_images,
	# Add full URLs for direct access
	'urls': {
	'annotated_pdf': f"{base_url}/output/{annotated_pdf}" if annotated_pdf else None,
	'markdown': f"{base_url}/output/{markdown_path}" if markdown_path else None,
	'figures': [f"{base_url}/output/{img}" for img in figure_images] if figure_images else [],
	'tables': [f"{base_url}/output/{img}" for img in table_images] if table_images else [],
	},
	# Keep relative paths for backward compatibility
	'download_urls': {
	'annotated_pdf': f"/output/{annotated_pdf}" if annotated_pdf else None,
	'markdown': f"/output/{markdown_path}" if markdown_path else None,
	'figures': [f"/output/{img}" for img in figure_images] if figure_images else [],
	'tables': [f"/output/{img}" for img in table_images] if table_images else [],
	}
	})


	@app.route('/output/<path:filename>')
	def output_file(filename):
	"""Serve output files (PDFs, images, markdown)."""
	try:
	output_folder = Path(app.config['OUTPUT_FOLDER']).resolve()
	file_path = (output_folder / filename).resolve()

	# Security: Prevent path traversal attacks
	if not str(file_path).startswith(str(output_folder)):
	return jsonify({'error': 'Invalid file path'}), 400

	# Check if file exists
	if not file_path.exists():
	return jsonify({
	'error': 'File not found',
	'requested_path': filename,
	'hint': 'Use /api/pdf-details/<stem> to get correct file paths'
	}), 404

	if not file_path.is_file():
	return jsonify({'error': 'Path is not a file'}), 400

	# Determine MIME type based on extension
	mime_types = {
	'.pdf': 'application/pdf',
	'.png': 'image/png',
	'.jpg': 'image/jpeg',
	'.jpeg': 'image/jpeg',
	'.md': 'text/markdown',
	'.json': 'application/json',
	'.txt': 'text/plain'
	}

	ext = file_path.suffix.lower()
	mimetype = mime_types.get(ext, 'application/octet-stream')

	return send_file(str(file_path), mimetype=mimetype, as_attachment=False)

	except Exception as e:
	logger.error(f"Error serving file {filename}: {e}")
	return jsonify({
	'error': 'Failed to serve file',
	'message': str(e)
	}), 500


	def _delete_by_stem(stem_raw: str):
	stem = (stem_raw or "").strip()
	if not stem:
	return jsonify({'error': 'Missing stem'}), 400

	# Resolve output directory safely
	output_root = Path(app.config['OUTPUT_FOLDER']).resolve()
	target_dir = (output_root / stem).resolve()

	# Prevent path traversal - ensure target is within output_root
	if output_root not in target_dir.parents and target_dir != output_root:
	return jsonify({'error': 'Invalid stem path'}), 400

	if not target_dir.exists() or not target_dir.is_dir():
	return jsonify({'error': 'Not found'}), 404

	# Delete the directory
	shutil.rmtree(target_dir, ignore_errors=False)
	logger.info(f"Deleted processed output: {target_dir}")

	return jsonify({'ok': True, 'deleted': stem})


	@app.route('/api/delete', methods=['POST'])
	def delete_pdf():
	"""Delete a processed PDF directory by stem (JSON or form body)."""
	try:
	data = request.get_json(silent=True) or {}
	stem = (data.get('stem') or request.form.get('stem') or '').strip()
	return _delete_by_stem(stem)
	except Exception as e:
	logger.error(f"Delete failed: {e}")
	return jsonify({'error': str(e)}), 500


	@app.route('/api/delete/<path:stem>', methods=['POST', 'GET'])
	def delete_pdf_by_path(stem: str):
	"""Alternate endpoint to delete using URL path, for clients avoiding bodies."""
	try:
	return _delete_by_stem(stem)
	except Exception as e:
	logger.error(f"Delete failed: {e}")
	return jsonify({'error': str(e)}), 500


	@app.route('/api/download-zip/<path:stem>', methods=['GET'])
	def download_zip(stem: str):
	"""Download the processed output as a zip archive."""
	import io
	import zipfile

	stem = stem.strip()
	if not stem:
	return jsonify({'error': 'Missing stem'}), 400

	output_root = Path(app.config['OUTPUT_FOLDER']).resolve()
	target_dir = (output_root / stem).resolve()

	# Prevent path traversal
	if output_root not in target_dir.parents and target_dir != output_root:
	return jsonify({'error': 'Invalid stem path'}), 400

	if not target_dir.exists() or not target_dir.is_dir():
	return jsonify({'error': 'PDF not found or not processed completely'}), 404

	try:
	memory_file = io.BytesIO()
	with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf:
	for root, _, files in os.walk(target_dir):
	for file in files:
	file_path = Path(root) / file
	arcname = file_path.relative_to(target_dir)
	zf.write(file_path, arcname)

	memory_file.seek(0)

	return send_file(
	memory_file,
	mimetype='application/zip',
	as_attachment=True,
	download_name=f"{stem}_extracted.zip"
	)
	except Exception as e:
	logger.error(f"Zip creation failed: {e}")
	return jsonify({'error': str(e)}), 500


	@app.route('/api/download-all/<task_id>', methods=['GET'])
	def download_all(task_id: str):
	"""Download all output directories for a task as a single ZIP archive."""
	import io, zipfile

	with _progress_lock:
	tracker = _progress_tracker.get(task_id)

	if not tracker:
	return jsonify({'error': 'Task not found'}), 404

	stems = [r.get('stem') for r in tracker.get('results', []) if r.get('stem')]
	if not stems:
	return jsonify({'error': 'No processed files found for this task'}), 404

	output_root = Path(app.config['OUTPUT_FOLDER']).resolve()
	memory_file = io.BytesIO()

	try:
	with zipfile.ZipFile(memory_file, 'w', zipfile.ZIP_DEFLATED) as zf:
	for stem in stems:
	target_dir = (output_root / stem).resolve()
	# Prevent path traversal
	if output_root not in target_dir.parents and target_dir != output_root:
	continue
	if not target_dir.exists():
	continue
	for root, _, files in os.walk(target_dir):
	for file in files:
	file_path = Path(root) / file
	# Archive under stem/filename so files don't collide
	arcname = Path(stem) / file_path.relative_to(target_dir)
	zf.write(file_path, arcname)

	memory_file.seek(0)
	return send_file(
	memory_file,
	mimetype='application/zip',
	as_attachment=True,
	download_name='all_extracted.zip'
	)
	except Exception as e:
	logger.error(f"Download-all zip creation failed: {e}")
	return jsonify({'error': str(e)}), 500


	if __name__ == '__main__':

	# Run on port 7860 for Hugging Face Spaces, or 5000 for local development
	port = int(os.environ.get('PORT', 7860))
	app.run(debug=False, host='0.0.0.0', port=port)