Spaces:

aletrn
/

nougat-transformers

Paused

nougat-transformers / scripts /batch_extract.sh

alessandro trinca tornidor

fix(batch_extract): quote curl file path to handle special characters

60df088 about 1 month ago

8.71 kB

	#!/usr/bin/env bash
	# Batch-extract PDFs to markdown via local Nougat Gradio API.
	#
	# Usage:
	# batch_extract.sh --input ~/papers/ --output ~/formulas/
	# batch_extract.sh --input ~/papers/ --output ~/formulas/ --port 7860,7861,7862
	# batch_extract.sh --input ~/papers/ --output ~/formulas/ --host 10.0.0.1 --force
	#
	# Requires: bash, curl, python3
	set -uo pipefail

	# --- Defaults ---
	INPUT_DIR=""
	OUTPUT_DIR=""
	PORTS="7860"
	HOST="localhost"
	FORCE=false
	TIMEOUT=600

	# --- Counters ---
	TOTAL=0
	EXTRACTED=0
	SKIPPED=0
	FAILED=0
	TOTAL_CHARS=0
	TOTAL_DISPLAY_EQ=0
	TOTAL_INLINE_EQ=0
	FAILED_FILES=()

	usage() {
	cat <<'USAGE'
	Usage: batch_extract.sh [OPTIONS]

	Required:
	--input DIR Directory containing PDF files
	--output DIR Directory for markdown output files

	Optional:
	--port PORTS Comma-separated ports (default: 7860)
	Multiple ports enable parallel extraction
	--host HOST Server hostname/IP (default: localhost)
	--force Re-extract even if output .md exists
	--timeout SECS Max seconds per PDF (default: 600)
	--help Show this help
	USAGE
	exit 0
	}

	# --- Argument parsing ---
	while [[ $# -gt 0 ]]; do
	case "$1" in
	--input) INPUT_DIR="$2"; shift 2 ;;
	--output) OUTPUT_DIR="$2"; shift 2 ;;
	--port) PORTS="$2"; shift 2 ;;
	--host) HOST="$2"; shift 2 ;;
	--force) FORCE=true; shift ;;
	--timeout) TIMEOUT="$2"; shift 2 ;;
	--help) usage ;;
	*) echo "Unknown argument: $1" >&2; exit 1 ;;
	esac
	done

	if [[ -z "$INPUT_DIR" \|\| -z "$OUTPUT_DIR" ]]; then
	echo "Error: --input and --output are required" >&2
	usage
	fi

	if [[ ! -d "$INPUT_DIR" ]]; then
	echo "Error: input directory does not exist: $INPUT_DIR" >&2
	exit 1
	fi

	mkdir -p "$OUTPUT_DIR"

	# --- Split ports ---
	IFS=',' read -ra PORT_ARRAY <<< "$PORTS"
	NUM_PORTS=${#PORT_ARRAY[@]}

	# --- Health check ---
	check_server() {
	local port="$1"
	local url="http://${HOST}:${port}/"
	if ! curl -sf --connect-timeout 5 -o /dev/null "$url"; then
	echo "Error: Nougat not reachable at $url" >&2
	return 1
	fi
	}

	echo "Checking $NUM_PORTS server(s)..."
	for port in "${PORT_ARRAY[@]}"; do
	if ! check_server "$port"; then
	exit 1
	fi
	echo " Port $port: OK"
	done

	# --- Core extraction ---
	extract_pdf() {
	local pdf_path="$1"
	local output_path="$2"
	local port="$3"
	local base_url="http://${HOST}:${port}/gradio_api"
	local pdf_name
	pdf_name="$(basename "$pdf_path")"

	# Upload
	local upload
	upload=$(curl -sf --max-time 30 -X POST "$base_url/upload" \
	-F "files=@\"$pdf_path\"") \|\| { echo "FAIL upload: $pdf_name" >&2; return 1; }

	local fpath
	fpath=$(echo "$upload" \| python3 -c "import sys,json; print(json.load(sys.stdin)[0])") \
	\|\| { echo "FAIL parse upload: $pdf_name" >&2; return 1; }

	# Call inference
	local event
	event=$(curl -sf --max-time 30 -X POST "$base_url/call/inference" \
	-H "Content-Type: application/json" \
	-d "{\"data\": [{\"path\": \"$fpath\", \"orig_name\": \"$pdf_name\", \"meta\": {\"_type\": \"gradio.FileData\"}}, \"\"]}") \
	\|\| { echo "FAIL inference call: $pdf_name" >&2; return 1; }

	local event_id
	event_id=$(echo "$event" \| python3 -c "import sys,json; print(json.load(sys.stdin)['event_id'])") \
	\|\| { echo "FAIL parse event_id: $pdf_name" >&2; return 1; }

	# Poll result
	local result
	result=$(curl -sf --max-time "$TIMEOUT" -N "$base_url/call/inference/$event_id") \
	\|\| { echo "FAIL polling: $pdf_name (timeout=${TIMEOUT}s)" >&2; return 1; }

	# Parse SSE and save
	echo "$result" \| python3 -c "
	import sys, json, re

	lines = sys.stdin.read().strip().split('\n')
	data_lines = [l for l in lines if l.startswith('data:')]
	if not data_lines:
	print('ERROR: no data lines', file=sys.stderr)
	sys.exit(1)
	last = data_lines[-1].replace('data: ', '', 1)
	data = json.loads(last, strict=False)
	text = data[0]

	display = len(re.findall(r'\\\$\\\$(.+?)\\\$\\\$', text, re.DOTALL))
	inline = len(re.findall(r'(?<!\\\$)\\\$(?!\\\$)(.+?)(?<!\\\$)\\\$(?!\\\$)', text))

	# Stats line to stderr: chars,display,inline
	print(f'{len(text)},{display},{inline}', file=sys.stderr)
	print(text)
	" > "$output_path" 2>"${output_path}.stats"

	local exit_code=$?
	if [[ $exit_code -ne 0 ]]; then
	echo "FAIL parse: $pdf_name" >&2
	rm -f "$output_path" "${output_path}.stats"
	return 1
	fi

	# Read stats
	local stats
	stats=$(cat "${output_path}.stats")
	rm -f "${output_path}.stats"
	echo "$stats"
	return 0
	}

	# --- Build file list ---
	mapfile -t PDF_FILES < <(find "$INPUT_DIR" -maxdepth 1 -name '*.pdf' -type f \| sort)
	TOTAL=${#PDF_FILES[@]}

	if [[ $TOTAL -eq 0 ]]; then
	echo "No PDF files found in $INPUT_DIR"
	exit 0
	fi

	echo ""
	echo "Found $TOTAL PDF(s) in $INPUT_DIR"
	echo "Output: $OUTPUT_DIR"
	echo "Ports: ${PORTS} (${NUM_PORTS} instance(s))"
	echo ""

	# --- Process PDFs ---
	process_pdf() {
	local idx="$1"
	local pdf="$2"
	local port="$3"
	local pdf_name
	pdf_name="$(basename "$pdf")"
	local md_name="${pdf_name%.pdf}.md"
	local output_path="$OUTPUT_DIR/$md_name"

	# Skip check
	if [[ "$FORCE" != true && -f "$output_path" ]]; then
	echo "[$idx/$TOTAL] SKIP $pdf_name (exists)"
	echo "SKIPPED"
	return 0
	fi

	echo "[$idx/$TOTAL] Extracting $pdf_name (port $port)..."
	local stats
	stats=$(extract_pdf "$pdf" "$output_path" "$port")
	local rc=$?

	if [[ $rc -ne 0 ]]; then
	echo "[$idx/$TOTAL] FAILED $pdf_name"
	echo "FAILED:$pdf_name"
	return 1
	fi

	local chars display inline
	IFS=',' read -r chars display inline <<< "$stats"
	echo "[$idx/$TOTAL] OK $pdf_name — ${chars} chars, ${display} display + ${inline} inline eqs"
	echo "OK:${chars}:${display}:${inline}"
	return 0
	}

	START_TIME=$SECONDS

	if [[ $NUM_PORTS -eq 1 ]]; then
	# Sequential mode
	for i in "${!PDF_FILES[@]}"; do
	idx=$((i + 1))
	result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "${PORT_ARRAY[0]}")
	status=$(echo "$result" \| tail -1)

	case "$status" in
	SKIPPED) ((SKIPPED++)) ;;
	FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;;
	OK:*)
	((EXTRACTED++))
	IFS=':' read -r _ chars display inline <<< "$status"
	TOTAL_CHARS=$((TOTAL_CHARS + chars))
	TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display))
	TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline))
	;;
	esac
	done
	else
	# Parallel mode: round-robin across ports
	TMPDIR_RESULTS=$(mktemp -d)
	trap 'rm -rf "$TMPDIR_RESULTS"' EXIT

	running=0
	for i in "${!PDF_FILES[@]}"; do
	idx=$((i + 1))
	port_idx=$((i % NUM_PORTS))
	port="${PORT_ARRAY[$port_idx]}"

	(
	result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "$port")
	status=$(echo "$result" \| tail -1)
	echo "$status" > "$TMPDIR_RESULTS/$idx.result"
	) &

	((running++))
	if [[ $running -ge $NUM_PORTS ]]; then
	wait -n 2>/dev/null \|\| wait
	((running--))
	fi
	done
	wait

	# Collect results
	for i in "${!PDF_FILES[@]}"; do
	idx=$((i + 1))
	if [[ -f "$TMPDIR_RESULTS/$idx.result" ]]; then
	status=$(cat "$TMPDIR_RESULTS/$idx.result")
	case "$status" in
	SKIPPED) ((SKIPPED++)) ;;
	FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;;
	OK:*)
	((EXTRACTED++))
	IFS=':' read -r _ chars display inline <<< "$status"
	TOTAL_CHARS=$((TOTAL_CHARS + chars))
	TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display))
	TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline))
	;;
	esac
	else
	((FAILED++))
	fi
	done
	fi

	ELAPSED=$((SECONDS - START_TIME))

	# --- Summary ---
	echo ""
	echo "=============================="
	echo "BATCH EXTRACTION COMPLETE"
	echo "=============================="
	echo "Total PDFs: $TOTAL"
	echo "Extracted: $EXTRACTED"
	echo "Skipped: $SKIPPED"
	echo "Failed: $FAILED"
	echo "Total chars: $TOTAL_CHARS"
	echo "Display equations: $TOTAL_DISPLAY_EQ"
	echo "Inline equations: $TOTAL_INLINE_EQ"
	echo "Elapsed: ${ELAPSED}s"

	if [[ $FAILED -gt 0 ]]; then
	echo ""
	echo "Failed files:"
	for f in "${FAILED_FILES[@]}"; do
	echo " - $f"
	done
	fi