nougat-transformers / scripts /batch_extract.sh
alessandro trinca tornidor
fix(batch_extract): quote curl file path to handle special characters
60df088
#!/usr/bin/env bash
# Batch-extract PDFs to markdown via local Nougat Gradio API.
#
# Usage:
# batch_extract.sh --input ~/papers/ --output ~/formulas/
# batch_extract.sh --input ~/papers/ --output ~/formulas/ --port 7860,7861,7862
# batch_extract.sh --input ~/papers/ --output ~/formulas/ --host 10.0.0.1 --force
#
# Requires: bash, curl, python3
set -uo pipefail
# --- Defaults ---
INPUT_DIR=""
OUTPUT_DIR=""
PORTS="7860"
HOST="localhost"
FORCE=false
TIMEOUT=600
# --- Counters ---
TOTAL=0
EXTRACTED=0
SKIPPED=0
FAILED=0
TOTAL_CHARS=0
TOTAL_DISPLAY_EQ=0
TOTAL_INLINE_EQ=0
FAILED_FILES=()
usage() {
cat <<'USAGE'
Usage: batch_extract.sh [OPTIONS]
Required:
--input DIR Directory containing PDF files
--output DIR Directory for markdown output files
Optional:
--port PORTS Comma-separated ports (default: 7860)
Multiple ports enable parallel extraction
--host HOST Server hostname/IP (default: localhost)
--force Re-extract even if output .md exists
--timeout SECS Max seconds per PDF (default: 600)
--help Show this help
USAGE
exit 0
}
# --- Argument parsing ---
while [[ $# -gt 0 ]]; do
case "$1" in
--input) INPUT_DIR="$2"; shift 2 ;;
--output) OUTPUT_DIR="$2"; shift 2 ;;
--port) PORTS="$2"; shift 2 ;;
--host) HOST="$2"; shift 2 ;;
--force) FORCE=true; shift ;;
--timeout) TIMEOUT="$2"; shift 2 ;;
--help) usage ;;
*) echo "Unknown argument: $1" >&2; exit 1 ;;
esac
done
if [[ -z "$INPUT_DIR" || -z "$OUTPUT_DIR" ]]; then
echo "Error: --input and --output are required" >&2
usage
fi
if [[ ! -d "$INPUT_DIR" ]]; then
echo "Error: input directory does not exist: $INPUT_DIR" >&2
exit 1
fi
mkdir -p "$OUTPUT_DIR"
# --- Split ports ---
IFS=',' read -ra PORT_ARRAY <<< "$PORTS"
NUM_PORTS=${#PORT_ARRAY[@]}
# --- Health check ---
check_server() {
local port="$1"
local url="http://${HOST}:${port}/"
if ! curl -sf --connect-timeout 5 -o /dev/null "$url"; then
echo "Error: Nougat not reachable at $url" >&2
return 1
fi
}
echo "Checking $NUM_PORTS server(s)..."
for port in "${PORT_ARRAY[@]}"; do
if ! check_server "$port"; then
exit 1
fi
echo " Port $port: OK"
done
# --- Core extraction ---
extract_pdf() {
local pdf_path="$1"
local output_path="$2"
local port="$3"
local base_url="http://${HOST}:${port}/gradio_api"
local pdf_name
pdf_name="$(basename "$pdf_path")"
# Upload
local upload
upload=$(curl -sf --max-time 30 -X POST "$base_url/upload" \
-F "files=@\"$pdf_path\"") || { echo "FAIL upload: $pdf_name" >&2; return 1; }
local fpath
fpath=$(echo "$upload" | python3 -c "import sys,json; print(json.load(sys.stdin)[0])") \
|| { echo "FAIL parse upload: $pdf_name" >&2; return 1; }
# Call inference
local event
event=$(curl -sf --max-time 30 -X POST "$base_url/call/inference" \
-H "Content-Type: application/json" \
-d "{\"data\": [{\"path\": \"$fpath\", \"orig_name\": \"$pdf_name\", \"meta\": {\"_type\": \"gradio.FileData\"}}, \"\"]}") \
|| { echo "FAIL inference call: $pdf_name" >&2; return 1; }
local event_id
event_id=$(echo "$event" | python3 -c "import sys,json; print(json.load(sys.stdin)['event_id'])") \
|| { echo "FAIL parse event_id: $pdf_name" >&2; return 1; }
# Poll result
local result
result=$(curl -sf --max-time "$TIMEOUT" -N "$base_url/call/inference/$event_id") \
|| { echo "FAIL polling: $pdf_name (timeout=${TIMEOUT}s)" >&2; return 1; }
# Parse SSE and save
echo "$result" | python3 -c "
import sys, json, re
lines = sys.stdin.read().strip().split('\n')
data_lines = [l for l in lines if l.startswith('data:')]
if not data_lines:
print('ERROR: no data lines', file=sys.stderr)
sys.exit(1)
last = data_lines[-1].replace('data: ', '', 1)
data = json.loads(last, strict=False)
text = data[0]
display = len(re.findall(r'\\\$\\\$(.+?)\\\$\\\$', text, re.DOTALL))
inline = len(re.findall(r'(?<!\\\$)\\\$(?!\\\$)(.+?)(?<!\\\$)\\\$(?!\\\$)', text))
# Stats line to stderr: chars,display,inline
print(f'{len(text)},{display},{inline}', file=sys.stderr)
print(text)
" > "$output_path" 2>"${output_path}.stats"
local exit_code=$?
if [[ $exit_code -ne 0 ]]; then
echo "FAIL parse: $pdf_name" >&2
rm -f "$output_path" "${output_path}.stats"
return 1
fi
# Read stats
local stats
stats=$(cat "${output_path}.stats")
rm -f "${output_path}.stats"
echo "$stats"
return 0
}
# --- Build file list ---
mapfile -t PDF_FILES < <(find "$INPUT_DIR" -maxdepth 1 -name '*.pdf' -type f | sort)
TOTAL=${#PDF_FILES[@]}
if [[ $TOTAL -eq 0 ]]; then
echo "No PDF files found in $INPUT_DIR"
exit 0
fi
echo ""
echo "Found $TOTAL PDF(s) in $INPUT_DIR"
echo "Output: $OUTPUT_DIR"
echo "Ports: ${PORTS} (${NUM_PORTS} instance(s))"
echo ""
# --- Process PDFs ---
process_pdf() {
local idx="$1"
local pdf="$2"
local port="$3"
local pdf_name
pdf_name="$(basename "$pdf")"
local md_name="${pdf_name%.pdf}.md"
local output_path="$OUTPUT_DIR/$md_name"
# Skip check
if [[ "$FORCE" != true && -f "$output_path" ]]; then
echo "[$idx/$TOTAL] SKIP $pdf_name (exists)"
echo "SKIPPED"
return 0
fi
echo "[$idx/$TOTAL] Extracting $pdf_name (port $port)..."
local stats
stats=$(extract_pdf "$pdf" "$output_path" "$port")
local rc=$?
if [[ $rc -ne 0 ]]; then
echo "[$idx/$TOTAL] FAILED $pdf_name"
echo "FAILED:$pdf_name"
return 1
fi
local chars display inline
IFS=',' read -r chars display inline <<< "$stats"
echo "[$idx/$TOTAL] OK $pdf_name${chars} chars, ${display} display + ${inline} inline eqs"
echo "OK:${chars}:${display}:${inline}"
return 0
}
START_TIME=$SECONDS
if [[ $NUM_PORTS -eq 1 ]]; then
# Sequential mode
for i in "${!PDF_FILES[@]}"; do
idx=$((i + 1))
result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "${PORT_ARRAY[0]}")
status=$(echo "$result" | tail -1)
case "$status" in
SKIPPED) ((SKIPPED++)) ;;
FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;;
OK:*)
((EXTRACTED++))
IFS=':' read -r _ chars display inline <<< "$status"
TOTAL_CHARS=$((TOTAL_CHARS + chars))
TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display))
TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline))
;;
esac
done
else
# Parallel mode: round-robin across ports
TMPDIR_RESULTS=$(mktemp -d)
trap 'rm -rf "$TMPDIR_RESULTS"' EXIT
running=0
for i in "${!PDF_FILES[@]}"; do
idx=$((i + 1))
port_idx=$((i % NUM_PORTS))
port="${PORT_ARRAY[$port_idx]}"
(
result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "$port")
status=$(echo "$result" | tail -1)
echo "$status" > "$TMPDIR_RESULTS/$idx.result"
) &
((running++))
if [[ $running -ge $NUM_PORTS ]]; then
wait -n 2>/dev/null || wait
((running--))
fi
done
wait
# Collect results
for i in "${!PDF_FILES[@]}"; do
idx=$((i + 1))
if [[ -f "$TMPDIR_RESULTS/$idx.result" ]]; then
status=$(cat "$TMPDIR_RESULTS/$idx.result")
case "$status" in
SKIPPED) ((SKIPPED++)) ;;
FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;;
OK:*)
((EXTRACTED++))
IFS=':' read -r _ chars display inline <<< "$status"
TOTAL_CHARS=$((TOTAL_CHARS + chars))
TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display))
TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline))
;;
esac
else
((FAILED++))
fi
done
fi
ELAPSED=$((SECONDS - START_TIME))
# --- Summary ---
echo ""
echo "=============================="
echo "BATCH EXTRACTION COMPLETE"
echo "=============================="
echo "Total PDFs: $TOTAL"
echo "Extracted: $EXTRACTED"
echo "Skipped: $SKIPPED"
echo "Failed: $FAILED"
echo "Total chars: $TOTAL_CHARS"
echo "Display equations: $TOTAL_DISPLAY_EQ"
echo "Inline equations: $TOTAL_INLINE_EQ"
echo "Elapsed: ${ELAPSED}s"
if [[ $FAILED -gt 0 ]]; then
echo ""
echo "Failed files:"
for f in "${FAILED_FILES[@]}"; do
echo " - $f"
done
fi