#!/usr/bin/env bash # Batch-extract PDFs to markdown via local Nougat Gradio API. # # Usage: # batch_extract.sh --input ~/papers/ --output ~/formulas/ # batch_extract.sh --input ~/papers/ --output ~/formulas/ --port 7860,7861,7862 # batch_extract.sh --input ~/papers/ --output ~/formulas/ --host 10.0.0.1 --force # # Requires: bash, curl, python3 set -uo pipefail # --- Defaults --- INPUT_DIR="" OUTPUT_DIR="" PORTS="7860" HOST="localhost" FORCE=false TIMEOUT=600 # --- Counters --- TOTAL=0 EXTRACTED=0 SKIPPED=0 FAILED=0 TOTAL_CHARS=0 TOTAL_DISPLAY_EQ=0 TOTAL_INLINE_EQ=0 FAILED_FILES=() usage() { cat <<'USAGE' Usage: batch_extract.sh [OPTIONS] Required: --input DIR Directory containing PDF files --output DIR Directory for markdown output files Optional: --port PORTS Comma-separated ports (default: 7860) Multiple ports enable parallel extraction --host HOST Server hostname/IP (default: localhost) --force Re-extract even if output .md exists --timeout SECS Max seconds per PDF (default: 600) --help Show this help USAGE exit 0 } # --- Argument parsing --- while [[ $# -gt 0 ]]; do case "$1" in --input) INPUT_DIR="$2"; shift 2 ;; --output) OUTPUT_DIR="$2"; shift 2 ;; --port) PORTS="$2"; shift 2 ;; --host) HOST="$2"; shift 2 ;; --force) FORCE=true; shift ;; --timeout) TIMEOUT="$2"; shift 2 ;; --help) usage ;; *) echo "Unknown argument: $1" >&2; exit 1 ;; esac done if [[ -z "$INPUT_DIR" || -z "$OUTPUT_DIR" ]]; then echo "Error: --input and --output are required" >&2 usage fi if [[ ! -d "$INPUT_DIR" ]]; then echo "Error: input directory does not exist: $INPUT_DIR" >&2 exit 1 fi mkdir -p "$OUTPUT_DIR" # --- Split ports --- IFS=',' read -ra PORT_ARRAY <<< "$PORTS" NUM_PORTS=${#PORT_ARRAY[@]} # --- Health check --- check_server() { local port="$1" local url="http://${HOST}:${port}/" if ! curl -sf --connect-timeout 5 -o /dev/null "$url"; then echo "Error: Nougat not reachable at $url" >&2 return 1 fi } echo "Checking $NUM_PORTS server(s)..." for port in "${PORT_ARRAY[@]}"; do if ! check_server "$port"; then exit 1 fi echo " Port $port: OK" done # --- Core extraction --- extract_pdf() { local pdf_path="$1" local output_path="$2" local port="$3" local base_url="http://${HOST}:${port}/gradio_api" local pdf_name pdf_name="$(basename "$pdf_path")" # Upload local upload upload=$(curl -sf --max-time 30 -X POST "$base_url/upload" \ -F "files=@\"$pdf_path\"") || { echo "FAIL upload: $pdf_name" >&2; return 1; } local fpath fpath=$(echo "$upload" | python3 -c "import sys,json; print(json.load(sys.stdin)[0])") \ || { echo "FAIL parse upload: $pdf_name" >&2; return 1; } # Call inference local event event=$(curl -sf --max-time 30 -X POST "$base_url/call/inference" \ -H "Content-Type: application/json" \ -d "{\"data\": [{\"path\": \"$fpath\", \"orig_name\": \"$pdf_name\", \"meta\": {\"_type\": \"gradio.FileData\"}}, \"\"]}") \ || { echo "FAIL inference call: $pdf_name" >&2; return 1; } local event_id event_id=$(echo "$event" | python3 -c "import sys,json; print(json.load(sys.stdin)['event_id'])") \ || { echo "FAIL parse event_id: $pdf_name" >&2; return 1; } # Poll result local result result=$(curl -sf --max-time "$TIMEOUT" -N "$base_url/call/inference/$event_id") \ || { echo "FAIL polling: $pdf_name (timeout=${TIMEOUT}s)" >&2; return 1; } # Parse SSE and save echo "$result" | python3 -c " import sys, json, re lines = sys.stdin.read().strip().split('\n') data_lines = [l for l in lines if l.startswith('data:')] if not data_lines: print('ERROR: no data lines', file=sys.stderr) sys.exit(1) last = data_lines[-1].replace('data: ', '', 1) data = json.loads(last, strict=False) text = data[0] display = len(re.findall(r'\\\$\\\$(.+?)\\\$\\\$', text, re.DOTALL)) inline = len(re.findall(r'(? "$output_path" 2>"${output_path}.stats" local exit_code=$? if [[ $exit_code -ne 0 ]]; then echo "FAIL parse: $pdf_name" >&2 rm -f "$output_path" "${output_path}.stats" return 1 fi # Read stats local stats stats=$(cat "${output_path}.stats") rm -f "${output_path}.stats" echo "$stats" return 0 } # --- Build file list --- mapfile -t PDF_FILES < <(find "$INPUT_DIR" -maxdepth 1 -name '*.pdf' -type f | sort) TOTAL=${#PDF_FILES[@]} if [[ $TOTAL -eq 0 ]]; then echo "No PDF files found in $INPUT_DIR" exit 0 fi echo "" echo "Found $TOTAL PDF(s) in $INPUT_DIR" echo "Output: $OUTPUT_DIR" echo "Ports: ${PORTS} (${NUM_PORTS} instance(s))" echo "" # --- Process PDFs --- process_pdf() { local idx="$1" local pdf="$2" local port="$3" local pdf_name pdf_name="$(basename "$pdf")" local md_name="${pdf_name%.pdf}.md" local output_path="$OUTPUT_DIR/$md_name" # Skip check if [[ "$FORCE" != true && -f "$output_path" ]]; then echo "[$idx/$TOTAL] SKIP $pdf_name (exists)" echo "SKIPPED" return 0 fi echo "[$idx/$TOTAL] Extracting $pdf_name (port $port)..." local stats stats=$(extract_pdf "$pdf" "$output_path" "$port") local rc=$? if [[ $rc -ne 0 ]]; then echo "[$idx/$TOTAL] FAILED $pdf_name" echo "FAILED:$pdf_name" return 1 fi local chars display inline IFS=',' read -r chars display inline <<< "$stats" echo "[$idx/$TOTAL] OK $pdf_name — ${chars} chars, ${display} display + ${inline} inline eqs" echo "OK:${chars}:${display}:${inline}" return 0 } START_TIME=$SECONDS if [[ $NUM_PORTS -eq 1 ]]; then # Sequential mode for i in "${!PDF_FILES[@]}"; do idx=$((i + 1)) result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "${PORT_ARRAY[0]}") status=$(echo "$result" | tail -1) case "$status" in SKIPPED) ((SKIPPED++)) ;; FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;; OK:*) ((EXTRACTED++)) IFS=':' read -r _ chars display inline <<< "$status" TOTAL_CHARS=$((TOTAL_CHARS + chars)) TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display)) TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline)) ;; esac done else # Parallel mode: round-robin across ports TMPDIR_RESULTS=$(mktemp -d) trap 'rm -rf "$TMPDIR_RESULTS"' EXIT running=0 for i in "${!PDF_FILES[@]}"; do idx=$((i + 1)) port_idx=$((i % NUM_PORTS)) port="${PORT_ARRAY[$port_idx]}" ( result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "$port") status=$(echo "$result" | tail -1) echo "$status" > "$TMPDIR_RESULTS/$idx.result" ) & ((running++)) if [[ $running -ge $NUM_PORTS ]]; then wait -n 2>/dev/null || wait ((running--)) fi done wait # Collect results for i in "${!PDF_FILES[@]}"; do idx=$((i + 1)) if [[ -f "$TMPDIR_RESULTS/$idx.result" ]]; then status=$(cat "$TMPDIR_RESULTS/$idx.result") case "$status" in SKIPPED) ((SKIPPED++)) ;; FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;; OK:*) ((EXTRACTED++)) IFS=':' read -r _ chars display inline <<< "$status" TOTAL_CHARS=$((TOTAL_CHARS + chars)) TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display)) TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline)) ;; esac else ((FAILED++)) fi done fi ELAPSED=$((SECONDS - START_TIME)) # --- Summary --- echo "" echo "==============================" echo "BATCH EXTRACTION COMPLETE" echo "==============================" echo "Total PDFs: $TOTAL" echo "Extracted: $EXTRACTED" echo "Skipped: $SKIPPED" echo "Failed: $FAILED" echo "Total chars: $TOTAL_CHARS" echo "Display equations: $TOTAL_DISPLAY_EQ" echo "Inline equations: $TOTAL_INLINE_EQ" echo "Elapsed: ${ELAPSED}s" if [[ $FAILED -gt 0 ]]; then echo "" echo "Failed files:" for f in "${FAILED_FILES[@]}"; do echo " - $f" done fi