Spaces:
Paused
Paused
alessandro trinca tornidor
fix(batch_extract): quote curl file path to handle special characters
60df088 | # Batch-extract PDFs to markdown via local Nougat Gradio API. | |
| # | |
| # Usage: | |
| # batch_extract.sh --input ~/papers/ --output ~/formulas/ | |
| # batch_extract.sh --input ~/papers/ --output ~/formulas/ --port 7860,7861,7862 | |
| # batch_extract.sh --input ~/papers/ --output ~/formulas/ --host 10.0.0.1 --force | |
| # | |
| # Requires: bash, curl, python3 | |
| set -uo pipefail | |
| # --- Defaults --- | |
| INPUT_DIR="" | |
| OUTPUT_DIR="" | |
| PORTS="7860" | |
| HOST="localhost" | |
| FORCE=false | |
| TIMEOUT=600 | |
| # --- Counters --- | |
| TOTAL=0 | |
| EXTRACTED=0 | |
| SKIPPED=0 | |
| FAILED=0 | |
| TOTAL_CHARS=0 | |
| TOTAL_DISPLAY_EQ=0 | |
| TOTAL_INLINE_EQ=0 | |
| FAILED_FILES=() | |
| usage() { | |
| cat <<'USAGE' | |
| Usage: batch_extract.sh [OPTIONS] | |
| Required: | |
| --input DIR Directory containing PDF files | |
| --output DIR Directory for markdown output files | |
| Optional: | |
| --port PORTS Comma-separated ports (default: 7860) | |
| Multiple ports enable parallel extraction | |
| --host HOST Server hostname/IP (default: localhost) | |
| --force Re-extract even if output .md exists | |
| --timeout SECS Max seconds per PDF (default: 600) | |
| --help Show this help | |
| USAGE | |
| exit 0 | |
| } | |
| # --- Argument parsing --- | |
| while [[ $# -gt 0 ]]; do | |
| case "$1" in | |
| --input) INPUT_DIR="$2"; shift 2 ;; | |
| --output) OUTPUT_DIR="$2"; shift 2 ;; | |
| --port) PORTS="$2"; shift 2 ;; | |
| --host) HOST="$2"; shift 2 ;; | |
| --force) FORCE=true; shift ;; | |
| --timeout) TIMEOUT="$2"; shift 2 ;; | |
| --help) usage ;; | |
| *) echo "Unknown argument: $1" >&2; exit 1 ;; | |
| esac | |
| done | |
| if [[ -z "$INPUT_DIR" || -z "$OUTPUT_DIR" ]]; then | |
| echo "Error: --input and --output are required" >&2 | |
| usage | |
| fi | |
| if [[ ! -d "$INPUT_DIR" ]]; then | |
| echo "Error: input directory does not exist: $INPUT_DIR" >&2 | |
| exit 1 | |
| fi | |
| mkdir -p "$OUTPUT_DIR" | |
| # --- Split ports --- | |
| IFS=',' read -ra PORT_ARRAY <<< "$PORTS" | |
| NUM_PORTS=${#PORT_ARRAY[@]} | |
| # --- Health check --- | |
| check_server() { | |
| local port="$1" | |
| local url="http://${HOST}:${port}/" | |
| if ! curl -sf --connect-timeout 5 -o /dev/null "$url"; then | |
| echo "Error: Nougat not reachable at $url" >&2 | |
| return 1 | |
| fi | |
| } | |
| echo "Checking $NUM_PORTS server(s)..." | |
| for port in "${PORT_ARRAY[@]}"; do | |
| if ! check_server "$port"; then | |
| exit 1 | |
| fi | |
| echo " Port $port: OK" | |
| done | |
| # --- Core extraction --- | |
| extract_pdf() { | |
| local pdf_path="$1" | |
| local output_path="$2" | |
| local port="$3" | |
| local base_url="http://${HOST}:${port}/gradio_api" | |
| local pdf_name | |
| pdf_name="$(basename "$pdf_path")" | |
| # Upload | |
| local upload | |
| upload=$(curl -sf --max-time 30 -X POST "$base_url/upload" \ | |
| -F "files=@\"$pdf_path\"") || { echo "FAIL upload: $pdf_name" >&2; return 1; } | |
| local fpath | |
| fpath=$(echo "$upload" | python3 -c "import sys,json; print(json.load(sys.stdin)[0])") \ | |
| || { echo "FAIL parse upload: $pdf_name" >&2; return 1; } | |
| # Call inference | |
| local event | |
| event=$(curl -sf --max-time 30 -X POST "$base_url/call/inference" \ | |
| -H "Content-Type: application/json" \ | |
| -d "{\"data\": [{\"path\": \"$fpath\", \"orig_name\": \"$pdf_name\", \"meta\": {\"_type\": \"gradio.FileData\"}}, \"\"]}") \ | |
| || { echo "FAIL inference call: $pdf_name" >&2; return 1; } | |
| local event_id | |
| event_id=$(echo "$event" | python3 -c "import sys,json; print(json.load(sys.stdin)['event_id'])") \ | |
| || { echo "FAIL parse event_id: $pdf_name" >&2; return 1; } | |
| # Poll result | |
| local result | |
| result=$(curl -sf --max-time "$TIMEOUT" -N "$base_url/call/inference/$event_id") \ | |
| || { echo "FAIL polling: $pdf_name (timeout=${TIMEOUT}s)" >&2; return 1; } | |
| # Parse SSE and save | |
| echo "$result" | python3 -c " | |
| import sys, json, re | |
| lines = sys.stdin.read().strip().split('\n') | |
| data_lines = [l for l in lines if l.startswith('data:')] | |
| if not data_lines: | |
| print('ERROR: no data lines', file=sys.stderr) | |
| sys.exit(1) | |
| last = data_lines[-1].replace('data: ', '', 1) | |
| data = json.loads(last, strict=False) | |
| text = data[0] | |
| display = len(re.findall(r'\\\$\\\$(.+?)\\\$\\\$', text, re.DOTALL)) | |
| inline = len(re.findall(r'(?<!\\\$)\\\$(?!\\\$)(.+?)(?<!\\\$)\\\$(?!\\\$)', text)) | |
| # Stats line to stderr: chars,display,inline | |
| print(f'{len(text)},{display},{inline}', file=sys.stderr) | |
| print(text) | |
| " > "$output_path" 2>"${output_path}.stats" | |
| local exit_code=$? | |
| if [[ $exit_code -ne 0 ]]; then | |
| echo "FAIL parse: $pdf_name" >&2 | |
| rm -f "$output_path" "${output_path}.stats" | |
| return 1 | |
| fi | |
| # Read stats | |
| local stats | |
| stats=$(cat "${output_path}.stats") | |
| rm -f "${output_path}.stats" | |
| echo "$stats" | |
| return 0 | |
| } | |
| # --- Build file list --- | |
| mapfile -t PDF_FILES < <(find "$INPUT_DIR" -maxdepth 1 -name '*.pdf' -type f | sort) | |
| TOTAL=${#PDF_FILES[@]} | |
| if [[ $TOTAL -eq 0 ]]; then | |
| echo "No PDF files found in $INPUT_DIR" | |
| exit 0 | |
| fi | |
| echo "" | |
| echo "Found $TOTAL PDF(s) in $INPUT_DIR" | |
| echo "Output: $OUTPUT_DIR" | |
| echo "Ports: ${PORTS} (${NUM_PORTS} instance(s))" | |
| echo "" | |
| # --- Process PDFs --- | |
| process_pdf() { | |
| local idx="$1" | |
| local pdf="$2" | |
| local port="$3" | |
| local pdf_name | |
| pdf_name="$(basename "$pdf")" | |
| local md_name="${pdf_name%.pdf}.md" | |
| local output_path="$OUTPUT_DIR/$md_name" | |
| # Skip check | |
| if [[ "$FORCE" != true && -f "$output_path" ]]; then | |
| echo "[$idx/$TOTAL] SKIP $pdf_name (exists)" | |
| echo "SKIPPED" | |
| return 0 | |
| fi | |
| echo "[$idx/$TOTAL] Extracting $pdf_name (port $port)..." | |
| local stats | |
| stats=$(extract_pdf "$pdf" "$output_path" "$port") | |
| local rc=$? | |
| if [[ $rc -ne 0 ]]; then | |
| echo "[$idx/$TOTAL] FAILED $pdf_name" | |
| echo "FAILED:$pdf_name" | |
| return 1 | |
| fi | |
| local chars display inline | |
| IFS=',' read -r chars display inline <<< "$stats" | |
| echo "[$idx/$TOTAL] OK $pdf_name — ${chars} chars, ${display} display + ${inline} inline eqs" | |
| echo "OK:${chars}:${display}:${inline}" | |
| return 0 | |
| } | |
| START_TIME=$SECONDS | |
| if [[ $NUM_PORTS -eq 1 ]]; then | |
| # Sequential mode | |
| for i in "${!PDF_FILES[@]}"; do | |
| idx=$((i + 1)) | |
| result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "${PORT_ARRAY[0]}") | |
| status=$(echo "$result" | tail -1) | |
| case "$status" in | |
| SKIPPED) ((SKIPPED++)) ;; | |
| FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;; | |
| OK:*) | |
| ((EXTRACTED++)) | |
| IFS=':' read -r _ chars display inline <<< "$status" | |
| TOTAL_CHARS=$((TOTAL_CHARS + chars)) | |
| TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display)) | |
| TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline)) | |
| ;; | |
| esac | |
| done | |
| else | |
| # Parallel mode: round-robin across ports | |
| TMPDIR_RESULTS=$(mktemp -d) | |
| trap 'rm -rf "$TMPDIR_RESULTS"' EXIT | |
| running=0 | |
| for i in "${!PDF_FILES[@]}"; do | |
| idx=$((i + 1)) | |
| port_idx=$((i % NUM_PORTS)) | |
| port="${PORT_ARRAY[$port_idx]}" | |
| ( | |
| result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "$port") | |
| status=$(echo "$result" | tail -1) | |
| echo "$status" > "$TMPDIR_RESULTS/$idx.result" | |
| ) & | |
| ((running++)) | |
| if [[ $running -ge $NUM_PORTS ]]; then | |
| wait -n 2>/dev/null || wait | |
| ((running--)) | |
| fi | |
| done | |
| wait | |
| # Collect results | |
| for i in "${!PDF_FILES[@]}"; do | |
| idx=$((i + 1)) | |
| if [[ -f "$TMPDIR_RESULTS/$idx.result" ]]; then | |
| status=$(cat "$TMPDIR_RESULTS/$idx.result") | |
| case "$status" in | |
| SKIPPED) ((SKIPPED++)) ;; | |
| FAILED:*) ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;; | |
| OK:*) | |
| ((EXTRACTED++)) | |
| IFS=':' read -r _ chars display inline <<< "$status" | |
| TOTAL_CHARS=$((TOTAL_CHARS + chars)) | |
| TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display)) | |
| TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline)) | |
| ;; | |
| esac | |
| else | |
| ((FAILED++)) | |
| fi | |
| done | |
| fi | |
| ELAPSED=$((SECONDS - START_TIME)) | |
| # --- Summary --- | |
| echo "" | |
| echo "==============================" | |
| echo "BATCH EXTRACTION COMPLETE" | |
| echo "==============================" | |
| echo "Total PDFs: $TOTAL" | |
| echo "Extracted: $EXTRACTED" | |
| echo "Skipped: $SKIPPED" | |
| echo "Failed: $FAILED" | |
| echo "Total chars: $TOTAL_CHARS" | |
| echo "Display equations: $TOTAL_DISPLAY_EQ" | |
| echo "Inline equations: $TOTAL_INLINE_EQ" | |
| echo "Elapsed: ${ELAPSED}s" | |
| if [[ $FAILED -gt 0 ]]; then | |
| echo "" | |
| echo "Failed files:" | |
| for f in "${FAILED_FILES[@]}"; do | |
| echo " - $f" | |
| done | |
| fi | |