Spaces:

aletrn
/

nougat-transformers

Paused

File size: 8,714 Bytes

#!/usr/bin/env bash
# Batch-extract PDFs to markdown via local Nougat Gradio API.
#
# Usage:
#   batch_extract.sh --input ~/papers/ --output ~/formulas/
#   batch_extract.sh --input ~/papers/ --output ~/formulas/ --port 7860,7861,7862
#   batch_extract.sh --input ~/papers/ --output ~/formulas/ --host 10.0.0.1 --force
#
# Requires: bash, curl, python3
set -uo pipefail

# --- Defaults ---
INPUT_DIR=""
OUTPUT_DIR=""
PORTS="7860"
HOST="localhost"
FORCE=false
TIMEOUT=600

# --- Counters ---
TOTAL=0
EXTRACTED=0
SKIPPED=0
FAILED=0
TOTAL_CHARS=0
TOTAL_DISPLAY_EQ=0
TOTAL_INLINE_EQ=0
FAILED_FILES=()

usage() {
    cat <<'USAGE'
Usage: batch_extract.sh [OPTIONS]

Required:
  --input  DIR    Directory containing PDF files
  --output DIR    Directory for markdown output files

Optional:
  --port   PORTS  Comma-separated ports (default: 7860)
                  Multiple ports enable parallel extraction
  --host   HOST   Server hostname/IP (default: localhost)
  --force         Re-extract even if output .md exists
  --timeout SECS  Max seconds per PDF (default: 600)
  --help          Show this help
USAGE
    exit 0
}

# --- Argument parsing ---
while [[ $# -gt 0 ]]; do
    case "$1" in
        --input)   INPUT_DIR="$2"; shift 2 ;;
        --output)  OUTPUT_DIR="$2"; shift 2 ;;
        --port)    PORTS="$2"; shift 2 ;;
        --host)    HOST="$2"; shift 2 ;;
        --force)   FORCE=true; shift ;;
        --timeout) TIMEOUT="$2"; shift 2 ;;
        --help)    usage ;;
        *) echo "Unknown argument: $1" >&2; exit 1 ;;
    esac
done

if [[ -z "$INPUT_DIR" || -z "$OUTPUT_DIR" ]]; then
    echo "Error: --input and --output are required" >&2
    usage
fi

if [[ ! -d "$INPUT_DIR" ]]; then
    echo "Error: input directory does not exist: $INPUT_DIR" >&2
    exit 1
fi

mkdir -p "$OUTPUT_DIR"

# --- Split ports ---
IFS=',' read -ra PORT_ARRAY <<< "$PORTS"
NUM_PORTS=${#PORT_ARRAY[@]}

# --- Health check ---
check_server() {
    local port="$1"
    local url="http://${HOST}:${port}/"
    if ! curl -sf --connect-timeout 5 -o /dev/null "$url"; then
        echo "Error: Nougat not reachable at $url" >&2
        return 1
    fi
}

echo "Checking $NUM_PORTS server(s)..."
for port in "${PORT_ARRAY[@]}"; do
    if ! check_server "$port"; then
        exit 1
    fi
    echo "  Port $port: OK"
done

# --- Core extraction ---
extract_pdf() {
    local pdf_path="$1"
    local output_path="$2"
    local port="$3"
    local base_url="http://${HOST}:${port}/gradio_api"
    local pdf_name
    pdf_name="$(basename "$pdf_path")"

    # Upload
    local upload
    upload=$(curl -sf --max-time 30 -X POST "$base_url/upload" \
        -F "files=@\"$pdf_path\"") || { echo "FAIL upload: $pdf_name" >&2; return 1; }

    local fpath
    fpath=$(echo "$upload" | python3 -c "import sys,json; print(json.load(sys.stdin)[0])") \
        || { echo "FAIL parse upload: $pdf_name" >&2; return 1; }

    # Call inference
    local event
    event=$(curl -sf --max-time 30 -X POST "$base_url/call/inference" \
        -H "Content-Type: application/json" \
        -d "{\"data\": [{\"path\": \"$fpath\", \"orig_name\": \"$pdf_name\", \"meta\": {\"_type\": \"gradio.FileData\"}}, \"\"]}") \
        || { echo "FAIL inference call: $pdf_name" >&2; return 1; }

    local event_id
    event_id=$(echo "$event" | python3 -c "import sys,json; print(json.load(sys.stdin)['event_id'])") \
        || { echo "FAIL parse event_id: $pdf_name" >&2; return 1; }

    # Poll result
    local result
    result=$(curl -sf --max-time "$TIMEOUT" -N "$base_url/call/inference/$event_id") \
        || { echo "FAIL polling: $pdf_name (timeout=${TIMEOUT}s)" >&2; return 1; }

    # Parse SSE and save
    echo "$result" | python3 -c "
import sys, json, re

lines = sys.stdin.read().strip().split('\n')
data_lines = [l for l in lines if l.startswith('data:')]
if not data_lines:
    print('ERROR: no data lines', file=sys.stderr)
    sys.exit(1)
last = data_lines[-1].replace('data: ', '', 1)
data = json.loads(last, strict=False)
text = data[0]

display = len(re.findall(r'\\\$\\\$(.+?)\\\$\\\$', text, re.DOTALL))
inline = len(re.findall(r'(?<!\\\$)\\\$(?!\\\$)(.+?)(?<!\\\$)\\\$(?!\\\$)', text))

# Stats line to stderr: chars,display,inline
print(f'{len(text)},{display},{inline}', file=sys.stderr)
print(text)
" > "$output_path" 2>"${output_path}.stats"

    local exit_code=$?
    if [[ $exit_code -ne 0 ]]; then
        echo "FAIL parse: $pdf_name" >&2
        rm -f "$output_path" "${output_path}.stats"
        return 1
    fi

    # Read stats
    local stats
    stats=$(cat "${output_path}.stats")
    rm -f "${output_path}.stats"
    echo "$stats"
    return 0
}

# --- Build file list ---
mapfile -t PDF_FILES < <(find "$INPUT_DIR" -maxdepth 1 -name '*.pdf' -type f | sort)
TOTAL=${#PDF_FILES[@]}

if [[ $TOTAL -eq 0 ]]; then
    echo "No PDF files found in $INPUT_DIR"
    exit 0
fi

echo ""
echo "Found $TOTAL PDF(s) in $INPUT_DIR"
echo "Output: $OUTPUT_DIR"
echo "Ports: ${PORTS} (${NUM_PORTS} instance(s))"
echo ""

# --- Process PDFs ---
process_pdf() {
    local idx="$1"
    local pdf="$2"
    local port="$3"
    local pdf_name
    pdf_name="$(basename "$pdf")"
    local md_name="${pdf_name%.pdf}.md"
    local output_path="$OUTPUT_DIR/$md_name"

    # Skip check
    if [[ "$FORCE" != true && -f "$output_path" ]]; then
        echo "[$idx/$TOTAL] SKIP $pdf_name (exists)"
        echo "SKIPPED"
        return 0
    fi

    echo "[$idx/$TOTAL] Extracting $pdf_name (port $port)..."
    local stats
    stats=$(extract_pdf "$pdf" "$output_path" "$port")
    local rc=$?

    if [[ $rc -ne 0 ]]; then
        echo "[$idx/$TOTAL] FAILED $pdf_name"
        echo "FAILED:$pdf_name"
        return 1
    fi

    local chars display inline
    IFS=',' read -r chars display inline <<< "$stats"
    echo "[$idx/$TOTAL] OK $pdf_name — ${chars} chars, ${display} display + ${inline} inline eqs"
    echo "OK:${chars}:${display}:${inline}"
    return 0
}

START_TIME=$SECONDS

if [[ $NUM_PORTS -eq 1 ]]; then
    # Sequential mode
    for i in "${!PDF_FILES[@]}"; do
        idx=$((i + 1))
        result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "${PORT_ARRAY[0]}")
        status=$(echo "$result" | tail -1)

        case "$status" in
            SKIPPED)        ((SKIPPED++)) ;;
            FAILED:*)       ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;;
            OK:*)
                ((EXTRACTED++))
                IFS=':' read -r _ chars display inline <<< "$status"
                TOTAL_CHARS=$((TOTAL_CHARS + chars))
                TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display))
                TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline))
                ;;
        esac
    done
else
    # Parallel mode: round-robin across ports
    TMPDIR_RESULTS=$(mktemp -d)
    trap 'rm -rf "$TMPDIR_RESULTS"' EXIT

    running=0
    for i in "${!PDF_FILES[@]}"; do
        idx=$((i + 1))
        port_idx=$((i % NUM_PORTS))
        port="${PORT_ARRAY[$port_idx]}"

        (
            result=$(process_pdf "$idx" "${PDF_FILES[$i]}" "$port")
            status=$(echo "$result" | tail -1)
            echo "$status" > "$TMPDIR_RESULTS/$idx.result"
        ) &

        ((running++))
        if [[ $running -ge $NUM_PORTS ]]; then
            wait -n 2>/dev/null || wait
            ((running--))
        fi
    done
    wait

    # Collect results
    for i in "${!PDF_FILES[@]}"; do
        idx=$((i + 1))
        if [[ -f "$TMPDIR_RESULTS/$idx.result" ]]; then
            status=$(cat "$TMPDIR_RESULTS/$idx.result")
            case "$status" in
                SKIPPED)        ((SKIPPED++)) ;;
                FAILED:*)       ((FAILED++)); FAILED_FILES+=("${status#FAILED:}") ;;
                OK:*)
                    ((EXTRACTED++))
                    IFS=':' read -r _ chars display inline <<< "$status"
                    TOTAL_CHARS=$((TOTAL_CHARS + chars))
                    TOTAL_DISPLAY_EQ=$((TOTAL_DISPLAY_EQ + display))
                    TOTAL_INLINE_EQ=$((TOTAL_INLINE_EQ + inline))
                    ;;
            esac
        else
            ((FAILED++))
        fi
    done
fi

ELAPSED=$((SECONDS - START_TIME))

# --- Summary ---
echo ""
echo "=============================="
echo "BATCH EXTRACTION COMPLETE"
echo "=============================="
echo "Total PDFs:       $TOTAL"
echo "Extracted:        $EXTRACTED"
echo "Skipped:          $SKIPPED"
echo "Failed:           $FAILED"
echo "Total chars:      $TOTAL_CHARS"
echo "Display equations: $TOTAL_DISPLAY_EQ"
echo "Inline equations: $TOTAL_INLINE_EQ"
echo "Elapsed:          ${ELAPSED}s"

if [[ $FAILED -gt 0 ]]; then
    echo ""
    echo "Failed files:"
    for f in "${FAILED_FILES[@]}"; do
        echo "  - $f"
    done
fi