Spaces:

kodetr
/

scriptai-backend

Sleeping

App Files Files Community

scriptai-backend / api_server.py

kodetr

update

519d951 verified 7 days ago

raw

history blame contribute delete

4.01 kB

	#!/usr/bin/env python3
	"""
	FastAPI server for hybrid PDF extraction.

	Endpoints:
	- GET /health
	- POST /extract-pdf-text (multipart: file, max_pages, ocr_lang)
	"""

	from __future__ import annotations

	import os
	import tempfile
	from pathlib import Path
	from typing import Optional

	from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile
	from fastapi.responses import JSONResponse

	try:
	from .extract_pdf_text import run as extract_run
	except ImportError:
	# Fallback when running as a plain script from this folder.
	from extract_pdf_text import run as extract_run


	app = FastAPI(title="ScriptAI PDF Extractor API", version="1.0.0")


	@app.get("/")
	def root_health() -> dict:
	# Many platforms probe GET / for health checks.
	return {"ok": True, "service": "pdf-extractor", "endpoint": "/extract-pdf-text"}


	@app.get("/health")
	def health() -> dict:
	return {"ok": True, "service": "pdf-extractor"}


	def ensure_authorized(authorization: Optional[str]) -> None:
	expected_token = (os.getenv("PYTHON_EXTRACTOR_TOKEN") or "").strip()
	if expected_token == "":
	return

	bearer = (authorization or "").strip()
	if not bearer.startswith("Bearer "):
	raise HTTPException(status_code=401, detail="Unauthorized")

	received = bearer[7:].strip()
	if received != expected_token:
	raise HTTPException(status_code=401, detail="Unauthorized")


	@app.post("/extract-pdf-text")
	async def extract_pdf_text(
	file: UploadFile = File(...),
	max_pages: int = Form(20),
	ocr_lang: str = Form("ind+eng"),
	authorization: Optional[str] = Header(default=None),
	) -> JSONResponse:
	ensure_authorized(authorization)

	max_pages = max(1, min(max_pages, 80))

	suffix = ".pdf"
	temp_path: Optional[Path] = None

	try:
	with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
	temp_path = Path(tmp.name)
	while True:
	chunk = await file.read(1024 * 1024)
	if not chunk:
	break
	tmp.write(chunk)

	payload = extract_run(str(temp_path), max_pages=max_pages, ocr_lang=ocr_lang)

	status = 200 if payload.get("success") else 422
	return JSONResponse(payload, status_code=status)
	except HTTPException:
	raise
	except Exception as exc:
	return JSONResponse(
	{
	"success": False,
	"mode": "error",
	"engine": "none",
	"text": "",
	"error": str(exc),
	},
	status_code=500,
	)
	finally:
	await file.close()
	if temp_path and temp_path.exists():
	temp_path.unlink(missing_ok=True)


	@app.post("/")
	async def extract_pdf_text_root(
	file: UploadFile = File(...),
	max_pages: int = Form(20),
	ocr_lang: str = Form("ind+eng"),
	authorization: Optional[str] = Header(default=None),
	) -> JSONResponse:
	# Alias endpoint to keep compatibility with simple base URL posting.
	return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)


	@app.post("/api/extract-pdf-text")
	async def extract_pdf_text_api_alias(
	file: UploadFile = File(...),
	max_pages: int = Form(20),
	ocr_lang: str = Form("ind+eng"),
	authorization: Optional[str] = Header(default=None),
	) -> JSONResponse:
	# Compatibility alias used by Laravel fallback endpoint list.
	return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)


	@app.post("/extract/pdf-text")
	async def extract_pdf_text_legacy_alias(
	file: UploadFile = File(...),
	max_pages: int = Form(20),
	ocr_lang: str = Form("ind+eng"),
	authorization: Optional[str] = Header(default=None),
	) -> JSONResponse:
	# Legacy compatibility alias used by older clients.
	return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)