Spaces:

kodetr
/

scriptai-backend

Sleeping

App Files Files Community

kodetr commited on 29 days ago

Commit

0861826

verified ·

1 Parent(s): 4936063

update

Browse files

Files changed (5) hide show

__pycache__/api_server.cpython-310.pyc +0 -0
__pycache__/extract_pdf_text.cpython-310.pyc +0 -0
api_server.py +109 -0
extract_pdf_text.py +194 -0
requirements.txt +14 -0

__pycache__/api_server.cpython-310.pyc ADDED Viewed

Binary file (2.96 kB). View file

__pycache__/extract_pdf_text.cpython-310.pyc ADDED Viewed

Binary file (4.89 kB). View file

api_server.py ADDED Viewed

	@@ -0,0 +1,109 @@

+#!/usr/bin/env python3
+"""
+FastAPI server for hybrid PDF extraction.
+Endpoints:
+- GET /health
+- POST /extract-pdf-text (multipart: file, max_pages, ocr_lang)
+"""
+from __future__ import annotations
+import os
+import tempfile
+from pathlib import Path
+from typing import Optional
+from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile
+from fastapi.responses import JSONResponse
+try:
+    from .extract_pdf_text import run as extract_run
+except ImportError:
+    # Fallback when running as a plain script from this folder.
+    from extract_pdf_text import run as extract_run
+app = FastAPI(title="ScriptAI PDF Extractor API", version="1.0.0")
+@app.get("/health")
+def health() -> dict:
+    return {"ok": True, "service": "pdf-extractor"}
+def ensure_authorized(authorization: Optional[str]) -> None:
+    expected_token = (os.getenv("PYTHON_EXTRACTOR_TOKEN") or "").strip()
+    if expected_token == "":
+        return
+    bearer = (authorization or "").strip()
+    if not bearer.startswith("Bearer "):
+        raise HTTPException(status_code=401, detail="Unauthorized")
+    received = bearer[7:].strip()
+    if received != expected_token:
+        raise HTTPException(status_code=401, detail="Unauthorized")
+@app.post("/extract-pdf-text")
+async def extract_pdf_text(
+    file: UploadFile = File(...),
+    max_pages: int = Form(20),
+    ocr_lang: str = Form("ind+eng"),
+    authorization: Optional[str] = Header(default=None),
+) -> JSONResponse:
+    ensure_authorized(authorization)
+    filename = (file.filename or "uploaded.pdf").lower()
+    content_type = (file.content_type or "").lower()
+    if not filename.endswith(".pdf") and "pdf" not in content_type:
+        raise HTTPException(status_code=422, detail="File harus berformat PDF.")
+    max_pages = max(1, min(max_pages, 80))
+    suffix = ".pdf"
+    temp_path: Optional[Path] = None
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            temp_path = Path(tmp.name)
+            while True:
+                chunk = await file.read(1024 * 1024)
+                if not chunk:
+                    break
+                tmp.write(chunk)
+        payload = extract_run(str(temp_path), max_pages=max_pages, ocr_lang=ocr_lang)
+        status = 200 if payload.get("success") else 422
+        return JSONResponse(payload, status_code=status)
+    except HTTPException:
+        raise
+    except Exception as exc:
+        return JSONResponse(
+            {
+                "success": False,
+                "mode": "error",
+                "engine": "none",
+                "text": "",
+                "error": str(exc),
+            },
+            status_code=500,
+        )
+    finally:
+        await file.close()
+        if temp_path and temp_path.exists():
+            temp_path.unlink(missing_ok=True)
+@app.post("/")
+async def extract_pdf_text_root(
+    file: UploadFile = File(...),
+    max_pages: int = Form(20),
+    ocr_lang: str = Form("ind+eng"),
+    authorization: Optional[str] = Header(default=None),
+) -> JSONResponse:
+    # Alias endpoint to keep compatibility with simple base URL posting.
+    return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)

extract_pdf_text.py ADDED Viewed

	@@ -0,0 +1,194 @@

+#!/usr/bin/env python3
+"""
+Hybrid PDF extractor:
+1) Text-based PDF via PyMuPDF/pdfplumber
+2) Scan PDF via OCR (Tesseract first, PaddleOCR fallback)
+Output JSON to stdout.
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+from typing import Optional
+def clean_text(text: str) -> str:
+    text = text or ""
+    text = re.sub(r"\r\n?", "\n", text)
+    text = re.sub(r"[ \t]{2,}", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def extract_with_pymupdf(path: str, max_pages: int) -> str:
+    try:
+        import fitz  # PyMuPDF
+    except Exception:
+        return ""
+    texts = []
+    try:
+        doc = fitz.open(path)
+        total = min(len(doc), max_pages)
+        for i in range(total):
+            page = doc.load_page(i)
+            texts.append(page.get_text("text") or "")
+        doc.close()
+    except Exception:
+        return ""
+    return clean_text("\n".join(texts))
+def extract_with_pdfplumber(path: str, max_pages: int) -> str:
+    try:
+        import pdfplumber
+    except Exception:
+        return ""
+    texts = []
+    try:
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages[:max_pages]:
+                texts.append(page.extract_text() or "")
+    except Exception:
+        return ""
+    return clean_text("\n".join(texts))
+def ocr_with_tesseract(path: str, max_pages: int, lang: str) -> str:
+    try:
+        from pdf2image import convert_from_path
+        import pytesseract
+    except Exception:
+        return ""
+    texts = []
+    try:
+        images = convert_from_path(path, dpi=250, first_page=1, last_page=max_pages)
+        for image in images:
+            texts.append(pytesseract.image_to_string(image, lang=lang) or "")
+    except Exception:
+        return ""
+    return clean_text("\n".join(texts))
+def ocr_with_paddle(path: str, max_pages: int) -> str:
+    try:
+        from pdf2image import convert_from_path
+        from paddleocr import PaddleOCR
+    except Exception:
+        return ""
+    texts = []
+    try:
+        images = convert_from_path(path, dpi=220, first_page=1, last_page=max_pages)
+        ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False)
+        for image in images:
+            result = ocr.ocr(image)
+            if not result:
+                continue
+            page_lines = []
+            for item in result[0] or []:
+                if isinstance(item, (list, tuple)) and len(item) >= 2:
+                    text_info = item[1]
+                    if isinstance(text_info, (list, tuple)) and text_info:
+                        page_lines.append(str(text_info[0]))
+            if page_lines:
+                texts.append("\n".join(page_lines))
+    except Exception:
+        return ""
+    return clean_text("\n".join(texts))
+def looks_like_text_based(text: str) -> bool:
+    text = clean_text(text)
+    if len(text) < 40:
+        return False
+    alnum_count = sum(1 for c in text if c.isalnum())
+    return alnum_count >= 24
+def run(path: str, max_pages: int, ocr_lang: str) -> dict:
+    text = extract_with_pymupdf(path, max_pages)
+    if looks_like_text_based(text):
+        return {
+            "success": True,
+            "mode": "text-based",
+            "engine": "pymupdf",
+            "text": text,
+        }
+    text_pdfplumber = extract_with_pdfplumber(path, max_pages)
+    if looks_like_text_based(text_pdfplumber):
+        return {
+            "success": True,
+            "mode": "text-based",
+            "engine": "pdfplumber",
+            "text": text_pdfplumber,
+        }
+    text_ocr_tesseract = ocr_with_tesseract(path, max_pages, ocr_lang)
+    if looks_like_text_based(text_ocr_tesseract):
+        return {
+            "success": True,
+            "mode": "scan-ocr",
+            "engine": "tesseract",
+            "text": text_ocr_tesseract,
+        }
+    text_ocr_paddle = ocr_with_paddle(path, max_pages)
+    if looks_like_text_based(text_ocr_paddle):
+        return {
+            "success": True,
+            "mode": "scan-ocr",
+            "engine": "paddleocr",
+            "text": text_ocr_paddle,
+        }
+    merged = clean_text("\n\n".join([text, text_pdfplumber, text_ocr_tesseract, text_ocr_paddle]))
+    return {
+        "success": merged != "",
+        "mode": "mixed-fallback" if merged else "none",
+        "engine": "combined",
+        "text": merged,
+        "error": "Tidak ada teks yang dapat diekstrak dari PDF." if merged == "" else None,
+    }
+def parse_args(argv: Optional[list] = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Extract text from PDF (text-based + OCR)")
+    parser.add_argument("pdf_path", help="Path to PDF file")
+    parser.add_argument("--max-pages", type=int, default=20)
+    parser.add_argument("--ocr-lang", default="ind+eng")
+    return parser.parse_args(argv)
+def main(argv: Optional[list] = None) -> int:
+    args = parse_args(argv)
+    try:
+        payload = run(args.pdf_path, max(1, args.max_pages), args.ocr_lang)
+    except Exception as exc:
+        payload = {
+            "success": False,
+            "mode": "error",
+            "engine": "none",
+            "text": "",
+            "error": str(exc),
+        }
+    sys.stdout.write(json.dumps(payload, ensure_ascii=False))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# Text-based PDF extraction
+PyMuPDF>=1.24.0
+pdfplumber>=0.11.0
+# OCR pipeline
+pytesseract>=0.3.10
+pdf2image>=1.17.0
+paddleocr>=2.8.0
+paddlepaddle>=2.6.0
+# API server
+fastapi>=0.116.0
+uvicorn>=0.35.0
+python-multipart>=0.0.20