kodetr commited on
Commit
0861826
·
verified ·
1 Parent(s): 4936063
__pycache__/api_server.cpython-310.pyc ADDED
Binary file (2.96 kB). View file
 
__pycache__/extract_pdf_text.cpython-310.pyc ADDED
Binary file (4.89 kB). View file
 
api_server.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ FastAPI server for hybrid PDF extraction.
4
+
5
+ Endpoints:
6
+ - GET /health
7
+ - POST /extract-pdf-text (multipart: file, max_pages, ocr_lang)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import tempfile
14
+ from pathlib import Path
15
+ from typing import Optional
16
+
17
+ from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile
18
+ from fastapi.responses import JSONResponse
19
+
20
+ try:
21
+ from .extract_pdf_text import run as extract_run
22
+ except ImportError:
23
+ # Fallback when running as a plain script from this folder.
24
+ from extract_pdf_text import run as extract_run
25
+
26
+
27
+ app = FastAPI(title="ScriptAI PDF Extractor API", version="1.0.0")
28
+
29
+
30
+ @app.get("/health")
31
+ def health() -> dict:
32
+ return {"ok": True, "service": "pdf-extractor"}
33
+
34
+
35
+ def ensure_authorized(authorization: Optional[str]) -> None:
36
+ expected_token = (os.getenv("PYTHON_EXTRACTOR_TOKEN") or "").strip()
37
+ if expected_token == "":
38
+ return
39
+
40
+ bearer = (authorization or "").strip()
41
+ if not bearer.startswith("Bearer "):
42
+ raise HTTPException(status_code=401, detail="Unauthorized")
43
+
44
+ received = bearer[7:].strip()
45
+ if received != expected_token:
46
+ raise HTTPException(status_code=401, detail="Unauthorized")
47
+
48
+
49
+ @app.post("/extract-pdf-text")
50
+ async def extract_pdf_text(
51
+ file: UploadFile = File(...),
52
+ max_pages: int = Form(20),
53
+ ocr_lang: str = Form("ind+eng"),
54
+ authorization: Optional[str] = Header(default=None),
55
+ ) -> JSONResponse:
56
+ ensure_authorized(authorization)
57
+
58
+ filename = (file.filename or "uploaded.pdf").lower()
59
+ content_type = (file.content_type or "").lower()
60
+
61
+ if not filename.endswith(".pdf") and "pdf" not in content_type:
62
+ raise HTTPException(status_code=422, detail="File harus berformat PDF.")
63
+
64
+ max_pages = max(1, min(max_pages, 80))
65
+
66
+ suffix = ".pdf"
67
+ temp_path: Optional[Path] = None
68
+
69
+ try:
70
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
71
+ temp_path = Path(tmp.name)
72
+ while True:
73
+ chunk = await file.read(1024 * 1024)
74
+ if not chunk:
75
+ break
76
+ tmp.write(chunk)
77
+
78
+ payload = extract_run(str(temp_path), max_pages=max_pages, ocr_lang=ocr_lang)
79
+
80
+ status = 200 if payload.get("success") else 422
81
+ return JSONResponse(payload, status_code=status)
82
+ except HTTPException:
83
+ raise
84
+ except Exception as exc:
85
+ return JSONResponse(
86
+ {
87
+ "success": False,
88
+ "mode": "error",
89
+ "engine": "none",
90
+ "text": "",
91
+ "error": str(exc),
92
+ },
93
+ status_code=500,
94
+ )
95
+ finally:
96
+ await file.close()
97
+ if temp_path and temp_path.exists():
98
+ temp_path.unlink(missing_ok=True)
99
+
100
+
101
+ @app.post("/")
102
+ async def extract_pdf_text_root(
103
+ file: UploadFile = File(...),
104
+ max_pages: int = Form(20),
105
+ ocr_lang: str = Form("ind+eng"),
106
+ authorization: Optional[str] = Header(default=None),
107
+ ) -> JSONResponse:
108
+ # Alias endpoint to keep compatibility with simple base URL posting.
109
+ return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)
extract_pdf_text.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Hybrid PDF extractor:
4
+ 1) Text-based PDF via PyMuPDF/pdfplumber
5
+ 2) Scan PDF via OCR (Tesseract first, PaddleOCR fallback)
6
+
7
+ Output JSON to stdout.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import re
15
+ import sys
16
+ from typing import Optional
17
+
18
+
19
+ def clean_text(text: str) -> str:
20
+ text = text or ""
21
+ text = re.sub(r"\r\n?", "\n", text)
22
+ text = re.sub(r"[ \t]{2,}", " ", text)
23
+ text = re.sub(r"\n{3,}", "\n\n", text)
24
+ return text.strip()
25
+
26
+
27
+ def extract_with_pymupdf(path: str, max_pages: int) -> str:
28
+ try:
29
+ import fitz # PyMuPDF
30
+ except Exception:
31
+ return ""
32
+
33
+ texts = []
34
+ try:
35
+ doc = fitz.open(path)
36
+ total = min(len(doc), max_pages)
37
+ for i in range(total):
38
+ page = doc.load_page(i)
39
+ texts.append(page.get_text("text") or "")
40
+ doc.close()
41
+ except Exception:
42
+ return ""
43
+
44
+ return clean_text("\n".join(texts))
45
+
46
+
47
+ def extract_with_pdfplumber(path: str, max_pages: int) -> str:
48
+ try:
49
+ import pdfplumber
50
+ except Exception:
51
+ return ""
52
+
53
+ texts = []
54
+ try:
55
+ with pdfplumber.open(path) as pdf:
56
+ for page in pdf.pages[:max_pages]:
57
+ texts.append(page.extract_text() or "")
58
+ except Exception:
59
+ return ""
60
+
61
+ return clean_text("\n".join(texts))
62
+
63
+
64
+ def ocr_with_tesseract(path: str, max_pages: int, lang: str) -> str:
65
+ try:
66
+ from pdf2image import convert_from_path
67
+ import pytesseract
68
+ except Exception:
69
+ return ""
70
+
71
+ texts = []
72
+ try:
73
+ images = convert_from_path(path, dpi=250, first_page=1, last_page=max_pages)
74
+ for image in images:
75
+ texts.append(pytesseract.image_to_string(image, lang=lang) or "")
76
+ except Exception:
77
+ return ""
78
+
79
+ return clean_text("\n".join(texts))
80
+
81
+
82
+ def ocr_with_paddle(path: str, max_pages: int) -> str:
83
+ try:
84
+ from pdf2image import convert_from_path
85
+ from paddleocr import PaddleOCR
86
+ except Exception:
87
+ return ""
88
+
89
+ texts = []
90
+ try:
91
+ images = convert_from_path(path, dpi=220, first_page=1, last_page=max_pages)
92
+ ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False)
93
+ for image in images:
94
+ result = ocr.ocr(image)
95
+ if not result:
96
+ continue
97
+ page_lines = []
98
+ for item in result[0] or []:
99
+ if isinstance(item, (list, tuple)) and len(item) >= 2:
100
+ text_info = item[1]
101
+ if isinstance(text_info, (list, tuple)) and text_info:
102
+ page_lines.append(str(text_info[0]))
103
+ if page_lines:
104
+ texts.append("\n".join(page_lines))
105
+ except Exception:
106
+ return ""
107
+
108
+ return clean_text("\n".join(texts))
109
+
110
+
111
+ def looks_like_text_based(text: str) -> bool:
112
+ text = clean_text(text)
113
+ if len(text) < 40:
114
+ return False
115
+
116
+ alnum_count = sum(1 for c in text if c.isalnum())
117
+ return alnum_count >= 24
118
+
119
+
120
+ def run(path: str, max_pages: int, ocr_lang: str) -> dict:
121
+ text = extract_with_pymupdf(path, max_pages)
122
+ if looks_like_text_based(text):
123
+ return {
124
+ "success": True,
125
+ "mode": "text-based",
126
+ "engine": "pymupdf",
127
+ "text": text,
128
+ }
129
+
130
+ text_pdfplumber = extract_with_pdfplumber(path, max_pages)
131
+ if looks_like_text_based(text_pdfplumber):
132
+ return {
133
+ "success": True,
134
+ "mode": "text-based",
135
+ "engine": "pdfplumber",
136
+ "text": text_pdfplumber,
137
+ }
138
+
139
+ text_ocr_tesseract = ocr_with_tesseract(path, max_pages, ocr_lang)
140
+ if looks_like_text_based(text_ocr_tesseract):
141
+ return {
142
+ "success": True,
143
+ "mode": "scan-ocr",
144
+ "engine": "tesseract",
145
+ "text": text_ocr_tesseract,
146
+ }
147
+
148
+ text_ocr_paddle = ocr_with_paddle(path, max_pages)
149
+ if looks_like_text_based(text_ocr_paddle):
150
+ return {
151
+ "success": True,
152
+ "mode": "scan-ocr",
153
+ "engine": "paddleocr",
154
+ "text": text_ocr_paddle,
155
+ }
156
+
157
+ merged = clean_text("\n\n".join([text, text_pdfplumber, text_ocr_tesseract, text_ocr_paddle]))
158
+ return {
159
+ "success": merged != "",
160
+ "mode": "mixed-fallback" if merged else "none",
161
+ "engine": "combined",
162
+ "text": merged,
163
+ "error": "Tidak ada teks yang dapat diekstrak dari PDF." if merged == "" else None,
164
+ }
165
+
166
+
167
+ def parse_args(argv: Optional[list] = None) -> argparse.Namespace:
168
+ parser = argparse.ArgumentParser(description="Extract text from PDF (text-based + OCR)")
169
+ parser.add_argument("pdf_path", help="Path to PDF file")
170
+ parser.add_argument("--max-pages", type=int, default=20)
171
+ parser.add_argument("--ocr-lang", default="ind+eng")
172
+ return parser.parse_args(argv)
173
+
174
+
175
+ def main(argv: Optional[list] = None) -> int:
176
+ args = parse_args(argv)
177
+
178
+ try:
179
+ payload = run(args.pdf_path, max(1, args.max_pages), args.ocr_lang)
180
+ except Exception as exc:
181
+ payload = {
182
+ "success": False,
183
+ "mode": "error",
184
+ "engine": "none",
185
+ "text": "",
186
+ "error": str(exc),
187
+ }
188
+
189
+ sys.stdout.write(json.dumps(payload, ensure_ascii=False))
190
+ return 0
191
+
192
+
193
+ if __name__ == "__main__":
194
+ raise SystemExit(main())
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text-based PDF extraction
2
+ PyMuPDF>=1.24.0
3
+ pdfplumber>=0.11.0
4
+
5
+ # OCR pipeline
6
+ pytesseract>=0.3.10
7
+ pdf2image>=1.17.0
8
+ paddleocr>=2.8.0
9
+ paddlepaddle>=2.6.0
10
+
11
+ # API server
12
+ fastapi>=0.116.0
13
+ uvicorn>=0.35.0
14
+ python-multipart>=0.0.20