kodetr commited on
Commit
519d951
·
verified ·
1 Parent(s): 40345a5
Files changed (2) hide show
  1. api_server.py +0 -6
  2. extract_pdf_text.py +22 -4
api_server.py CHANGED
@@ -61,12 +61,6 @@ async def extract_pdf_text(
61
  ) -> JSONResponse:
62
  ensure_authorized(authorization)
63
 
64
- filename = (file.filename or "uploaded.pdf").lower()
65
- content_type = (file.content_type or "").lower()
66
-
67
- if not filename.endswith(".pdf") and "pdf" not in content_type:
68
- raise HTTPException(status_code=422, detail="File harus berformat PDF.")
69
-
70
  max_pages = max(1, min(max_pages, 80))
71
 
72
  suffix = ".pdf"
 
61
  ) -> JSONResponse:
62
  ensure_authorized(authorization)
63
 
 
 
 
 
 
 
64
  max_pages = max(1, min(max_pages, 80))
65
 
66
  suffix = ".pdf"
extract_pdf_text.py CHANGED
@@ -154,11 +154,11 @@ def ocr_with_paddle(path: str, max_pages: int) -> str:
154
 
155
  def looks_like_text_based(text: str) -> bool:
156
  text = clean_text(text)
157
- if len(text) < 40:
158
  return False
159
 
160
  alnum_count = sum(1 for c in text if c.isalnum())
161
- return alnum_count >= 24
162
 
163
 
164
  def run(path: str, max_pages: int, ocr_lang: str) -> dict:
@@ -187,6 +187,11 @@ def run(path: str, max_pages: int, ocr_lang: str) -> dict:
187
  "mode": "scan-ocr",
188
  "engine": "tesseract",
189
  "text": text_ocr_tesseract,
 
 
 
 
 
190
  }
191
 
192
  text_ocr_paddle = ocr_with_paddle(path, max_pages)
@@ -196,15 +201,28 @@ def run(path: str, max_pages: int, ocr_lang: str) -> dict:
196
  "mode": "scan-ocr",
197
  "engine": "paddleocr",
198
  "text": text_ocr_paddle,
 
 
 
 
 
 
199
  }
200
 
201
  merged = clean_text("\n\n".join([text, text_pdfplumber, text_ocr_tesseract, text_ocr_paddle]))
202
  return {
203
- "success": merged != "",
204
  "mode": "mixed-fallback" if merged else "none",
205
  "engine": "combined",
206
  "text": merged,
207
- "error": "Tidak ada teks yang dapat diekstrak dari PDF." if merged == "" else None,
 
 
 
 
 
 
 
208
  }
209
 
210
 
 
154
 
155
  def looks_like_text_based(text: str) -> bool:
156
  text = clean_text(text)
157
+ if len(text) < 10:
158
  return False
159
 
160
  alnum_count = sum(1 for c in text if c.isalnum())
161
+ return alnum_count >= 6
162
 
163
 
164
  def run(path: str, max_pages: int, ocr_lang: str) -> dict:
 
187
  "mode": "scan-ocr",
188
  "engine": "tesseract",
189
  "text": text_ocr_tesseract,
190
+ "debug": {
191
+ "len_pymupdf": len(clean_text(text)),
192
+ "len_pdfplumber": len(clean_text(text_pdfplumber)),
193
+ "len_tesseract": len(clean_text(text_ocr_tesseract)),
194
+ },
195
  }
196
 
197
  text_ocr_paddle = ocr_with_paddle(path, max_pages)
 
201
  "mode": "scan-ocr",
202
  "engine": "paddleocr",
203
  "text": text_ocr_paddle,
204
+ "debug": {
205
+ "len_pymupdf": len(clean_text(text)),
206
+ "len_pdfplumber": len(clean_text(text_pdfplumber)),
207
+ "len_tesseract": len(clean_text(text_ocr_tesseract)),
208
+ "len_paddleocr": len(clean_text(text_ocr_paddle)),
209
+ },
210
  }
211
 
212
  merged = clean_text("\n\n".join([text, text_pdfplumber, text_ocr_tesseract, text_ocr_paddle]))
213
  return {
214
+ "success": len(merged) >= 10,
215
  "mode": "mixed-fallback" if merged else "none",
216
  "engine": "combined",
217
  "text": merged,
218
+ "error": "Tidak ada teks yang dapat diekstrak dari PDF." if len(merged) < 10 else None,
219
+ "debug": {
220
+ "len_pymupdf": len(clean_text(text)),
221
+ "len_pdfplumber": len(clean_text(text_pdfplumber)),
222
+ "len_tesseract": len(clean_text(text_ocr_tesseract)),
223
+ "len_paddleocr": len(clean_text(text_ocr_paddle)),
224
+ "len_merged": len(merged),
225
+ },
226
  }
227
 
228