File size: 3,732 Bytes
632c507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image
import pytesseract
import fitz
from concurrent.futures import ThreadPoolExecutor
import asyncio
import cv2
import numpy as np
import io

app = FastAPI(title="Fast Parallel Text Extract API")
executor = ThreadPoolExecutor(max_workers=8)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # allow all origins for testing
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)



# ---------- Utils ----------
def read_image_from_bytes(file_bytes: bytes):
    arr = np.frombuffer(file_bytes, np.uint8)
    img = cv2.imdecode(arr, cv2.IMREAD_COLOR)
    return img

def resize_if_large(img, max_dim=2000):
    h, w = img.shape[:2]
    if max(h, w) > max_dim:
        scale = max_dim / max(h, w)
        img = cv2.resize(img, (int(w*scale), int(h*scale)), interpolation=cv2.INTER_AREA)
    return img

# ---------- Fast OCR ----------
def fast_ocr(file_bytes: bytes, lang: str = "eng"):
    img_bgr = read_image_from_bytes(file_bytes)
    if img_bgr is None:
        return ""

    img_bgr = resize_if_large(img_bgr)

    # Light preprocessing
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    pil_img = Image.fromarray(gray)
    config = "--oem 3 --psm 6"  # balanced speed + accuracy
    text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
    return text.strip()

# ---------- Heavy OCR (fallback only) ----------
def heavy_ocr(file_bytes: bytes, lang: str = "eng"):
    img_bgr = read_image_from_bytes(file_bytes)
    if img_bgr is None:
        return ""

    # Denoise + threshold (slower but more robust)
    gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=10)
    _, gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    pil_img = Image.fromarray(gray)
    config = "--oem 3 --psm 6"
    text = pytesseract.image_to_string(pil_img, config=config, lang=lang)
    return text.strip()

# ---------- Image extraction ----------
def extract_text_from_image_bytes(file_bytes: bytes, lang: str = "eng"):
    text = fast_ocr(file_bytes, lang)
    if len(text) < 20:
        text = heavy_ocr(file_bytes, lang)
    return text

# ---------- PDF extraction ----------
def extract_text_from_pdf_bytes(file_bytes: bytes):
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    texts = []
    for page in doc:
        try:
            texts.append(page.get_text("text"))
        except Exception:
            texts.append("")
    return "\n".join(texts)

# ---------- Endpoints ----------

@app.post("/extract-image")
async def extract_image(file: UploadFile = File(...), lang: str = Form("eng")):
    """
    Extract text from image.
    lang: Tesseract language code, e.g. 'eng', 'hin', 'tam', or 'eng+hin'
    """
    try:
        raw = await file.read()
        loop = asyncio.get_event_loop()
        text = await loop.run_in_executor(executor, extract_text_from_image_bytes, raw, lang)
        return JSONResponse({"text": text})
    except Exception as e:
        return JSONResponse({"error": str(e)}, status_code=500)

@app.post("/extract-pdf")
async def extract_pdf(file: UploadFile = File(...)):
    """
    Extract text from PDF.
    """
    try:
        raw = await file.read()
        loop = asyncio.get_event_loop()
        text = await loop.run_in_executor(executor, extract_text_from_pdf_bytes, raw)
        return JSONResponse({"text": text})
    except Exception as e:
        return JSONResponse({"error": str(e)}, status_code=500)