badman99dev commited on
Commit
6298ba6
ยท
1 Parent(s): 1db1a4f

๐Ÿš€ Final OCR API with format filter and multi-page PDF

Browse files
Files changed (5) hide show
  1. Dockerfile +4 -11
  2. README.md +5 -4
  3. app.py +29 -10
  4. packages.txt +1 -5
  5. requirements.txt +1 -1
Dockerfile CHANGED
@@ -1,17 +1,10 @@
1
  FROM python:3.10
2
 
3
- RUN apt-get update && apt-get install -y \
4
- tesseract-ocr \
5
- tesseract-ocr-hin \
6
- poppler-utils \
7
- libglib2.0-0 \
8
- libsm6 \
9
- libxrender1 \
10
- libxext6 \
11
- && apt-get clean
12
 
13
- WORKDIR /app
14
  COPY . /app
15
- RUN pip install -r requirements.txt
16
 
17
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10
2
 
3
+ RUN apt-get update && \
4
+ apt-get install -y poppler-utils tesseract-ocr tesseract-ocr-hin && \
5
+ pip install --no-cache-dir fastapi uvicorn pytesseract pillow pdf2image python-multipart
 
 
 
 
 
 
6
 
 
7
  COPY . /app
8
+ WORKDIR /app
9
 
10
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md CHANGED
@@ -1,9 +1,10 @@
1
  ---
2
  title: Hindi OCR API
3
- emoji: ๐Ÿ“–
4
- colorFrom: blue
5
- colorTo: indigo
6
  sdk: docker
 
7
  app_file: app.py
8
- pinned: false
9
  ---
 
1
  ---
2
  title: Hindi OCR API
3
+ emoji: ๐Ÿงพ
4
+ colorFrom: indigo
5
+ colorTo: pink
6
  sdk: docker
7
+ sdk_version: "1.0"
8
  app_file: app.py
9
+ pinned: true
10
  ---
app.py CHANGED
@@ -7,16 +7,35 @@ import io
7
 
8
  app = FastAPI()
9
 
10
- def ocr_image(image: Image.Image) -> str:
11
- return pytesseract.image_to_string(image, lang='hin+eng')
12
-
13
  @app.post("/ocr")
14
  async def extract_text(file: UploadFile = File(...)):
 
 
 
 
 
 
 
 
 
15
  contents = await file.read()
16
- if file.filename.lower().endswith(".pdf"):
17
- images = convert_from_bytes(contents)
18
- text = "\n".join([ocr_image(img) for img in images])
19
- else:
20
- image = Image.open(io.BytesIO(contents))
21
- text = ocr_image(image)
22
- return JSONResponse(content={"text": text})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  app = FastAPI()
9
 
 
 
 
10
  @app.post("/ocr")
11
  async def extract_text(file: UploadFile = File(...)):
12
+ filename = file.filename.lower()
13
+ allowed_ext = (".jpg", ".jpeg", ".png", ".pdf")
14
+
15
+ if not filename.endswith(allowed_ext):
16
+ return JSONResponse(
17
+ content={"error": "โŒ Unsupported file format! Please upload JPG, PNG, or PDF."},
18
+ status_code=400
19
+ )
20
+
21
  contents = await file.read()
22
+ extracted_text = ""
23
+
24
+ try:
25
+ if filename.endswith(".pdf"):
26
+ images = convert_from_bytes(contents)
27
+ for page in images:
28
+ text = pytesseract.image_to_string(page, lang="hin+eng")
29
+ extracted_text += text + "\n\n"
30
+ else:
31
+ image = Image.open(io.BytesIO(contents))
32
+ text = pytesseract.image_to_string(image, lang="hin+eng")
33
+ extracted_text = text
34
+
35
+ return {"text": extracted_text.strip() or "โš ๏ธ No text found."}
36
+
37
+ except Exception as e:
38
+ return JSONResponse(
39
+ content={"error": "๐Ÿšซ Failed to process file", "details": str(e)},
40
+ status_code=500
41
+ )
packages.txt CHANGED
@@ -1,7 +1,3 @@
 
1
  tesseract-ocr
2
  tesseract-ocr-hin
3
- poppler-utils
4
- libglib2.0-0
5
- libsm6
6
- libxrender1
7
- libxext6
 
1
+ poppler-utils
2
  tesseract-ocr
3
  tesseract-ocr-hin
 
 
 
 
 
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  fastapi
2
  uvicorn
3
  pytesseract
 
4
  pdf2image
5
  python-multipart
6
- Pillow
 
1
  fastapi
2
  uvicorn
3
  pytesseract
4
+ pillow
5
  pdf2image
6
  python-multipart