shubhjo commited on
Commit
0dee6b5
·
verified ·
1 Parent(s): d4dd73a

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +43 -0
  2. app.py +173 -0
  3. requirements.txt +10 -0
Dockerfile ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+
5
+ RUN apt-get update && apt-get install -y \
6
+ tesseract-ocr \
7
+ tesseract-ocr-eng tesseract-ocr-hin tesseract-ocr-ara tesseract-ocr-spa \
8
+ tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-rus \
9
+ poppler-utils \
10
+ libopencv-dev \
11
+ && rm -rf /var/lib/apt/lists/\*
12
+
13
+ # Create working directory
14
+
15
+ RUN mkdir -p /app WORKDIR /app
16
+
17
+ # Copy files
18
+
19
+ COPY . /app/
20
+
21
+ # Debug: List files
22
+
23
+ RUN ls -la /app
24
+
25
+ # Verify files
26
+
27
+ RUN if \[ ! -f /app/requirements.txt \]; then echo "requirements.txt not found" && exit 1; fi RUN if \[ ! -f /app/ocr_api.py \]; then echo "ocr_api.py not found" && exit 1; fi
28
+
29
+ # Install Python dependencies
30
+
31
+ RUN pip install --no-cache-dir -r /app/requirements.txt
32
+
33
+ # Create non-root user
34
+
35
+ RUN useradd -m appuser USER appuser
36
+
37
+ # Expose port
38
+
39
+ EXPOSE 8000
40
+
41
+ # Run the app
42
+
43
+ CMD \["uvicorn", "ocr_api:app", "--host", "0.0.0.0", "--port", "8000"\]
app.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ import pytesseract
3
+ import cv2
4
+ import os
5
+ from PIL import Image
6
+ import json
7
+ import unicodedata
8
+ from pdf2image import convert_from_bytes
9
+ from pypdf import PdfReader
10
+ import numpy as np
11
+ from typing import List
12
+ import io
13
+ import logging
14
+ import time
15
+ import asyncio
16
+ import psutil
17
+ import cachetools
18
+ import hashlib
19
+
20
+ app = FastAPI()
21
+
22
+ # Configure logging
23
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Set Tesseract path
27
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
28
+
29
+ # In-memory cache for OCR results (1-hour TTL)
30
+ cache = cachetools.TTLCache(maxsize=100, ttl=3600)
31
+
32
+ def log_memory_usage():
33
+ """Log current memory usage."""
34
+ process = psutil.Process()
35
+ mem_info = process.memory_info()
36
+ return f"Memory usage: {mem_info.rss / 1024 / 1024:.2f} MB"
37
+
38
+ def get_file_hash(pdf_bytes):
39
+ """Generate MD5 hash of PDF content."""
40
+ return hashlib.md5(pdf_bytes).hexdigest()
41
+
42
+ async def process_page(img, page_idx):
43
+ """Process a single PDF page with OCR."""
44
+ start_time = time.time()
45
+ logger.info(f"Starting OCR for page {page_idx}, {log_memory_usage()}")
46
+ try:
47
+ img_cv = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
48
+ gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
49
+ img_pil = Image.fromarray(cv2.cvtColor(gray, cv2.COLOR_GRAY2RGB))
50
+ custom_config = r'--oem 1 --psm 6 -l eng+hin+ara+spa+ita+fra+rus'
51
+ page_text = pytesseract.image_to_string(img_pil, config=custom_config)
52
+ logger.info(f"Completed OCR for page {page_idx}, took {time.time() - start_time:.2f} seconds, {log_memory_usage()}")
53
+ return page_text + "\n"
54
+ except Exception as e:
55
+ logger.error(f"OCR failed for page {page_idx}: {str(e)}, {log_memory_usage()}")
56
+ return ""
57
+
58
+ @app.post("/ocr")
59
+ async def extract_text(files: List[UploadFile] = File(...)):
60
+ output_json = {
61
+ "success": True,
62
+ "message": "",
63
+ "data": []
64
+ }
65
+ success_count = 0
66
+ fail_count = 0
67
+
68
+ logger.info(f"Starting OCR for {len(files)} files, {log_memory_usage()}")
69
+
70
+ for file in files:
71
+ total_start_time = time.time()
72
+ logger.info(f"Processing file: {file.filename}, {log_memory_usage()}")
73
+
74
+ if not file.filename.lower().endswith('.pdf'):
75
+ fail_count += 1
76
+ output_json["data"].append({
77
+ "filename": file.filename,
78
+ "raw_text": "",
79
+ "error": "File is not a PDF"
80
+ })
81
+ logger.error(f"File {file.filename} is not a PDF")
82
+ continue
83
+
84
+ # Read PDF into memory
85
+ try:
86
+ pdf_start_time = time.time()
87
+ pdf_bytes = await file.read()
88
+ pdf_stream = io.BytesIO(pdf_bytes)
89
+ file_hash = get_file_hash(pdf_bytes)
90
+ logger.info(f"Read PDF {file.filename}, took {time.time() - pdf_start_time:.2f} seconds, size: {len(pdf_bytes)/1024:.2f} KB, {log_memory_usage()}")
91
+ except Exception as e:
92
+ fail_count += 1
93
+ output_json["data"].append({
94
+ "filename": file.filename,
95
+ "raw_text": "",
96
+ "error": f"Failed to read PDF: {str(e)}"
97
+ })
98
+ logger.error(f"Failed to read PDF {file.filename}: {str(e)}, {log_memory_usage()}")
99
+ continue
100
+
101
+ # Check cache
102
+ if file_hash in cache:
103
+ success_count += 1
104
+ output_json["data"].append({
105
+ "filename": file.filename,
106
+ "raw_text": cache[file_hash],
107
+ "error": ""
108
+ })
109
+ logger.info(f"Cache hit for {file.filename}, {log_memory_usage()}")
110
+ continue
111
+
112
+ # Initialize raw text
113
+ raw_text = ""
114
+ # Try extracting embedded text
115
+ try:
116
+ extract_start_time = time.time()
117
+ reader = PdfReader(pdf_stream)
118
+ for page in reader.pages:
119
+ text = page.extract_text()
120
+ if text:
121
+ raw_text += text + "\n"
122
+ logger.info(f"Embedded text extraction for {file.filename}, took {time.time() - extract_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
123
+ except Exception as e:
124
+ logger.warning(f"Embedded text extraction failed for {file.filename}: {str(e)}, {log_memory_usage()}")
125
+
126
+ # If no embedded text, perform OCR
127
+ if not raw_text.strip():
128
+ try:
129
+ convert_start_time = time.time()
130
+ images = convert_from_bytes(pdf_bytes, poppler_path="/usr/local/bin", dpi=100)
131
+ logger.info(f"PDF to images conversion for {file.filename}, {len(images)} pages, took {time.time() - convert_start_time:.2f} seconds, {log_memory_usage()}")
132
+
133
+ ocr_start_time = time.time()
134
+ tasks = [process_page(img, i) for i, img in enumerate(images)]
135
+ page_texts = await asyncio.gather(*tasks)
136
+ raw_text = "".join(page_texts)
137
+ logger.info(f"Total OCR for {file.filename}, took {time.time() - ocr_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
138
+ except Exception as e:
139
+ fail_count += 1
140
+ output_json["data"].append({
141
+ "filename": file.filename,
142
+ "raw_text": "",
143
+ "error": f"OCR failed: {str(e)}"
144
+ })
145
+ logger.error(f"OCR failed for {file.filename}: {str(e)}, {log_memory_usage()}")
146
+ continue
147
+
148
+ # Normalize text
149
+ try:
150
+ normalize_start_time = time.time()
151
+ raw_text = unicodedata.normalize('NFKC', raw_text)
152
+ raw_text = raw_text.encode().decode('utf-8')
153
+ logger.info(f"Text normalization for {file.filename}, took {time.time() - normalize_start_time:.2f} seconds, text length: {len(raw_text)}, {log_memory_usage()}")
154
+ except Exception as e:
155
+ logger.warning(f"Text normalization failed for {file.filename}: {str(e)}, {log_memory_usage()}")
156
+
157
+ # Cache and store result
158
+ cache[file_hash] = raw_text
159
+ success_count += 1
160
+ output_json["data"].append({
161
+ "filename": file.filename,
162
+ "raw_text": raw_text,
163
+ "error": ""
164
+ })
165
+
166
+ logger.info(f"Total processing for {file.filename}, took {time.time() - total_start_time:.2f} seconds, {log_memory_usage()}")
167
+
168
+ output_json["message"] = f"Processed {len(files)} PDFs. {success_count} succeeded, {fail_count} failed."
169
+ if fail_count > 0 and success_count == 0:
170
+ output_json["success"] = False
171
+
172
+ logger.info(f"Completed OCR for {len(files)} files, {success_count} succeeded, {fail_count} failed, {log_memory_usage()}")
173
+ return output_json
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.30.6
3
+ pytesseract==0.3.13
4
+ opencv-python==4.10.0.84
5
+ pillow==10.4.0
6
+ pdf2image==1.17.0
7
+ pypdf==5.0.1
8
+ numpy==1.26.4
9
+ psutil==6.0.0
10
+ cachetools==5.5.0