Soumik Bose commited on
Commit
360683b
·
1 Parent(s): 3a57d82

Initial commit

Browse files
Files changed (1) hide show
  1. main.py +419 -98
main.py CHANGED
@@ -1,3 +1,324 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import sys
3
  import uuid
@@ -33,17 +354,12 @@ load_dotenv()
33
  class Config:
34
  APP_NAME = os.getenv("APP_NAME", "OCR API")
35
  API_TOKEN = os.getenv("API_BEARER_TOKEN")
36
- MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
37
 
38
- # Parse allowed origins from comma-separated string
39
  allowed_origins_raw = os.getenv("ALLOWED_ORIGINS")
40
  ALLOWED_ORIGINS = [origin.strip() for origin in allowed_origins_raw.split(",") if origin.strip()] if allowed_origins_raw else []
41
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
42
 
43
- # Validation check on startup
44
- if not Config.API_TOKEN:
45
- print("CRITICAL WARNING: API_BEARER_TOKEN is not set in .env")
46
-
47
  # ==========================================
48
  # 2. LOGGING SETUP
49
  # ==========================================
@@ -74,11 +390,17 @@ class BaseResponse(BaseModel):
74
  status: StatusEnum
75
  message: Optional[str] = None
76
 
 
 
 
 
 
77
  class OCRResult(BaseModel):
78
  filename: str
79
  content_type: str
80
- pages: int
81
- text: str
 
82
 
83
  class APIResponse(BaseResponse):
84
  data: Optional[OCRResult] = None
@@ -93,9 +415,6 @@ class SecurityService:
93
 
94
  @staticmethod
95
  async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
96
- """
97
- Validates the Bearer token.
98
- """
99
  if credentials.credentials != Config.API_TOKEN:
100
  logger.warning(f"Auth Failed. Token used: {credentials.credentials[:5]}...")
101
  raise HTTPException(
@@ -116,27 +435,26 @@ class FileValidator:
116
 
117
  @staticmethod
118
  def check_size_and_save(file: UploadFile) -> str:
119
- """
120
- Stream file to disk to check size without loading entire file into RAM.
121
- Returns path to temp file.
122
- """
123
  try:
124
  suffix = Path(file.filename).suffix
125
- # Create a named temp file that persists so Tesseract can read it
126
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
127
  shutil.copyfileobj(file.file, buffer)
128
  tmp_path = buffer.name
129
 
 
 
 
130
  # Check size
131
- file_size = os.path.getsize(tmp_path)
132
  if file_size > Config.MAX_SIZE:
133
- os.remove(tmp_path)
134
  raise HTTPException(
135
  status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
136
  detail=f"File size exceeds limit of {Config.MAX_SIZE / (1024*1024)}MB"
137
  )
138
 
139
- return tmp_path
140
  except HTTPException:
141
  raise
142
  except Exception as e:
@@ -145,63 +463,83 @@ class FileValidator:
145
 
146
  class OCRProcessor:
147
  @classmethod
148
- def process_file(cls, file_path: str, content_type: str) -> dict:
149
- """
150
- Heavy CPU Logic.
151
- """
152
  start = time.perf_counter()
153
- text = ""
154
- pages = 1
 
 
 
155
 
156
  try:
 
 
157
  if content_type == "application/pdf":
 
 
158
  # Convert PDF to images
159
  images = convert_from_path(file_path)
160
- pages = len(images)
161
- # Extract text from each page
162
- page_texts = []
 
163
  for idx, img in enumerate(images):
164
- page_texts.append(f"--- Page {idx+1} ---\n{pytesseract.image_to_string(img)}")
165
- text = "\n\n".join(page_texts)
 
 
 
 
 
 
 
 
 
 
166
  else:
167
  # Standard Image
168
- text = pytesseract.image_to_string(Image.open(file_path))
169
-
 
 
 
 
 
 
 
 
 
170
  duration = (time.perf_counter() - start) * 1000
171
- logger.info(f"OCR CPU Engine finished in {duration:.2f}ms")
172
 
173
- return {"pages": pages, "text": text}
 
 
 
 
174
  except Exception as e:
175
- logger.error(f"OCR Extraction Error: {str(e)}")
176
- raise ValueError("Failed to extract text from document")
177
 
178
  # ==========================================
179
  # 5. FASTAPI APP INIT
180
  # ==========================================
181
 
182
- app = FastAPI(
183
- title=Config.APP_NAME,
184
- version="1.0.0",
185
- docs_url="/docs", # You can disable this in prod by setting to None
186
- redoc_url=None
187
- )
188
 
189
- # STRICT CORS CONFIGURATION
190
  app.add_middleware(
191
  CORSMiddleware,
192
- allow_origins=Config.ALLOWED_ORIGINS, # Loaded from .env
193
  allow_credentials=True,
194
- allow_methods=["GET", "POST"], # STRICT: Only GET and POST allowed
195
- allow_headers=["Authorization", "Content-Type", "X-Request-ID"], # Allowed headers
196
  )
197
 
198
- # Middleware: Request ID & Logging
199
  @app.middleware("http")
200
  async def request_context_middleware(request: Request, call_next):
201
  req_id = str(uuid.uuid4())
202
  request.state.request_id = req_id
203
 
204
- # Inject ID into logger
205
  old_factory = logging.getLogRecordFactory()
206
  def record_factory(*args, **kwargs):
207
  record = old_factory(*args, **kwargs)
@@ -210,14 +548,13 @@ async def request_context_middleware(request: Request, call_next):
210
  logging.setLogRecordFactory(record_factory)
211
 
212
  start_time = time.perf_counter()
213
- logger.info(f"Incoming Request: {request.method} {request.url.path} | Origin: {request.headers.get('origin', 'unknown')}")
214
 
215
  try:
216
  response = await call_next(request)
217
  process_time = (time.perf_counter() - start_time) * 1000
218
  response.headers["X-Request-ID"] = req_id
219
  response.headers["X-Process-Time"] = f"{process_time:.2f}ms"
220
- logger.info(f"Response: {response.status_code} | Time: {process_time:.2f}ms")
221
  return response
222
  except Exception as e:
223
  logger.exception("Unhandled Exception in Middleware")
@@ -232,7 +569,6 @@ async def request_context_middleware(request: Request, call_next):
232
 
233
  @app.get("/", response_model=BaseResponse)
234
  async def root(request: Request):
235
- """Simple connectivity check."""
236
  return {
237
  "request_id": request.state.request_id,
238
  "process_time_ms": 0,
@@ -240,79 +576,64 @@ async def root(request: Request):
240
  "message": "OCR API is running."
241
  }
242
 
243
- @app.get("/api/v1/ping", response_model=BaseResponse)
244
- async def health_check(request: Request):
245
- """Docker Healthcheck Endpoint."""
246
- return {
247
- "request_id": request.state.request_id,
248
- "process_time_ms": 0,
249
- "status": StatusEnum.SUCCESS,
250
- "message": "OCR API is healthy."
251
- }
252
-
253
  @app.post("/api/v1/get_data", response_model=APIResponse)
254
  async def extract_data(
255
  request: Request,
256
  file: UploadFile = File(...),
257
  token: str = Depends(SecurityService.validate_token)
258
  ):
259
- """
260
- Main OCR Endpoint.
261
- Non-blocking: Offloads OCR to threadpool.
262
- """
263
  start_ts = time.perf_counter()
264
  tmp_file_path = None
 
265
 
266
  try:
267
- # 1. Validate File Type
 
 
268
  FileValidator.validate(file)
269
-
270
- # 2. Save File (IO Bound)
271
  tmp_file_path = FileValidator.check_size_and_save(file)
 
272
 
273
- # 3. Process (CPU Bound) - Run in ThreadPool for Non-Blocking
274
  result = await run_in_threadpool(
275
  OCRProcessor.process_file,
276
  tmp_file_path,
277
- file.content_type
 
278
  )
279
 
 
280
  return {
281
- "request_id": request.state.request_id,
282
  "process_time_ms": (time.perf_counter() - start_ts) * 1000,
283
  "status": StatusEnum.SUCCESS,
284
  "message": "OCR Extraction Successful",
285
  "data": {
286
  "filename": file.filename,
287
  "content_type": file.content_type,
288
- "pages": result["pages"],
289
- "text": result["text"]
 
290
  }
291
  }
292
 
293
- except HTTPException as he:
294
- raise he
295
- except ValueError as ve:
296
- # OCR logic errors
297
- return {
298
- "request_id": request.state.request_id,
299
- "process_time_ms": (time.perf_counter() - start_ts) * 1000,
300
- "status": StatusEnum.ERROR,
301
- "error_message": str(ve)
302
- }
303
  except Exception as e:
304
- logger.error(f"Unexpected API Error: {e}")
305
- return {
306
- "request_id": request.state.request_id,
307
- "process_time_ms": (time.perf_counter() - start_ts) * 1000,
308
- "status": StatusEnum.ERROR,
309
- "error_message": "An unexpected error occurred."
310
- }
 
 
 
 
 
311
  finally:
312
- # Cleanup temp file
313
- if tmp_file_path and os.path.exists(tmp_file_path):
314
- try:
315
- os.remove(tmp_file_path)
316
- logger.info("Temp file deleted.")
317
- except OSError:
318
- pass
 
1
+ # import os
2
+ # import sys
3
+ # import uuid
4
+ # import time
5
+ # import logging
6
+ # import shutil
7
+ # import tempfile
8
+ # from typing import Optional, List
9
+ # from enum import Enum
10
+ # from pathlib import Path
11
+
12
+ # # Third-party imports
13
+ # import uvicorn
14
+ # import pytesseract
15
+ # from fastapi import (
16
+ # FastAPI, File, UploadFile, Depends,
17
+ # HTTPException, Request, status
18
+ # )
19
+ # from fastapi.middleware.cors import CORSMiddleware
20
+ # from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
21
+ # from fastapi.responses import JSONResponse
22
+ # from fastapi.concurrency import run_in_threadpool
23
+ # from pydantic import BaseModel
24
+ # from dotenv import load_dotenv
25
+ # from PIL import Image
26
+ # from pdf2image import convert_from_path
27
+
28
+ # # ==========================================
29
+ # # 1. CONFIGURATION & ENV LOADING
30
+ # # ==========================================
31
+ # load_dotenv()
32
+
33
+ # class Config:
34
+ # APP_NAME = os.getenv("APP_NAME", "OCR API")
35
+ # API_TOKEN = os.getenv("API_BEARER_TOKEN")
36
+ # MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
37
+
38
+ # allowed_origins_raw = os.getenv("ALLOWED_ORIGINS")
39
+ # ALLOWED_ORIGINS = [origin.strip() for origin in allowed_origins_raw.split(",") if origin.strip()] if allowed_origins_raw else []
40
+ # ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
41
+
42
+ # if not Config.API_TOKEN:
43
+ # print("CRITICAL WARNING: API_BEARER_TOKEN is not set in .env")
44
+
45
+ # # ==========================================
46
+ # # 2. LOGGING SETUP
47
+ # # ==========================================
48
+ # class RequestIdFilter(logging.Filter):
49
+ # def filter(self, record):
50
+ # if not hasattr(record, 'request_id'):
51
+ # record.request_id = 'system'
52
+ # return True
53
+
54
+ # logging.basicConfig(
55
+ # level=logging.INFO,
56
+ # format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
57
+ # datefmt='%Y-%m-%d %H:%M:%S'
58
+ # )
59
+ # logger = logging.getLogger("ocr_api")
60
+ # logger.addFilter(RequestIdFilter())
61
+
62
+ # # ==========================================
63
+ # # 3. PYDANTIC MODELS (UPDATED)
64
+ # # ==========================================
65
+ # class StatusEnum(str, Enum):
66
+ # SUCCESS = "success"
67
+ # ERROR = "error"
68
+
69
+ # class BaseResponse(BaseModel):
70
+ # request_id: str
71
+ # process_time_ms: float
72
+ # status: StatusEnum
73
+ # message: Optional[str] = None
74
+
75
+ # # New model for individual page details
76
+ # class PageResult(BaseModel):
77
+ # index: int
78
+ # page_number: int
79
+ # text: str
80
+
81
+ # # Updated Result model to return a list of pages
82
+ # class OCRResult(BaseModel):
83
+ # filename: str
84
+ # content_type: str
85
+ # total_pages: int
86
+ # pages_content: List[PageResult]
87
+
88
+ # class APIResponse(BaseResponse):
89
+ # data: Optional[OCRResult] = None
90
+ # error_message: Optional[str] = None
91
+
92
+ # # ==========================================
93
+ # # 4. BUSINESS LOGIC SERVICES
94
+ # # ==========================================
95
+
96
+ # class SecurityService:
97
+ # security_scheme = HTTPBearer()
98
+
99
+ # @staticmethod
100
+ # async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
101
+ # if credentials.credentials != Config.API_TOKEN:
102
+ # logger.warning(f"Auth Failed. Token used: {credentials.credentials[:5]}...")
103
+ # raise HTTPException(
104
+ # status_code=status.HTTP_401_UNAUTHORIZED,
105
+ # detail="Invalid Bearer Token",
106
+ # headers={"WWW-Authenticate": "Bearer"},
107
+ # )
108
+ # return credentials.credentials
109
+
110
+ # class FileValidator:
111
+ # @staticmethod
112
+ # def validate(file: UploadFile):
113
+ # if file.content_type not in Config.ALLOWED_TYPES:
114
+ # raise HTTPException(
115
+ # status_code=status.HTTP_400_BAD_REQUEST,
116
+ # detail=f"Invalid file type. Allowed: {Config.ALLOWED_TYPES}"
117
+ # )
118
+
119
+ # @staticmethod
120
+ # def check_size_and_save(file: UploadFile) -> str:
121
+ # try:
122
+ # suffix = Path(file.filename).suffix
123
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
124
+ # shutil.copyfileobj(file.file, buffer)
125
+ # tmp_path = buffer.name
126
+
127
+ # file_size = os.path.getsize(tmp_path)
128
+ # if file_size > Config.MAX_SIZE:
129
+ # os.remove(tmp_path)
130
+ # raise HTTPException(
131
+ # status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
132
+ # detail=f"File size exceeds limit of {Config.MAX_SIZE / (1024*1024)}MB"
133
+ # )
134
+
135
+ # return tmp_path
136
+ # except HTTPException:
137
+ # raise
138
+ # except Exception as e:
139
+ # logger.error(f"File save error: {e}")
140
+ # raise HTTPException(status.HTTP_500_INTERNAL_SERVER_ERROR, "File upload failed")
141
+
142
+ # class OCRProcessor:
143
+ # @classmethod
144
+ # def process_file(cls, file_path: str, content_type: str, request_id: str) -> dict:
145
+ # """
146
+ # Extracts text per page and logs every step.
147
+ # """
148
+ # start = time.perf_counter()
149
+ # pages_content = []
150
+
151
+ # # Helper to inject request_id into logs inside the thread
152
+ # def log_op(msg):
153
+ # logger.info(f"[Thread-Op] {msg}", extra={'request_id': request_id})
154
+
155
+ # try:
156
+ # log_op(f"Starting processing for file type: {content_type}")
157
+
158
+ # if content_type == "application/pdf":
159
+ # log_op("Initiating PDF to Image conversion...")
160
+
161
+ # # Convert PDF to images
162
+ # images = convert_from_path(file_path)
163
+ # total_pages = len(images)
164
+
165
+ # log_op(f"PDF converted successfully. Total pages found: {total_pages}")
166
+
167
+ # # Iterate through pages
168
+ # for idx, img in enumerate(images):
169
+ # page_num = idx + 1
170
+ # log_op(f"Processing Page {page_num}/{total_pages} - extracting text...")
171
+
172
+ # extracted_text = pytesseract.image_to_string(img).strip()
173
+
174
+ # log_op(f"Completed Page {page_num}/{total_pages} - Text length: {len(extracted_text)} chars")
175
+
176
+ # pages_content.append({
177
+ # "index": idx,
178
+ # "page_number": page_num,
179
+ # "text": extracted_text
180
+ # })
181
+ # else:
182
+ # # Standard Image
183
+ # log_op("Processing single image file...")
184
+ # img = Image.open(file_path)
185
+ # text = pytesseract.image_to_string(img).strip()
186
+ # log_op("Image text extraction complete.")
187
+
188
+ # pages_content.append({
189
+ # "index": 0,
190
+ # "page_number": 1,
191
+ # "text": text
192
+ # })
193
+
194
+ # duration = (time.perf_counter() - start) * 1000
195
+ # log_op(f"OCR Engine finished all tasks in {duration:.2f}ms")
196
+
197
+ # return {
198
+ # "total_pages": len(pages_content),
199
+ # "pages_content": pages_content
200
+ # }
201
+
202
+ # except Exception as e:
203
+ # log_op(f"OCR Extraction Error: {str(e)}")
204
+ # raise ValueError("Failed to extract text from document")
205
+
206
+ # # ==========================================
207
+ # # 5. FASTAPI APP INIT
208
+ # # ==========================================
209
+
210
+ # app = FastAPI(title=Config.APP_NAME)
211
+
212
+ # app.add_middleware(
213
+ # CORSMiddleware,
214
+ # allow_origins=Config.ALLOWED_ORIGINS,
215
+ # allow_credentials=True,
216
+ # allow_methods=["GET", "POST"],
217
+ # allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
218
+ # )
219
+
220
+ # @app.middleware("http")
221
+ # async def request_context_middleware(request: Request, call_next):
222
+ # req_id = str(uuid.uuid4())
223
+ # request.state.request_id = req_id
224
+
225
+ # old_factory = logging.getLogRecordFactory()
226
+ # def record_factory(*args, **kwargs):
227
+ # record = old_factory(*args, **kwargs)
228
+ # record.request_id = req_id
229
+ # return record
230
+ # logging.setLogRecordFactory(record_factory)
231
+
232
+ # start_time = time.perf_counter()
233
+ # logger.info(f"Incoming Request: {request.method} {request.url.path}")
234
+
235
+ # try:
236
+ # response = await call_next(request)
237
+ # process_time = (time.perf_counter() - start_time) * 1000
238
+ # response.headers["X-Request-ID"] = req_id
239
+ # response.headers["X-Process-Time"] = f"{process_time:.2f}ms"
240
+ # return response
241
+ # except Exception as e:
242
+ # logger.exception("Unhandled Exception in Middleware")
243
+ # return JSONResponse(
244
+ # status_code=500,
245
+ # content={"status": "error", "message": "Internal Server Error", "request_id": req_id}
246
+ # )
247
+
248
+ # # ==========================================
249
+ # # 6. ENDPOINTS
250
+ # # ==========================================
251
+
252
+ # @app.get("/", response_model=BaseResponse)
253
+ # async def root(request: Request):
254
+ # return {
255
+ # "request_id": request.state.request_id,
256
+ # "process_time_ms": 0,
257
+ # "status": StatusEnum.SUCCESS,
258
+ # "message": "OCR API is running."
259
+ # }
260
+
261
+ # @app.post("/api/v1/get_data", response_model=APIResponse)
262
+ # async def extract_data(
263
+ # request: Request,
264
+ # file: UploadFile = File(...),
265
+ # token: str = Depends(SecurityService.validate_token)
266
+ # ):
267
+ # start_ts = time.perf_counter()
268
+ # tmp_file_path = None
269
+ # req_id = request.state.request_id
270
+
271
+ # try:
272
+ # # 1. Log Upload
273
+ # logger.info(f"Received file: {file.filename}, Content-Type: {file.content_type}")
274
+
275
+ # FileValidator.validate(file)
276
+
277
+ # # 2. Save File
278
+ # logger.info("Saving temporary file to disk...")
279
+ # tmp_file_path = FileValidator.check_size_and_save(file)
280
+ # logger.info(f"File saved at {tmp_file_path}. Sending to OCR thread.")
281
+
282
+ # # 3. Process
283
+ # result = await run_in_threadpool(
284
+ # OCRProcessor.process_file,
285
+ # tmp_file_path,
286
+ # file.content_type,
287
+ # req_id # Pass ID explicitly for thread logging
288
+ # )
289
+
290
+ # return {
291
+ # "request_id": req_id,
292
+ # "process_time_ms": (time.perf_counter() - start_ts) * 1000,
293
+ # "status": StatusEnum.SUCCESS,
294
+ # "message": "OCR Extraction Successful",
295
+ # "data": {
296
+ # "filename": file.filename,
297
+ # "content_type": file.content_type,
298
+ # "total_pages": result["total_pages"],
299
+ # "pages_content": result["pages_content"]
300
+ # }
301
+ # }
302
+
303
+ # except Exception as e:
304
+ # logger.error(f"Processing failed: {str(e)}")
305
+ # status_code = getattr(e, "status_code", 500)
306
+ # return JSONResponse(
307
+ # status_code=status_code,
308
+ # content={
309
+ # "request_id": req_id,
310
+ # "process_time_ms": (time.perf_counter() - start_ts) * 1000,
311
+ # "status": StatusEnum.ERROR,
312
+ # "error_message": getattr(e, "detail", str(e))
313
+ # }
314
+ # )
315
+ # finally:
316
+ # if tmp_file_path and os.path.exists(tmp_file_path):
317
+ # os.remove(tmp_file_path)
318
+ # logger.info("Temporary file cleaned up.")
319
+
320
+
321
+
322
  import os
323
  import sys
324
  import uuid
 
354
  class Config:
355
  APP_NAME = os.getenv("APP_NAME", "OCR API")
356
  API_TOKEN = os.getenv("API_BEARER_TOKEN")
357
+ MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
358
 
 
359
  allowed_origins_raw = os.getenv("ALLOWED_ORIGINS")
360
  ALLOWED_ORIGINS = [origin.strip() for origin in allowed_origins_raw.split(",") if origin.strip()] if allowed_origins_raw else []
361
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
362
 
 
 
 
 
363
  # ==========================================
364
  # 2. LOGGING SETUP
365
  # ==========================================
 
390
  status: StatusEnum
391
  message: Optional[str] = None
392
 
393
+ class PageResult(BaseModel):
394
+ index: int
395
+ page_number: int
396
+ text: str
397
+
398
  class OCRResult(BaseModel):
399
  filename: str
400
  content_type: str
401
+ saved_file_path: str # <--- NEW FIELD
402
+ total_pages: int
403
+ pages_content: List[PageResult]
404
 
405
  class APIResponse(BaseResponse):
406
  data: Optional[OCRResult] = None
 
415
 
416
  @staticmethod
417
  async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
 
 
 
418
  if credentials.credentials != Config.API_TOKEN:
419
  logger.warning(f"Auth Failed. Token used: {credentials.credentials[:5]}...")
420
  raise HTTPException(
 
435
 
436
  @staticmethod
437
  def check_size_and_save(file: UploadFile) -> str:
 
 
 
 
438
  try:
439
  suffix = Path(file.filename).suffix
440
+ # Create a named temp file that persists (delete=False)
441
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir=tempfile.gettempdir()) as buffer:
442
  shutil.copyfileobj(file.file, buffer)
443
  tmp_path = buffer.name
444
 
445
+ # Return absolute path
446
+ abs_path = os.path.abspath(tmp_path)
447
+
448
  # Check size
449
+ file_size = os.path.getsize(abs_path)
450
  if file_size > Config.MAX_SIZE:
451
+ os.remove(abs_path)
452
  raise HTTPException(
453
  status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
454
  detail=f"File size exceeds limit of {Config.MAX_SIZE / (1024*1024)}MB"
455
  )
456
 
457
+ return abs_path
458
  except HTTPException:
459
  raise
460
  except Exception as e:
 
463
 
464
  class OCRProcessor:
465
  @classmethod
466
+ def process_file(cls, file_path: str, content_type: str, request_id: str) -> dict:
 
 
 
467
  start = time.perf_counter()
468
+ pages_content = []
469
+
470
+ # Helper for threaded logging
471
+ def log_op(msg):
472
+ logger.info(f"[Thread-Op] {msg}", extra={'request_id': request_id})
473
 
474
  try:
475
+ log_op(f"Starting OCR processing. File located at: {file_path}")
476
+
477
  if content_type == "application/pdf":
478
+ log_op("Reading PDF file and converting to images...")
479
+
480
  # Convert PDF to images
481
  images = convert_from_path(file_path)
482
+ total_pages = len(images)
483
+
484
+ log_op(f"PDF Conversion Success. Total Pages: {total_pages}")
485
+
486
  for idx, img in enumerate(images):
487
+ page_num = idx + 1
488
+ log_op(f"Processing Page {page_num}/{total_pages}...")
489
+
490
+ extracted_text = pytesseract.image_to_string(img).strip()
491
+
492
+ log_op(f"Page {page_num} Done. Extracted {len(extracted_text)} characters.")
493
+
494
+ pages_content.append({
495
+ "index": idx,
496
+ "page_number": page_num,
497
+ "text": extracted_text
498
+ })
499
  else:
500
  # Standard Image
501
+ log_op("Processing single image file...")
502
+ img = Image.open(file_path)
503
+ text = pytesseract.image_to_string(img).strip()
504
+ log_op(f"Image processing complete. Extracted {len(text)} characters.")
505
+
506
+ pages_content.append({
507
+ "index": 0,
508
+ "page_number": 1,
509
+ "text": text
510
+ })
511
+
512
  duration = (time.perf_counter() - start) * 1000
513
+ log_op(f"OCR CPU Task finished in {duration:.2f}ms")
514
 
515
+ return {
516
+ "total_pages": len(pages_content),
517
+ "pages_content": pages_content
518
+ }
519
+
520
  except Exception as e:
521
+ log_op(f"OCR Extraction Logic Error: {str(e)}")
522
+ raise ValueError(f"Failed to process document: {str(e)}")
523
 
524
  # ==========================================
525
  # 5. FASTAPI APP INIT
526
  # ==========================================
527
 
528
+ app = FastAPI(title=Config.APP_NAME)
 
 
 
 
 
529
 
 
530
  app.add_middleware(
531
  CORSMiddleware,
532
+ allow_origins=Config.ALLOWED_ORIGINS,
533
  allow_credentials=True,
534
+ allow_methods=["GET", "POST"],
535
+ allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
536
  )
537
 
 
538
  @app.middleware("http")
539
  async def request_context_middleware(request: Request, call_next):
540
  req_id = str(uuid.uuid4())
541
  request.state.request_id = req_id
542
 
 
543
  old_factory = logging.getLogRecordFactory()
544
  def record_factory(*args, **kwargs):
545
  record = old_factory(*args, **kwargs)
 
548
  logging.setLogRecordFactory(record_factory)
549
 
550
  start_time = time.perf_counter()
551
+ logger.info(f"Incoming Request: {request.method} {request.url.path}")
552
 
553
  try:
554
  response = await call_next(request)
555
  process_time = (time.perf_counter() - start_time) * 1000
556
  response.headers["X-Request-ID"] = req_id
557
  response.headers["X-Process-Time"] = f"{process_time:.2f}ms"
 
558
  return response
559
  except Exception as e:
560
  logger.exception("Unhandled Exception in Middleware")
 
569
 
570
  @app.get("/", response_model=BaseResponse)
571
  async def root(request: Request):
 
572
  return {
573
  "request_id": request.state.request_id,
574
  "process_time_ms": 0,
 
576
  "message": "OCR API is running."
577
  }
578
 
 
 
 
 
 
 
 
 
 
 
579
  @app.post("/api/v1/get_data", response_model=APIResponse)
580
  async def extract_data(
581
  request: Request,
582
  file: UploadFile = File(...),
583
  token: str = Depends(SecurityService.validate_token)
584
  ):
 
 
 
 
585
  start_ts = time.perf_counter()
586
  tmp_file_path = None
587
+ req_id = request.state.request_id
588
 
589
  try:
590
+ logger.info(f"Received Upload: {file.filename} ({file.content_type})")
591
+
592
+ # 1. Validate
593
  FileValidator.validate(file)
594
+
595
+ # 2. Save (Disk IO)
596
  tmp_file_path = FileValidator.check_size_and_save(file)
597
+ logger.info(f"File successfully saved to disk at: {tmp_file_path}")
598
 
599
+ # 3. Process (CPU Bound) - Run in ThreadPool
600
  result = await run_in_threadpool(
601
  OCRProcessor.process_file,
602
  tmp_file_path,
603
+ file.content_type,
604
+ req_id
605
  )
606
 
607
+ # 4. Return Response (File is NOT deleted)
608
  return {
609
+ "request_id": req_id,
610
  "process_time_ms": (time.perf_counter() - start_ts) * 1000,
611
  "status": StatusEnum.SUCCESS,
612
  "message": "OCR Extraction Successful",
613
  "data": {
614
  "filename": file.filename,
615
  "content_type": file.content_type,
616
+ "saved_file_path": tmp_file_path, # Returning the full path
617
+ "total_pages": result["total_pages"],
618
+ "pages_content": result["pages_content"]
619
  }
620
  }
621
 
 
 
 
 
 
 
 
 
 
 
622
  except Exception as e:
623
+ logger.error(f"Request failed: {str(e)}")
624
+ # Even on error, we might want to keep the file for debugging if it was saved
625
+ status_code = getattr(e, "status_code", 500)
626
+ return JSONResponse(
627
+ status_code=status_code,
628
+ content={
629
+ "request_id": req_id,
630
+ "process_time_ms": (time.perf_counter() - start_ts) * 1000,
631
+ "status": StatusEnum.ERROR,
632
+ "error_message": getattr(e, "detail", str(e))
633
+ }
634
+ )
635
  finally:
636
+ # User requested NOT to remove the temp file.
637
+ # Logging that file persists for clarity.
638
+ if tmp_file_path:
639
+ logger.info(f"Request complete. File preserved at: {tmp_file_path}")