Soumik Bose commited on
Commit
b267a19
·
1 Parent(s): 608b1fc
Files changed (2) hide show
  1. Dockerfile +7 -18
  2. main.py +66 -454
Dockerfile CHANGED
@@ -1,11 +1,8 @@
1
- # UPDATE: Changed from 3.9 to 3.11 to support newer NumPy/Pandas versions
2
  FROM python:3.11-slim
3
 
4
- # Set the working directory in the container
5
  WORKDIR /app
6
 
7
  # Install system dependencies
8
- # Added 'curl' explicitly to the list below
9
  RUN apt-get update && apt-get install -y \
10
  curl \
11
  tesseract-ocr \
@@ -24,33 +21,25 @@ RUN apt-get update && apt-get install -y \
24
  libglib2.0-0 \
25
  && rm -rf /var/lib/apt/lists/*
26
 
27
- # Upgrade pip, setuptools, and wheel before installing deps
 
 
 
 
 
28
  COPY requirements.txt .
29
  RUN pip install --upgrade pip setuptools wheel \
30
  && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
31
 
32
- # Copy the current directory contents into the container
33
  COPY . .
34
 
35
- # Create a non-root user for security (Production Best Practice)
36
  RUN useradd -m appuser && chown -R appuser /app
37
  USER appuser
38
 
39
- # Set environment variables
40
- ENV HF_HOME=/tmp/cache
41
- ENV PORT=7860
42
-
43
- # Create cache directory (if still needed)
44
  RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME}
45
 
46
- # Expose port
47
  EXPOSE $PORT
48
 
49
- # Run FastAPI with Uvicorn
50
- # The curl command will now work because the package is installed
51
  CMD bash -c "\
52
- while true; do \
53
- curl -s https://xce009-ocr-api.hf.space >/dev/null; \
54
- sleep 300; \
55
- done & \
56
  uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 4"
 
 
1
  FROM python:3.11-slim
2
 
 
3
  WORKDIR /app
4
 
5
  # Install system dependencies
 
6
  RUN apt-get update && apt-get install -y \
7
  curl \
8
  tesseract-ocr \
 
21
  libglib2.0-0 \
22
  && rm -rf /var/lib/apt/lists/*
23
 
24
+ # Fix: Ensure logs appear immediately in the console
25
+ ENV PYTHONUNBUFFERED=1
26
+ ENV PYTHONIOENCODING=UTF-8
27
+ ENV HF_HOME=/tmp/cache
28
+ ENV PORT=7860
29
+
30
  COPY requirements.txt .
31
  RUN pip install --upgrade pip setuptools wheel \
32
  && pip install --default-timeout=100 --retries=10 --no-cache-dir -r requirements.txt
33
 
 
34
  COPY . .
35
 
 
36
  RUN useradd -m appuser && chown -R appuser /app
37
  USER appuser
38
 
 
 
 
 
 
39
  RUN mkdir -p ${HF_HOME} && chmod 777 ${HF_HOME}
40
 
 
41
  EXPOSE $PORT
42
 
 
 
43
  CMD bash -c "\
44
+ (while true; do curl -s https://xce009-ocr-api.hf.space >/dev/null; sleep 300; done) & \
 
 
 
45
  uvicorn main:app --host 0.0.0.0 --port ${PORT} --workers 4"
main.py CHANGED
@@ -1,326 +1,4 @@
1
- # import os
2
- # import sys
3
- # import uuid
4
- # import time
5
- # import logging
6
- # import shutil
7
- # import tempfile
8
- # from typing import Optional, List
9
- # from enum import Enum
10
- # from pathlib import Path
11
-
12
- # # Third-party imports
13
- # import uvicorn
14
- # import pytesseract
15
- # from fastapi import (
16
- # FastAPI, File, UploadFile, Depends,
17
- # HTTPException, Request, status
18
- # )
19
- # from fastapi.middleware.cors import CORSMiddleware
20
- # from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
21
- # from fastapi.responses import JSONResponse
22
- # from fastapi.concurrency import run_in_threadpool
23
- # from pydantic import BaseModel
24
- # from dotenv import load_dotenv
25
- # from PIL import Image
26
- # from pdf2image import convert_from_path
27
-
28
- # # ==========================================
29
- # # 1. CONFIGURATION & ENV LOADING
30
- # # ==========================================
31
- # load_dotenv()
32
-
33
- # class Config:
34
- # APP_NAME = os.getenv("APP_NAME", "OCR API")
35
- # API_TOKEN = os.getenv("API_BEARER_TOKEN")
36
- # MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800))
37
-
38
- # allowed_origins_raw = os.getenv("ALLOWED_ORIGINS")
39
- # ALLOWED_ORIGINS = [origin.strip() for origin in allowed_origins_raw.split(",") if origin.strip()] if allowed_origins_raw else []
40
- # ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
41
-
42
- # if not Config.API_TOKEN:
43
- # print("CRITICAL WARNING: API_BEARER_TOKEN is not set in .env")
44
-
45
- # # ==========================================
46
- # # 2. LOGGING SETUP
47
- # # ==========================================
48
- # class RequestIdFilter(logging.Filter):
49
- # def filter(self, record):
50
- # if not hasattr(record, 'request_id'):
51
- # record.request_id = 'system'
52
- # return True
53
-
54
- # logging.basicConfig(
55
- # level=logging.INFO,
56
- # format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
57
- # datefmt='%Y-%m-%d %H:%M:%S'
58
- # )
59
- # logger = logging.getLogger("ocr_api")
60
- # logger.addFilter(RequestIdFilter())
61
-
62
- # # ==========================================
63
- # # 3. PYDANTIC MODELS (UPDATED)
64
- # # ==========================================
65
- # class StatusEnum(str, Enum):
66
- # SUCCESS = "success"
67
- # ERROR = "error"
68
-
69
- # class BaseResponse(BaseModel):
70
- # request_id: str
71
- # process_time_ms: float
72
- # status: StatusEnum
73
- # message: Optional[str] = None
74
-
75
- # # New model for individual page details
76
- # class PageResult(BaseModel):
77
- # index: int
78
- # page_number: int
79
- # text: str
80
-
81
- # # Updated Result model to return a list of pages
82
- # class OCRResult(BaseModel):
83
- # filename: str
84
- # content_type: str
85
- # total_pages: int
86
- # pages_content: List[PageResult]
87
-
88
- # class APIResponse(BaseResponse):
89
- # data: Optional[OCRResult] = None
90
- # error_message: Optional[str] = None
91
-
92
- # # ==========================================
93
- # # 4. BUSINESS LOGIC SERVICES
94
- # # ==========================================
95
-
96
- # class SecurityService:
97
- # security_scheme = HTTPBearer()
98
-
99
- # @staticmethod
100
- # async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
101
- # if credentials.credentials != Config.API_TOKEN:
102
- # logger.warning(f"Auth Failed. Token used: {credentials.credentials[:5]}...")
103
- # raise HTTPException(
104
- # status_code=status.HTTP_401_UNAUTHORIZED,
105
- # detail="Invalid Bearer Token",
106
- # headers={"WWW-Authenticate": "Bearer"},
107
- # )
108
- # return credentials.credentials
109
-
110
- # class FileValidator:
111
- # @staticmethod
112
- # def validate(file: UploadFile):
113
- # if file.content_type not in Config.ALLOWED_TYPES:
114
- # raise HTTPException(
115
- # status_code=status.HTTP_400_BAD_REQUEST,
116
- # detail=f"Invalid file type. Allowed: {Config.ALLOWED_TYPES}"
117
- # )
118
-
119
- # @staticmethod
120
- # def check_size_and_save(file: UploadFile) -> str:
121
- # try:
122
- # suffix = Path(file.filename).suffix
123
- # with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
124
- # shutil.copyfileobj(file.file, buffer)
125
- # tmp_path = buffer.name
126
-
127
- # file_size = os.path.getsize(tmp_path)
128
- # if file_size > Config.MAX_SIZE:
129
- # os.remove(tmp_path)
130
- # raise HTTPException(
131
- # status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
132
- # detail=f"File size exceeds limit of {Config.MAX_SIZE / (1024*1024)}MB"
133
- # )
134
-
135
- # return tmp_path
136
- # except HTTPException:
137
- # raise
138
- # except Exception as e:
139
- # logger.error(f"File save error: {e}")
140
- # raise HTTPException(status.HTTP_500_INTERNAL_SERVER_ERROR, "File upload failed")
141
-
142
- # class OCRProcessor:
143
- # @classmethod
144
- # def process_file(cls, file_path: str, content_type: str, request_id: str) -> dict:
145
- # """
146
- # Extracts text per page and logs every step.
147
- # """
148
- # start = time.perf_counter()
149
- # pages_content = []
150
-
151
- # # Helper to inject request_id into logs inside the thread
152
- # def log_op(msg):
153
- # logger.info(f"[Thread-Op] {msg}", extra={'request_id': request_id})
154
-
155
- # try:
156
- # log_op(f"Starting processing for file type: {content_type}")
157
-
158
- # if content_type == "application/pdf":
159
- # log_op("Initiating PDF to Image conversion...")
160
-
161
- # # Convert PDF to images
162
- # images = convert_from_path(file_path)
163
- # total_pages = len(images)
164
-
165
- # log_op(f"PDF converted successfully. Total pages found: {total_pages}")
166
-
167
- # # Iterate through pages
168
- # for idx, img in enumerate(images):
169
- # page_num = idx + 1
170
- # log_op(f"Processing Page {page_num}/{total_pages} - extracting text...")
171
-
172
- # extracted_text = pytesseract.image_to_string(img).strip()
173
-
174
- # log_op(f"Completed Page {page_num}/{total_pages} - Text length: {len(extracted_text)} chars")
175
-
176
- # pages_content.append({
177
- # "index": idx,
178
- # "page_number": page_num,
179
- # "text": extracted_text
180
- # })
181
- # else:
182
- # # Standard Image
183
- # log_op("Processing single image file...")
184
- # img = Image.open(file_path)
185
- # text = pytesseract.image_to_string(img).strip()
186
- # log_op("Image text extraction complete.")
187
-
188
- # pages_content.append({
189
- # "index": 0,
190
- # "page_number": 1,
191
- # "text": text
192
- # })
193
-
194
- # duration = (time.perf_counter() - start) * 1000
195
- # log_op(f"OCR Engine finished all tasks in {duration:.2f}ms")
196
-
197
- # return {
198
- # "total_pages": len(pages_content),
199
- # "pages_content": pages_content
200
- # }
201
-
202
- # except Exception as e:
203
- # log_op(f"OCR Extraction Error: {str(e)}")
204
- # raise ValueError("Failed to extract text from document")
205
-
206
- # # ==========================================
207
- # # 5. FASTAPI APP INIT
208
- # # ==========================================
209
-
210
- # app = FastAPI(title=Config.APP_NAME)
211
-
212
- # app.add_middleware(
213
- # CORSMiddleware,
214
- # allow_origins=Config.ALLOWED_ORIGINS,
215
- # allow_credentials=True,
216
- # allow_methods=["GET", "POST"],
217
- # allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
218
- # )
219
-
220
- # @app.middleware("http")
221
- # async def request_context_middleware(request: Request, call_next):
222
- # req_id = str(uuid.uuid4())
223
- # request.state.request_id = req_id
224
-
225
- # old_factory = logging.getLogRecordFactory()
226
- # def record_factory(*args, **kwargs):
227
- # record = old_factory(*args, **kwargs)
228
- # record.request_id = req_id
229
- # return record
230
- # logging.setLogRecordFactory(record_factory)
231
-
232
- # start_time = time.perf_counter()
233
- # logger.info(f"Incoming Request: {request.method} {request.url.path}")
234
-
235
- # try:
236
- # response = await call_next(request)
237
- # process_time = (time.perf_counter() - start_time) * 1000
238
- # response.headers["X-Request-ID"] = req_id
239
- # response.headers["X-Process-Time"] = f"{process_time:.2f}ms"
240
- # return response
241
- # except Exception as e:
242
- # logger.exception("Unhandled Exception in Middleware")
243
- # return JSONResponse(
244
- # status_code=500,
245
- # content={"status": "error", "message": "Internal Server Error", "request_id": req_id}
246
- # )
247
-
248
- # # ==========================================
249
- # # 6. ENDPOINTS
250
- # # ==========================================
251
-
252
- # @app.get("/", response_model=BaseResponse)
253
- # async def root(request: Request):
254
- # return {
255
- # "request_id": request.state.request_id,
256
- # "process_time_ms": 0,
257
- # "status": StatusEnum.SUCCESS,
258
- # "message": "OCR API is running."
259
- # }
260
-
261
- # @app.post("/api/v1/get_data", response_model=APIResponse)
262
- # async def extract_data(
263
- # request: Request,
264
- # file: UploadFile = File(...),
265
- # token: str = Depends(SecurityService.validate_token)
266
- # ):
267
- # start_ts = time.perf_counter()
268
- # tmp_file_path = None
269
- # req_id = request.state.request_id
270
-
271
- # try:
272
- # # 1. Log Upload
273
- # logger.info(f"Received file: {file.filename}, Content-Type: {file.content_type}")
274
-
275
- # FileValidator.validate(file)
276
-
277
- # # 2. Save File
278
- # logger.info("Saving temporary file to disk...")
279
- # tmp_file_path = FileValidator.check_size_and_save(file)
280
- # logger.info(f"File saved at {tmp_file_path}. Sending to OCR thread.")
281
-
282
- # # 3. Process
283
- # result = await run_in_threadpool(
284
- # OCRProcessor.process_file,
285
- # tmp_file_path,
286
- # file.content_type,
287
- # req_id # Pass ID explicitly for thread logging
288
- # )
289
-
290
- # return {
291
- # "request_id": req_id,
292
- # "process_time_ms": (time.perf_counter() - start_ts) * 1000,
293
- # "status": StatusEnum.SUCCESS,
294
- # "message": "OCR Extraction Successful",
295
- # "data": {
296
- # "filename": file.filename,
297
- # "content_type": file.content_type,
298
- # "total_pages": result["total_pages"],
299
- # "pages_content": result["pages_content"]
300
- # }
301
- # }
302
-
303
- # except Exception as e:
304
- # logger.error(f"Processing failed: {str(e)}")
305
- # status_code = getattr(e, "status_code", 500)
306
- # return JSONResponse(
307
- # status_code=status_code,
308
- # content={
309
- # "request_id": req_id,
310
- # "process_time_ms": (time.perf_counter() - start_ts) * 1000,
311
- # "status": StatusEnum.ERROR,
312
- # "error_message": getattr(e, "detail", str(e))
313
- # }
314
- # )
315
- # finally:
316
- # if tmp_file_path and os.path.exists(tmp_file_path):
317
- # os.remove(tmp_file_path)
318
- # logger.info("Temporary file cleaned up.")
319
-
320
-
321
-
322
  import os
323
- import sys
324
  import uuid
325
  import time
326
  import logging
@@ -329,6 +7,7 @@ import tempfile
329
  from typing import Optional, List
330
  from enum import Enum
331
  from pathlib import Path
 
332
 
333
  # Third-party imports
334
  import uvicorn
@@ -347,38 +26,37 @@ from PIL import Image
347
  from pdf2image import convert_from_path
348
 
349
  # ==========================================
350
- # 1. CONFIGURATION & ENV LOADING
351
  # ==========================================
352
  load_dotenv()
353
 
 
 
 
354
  class Config:
355
  APP_NAME = os.getenv("APP_NAME", "OCR API")
356
  API_TOKEN = os.getenv("API_BEARER_TOKEN")
357
  MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
358
-
359
- allowed_origins_raw = os.getenv("ALLOWED_ORIGINS")
360
- ALLOWED_ORIGINS = [origin.strip() for origin in allowed_origins_raw.split(",") if origin.strip()] if allowed_origins_raw else []
361
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
362
 
363
- # ==========================================
364
- # 2. LOGGING SETUP
365
- # ==========================================
366
  class RequestIdFilter(logging.Filter):
367
  def filter(self, record):
368
- if not hasattr(record, 'request_id'):
369
- record.request_id = 'system'
370
  return True
371
 
372
  logging.basicConfig(
373
  level=logging.INFO,
374
  format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
375
- datefmt='%Y-%m-%d %H:%M:%S'
 
376
  )
377
  logger = logging.getLogger("ocr_api")
378
  logger.addFilter(RequestIdFilter())
379
 
380
  # ==========================================
381
- # 3. PYDANTIC MODELS
382
  # ==========================================
383
  class StatusEnum(str, Enum):
384
  SUCCESS = "success"
@@ -398,7 +76,7 @@ class PageResult(BaseModel):
398
  class OCRResult(BaseModel):
399
  filename: str
400
  content_type: str
401
- saved_file_path: str # <--- NEW FIELD
402
  total_pages: int
403
  pages_content: List[PageResult]
404
 
@@ -407,7 +85,7 @@ class APIResponse(BaseResponse):
407
  error_message: Optional[str] = None
408
 
409
  # ==========================================
410
- # 4. BUSINESS LOGIC SERVICES
411
  # ==========================================
412
 
413
  class SecurityService:
@@ -416,164 +94,108 @@ class SecurityService:
416
  @staticmethod
417
  async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
418
  if credentials.credentials != Config.API_TOKEN:
419
- logger.warning(f"Auth Failed. Token used: {credentials.credentials[:5]}...")
420
- raise HTTPException(
421
- status_code=status.HTTP_401_UNAUTHORIZED,
422
- detail="Invalid Bearer Token",
423
- headers={"WWW-Authenticate": "Bearer"},
424
- )
425
  return credentials.credentials
426
 
427
  class FileValidator:
428
  @staticmethod
429
  def validate(file: UploadFile):
430
  if file.content_type not in Config.ALLOWED_TYPES:
431
- raise HTTPException(
432
- status_code=status.HTTP_400_BAD_REQUEST,
433
- detail=f"Invalid file type. Allowed: {Config.ALLOWED_TYPES}"
434
- )
435
 
436
  @staticmethod
437
  def check_size_and_save(file: UploadFile) -> str:
438
- try:
439
- suffix = Path(file.filename).suffix
440
- # Create a named temp file that persists (delete=False)
441
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix, dir=tempfile.gettempdir()) as buffer:
442
- shutil.copyfileobj(file.file, buffer)
443
- tmp_path = buffer.name
444
-
445
- # Return absolute path
446
- abs_path = os.path.abspath(tmp_path)
447
-
448
- # Check size
449
- file_size = os.path.getsize(abs_path)
450
- if file_size > Config.MAX_SIZE:
451
- os.remove(abs_path)
452
- raise HTTPException(
453
- status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
454
- detail=f"File size exceeds limit of {Config.MAX_SIZE / (1024*1024)}MB"
455
- )
456
-
457
- return abs_path
458
- except HTTPException:
459
- raise
460
- except Exception as e:
461
- logger.error(f"File save error: {e}")
462
- raise HTTPException(status.HTTP_500_INTERNAL_SERVER_ERROR, "File upload failed")
463
 
464
  class OCRProcessor:
465
  @classmethod
466
- def process_file(cls, file_path: str, content_type: str, request_id: str) -> dict:
 
467
  start = time.perf_counter()
468
  pages_content = []
469
 
470
- # Helper for threaded logging
471
- def log_op(msg):
472
- logger.info(f"[Thread-Op] {msg}", extra={'request_id': request_id})
473
-
474
  try:
475
- log_op(f"Starting OCR processing. File located at: {file_path}")
476
 
477
  if content_type == "application/pdf":
478
- log_op("Reading PDF file and converting to images...")
479
-
480
- # Convert PDF to images
481
  images = convert_from_path(file_path)
482
- total_pages = len(images)
483
 
484
- log_op(f"PDF Conversion Success. Total Pages: {total_pages}")
485
-
486
  for idx, img in enumerate(images):
487
  page_num = idx + 1
488
- log_op(f"Processing Page {page_num}/{total_pages}...")
489
-
490
- extracted_text = pytesseract.image_to_string(img).strip()
491
-
492
- log_op(f"Page {page_num} Done. Extracted {len(extracted_text)} characters.")
493
-
494
- pages_content.append({
495
- "index": idx,
496
- "page_number": page_num,
497
- "text": extracted_text
498
- })
499
  else:
500
- # Standard Image
501
- log_op("Processing single image file...")
502
  img = Image.open(file_path)
503
  text = pytesseract.image_to_string(img).strip()
504
- log_op(f"Image processing complete. Extracted {len(text)} characters.")
505
-
506
- pages_content.append({
507
- "index": 0,
508
- "page_number": 1,
509
- "text": text
510
- })
511
 
512
- duration = (time.perf_counter() - start) * 1000
513
- log_op(f"OCR CPU Task finished in {duration:.2f}ms")
514
-
515
- return {
516
- "total_pages": len(pages_content),
517
- "pages_content": pages_content
518
- }
519
 
520
  except Exception as e:
521
- log_op(f"OCR Extraction Logic Error: {str(e)}")
522
- raise ValueError(f"Failed to process document: {str(e)}")
523
 
524
  # ==========================================
525
- # 5. FASTAPI APP INIT
526
  # ==========================================
527
-
528
  app = FastAPI(title=Config.APP_NAME)
529
 
530
  app.add_middleware(
531
  CORSMiddleware,
532
- allow_origins=Config.ALLOWED_ORIGINS,
533
- allow_credentials=True,
534
- allow_methods=["GET", "POST"],
535
- allow_headers=["Authorization", "Content-Type", "X-Request-ID"],
536
  )
537
 
538
  @app.middleware("http")
539
  async def request_context_middleware(request: Request, call_next):
 
540
  req_id = str(uuid.uuid4())
 
 
541
  request.state.request_id = req_id
542
-
543
- old_factory = logging.getLogRecordFactory()
544
- def record_factory(*args, **kwargs):
545
- record = old_factory(*args, **kwargs)
546
- record.request_id = req_id
547
- return record
548
- logging.setLogRecordFactory(record_factory)
549
 
550
  start_time = time.perf_counter()
551
- logger.info(f"Incoming Request: {request.method} {request.url.path}")
552
 
553
  try:
554
  response = await call_next(request)
555
- process_time = (time.perf_counter() - start_time) * 1000
556
  response.headers["X-Request-ID"] = req_id
557
- response.headers["X-Process-Time"] = f"{process_time:.2f}ms"
558
  return response
559
  except Exception as e:
560
- logger.exception("Unhandled Exception in Middleware")
561
- return JSONResponse(
562
- status_code=500,
563
- content={"status": "error", "message": "Internal Server Error", "request_id": req_id}
564
- )
565
 
566
  # ==========================================
567
- # 6. ENDPOINTS
568
  # ==========================================
569
 
570
- @app.get("/", response_model=BaseResponse)
571
  async def root(request: Request):
572
  return {
573
  "request_id": request.state.request_id,
574
  "process_time_ms": 0,
575
  "status": StatusEnum.SUCCESS,
576
- "message": "OCR API is running."
577
  }
578
 
579
  @app.post("/api/v1/get_data", response_model=APIResponse)
@@ -583,28 +205,21 @@ async def extract_data(
583
  token: str = Depends(SecurityService.validate_token)
584
  ):
585
  start_ts = time.perf_counter()
586
- tmp_file_path = None
587
  req_id = request.state.request_id
588
 
589
  try:
590
- logger.info(f"Received Upload: {file.filename} ({file.content_type})")
591
-
592
- # 1. Validate
593
  FileValidator.validate(file)
 
594
 
595
- # 2. Save (Disk IO)
596
- tmp_file_path = FileValidator.check_size_and_save(file)
597
- logger.info(f"File successfully saved to disk at: {tmp_file_path}")
598
-
599
- # 3. Process (CPU Bound) - Run in ThreadPool
600
  result = await run_in_threadpool(
601
  OCRProcessor.process_file,
602
- tmp_file_path,
603
- file.content_type,
604
- req_id
605
  )
606
 
607
- # 4. Return Response (File is NOT deleted)
608
  return {
609
  "request_id": req_id,
610
  "process_time_ms": (time.perf_counter() - start_ts) * 1000,
@@ -613,7 +228,7 @@ async def extract_data(
613
  "data": {
614
  "filename": file.filename,
615
  "content_type": file.content_type,
616
- "saved_file_path": tmp_file_path, # Returning the full path
617
  "total_pages": result["total_pages"],
618
  "pages_content": result["pages_content"]
619
  }
@@ -621,7 +236,6 @@ async def extract_data(
621
 
622
  except Exception as e:
623
  logger.error(f"Request failed: {str(e)}")
624
- # Even on error, we might want to keep the file for debugging if it was saved
625
  status_code = getattr(e, "status_code", 500)
626
  return JSONResponse(
627
  status_code=status_code,
@@ -633,7 +247,5 @@ async def extract_data(
633
  }
634
  )
635
  finally:
636
- # User requested NOT to remove the temp file.
637
- # Logging that file persists for clarity.
638
- if tmp_file_path:
639
- logger.info(f"Request complete. File preserved at: {tmp_file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import uuid
3
  import time
4
  import logging
 
7
  from typing import Optional, List
8
  from enum import Enum
9
  from pathlib import Path
10
+ from contextvars import ContextVar
11
 
12
  # Third-party imports
13
  import uvicorn
 
26
  from pdf2image import convert_from_path
27
 
28
  # ==========================================
29
+ # 1. CONFIGURATION & LOGGING SETUP
30
  # ==========================================
31
  load_dotenv()
32
 
33
+ # ContextVar for thread-safe Request ID tracking
34
+ request_id_ctx: ContextVar[str] = ContextVar("request_id", default="system")
35
+
36
  class Config:
37
  APP_NAME = os.getenv("APP_NAME", "OCR API")
38
  API_TOKEN = os.getenv("API_BEARER_TOKEN")
39
  MAX_SIZE = int(os.getenv("MAX_FILE_SIZE", 52428800)) # 50MB
40
+ ALLOWED_ORIGINS = [o.strip() for o in os.getenv("ALLOWED_ORIGINS", "").split(",") if o.strip()]
 
 
41
  ALLOWED_TYPES = ["image/jpeg", "image/png", "image/bmp", "image/webp", "application/pdf"]
42
 
 
 
 
43
  class RequestIdFilter(logging.Filter):
44
  def filter(self, record):
45
+ # Automatically pull request_id from the context variable
46
+ record.request_id = request_id_ctx.get()
47
  return True
48
 
49
  logging.basicConfig(
50
  level=logging.INFO,
51
  format='%(asctime)s | %(levelname)s | ReqID:%(request_id)s | %(message)s',
52
+ datefmt='%Y-%m-%d %H:%M:%S',
53
+ force=True # Ensures our config is applied
54
  )
55
  logger = logging.getLogger("ocr_api")
56
  logger.addFilter(RequestIdFilter())
57
 
58
  # ==========================================
59
+ # 2. MODELS
60
  # ==========================================
61
  class StatusEnum(str, Enum):
62
  SUCCESS = "success"
 
76
  class OCRResult(BaseModel):
77
  filename: str
78
  content_type: str
79
+ saved_file_path: str
80
  total_pages: int
81
  pages_content: List[PageResult]
82
 
 
85
  error_message: Optional[str] = None
86
 
87
  # ==========================================
88
+ # 3. SERVICES
89
  # ==========================================
90
 
91
  class SecurityService:
 
94
  @staticmethod
95
  async def validate_token(credentials: HTTPAuthorizationCredentials = Depends(security_scheme)):
96
  if credentials.credentials != Config.API_TOKEN:
97
+ logger.warning("Auth Failed: Invalid Token")
98
+ raise HTTPException(status_code=401, detail="Invalid Bearer Token")
 
 
 
 
99
  return credentials.credentials
100
 
101
  class FileValidator:
102
  @staticmethod
103
  def validate(file: UploadFile):
104
  if file.content_type not in Config.ALLOWED_TYPES:
105
+ raise HTTPException(400, f"Invalid file type: {file.content_type}")
 
 
 
106
 
107
  @staticmethod
108
  def check_size_and_save(file: UploadFile) -> str:
109
+ suffix = Path(file.filename).suffix
110
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as buffer:
111
+ shutil.copyfileobj(file.file, buffer)
112
+ tmp_path = os.path.abspath(buffer.name)
113
+
114
+ if os.path.getsize(tmp_path) > Config.MAX_SIZE:
115
+ os.remove(tmp_path)
116
+ raise HTTPException(413, "File too large")
117
+ return tmp_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  class OCRProcessor:
120
  @classmethod
121
+ def process_file(cls, file_path: str, content_type: str) -> dict:
122
+ """Note: No longer passing request_id; logger picks it up from contextvars automatically."""
123
  start = time.perf_counter()
124
  pages_content = []
125
 
 
 
 
 
126
  try:
127
+ logger.info(f"Processing File: {file_path}")
128
 
129
  if content_type == "application/pdf":
130
+ logger.info("Converting PDF to Images...")
 
 
131
  images = convert_from_path(file_path)
132
+ total = len(images)
133
 
 
 
134
  for idx, img in enumerate(images):
135
  page_num = idx + 1
136
+ logger.info(f"Scanning Page {page_num}/{total}")
137
+ text = pytesseract.image_to_string(img).strip()
138
+ pages_content.append({"index": idx, "page_number": page_num, "text": text})
 
 
 
 
 
 
 
 
139
  else:
140
+ logger.info("Scanning Single Image...")
 
141
  img = Image.open(file_path)
142
  text = pytesseract.image_to_string(img).strip()
143
+ pages_content.append({"index": 0, "page_number": 1, "text": text})
 
 
 
 
 
 
144
 
145
+ logger.info(f"OCR Complete in {(time.perf_counter()-start)*1000:.2f}ms")
146
+ return {"total_pages": len(pages_content), "pages_content": pages_content}
 
 
 
 
 
147
 
148
  except Exception as e:
149
+ logger.error(f"OCR Logic Failure: {str(e)}")
150
+ raise ValueError(str(e))
151
 
152
  # ==========================================
153
+ # 4. APP & MIDDLEWARE
154
  # ==========================================
 
155
  app = FastAPI(title=Config.APP_NAME)
156
 
157
  app.add_middleware(
158
  CORSMiddleware,
159
+ allow_origins=Config.ALLOWED_ORIGINS if Config.ALLOWED_ORIGINS else ["*"],
160
+ allow_methods=["*"],
161
+ allow_headers=["*"],
 
162
  )
163
 
164
  @app.middleware("http")
165
  async def request_context_middleware(request: Request, call_next):
166
+ # 1. Generate ID
167
  req_id = str(uuid.uuid4())
168
+ # 2. Set Context (Crucial for thread logging)
169
+ token = request_id_ctx.set(req_id)
170
  request.state.request_id = req_id
 
 
 
 
 
 
 
171
 
172
  start_time = time.perf_counter()
173
+ logger.info(f"Start: {request.method} {request.url.path}")
174
 
175
  try:
176
  response = await call_next(request)
177
+ duration = (time.perf_counter() - start_time) * 1000
178
  response.headers["X-Request-ID"] = req_id
179
+ logger.info(f"Finish: {response.status_code} in {duration:.2f}ms")
180
  return response
181
  except Exception as e:
182
+ logger.exception("Middleware caught crash")
183
+ return JSONResponse(status_code=500, content={"status":"error","message":"Internal Server Error","request_id":req_id})
184
+ finally:
185
+ # 3. Clean up Context
186
+ request_id_ctx.reset(token)
187
 
188
  # ==========================================
189
+ # 5. ENDPOINTS
190
  # ==========================================
191
 
192
+ @app.get("/")
193
  async def root(request: Request):
194
  return {
195
  "request_id": request.state.request_id,
196
  "process_time_ms": 0,
197
  "status": StatusEnum.SUCCESS,
198
+ "message": "OCR API Active"
199
  }
200
 
201
  @app.post("/api/v1/get_data", response_model=APIResponse)
 
205
  token: str = Depends(SecurityService.validate_token)
206
  ):
207
  start_ts = time.perf_counter()
208
+ tmp_path = None
209
  req_id = request.state.request_id
210
 
211
  try:
 
 
 
212
  FileValidator.validate(file)
213
+ tmp_path = FileValidator.check_size_and_save(file)
214
 
215
+ # CPU heavy task run in thread pool.
216
+ # ContextVars are automatically copied to the thread.
 
 
 
217
  result = await run_in_threadpool(
218
  OCRProcessor.process_file,
219
+ tmp_path,
220
+ file.content_type
 
221
  )
222
 
 
223
  return {
224
  "request_id": req_id,
225
  "process_time_ms": (time.perf_counter() - start_ts) * 1000,
 
228
  "data": {
229
  "filename": file.filename,
230
  "content_type": file.content_type,
231
+ "saved_file_path": tmp_path,
232
  "total_pages": result["total_pages"],
233
  "pages_content": result["pages_content"]
234
  }
 
236
 
237
  except Exception as e:
238
  logger.error(f"Request failed: {str(e)}")
 
239
  status_code = getattr(e, "status_code", 500)
240
  return JSONResponse(
241
  status_code=status_code,
 
247
  }
248
  )
249
  finally:
250
+ if tmp_path:
251
+ logger.info(f"File preserved at: {tmp_path}")