"""Profile ONNX engine performance to find bottlenecks."""
import sys, os
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import time
import cv2
import numpy as np

from ocr.engine_onnx import OcrEngineOnnx

engine = OcrEngineOnnx()

img = cv2.imread('working_space/input/ocr_test (2).png')
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

# Warm up (first call loads models)
t0 = time.perf_counter()
r = engine.recognize_numpy(img_rgb)
t1 = time.perf_counter()
print(f'First call (cold): {(t1-t0)*1000:.0f}ms')

# Profile individual stages
h, w = img_rgb.shape[:2]
print(f'Image: {w}x{h}')

# Time detection
t0 = time.perf_counter()
quads, scale = engine._detect(img_rgb)
t1 = time.perf_counter()
print(f'Detection: {(t1-t0)*1000:.1f}ms ({len(quads)} quads)')

# Time recognition per quad
for i, q in enumerate(quads[:3]):
    crop = engine._crop_quad(img_rgb, q)
    if crop is None:
        continue
    t0 = time.perf_counter()
    sid = engine._identify_script(crop)
    t1 = time.perf_counter()
    text, conf, _ = engine._recognize(crop, 2)
    t2 = time.perf_counter()
    print(f'  Quad {i}: ScriptID={t1-t0:.3f}s, Recognize={t2-t1:.3f}s -> "{text}"')

# Second call (warm)
t0 = time.perf_counter()
r = engine.recognize_numpy(img_rgb)
t1 = time.perf_counter()
print(f'Second call (warm): {(t1-t0)*1000:.0f}ms')

# Third call (warm)
t0 = time.perf_counter()
r = engine.recognize_numpy(img_rgb)
t1 = time.perf_counter()
print(f'Third call (warm): {(t1-t0)*1000:.0f}ms')

# Check session cached count
print(f'\nCached sessions: {len(engine._sessions)}')

# Check session options
sess = list(engine._sessions.values())[0] if engine._sessions else None
if sess:
    print(f'Session providers: {sess.get_providers()}')
    opts = sess.get_session_options()
    print(f'Inter-op threads: {opts.inter_op_num_threads}')
    print(f'Intra-op threads: {opts.intra_op_num_threads}')
    print(f'Optimization level: {opts.graph_optimization_level}')

# Now profile a big image
print('\n--- Large image (test.png) ---')
img2 = cv2.imread('working_space/input/test.png')
img2_rgb = cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)
h2, w2 = img2_rgb.shape[:2]
print(f'Image: {w2}x{h2}')

t0 = time.perf_counter()
r2 = engine.recognize_numpy(img2_rgb)
t1 = time.perf_counter()
print(f'Total: {(t1-t0)*1000:.0f}ms ({len(r2.lines)} lines)')

# Time just detection for test.png
t0 = time.perf_counter()
quads2, scale2 = engine._detect(img2_rgb)
t1 = time.perf_counter()
print(f'Detection only: {(t1-t0)*1000:.1f}ms ({len(quads2)} quads, scale={scale2:.2f})')