| import os |
| import uuid |
| import json |
| import ast |
| from pdf2image import convert_from_path |
| from PIL import Image, ImageDraw |
|
|
| BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
|
|
|
|
| def load_calibration(config_path="highlight_calibration.json"): |
| """Load calibration values from JSON or fallback to defaults.""" |
| if os.path.exists(config_path): |
| with open(config_path, "r") as f: |
| calib = json.load(f) |
| print(f"✅ Loaded calibration: {calib}") |
| return calib |
| else: |
| print("⚠️ No calibration file found. Using defaults.") |
| return {"x_offset": 0, "x_scale": 1.0, "y_offset": 0, "y_scale": 1.0} |
|
|
|
|
| def render_highlighted_pages(pdf_path, hits, output_dir=None, dpi=150): |
| """ |
| Render PDF pages as images and highlight bounding boxes with calibration applied. |
| Crops the output image tightly around highlighted area (+20 px padding). |
| """ |
| if output_dir is None: |
| output_dir = os.path.join(BASE_DIR, "highlighted") |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| calib = load_calibration() |
| X_OFFSET = calib.get("x_offset", 0) |
| X_SCALE = calib.get("x_scale", 1.0) |
| Y_OFFSET = calib.get("y_offset", 0) |
| Y_SCALE = calib.get("y_scale", 1.0) |
|
|
| |
| for old in os.listdir(output_dir): |
| try: |
| os.remove(os.path.join(output_dir, old)) |
| except Exception: |
| pass |
|
|
| hits = hits[:1] |
|
|
| pages_to_render = sorted({h["metadata"]["page"] for h in hits}) |
| pdf_images = convert_from_path(pdf_path, dpi=dpi) |
| result_paths = [] |
|
|
| for page_num in pages_to_render: |
| page_index = page_num - 1 |
| img = pdf_images[page_index].convert("RGBA") |
| w_img, h_img = img.size |
| overlay = Image.new("RGBA", img.size, (255, 255, 255, 0)) |
| draw = ImageDraw.Draw(overlay) |
| page_bboxes = [] |
|
|
| for h in hits: |
| meta = h.get("metadata", {}) |
| if meta.get("page") != page_num: |
| continue |
| bbox = meta.get("bbox") |
| |
| print(f"[DEBUG] page {page_num} raw bbox type: {type(bbox)} value: {bbox}") |
|
|
| |
| try: |
| if isinstance(bbox, str): |
| bbox = ast.literal_eval(bbox) |
| if not bbox or not isinstance(bbox, (list, tuple)) or len(bbox) != 4: |
| print(f"[WARN] Invalid bbox for page {page_num}: {bbox}") |
| continue |
| |
| x0, y0, x1, y1 = [float(v) for v in bbox] |
| x0 = x0 * X_SCALE + X_OFFSET |
| x1 = x1 * X_SCALE + X_OFFSET |
| y0 = y0 * Y_SCALE + Y_OFFSET |
| y1 = y1 * Y_SCALE + Y_OFFSET |
| except Exception as e: |
| print(f"[ERROR] Failed to parse bbox for page {page_num}: {bbox} -> {e}") |
| continue |
|
|
| left, top = max(0, min(x0, x1)), max(0, min(y0, y1)) |
| right, bottom = min(w_img, max(x0, x1)), min(h_img, max(y0, y1)) |
|
|
| if right <= left or bottom <= top: |
| continue |
|
|
| page_bboxes.append((left, top, right, bottom)) |
| draw.rectangle( |
| [left, top, right, bottom], |
| outline=(255, 0, 0), |
| width=4, |
| fill=(255, 0, 0, 100) |
| ) |
|
|
| |
| highlighted = Image.alpha_composite(img, overlay) |
|
|
| |
| if page_bboxes: |
| min_x = min(b[0] for b in page_bboxes) |
| min_y = min(b[1] for b in page_bboxes) |
| max_x = max(b[2] for b in page_bboxes) |
| max_y = max(b[3] for b in page_bboxes) |
|
|
| pad = 100 |
| crop_box = ( |
| max(0, int(min_x - pad)), |
| max(0, int(min_y - pad)), |
| int(min(max_x + pad, w_img)), |
| int(min(max_y + pad, h_img)), |
| ) |
|
|
| cropped = highlighted.crop(crop_box) |
| else: |
| cropped = highlighted |
|
|
| |
| print(f"✅ Drew {len(page_bboxes)} boxes on page {page_num}") |
|
|
| out_path = os.path.join(output_dir, f"highlight_page{page_num}_{uuid.uuid4().hex}.png") |
| cropped.convert("RGB").save(out_path) |
| result_paths.append(out_path) |
|
|
| print(f"✅ Highlighted and cropped page {page_num}: {out_path}") |
|
|
| return result_paths |
|
|
|
|
| |
| if __name__ == "__main__": |
| hits = [ |
| {"metadata": {"page": 2, "bbox": [87, 222, 592, 250], "type": "text"}}, |
| ] |
|
|
| render_highlighted_pages("samples/vdoc_rag_test.pdf", hits) |
|
|