| | import gradio as gr |
| | import torch |
| | import cv2 |
| | import os |
| | import numpy as np |
| | from PIL import Image, ImageEnhance |
| | from ultralytics import YOLO |
| | from decord import VideoReader, cpu |
| | from torchvision.transforms.functional import InterpolationMode |
| | from transformers import AutoModel, AutoTokenizer |
| | from backPrompt import main as main_b |
| | from frontPrompt import main as main_f |
| |
|
| | model_path = "best.pt" |
| | modelY = YOLO(model_path) |
| |
|
| | path = "OpenGVLab/InternVL2_5-4B" |
| | model = AutoModel.from_pretrained( |
| | path, |
| | torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32, |
| | |
| | low_cpu_mem_usage=True, |
| | use_flash_attn=True, |
| | trust_remote_code=True).eval() |
| | tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False) |
| |
|
| | def preprocessing(image): |
| | """Apply three enhancement filters, including brightness reduction, and resize.""" |
| | image = Image.fromarray(np.array(image)) |
| |
|
| |
|
| | image = ImageEnhance.Sharpness(image).enhance(2.0) |
| | image = ImageEnhance.Contrast(image).enhance(1.5) |
| | image = ImageEnhance.Brightness(image).enhance(0.8) |
| |
|
| | |
| | width = 800 |
| | aspect_ratio = image.height / image.width |
| | height = int(width * aspect_ratio) |
| | image = image.resize((width, height)) |
| |
|
| | return image |
| |
|
| |
|
| | def imageRotation(image): |
| | if image.height > image.width: |
| | return image.rotate(90, expand=True) |
| | return image |
| |
|
| |
|
| | def detect_document(image): |
| | """Detects front and back of the document using YOLO.""" |
| | image = np.array(image) |
| | results = modelY(image, conf=0.85) |
| |
|
| | detected_classes = set() |
| | labels = [] |
| | bounding_boxes = [] |
| |
|
| | for result in results: |
| | for box in result.boxes: |
| | x1, y1, x2, y2 = map(int, box.xyxy[0]) |
| | conf = box.conf[0] |
| | cls = int(box.cls[0]) |
| | class_name = modelY.names[cls] |
| |
|
| | detected_classes.add(class_name) |
| | label = f"{class_name} {conf:.2f}" |
| | labels.append(label) |
| | bounding_boxes.append((x1, y1, x2, y2, class_name, conf)) |
| |
|
| | cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2) |
| | cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) |
| |
|
| | possible_classes = {"front", "back"} |
| | missing_classes = possible_classes - detected_classes |
| | if missing_classes: |
| | labels.append(f"Missing: {', '.join(missing_classes)}") |
| |
|
| | return Image.fromarray(image), labels, bounding_boxes |
| |
|
| |
|
| | def crop_image(image, bounding_boxes): |
| | """Crops detected bounding boxes from the image.""" |
| | cropped_images = {} |
| | image = np.array(image) |
| |
|
| | for (x1, y1, x2, y2, class_name, conf) in bounding_boxes: |
| | cropped = image[y1:y2, x1:x2] |
| | cropped_images[class_name] = Image.fromarray(cropped) |
| |
|
| | return cropped_images |
| |
|
| |
|
| | def vision_ai_api(image, doc_type): |
| |
|
| | if doc_type == "front": |
| | results = main_f(image,model,tokenizer) |
| | if doc_type == "back": |
| | results = main_b(image,model,tokenizer) |
| | |
| | return results |
| |
|
| |
|
| | def predict(image): |
| | """Pipeline: Preprocess -> Detect -> Crop -> Vision AI API.""" |
| | processed_image = preprocessing(image) |
| | rotated_image = imageRotation(processed_image) |
| | detected_image, labels, bounding_boxes = detect_document(rotated_image) |
| |
|
| | cropped_images = crop_image(rotated_image, bounding_boxes) |
| |
|
| | |
| | front_result, back_result = None, None |
| | if "front" in cropped_images: |
| | front_result = vision_ai_api(cropped_images["front"], "front") |
| | if "back" in cropped_images: |
| | back_result = vision_ai_api(cropped_images["back"], "back") |
| |
|
| | |
| | api_results = { |
| | "front": front_result, |
| | "back": back_result |
| | } |
| | single_image = cropped_images.get("front") or cropped_images.get("back") or detected_image |
| | return single_image, labels, api_results |
| |
|
| |
|
| | iface = gr.Interface( |
| | fn=predict, |
| | inputs="image", |
| | outputs=["image", "text", "json"], |
| | title="License Field Detection (Front & Back Card)" |
| | ) |
| |
|
| | iface.launch() |
| |
|