Spaces:

yuvabe-ai
/

Ring_Size_Scale

Sleeping

File size: 16,989 Bytes

import streamlit as st
from PIL import Image, ImageDraw, ImageFont
import io
from io import BytesIO
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from rembg import remove
import mediapipe as mp
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
from transformers.dynamic_module_utils import get_imports
from unittest.mock import patch
from scipy.spatial import distance as dist

st.set_page_config(layout="wide", page_title="Ring Size Measurement")
ring_size_dict = {
    14.0: 3,
    14.4: 3.5,
    14.8: 4,
    15.2: 4.5,
    15.6: 5,
    16.0: 5.5,
    16.45: 6,
    16.9: 6.5,
    17.3: 7,
    17.7: 7.5,
    18.2: 8,
    18.6: 8.5,
    19.0: 9,
    19.4: 9.5,
    19.8: 10,
    20.2: 10.5,
    20.6: 11,
    21.0: 11.5,
    21.4: 12,
    21.8: 12.5,
    22.2: 13,
    22.6: 13.5
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
    if not str(filename).endswith("modeling_florence2.py"):
        return get_imports(filename)
    imports = get_imports(filename)
    imports.remove("flash_attn")
    return imports

def load_model():
    model_id = "microsoft/Florence-2-base-ft"
    processor = AutoProcessor.from_pretrained(model_id, torch_dtype=torch.qint8, trust_remote_code=True)
    
    try:
        os.mkdir("temp")
    except:
        pass
    
    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
        model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", trust_remote_code=True)
    
    Qmodel = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
    return Qmodel.to(device), processor 

if 'model_loaded' not in st.session_state:
    st.session_state.model_loaded = False

if not st.session_state.model_loaded:
    with st.spinner('Loading model...'):
        st.session_state.model, st.session_state.processor = load_model()
        st.session_state.model_loaded = True
        st.write("Model loaded complete")

def calculate_pixel_per_metric(image, known_diameter_of_coin=25):
    def generate_labels(model, processor, task_prompt, image, text_input=None):
        if text_input is None:
            prompt = task_prompt
        else:
            prompt = task_prompt + " " + text_input
        
        inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)

        generated_ids = model.generate(
            input_ids=inputs["input_ids"],
            pixel_values=inputs["pixel_values"],
            max_new_tokens=1024,
            early_stopping=False,
            do_sample=False,
            num_beams=3,
        )

        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]

        output = processor.post_process_generation(
            generated_text,
            task=task_prompt,
            image_size=(image.width, image.height)
        )
        
        return output

    def plot_bbox(original_image, data):
        # Create a copy of the original image to draw on
        image_with_bboxes = original_image.copy()

        # Use Pillow to draw bounding boxes and labels
        draw = ImageDraw.Draw(image_with_bboxes)
        def calculate_bbox_dimensions(bbox):
            x1, y1, x2, y2 = bbox
            width = x2 - x1
            height = y2 - y1
            return width, height

        # Inside your `plot_bbox` function, after drawing the bounding box:
        font = ImageFont.truetype("arial.ttf", 28)
        for bbox, label in zip(data['bboxes'], data['labels']):
            x1, y1, x2, y2 = bbox
            draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
            draw.text((x1, y1), label, fill="red", font=font)

        # Calculate dimensions
        width, height = calculate_bbox_dimensions(bbox)
        print(f"Label: {label}, Width: {width}, Height: {height}")
        dimension_text = f"W: {width}, H: {height}"
        draw.text((x1, y1 + 20), dimension_text, fill="red", font=font)

        real_world_dimension_mm = 160
        largest_dimension = max(width, height)
        pixels_per_mm = largest_dimension / real_world_dimension_mm
        ratio_text = f"Pixels/mm: {pixels_per_mm:.2f}"
        draw.text((x1, y1 + 40), ratio_text, fill="red", font=font)

        # buf = BytesIO()
        # image_with_bboxes.save(buf, format='PNG')
        # buf.seek(0)


        return image_with_bboxes,pixels_per_mm,pixels_per_mm
    
    def detecting_ruler(model, processor, image, task_prompt, text_input=None):
        results = generate_labels(model, processor, task_prompt, image, text_input=text_input)
        image_with_bboxes, value_1, value_2 = plot_bbox(image, results['<CAPTION_TO_PHRASE_GROUNDING>'])
        return value_1, value_2, image_with_bboxes
    
    image_for_model = image.copy()

    image_for_model = cv2.cvtColor(image_for_model, cv2.COLOR_BGR2RGB)
    image_for_model = Image.fromarray(image_for_model)
    # if image_for_model.mode != 'RGB':
    #     image_for_model = image_for_model.convert('RGB')

    # Process the image
    text_input = "ruler"
    task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
    pixel_per_metric, mm_per_pixel, marked_image_buf = detecting_ruler(st.session_state.model, st.session_state.processor, image_for_model, task_prompt, text_input)
            
    return pixel_per_metric, mm_per_pixel, marked_image_buf

def process_image(image):
    return remove(image)

def calculate_pip_width(image, original_img, pixel_per_metric):
    def calSize(xA, yA, xB, yB, color_circle, color_line, img):
        d = dist.euclidean((xA, yA), (xB, yB))
        cv2.circle(img, (int(xA), int(yA)), 5, color_circle, -1)
        cv2.circle(img, (int(xB), int(yB)), 5, color_circle, -1)
        cv2.line(img, (int(xA), int(yA)), (int(xB), int(yB)), color_line, 2)
        d_mm = d / pixel_per_metric
        d_mm = d_mm - 1.5
        cv2.putText(img, "{:.1f}".format(d_mm), (int(xA - 15), int(yA - 10)), cv2.FONT_HERSHEY_SIMPLEX, 0.65, (255, 255, 255), 2)
        print(d_mm)
        return d_mm
    
    def process_point(point, cnt, m1, b):
        x1, x2 = point[0], point[0]
        y1 = m1 * x1 + b
        y2 = m1 * x2 + b

        result = 1.0
        while result > 0:
            result = cv2.pointPolygonTest(cnt, (x1, y1), False)
            x1 += 1
            y1 = m1 * x1 + b
        x1 -= 1

        result = 1.0
        while result > 0:
            result = cv2.pointPolygonTest(cnt, (x2, y2), False)
            x2 -= 1
            y2 = m1 * x2 + b
        x2 += 1

        return x1, y1, x2, y2
    
    og_img = original_img.copy()
    imgH, imgW, _ = image.shape
    imgcpy = image.copy()
    image_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(image_gray, 1, 255, cv2.THRESH_BINARY)
    contours, _ = cv2.findContours(binary_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contour_image = np.zeros_like(image_gray)
    cv2.drawContours(contour_image, contours, -1, (255), thickness=cv2.FILLED)
    cv2.drawContours(imgcpy, contours, -1, (0, 255, 0), 2)
    # print("length : ",len(contours))
    
    marked_img = image.copy()

    if len(contours) > 0:
        cnt = max(contours, key=cv2.contourArea)
        frame2 = cv2.cvtColor(og_img, cv2.COLOR_BGR2RGB)
        handsLM = mp.solutions.hands.Hands(max_num_hands=1, min_detection_confidence=0.8, min_tracking_confidence=0.8)
        pr = handsLM.process(frame2)
        print(pr.multi_hand_landmarks)
        if pr.multi_hand_landmarks:
            for hand_landmarks in pr.multi_hand_landmarks:
                lmlist = []
                for id, landMark in enumerate(hand_landmarks.landmark):
                    xPos, yPos = int(landMark.x * imgW), int(landMark.y * imgH)
                    lmlist.append([id, xPos, yPos])

                if len(lmlist) != 0:
                    pip_joint = [lmlist[14][1], lmlist[14][2]]
                    mcp_joint = [lmlist[13][1], lmlist[13][2]]

                    midpoint_x = (pip_joint[0] + mcp_joint[0]) / 2
                    midpoint_y = (pip_joint[1] + mcp_joint[1]) / 2
                    midpoint = [midpoint_x, midpoint_y]

                    m2 = (pip_joint[1] - mcp_joint[1]) / (pip_joint[0] - mcp_joint[0])
                    m1 = -1 / m2
                    b = pip_joint[1] - m1 * pip_joint[0]

                    #pip_joint
                    x1_pip, y1_pip, x2_pip, y2_pip = process_point(pip_joint, cnt, m1, b)

                    m2 = (midpoint_y - mcp_joint[1]) / (midpoint_x - mcp_joint[0])
                    m1 = -1 / m2
                    b = midpoint_y - m1 * midpoint_x

                    #midpoint
                    x1_mid, y1_mid, x2_mid, y2_mid = process_point(midpoint, cnt, m1, b)

                    d_mm_pip = calSize(x1_pip, y1_pip, x2_pip, y2_pip, (255, 0, 0), (255, 0, 255), original_img)
                    d_mm_mid = calSize(x1_mid, y1_mid, x2_mid, y2_mid, (0, 255, 0), (0, 0, 255), original_img)

    largest_d_mm = max(int(d_mm_mid),int(d_mm_pip))
    return original_img, largest_d_mm, imgcpy, marked_img

def mark_hand_landmarks(image_path):

    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands()
    mp_draw = mp.solutions.drawing_utils

    img = image_path
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    results = hands.process(img_rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(img, hand_landmarks, mp_hands.HAND_CONNECTIONS)
            
            mcp = hand_landmarks.landmark[13]
            pip = hand_landmarks.landmark[14]
            
            img_height, img_width, _ = img.shape
            
            mcp_x, mcp_y = int(mcp.x * img_width), int(mcp.y * img_height)
            pip_x, pip_y = int(pip.x * img_width), int(pip.y * img_height)
            
            cv2.circle(img, (mcp_x, mcp_y), 10, (255, 0, 0), -1)
            cv2.circle(img, (pip_x, pip_y), 10, (255, 0, 0), -1)

    return img

def show_resized_image(images, titles, scale=0.5):
    num_images = len(images)
    
    fig, axes = plt.subplots(2, 3, figsize=(17, 13))  
    axes = axes.flatten()  

    for ax in axes[num_images:]:
        ax.axis('off')
    i = 0
    for ax, img, title in zip(axes, images, titles):
        i = i + 1
        print(i)
        resized_image = cv2.resize(img, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
        ax.imshow(cv2.cvtColor(resized_image, cv2.COLOR_BGR2RGB))
        ax.set_title(title)
        ax.axis('off')

    plt.tight_layout()
    img_stream = BytesIO()
    plt.savefig(img_stream, format='png')
    img_stream.seek(0)
    plt.close(fig)
    return img_stream

def get_ring_size(mm_value):
    if mm_value in ring_size_dict:
        return ring_size_dict[mm_value]
    else:
        closest_mm = min(ring_size_dict.keys(), key=lambda x: abs(x - mm_value))
        return ring_size_dict[closest_mm]

# st.set_page_config(layout="wide", page_title="Ring Size Measurement")
st.write("## Determine Your Ring Size")
st.write(
    "📏 Upload an image of your hand to measure the finger width and determine your ring size. The measurement will be displayed along with a visual breakdown of the image processing flow."
)
st.sidebar.write("## Upload :gear:")
#~~
st.write("### Workflow Overview")
st.image("FlowChart.png", caption="Workflow Overview", use_column_width=True)

st.write("### Detailed Workflow")
st.write("1. **Florence-2 Model:** Florence-2 is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks.We utilize this model to detect the scale within the image and mark a bounding box which we can use to find the approximate full measurement of scale.")
st.write("2. **Pixel Per Metric Ratio:** The Pixel Per Metric Ratio is used to convert pixel measurements into real-world units. By comparing the pixel length obtained from image analysis (i.e., Hough Circle) with the known real-world measurement of the reference object (coin), we get the ratio. This ratio then allows us to accurately scale and size estimation of objects within the image.")
st.write("3. **Background Removal:** Removing the background first ensures that only the relevant subject is highlighted. We start by converting the image to grayscale and applying thresholding to distinguish the subject from the background. Erosion and dilation then clean up the image, improving the detection of specific features like individual fingers.")
st.write("4. **Contour Detection:** We use Contour Detection to find the largest contour, which allows us to outline or draw a boundary around the subject (i.e., hand). This highlights the object's shape and edges, improving the precision of the subject.")
st.write("5. **Finding Hand Landmarks:** This involves using the MediaPipe library to identify key points on the hand, such as the PIP (Proximal Interphalangeal) and MCP (Metacarpophalangeal) joints of the ring finger. This enables precise tracking and analysis of finger positions and movements.")
st.write("6. **Determining Finger Width:** Here we use the slope formula `[y = mx + b]` with PIP and MCP points to measure the finger's width. We project outward perpendicularly from the PIP point towards the MCP point, then apply a point polygon test to accurately determine the pixel width of the finger.")
st.write("7. **Predicting Ring Size:** Predicting Ring Size involves calculating the finger’s diameter using the Pixel Per Metric Ratio and the largest width measurement at the PIP or MCP joint. This diameter is then used to predict the appropriate ring size.")
#~~

MAX_FILE_SIZE = 5 * 1024 * 1024  # 5MB

def process_image_and_get_results(upload):
    image = Image.open(upload)
    # image =  cv2.imread(upload)
    image_np = np.array(image)
    image_np = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
    original_img = image_np.copy()
    og_img1 = image_np.copy()
    og_img2 = image_np.copy()
    img_1 = image_np.copy()
    hand_lms = mark_hand_landmarks(img_1)

    pixel_per_metric, mm_per_pixel, image_with_coin_info = calculate_pixel_per_metric(image_np)
    processed_image = process_image(og_img1)
    image_with_pip_width, width_mm, contour_image, pip_mark_img = calculate_pip_width(processed_image, original_img, pixel_per_metric)
    image_with_coin_info = np.array(image_with_coin_info)
    if image_with_coin_info is None:
        print("inside1")
        raise ValueError("Image is None, cannot resize.")
        
    elif not isinstance(image_with_coin_info, (np.ndarray, cv2.UMat)):
        print("inside2")
        raise TypeError(f"Invalid image type: {type(image_with_coin_info)}. Expected numpy array or cv2.UMat.")
    ring_size = get_ring_size(width_mm)
    return {
        "processed_image": image_with_pip_width,
        "original_image": og_img2,
        "hand_lm_marked_image": hand_lms,
        "image_with_coin_info": image_with_coin_info,
        "contour_image": contour_image,
        "width_mm": width_mm,
        "ring_size": ring_size
    }

def show_how_it_works(processed_image):
    st.write("## How It Works")
    st.write("Here's a step-by-step breakdown of how your image is processed to determine your ring size:")
    st.image(processed_image, caption="Image Processing Flow", use_column_width=True)

col1, col2 = st.columns(2)
my_upload = st.sidebar.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])

if my_upload is not None:
    if my_upload.size > MAX_FILE_SIZE:
        st.error("The uploaded file is too large. Please upload an image smaller than 5MB.")
    else:
        st.write("## Image Processing Flow")
        results = process_image_and_get_results(my_upload)

        col1.write("Uploaded Image :camera:")
        col1.image(cv2.cvtColor(results["original_image"], cv2.COLOR_BGR2RGB), caption="Uploaded Image")

        col2.write("Processed Image :wrench:")
        col2.image(cv2.cvtColor(results["processed_image"], cv2.COLOR_BGR2RGB), caption="Processed Image with PIP Width")

        st.write(f"📏 The width of your finger is {results['width_mm']:.2f} mm, and the estimated ring size is {results['ring_size']:.1f}.")

        if st.button("How it Works"):
            st.write("## How It Works")
            st.write("Here's a step-by-step breakdown of how your image is processed to determine your ring size:")
            print("here")
            img_stream = show_resized_image(
                [results["original_image"], results["image_with_coin_info"], results["contour_image"], results["hand_lm_marked_image"], results["processed_image"]],
                ['Original Image', 'Image with Scale Info', 'Contour Boundary Image', 'Hand Landmarks', 'Ring Finger Width'],
                scale=0.5
            )
            st.image(img_stream, caption="Processing Flow", use_column_width=True)
else:
    st.info("Please upload an image to get started.")