File size: 7,346 Bytes
3f42a6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from dotenv import load_dotenv
import cv2, os, ast, io, base64
from PIL import Image

def call_VL(model, processor, device, messages):
    """
    Processes vision-language (VL) data using a model and processor.

    Args:
        model: The vision-language model used for generating outputs.
        processor: Preprocessing utility to format inputs and decode outputs.
        device: The computational device (e.g., 'cuda' or 'cpu') where processing occurs.
        messages: A list of messages containing text and visual information.

    Returns:
        dict: A dictionary representation of the model's processed output.
    """
    from qwen_vl_utils import process_vision_info
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    cleaned_output = output_text[0].strip('```python\n```')
    # Convert the cleaned string into a dictionary
    return ast.literal_eval(cleaned_output)

def load_VL(model_name = "Qwen/Qwen2-VL-7B-Instruct"):
    """
    Loads a vision-language (VL) model and its associated processor.

    Args:
        model_name (str): The name or path of the pre-trained model to load. 
                          Defaults to "Qwen/Qwen2-VL-7B-Instruct".

    Returns:
        tuple: A tuple containing the loaded model and its processor.
    """
    from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name, torch_dtype="auto", device_map="auto")
    
    processor = AutoProcessor.from_pretrained(model_name)
    return model, processor

def llm_dim(llm, img, device, scale = 1):
    """
    Extracts dimensions from a mechanical drawing image using a vision-language (VL) model.

    Args:
        llm (tuple): A tuple containing the loaded VL model and its processor.
        img (numpy.ndarray): The input image from which dimensions need to be extracted.
        device (torch.device): The device (e.g., CPU or GPU) to run the model on.
        scale (float, optional): Scaling factor for resizing the image. Defaults to 1.

    Returns:
        list: A list of strings representing the extracted dimensions from the image.
    """
    resized_img = cv2.resize(img, (int(img.shape[1]*scale), int(img.shape[0]*scale)), interpolation=cv2.INTER_AREA)
    img = Image.fromarray(resized_img)

    messages = [
        {"role": "system",
            "content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read:
                        Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0"
                        Angles, usually oriented text with arrows in the surroundings
                        Do not include surface finishes'''},],
        },
        {"role": "user",
            "content": [{"type": "image","image": img,},
                        {"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},],
        }]
    output_text = call_VL(model=llm[0], processor=llm[1], device = device, messages=messages)
    print(output_text)
    return output_text

def llm_table(tables, llm, img, device, query):
    """
    Extracts specific information from tables in a mechanical drawing image using a vision-language (VL) model.

    Args:
        tables (list): A list of table bounding boxes to process.
        llm (tuple): A tuple containing the loaded VL model and its processor.
        img (numpy.ndarray): The input image containing the tables.
        device (torch.device): The device (e.g., CPU or GPU) to run the model on.
        query (list): A list of strings specifying the information to extract.

    Returns:
        dict: A Python dictionary containing the extracted information based on the query.
    """
    for b in tables[0]:
        tab_img = img[b.y : b.y + b.h, b.x : b.x + b.w][:]
    tab_img = Image.fromarray(tab_img)

    query_string = ', '.join(query)
    messages = [
        {"role": "user",
            "content": [{"type": "image","image": tab_img,},
                        {"type": "text", "text": f"Based on the image, return only a python dictionary extracting this information: {query_string}"},],
        }]
    
    llm_dict = call_VL(model=llm[0], processor=llm[1], device = device, messages = messages)
    return llm_dict

def convert_img(img):
    pil_img=Image.fromarray(img)
    buf = io.BytesIO()
    pil_img.save(buf, format='JPEG')
    byte_im = buf.getvalue()
    return base64.b64encode(byte_im).decode('utf-8')

def gpt4_dim(img):
    """
    This function uses GPT-4 to extract dimensions from a mechanical drawing image using OCR.
    
    Args:
        img (numpy.ndarray): The input image of a mechanical drawing.
        
    Returns:
        list: A Python list of strings containing extracted dimensions from the image.
    """
    
    from openai import OpenAI
    
    load_dotenv()
    API_KEY = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=API_KEY)
    img_ = convert_img(img)

    messages = [
        {"role": "system",
            "content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read:
                        Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0"
                        Angles, usually oriented text with arrows in the surroundings
                        Feature Control Frames, usually in boxes, return either the symbol or its description, then the rest of the text'''},],
        },
        {"role": "user",
            "content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_}", "detail": "high"}},
                        {"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},],
        }]
    
    response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000)
    assistant_response=response.choices[0].message.content
    cleaned_output = assistant_response.strip('```python\n```')
    # Convert the cleaned string into a dictionary
    return ast.literal_eval(cleaned_output)

def ask_gpt(messages):

    from openai import OpenAI
    
    load_dotenv()
    API_KEY = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=API_KEY)

    response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000)
    assistant_response=response.choices[0].message.content

    return assistant_response