Spaces:
Runtime error
Runtime error
File size: 7,346 Bytes
3f42a6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
from dotenv import load_dotenv
import cv2, os, ast, io, base64
from PIL import Image
def call_VL(model, processor, device, messages):
"""
Processes vision-language (VL) data using a model and processor.
Args:
model: The vision-language model used for generating outputs.
processor: Preprocessing utility to format inputs and decode outputs.
device: The computational device (e.g., 'cuda' or 'cpu') where processing occurs.
messages: A list of messages containing text and visual information.
Returns:
dict: A dictionary representation of the model's processed output.
"""
from qwen_vl_utils import process_vision_info
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to(device)
generated_ids = model.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
cleaned_output = output_text[0].strip('```python\n```')
# Convert the cleaned string into a dictionary
return ast.literal_eval(cleaned_output)
def load_VL(model_name = "Qwen/Qwen2-VL-7B-Instruct"):
"""
Loads a vision-language (VL) model and its associated processor.
Args:
model_name (str): The name or path of the pre-trained model to load.
Defaults to "Qwen/Qwen2-VL-7B-Instruct".
Returns:
tuple: A tuple containing the loaded model and its processor.
"""
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name, torch_dtype="auto", device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)
return model, processor
def llm_dim(llm, img, device, scale = 1):
"""
Extracts dimensions from a mechanical drawing image using a vision-language (VL) model.
Args:
llm (tuple): A tuple containing the loaded VL model and its processor.
img (numpy.ndarray): The input image from which dimensions need to be extracted.
device (torch.device): The device (e.g., CPU or GPU) to run the model on.
scale (float, optional): Scaling factor for resizing the image. Defaults to 1.
Returns:
list: A list of strings representing the extracted dimensions from the image.
"""
resized_img = cv2.resize(img, (int(img.shape[1]*scale), int(img.shape[0]*scale)), interpolation=cv2.INTER_AREA)
img = Image.fromarray(resized_img)
messages = [
{"role": "system",
"content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read:
Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0"
Angles, usually oriented text with arrows in the surroundings
Do not include surface finishes'''},],
},
{"role": "user",
"content": [{"type": "image","image": img,},
{"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},],
}]
output_text = call_VL(model=llm[0], processor=llm[1], device = device, messages=messages)
print(output_text)
return output_text
def llm_table(tables, llm, img, device, query):
"""
Extracts specific information from tables in a mechanical drawing image using a vision-language (VL) model.
Args:
tables (list): A list of table bounding boxes to process.
llm (tuple): A tuple containing the loaded VL model and its processor.
img (numpy.ndarray): The input image containing the tables.
device (torch.device): The device (e.g., CPU or GPU) to run the model on.
query (list): A list of strings specifying the information to extract.
Returns:
dict: A Python dictionary containing the extracted information based on the query.
"""
for b in tables[0]:
tab_img = img[b.y : b.y + b.h, b.x : b.x + b.w][:]
tab_img = Image.fromarray(tab_img)
query_string = ', '.join(query)
messages = [
{"role": "user",
"content": [{"type": "image","image": tab_img,},
{"type": "text", "text": f"Based on the image, return only a python dictionary extracting this information: {query_string}"},],
}]
llm_dict = call_VL(model=llm[0], processor=llm[1], device = device, messages = messages)
return llm_dict
def convert_img(img):
pil_img=Image.fromarray(img)
buf = io.BytesIO()
pil_img.save(buf, format='JPEG')
byte_im = buf.getvalue()
return base64.b64encode(byte_im).decode('utf-8')
def gpt4_dim(img):
"""
This function uses GPT-4 to extract dimensions from a mechanical drawing image using OCR.
Args:
img (numpy.ndarray): The input image of a mechanical drawing.
Returns:
list: A Python list of strings containing extracted dimensions from the image.
"""
from openai import OpenAI
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)
img_ = convert_img(img)
messages = [
{"role": "system",
"content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read:
Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0"
Angles, usually oriented text with arrows in the surroundings
Feature Control Frames, usually in boxes, return either the symbol or its description, then the rest of the text'''},],
},
{"role": "user",
"content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_}", "detail": "high"}},
{"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},],
}]
response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000)
assistant_response=response.choices[0].message.content
cleaned_output = assistant_response.strip('```python\n```')
# Convert the cleaned string into a dictionary
return ast.literal_eval(cleaned_output)
def ask_gpt(messages):
from openai import OpenAI
load_dotenv()
API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=API_KEY)
response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000)
assistant_response=response.choices[0].message.content
return assistant_response |