Spaces:
Runtime error
Runtime error
| from dotenv import load_dotenv | |
| import cv2, os, ast, io, base64 | |
| from PIL import Image | |
| def call_VL(model, processor, device, messages): | |
| """ | |
| Processes vision-language (VL) data using a model and processor. | |
| Args: | |
| model: The vision-language model used for generating outputs. | |
| processor: Preprocessing utility to format inputs and decode outputs. | |
| device: The computational device (e.g., 'cuda' or 'cpu') where processing occurs. | |
| messages: A list of messages containing text and visual information. | |
| Returns: | |
| dict: A dictionary representation of the model's processed output. | |
| """ | |
| from qwen_vl_utils import process_vision_info | |
| text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| inputs = inputs.to(device) | |
| generated_ids = model.generate(**inputs, max_new_tokens=512) | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
| ] | |
| output_text = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| cleaned_output = output_text[0].strip('```python\n```') | |
| # Convert the cleaned string into a dictionary | |
| return ast.literal_eval(cleaned_output) | |
| def load_VL(model_name = "Qwen/Qwen2-VL-7B-Instruct"): | |
| """ | |
| Loads a vision-language (VL) model and its associated processor. | |
| Args: | |
| model_name (str): The name or path of the pre-trained model to load. | |
| Defaults to "Qwen/Qwen2-VL-7B-Instruct". | |
| Returns: | |
| tuple: A tuple containing the loaded model and its processor. | |
| """ | |
| from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | |
| model = Qwen2VLForConditionalGeneration.from_pretrained( | |
| model_name, torch_dtype="auto", device_map="auto") | |
| processor = AutoProcessor.from_pretrained(model_name) | |
| return model, processor | |
| def llm_dim(llm, img, device, scale = 1): | |
| """ | |
| Extracts dimensions from a mechanical drawing image using a vision-language (VL) model. | |
| Args: | |
| llm (tuple): A tuple containing the loaded VL model and its processor. | |
| img (numpy.ndarray): The input image from which dimensions need to be extracted. | |
| device (torch.device): The device (e.g., CPU or GPU) to run the model on. | |
| scale (float, optional): Scaling factor for resizing the image. Defaults to 1. | |
| Returns: | |
| list: A list of strings representing the extracted dimensions from the image. | |
| """ | |
| resized_img = cv2.resize(img, (int(img.shape[1]*scale), int(img.shape[0]*scale)), interpolation=cv2.INTER_AREA) | |
| img = Image.fromarray(resized_img) | |
| messages = [ | |
| {"role": "system", | |
| "content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read: | |
| Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0" | |
| Angles, usually oriented text with arrows in the surroundings | |
| Do not include surface finishes'''},], | |
| }, | |
| {"role": "user", | |
| "content": [{"type": "image","image": img,}, | |
| {"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},], | |
| }] | |
| output_text = call_VL(model=llm[0], processor=llm[1], device = device, messages=messages) | |
| print(output_text) | |
| return output_text | |
| def llm_table(tables, llm, img, device, query): | |
| """ | |
| Extracts specific information from tables in a mechanical drawing image using a vision-language (VL) model. | |
| Args: | |
| tables (list): A list of table bounding boxes to process. | |
| llm (tuple): A tuple containing the loaded VL model and its processor. | |
| img (numpy.ndarray): The input image containing the tables. | |
| device (torch.device): The device (e.g., CPU or GPU) to run the model on. | |
| query (list): A list of strings specifying the information to extract. | |
| Returns: | |
| dict: A Python dictionary containing the extracted information based on the query. | |
| """ | |
| for b in tables[0]: | |
| tab_img = img[b.y : b.y + b.h, b.x : b.x + b.w][:] | |
| tab_img = Image.fromarray(tab_img) | |
| query_string = ', '.join(query) | |
| messages = [ | |
| {"role": "user", | |
| "content": [{"type": "image","image": tab_img,}, | |
| {"type": "text", "text": f"Based on the image, return only a python dictionary extracting this information: {query_string}"},], | |
| }] | |
| llm_dict = call_VL(model=llm[0], processor=llm[1], device = device, messages = messages) | |
| return llm_dict | |
| def convert_img(img): | |
| pil_img=Image.fromarray(img) | |
| buf = io.BytesIO() | |
| pil_img.save(buf, format='JPEG') | |
| byte_im = buf.getvalue() | |
| return base64.b64encode(byte_im).decode('utf-8') | |
| def gpt4_dim(img): | |
| """ | |
| This function uses GPT-4 to extract dimensions from a mechanical drawing image using OCR. | |
| Args: | |
| img (numpy.ndarray): The input image of a mechanical drawing. | |
| Returns: | |
| list: A Python list of strings containing extracted dimensions from the image. | |
| """ | |
| from openai import OpenAI | |
| load_dotenv() | |
| API_KEY = os.getenv("OPENAI_API_KEY") | |
| client = OpenAI(api_key=API_KEY) | |
| img_ = convert_img(img) | |
| messages = [ | |
| {"role": "system", | |
| "content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read: | |
| Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0" | |
| Angles, usually oriented text with arrows in the surroundings | |
| Feature Control Frames, usually in boxes, return either the symbol or its description, then the rest of the text'''},], | |
| }, | |
| {"role": "user", | |
| "content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_}", "detail": "high"}}, | |
| {"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},], | |
| }] | |
| response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000) | |
| assistant_response=response.choices[0].message.content | |
| cleaned_output = assistant_response.strip('```python\n```') | |
| # Convert the cleaned string into a dictionary | |
| return ast.literal_eval(cleaned_output) | |
| def ask_gpt(messages): | |
| from openai import OpenAI | |
| load_dotenv() | |
| API_KEY = os.getenv("OPENAI_API_KEY") | |
| client = OpenAI(api_key=API_KEY) | |
| response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000) | |
| assistant_response=response.choices[0].message.content | |
| return assistant_response |