Spaces:

jeyanthangj2004
/

ocr

Runtime error

ocr

File size: 7,346 Bytes

3f42a6f

from dotenv import load_dotenv
import cv2, os, ast, io, base64
from PIL import Image

def call_VL(model, processor, device, messages):
    """
    Processes vision-language (VL) data using a model and processor.

    Args:
        model: The vision-language model used for generating outputs.
        processor: Preprocessing utility to format inputs and decode outputs.
        device: The computational device (e.g., 'cuda' or 'cpu') where processing occurs.
        messages: A list of messages containing text and visual information.

    Returns:
        dict: A dictionary representation of the model's processed output.
    """
    from qwen_vl_utils import process_vision_info
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    generated_ids = model.generate(**inputs, max_new_tokens=512)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    cleaned_output = output_text[0].strip('```python\n```')
    # Convert the cleaned string into a dictionary
    return ast.literal_eval(cleaned_output)

def load_VL(model_name = "Qwen/Qwen2-VL-7B-Instruct"):
    """
    Loads a vision-language (VL) model and its associated processor.

    Args:
        model_name (str): The name or path of the pre-trained model to load. 
                          Defaults to "Qwen/Qwen2-VL-7B-Instruct".

    Returns:
        tuple: A tuple containing the loaded model and its processor.
    """
    from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
    model = Qwen2VLForConditionalGeneration.from_pretrained(
        model_name, torch_dtype="auto", device_map="auto")
    
    processor = AutoProcessor.from_pretrained(model_name)
    return model, processor

def llm_dim(llm, img, device, scale = 1):
    """
    Extracts dimensions from a mechanical drawing image using a vision-language (VL) model.

    Args:
        llm (tuple): A tuple containing the loaded VL model and its processor.
        img (numpy.ndarray): The input image from which dimensions need to be extracted.
        device (torch.device): The device (e.g., CPU or GPU) to run the model on.
        scale (float, optional): Scaling factor for resizing the image. Defaults to 1.

    Returns:
        list: A list of strings representing the extracted dimensions from the image.
    """
    resized_img = cv2.resize(img, (int(img.shape[1]*scale), int(img.shape[0]*scale)), interpolation=cv2.INTER_AREA)
    img = Image.fromarray(resized_img)

    messages = [
        {"role": "system",
            "content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read:
                        Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0"
                        Angles, usually oriented text with arrows in the surroundings
                        Do not include surface finishes'''},],
        },
        {"role": "user",
            "content": [{"type": "image","image": img,},
                        {"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},],
        }]
    output_text = call_VL(model=llm[0], processor=llm[1], device = device, messages=messages)
    print(output_text)
    return output_text

def llm_table(tables, llm, img, device, query):
    """
    Extracts specific information from tables in a mechanical drawing image using a vision-language (VL) model.

    Args:
        tables (list): A list of table bounding boxes to process.
        llm (tuple): A tuple containing the loaded VL model and its processor.
        img (numpy.ndarray): The input image containing the tables.
        device (torch.device): The device (e.g., CPU or GPU) to run the model on.
        query (list): A list of strings specifying the information to extract.

    Returns:
        dict: A Python dictionary containing the extracted information based on the query.
    """
    for b in tables[0]:
        tab_img = img[b.y : b.y + b.h, b.x : b.x + b.w][:]
    tab_img = Image.fromarray(tab_img)

    query_string = ', '.join(query)
    messages = [
        {"role": "user",
            "content": [{"type": "image","image": tab_img,},
                        {"type": "text", "text": f"Based on the image, return only a python dictionary extracting this information: {query_string}"},],
        }]
    
    llm_dict = call_VL(model=llm[0], processor=llm[1], device = device, messages = messages)
    return llm_dict

def convert_img(img):
    pil_img=Image.fromarray(img)
    buf = io.BytesIO()
    pil_img.save(buf, format='JPEG')
    byte_im = buf.getvalue()
    return base64.b64encode(byte_im).decode('utf-8')

def gpt4_dim(img):
    """
    This function uses GPT-4 to extract dimensions from a mechanical drawing image using OCR.
    
    Args:
        img (numpy.ndarray): The input image of a mechanical drawing.
        
    Returns:
        list: A Python list of strings containing extracted dimensions from the image.
    """
    
    from openai import OpenAI
    
    load_dotenv()
    API_KEY = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=API_KEY)
    img_ = convert_img(img)

    messages = [
        {"role": "system",
            "content": [{"type": "text", "text": '''You are a specialized OCR system capable of reading mechanical drawings. You read:
                        Measurements, usually scattered and oriented text in the image and with arrows in the surroundings. If tolerances are present, read them as "nominal" "upper" "lower". e.g: "10 +0.1 0"
                        Angles, usually oriented text with arrows in the surroundings
                        Feature Control Frames, usually in boxes, return either the symbol or its description, then the rest of the text'''},],
        },
        {"role": "user",
            "content": [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_}", "detail": "high"}},
                        {"type": "text", "text": "Based on the image, return ONLY A PYTHON LIST OF STRINGS extracting dimensions"},],
        }]
    
    response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000)
    assistant_response=response.choices[0].message.content
    cleaned_output = assistant_response.strip('```python\n```')
    # Convert the cleaned string into a dictionary
    return ast.literal_eval(cleaned_output)

def ask_gpt(messages):

    from openai import OpenAI
    
    load_dotenv()
    API_KEY = os.getenv("OPENAI_API_KEY")
    client = OpenAI(api_key=API_KEY)

    response = client.chat.completions.create(model="gpt-4o", messages=messages, max_tokens=3000)
    assistant_response=response.choices[0].message.content

    return assistant_response