File size: 6,888 Bytes
36255e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import torch
import cv2
from PIL import Image
import numpy as np
from io import BytesIO
import torch
from ultralytics import YOLO
from torchvision import transforms
from transformers import TableTransformerForObjectDetection
from PIL import ImageDraw
import numpy as np
import csv
import easyocr
from tqdm.auto import tqdm
import csv


device = "cuda" if torch.cuda.is_available() else "cpu"
# new v1.1 checkpoints require no timm anymore
structure_model = TableTransformerForObjectDetection.from_pretrained("microsoft/table-structure-recognition-v1.1-all")
structure_model.to(device)
print("")

from torchvision import transforms

class MaxResize(object):
    def __init__(self, max_size=800):
        self.max_size = max_size

    def __call__(self, image):
        width, height = image.size
        current_max_size = max(width, height)
        scale = self.max_size / current_max_size
        resized_image = image.resize((int(round(scale*width)), int(round(scale*height))))

        return resized_image

detection_transform = transforms.Compose([
    MaxResize(800),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(-1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)


def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b


# update id2label to include "no object"
id2label = structure_model.config.id2label
id2label[len(structure_model.config.id2label)] = "no object"


def outputs_to_objects(outputs, img_size, id2label):
    m = outputs.logits.softmax(-1).max(-1)
    pred_labels = list(m.indices.detach().cpu().numpy())[0]
    pred_scores = list(m.values.detach().cpu().numpy())[0]
    pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
    pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)]

    objects = []
    for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
        class_label = id2label[int(label)]
        if not class_label == 'no object':
            objects.append({'label': class_label, 'score': float(score),
                            'bbox': [float(elem) for elem in bbox]})

    return objects

structure_transform = transforms.Compose([
    MaxResize(1000),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

def TSR(cropped_image):
    pixel_values = structure_transform(cropped_image).unsqueeze(0)
    pixel_values = pixel_values.to(device)
    print(pixel_values.shape)


    # forward pass
    with torch.no_grad():
        outputs = structure_model(pixel_values)

    # update id2label to include "no object"
    structure_id2label = structure_model.config.id2label
    structure_id2label[len(structure_id2label)] = "no object"

    cells = outputs_to_objects(outputs, cropped_image.size, structure_id2label)
    #print(cells)

    cropped_table_visualized = cropped_image.copy()
    draw = ImageDraw.Draw(cropped_table_visualized)

    for cell in cells:
        draw.rectangle(cell["bbox"], outline="red")

    return cropped_table_visualized , cells
############# Visualizing rows and columns on cropped image


## Modified

def get_cell_coordinates_by_row(table_data):
    # Extract rows and columns
    rows = [entry for entry in table_data if entry['label'] == 'table row']
    columns = [entry for entry in table_data if entry['label'] == 'table column']

    # Sort rows and columns by their Y and X coordinates, respectively
    rows.sort(key=lambda x: x['bbox'][1])
    columns.sort(key=lambda x: x['bbox'][0])

    # Function to find cell coordinates
    def find_cell_coordinates(row, column):
        # Use the row's Y coordinates for the cell's top and bottom
        cell_ymin = row['bbox'][1]
        cell_ymax = row['bbox'][3]  # Adjust as needed for better height

        # Use the column's X coordinates for the cell's left and right
        cell_xmin = column['bbox'][0]
        cell_xmax = column['bbox'][2]

        return [cell_xmin, cell_ymin, cell_xmax, cell_ymax]

    # Generate cell coordinates and count cells in each row
    cell_coordinates = []

    for row in rows:
        row_cells = []
        for column in columns:
            cell_bbox = find_cell_coordinates(row, column)
            row_cells.append({'column': column['bbox'], 'cell': cell_bbox})

        # Sort cells in the row by X coordinate
        row_cells.sort(key=lambda x: x['column'][0])

        # Append row information to cell_coordinates
        cell_coordinates.append({'row': row['bbox'], 'cells': row_cells, 'cell_count': len(row_cells)})

    # Sort rows from top to bottom
    cell_coordinates.sort(key=lambda x: x['row'][1])

    # Debugging: Print intermediate results
    #for i, row_info in enumerate(cell_coordinates):
     #   print(f"Row {i}: {row_info['row']}, Cell Count: {row_info['cell_count']}")
      #  for cell in row_info['cells']:
       #     print(f"  Cell Bounding Box: {cell['cell']}, Column Bounding Box: {cell['column']}")

    return cell_coordinates


# Initialize the EasyOCR reader
reader = easyocr.Reader(['en'])  # this needs to run only once to load the model into memory

def apply_ocr(cell_coordinates,cropped_image):
    # Initialize a list to store data for each row
    data = []

    for idx, row in enumerate(tqdm(cell_coordinates)):
        row_text = []  # List to store text for the current row
        for cell in row["cells"]:
            # Crop cell out of the image
            cell_image = np.array(cropped_image.crop(cell["cell"]))
            # Apply OCR
            result = reader.readtext(np.array(cell_image))
            if len(result) > 0:
                # Extract and join the detected text
                text = " ".join([x[1] for x in result])
                row_text.append(text)
            else:
                row_text.append("NAN")  # Append empty string if no text is detected

        # Append the row's text list to the data list
        data.append(row_text)

    return data


# Print the extracted text for each row
"""for idx, row_data in enumerate(data):
    print(f"Row {idx + 1}: {row_data}")"""

def op_csv(data):

# Define the output CSV file path
    output_csv_file = 'extract.csv'

# Write the data to a CSV file
    try:
        with open(output_csv_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)

            for row in data:
                writer.writerow(row)  # Write each row individually

        print(f"Data successfully written to {output_csv_file}")
        return output_csv_file
    except Exception as e:
        print(f"An error occurred: {e}")