Kushalguptaiitb
/

table_test

@@ -1,497 +0,0 @@
-import cv2
-import os
-import supervision as sv  # pip install supervision
-from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
-from pdf2image import convert_from_path
-import numpy as np
-from PIL import Image
-import json
-import pytesseract
-import pandas as pd
-from sentence_transformers import SentenceTransformer, util
-from PyPDF2 import PdfReader
-from datetime import datetime
-import torch
-import logging
-from utils.utils_code import log_time_taken
-from concurrent.futures import ProcessPoolExecutor, as_completed
-import multiprocessing
-import sys
-import gc
-from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
-from config.set_config import set_configuration
-set_config_project = set_configuration()
-layout_model_weights_path = set_config_project.layout_model_weights_path
-no_of_threads = set_config_project.no_of_threads
-from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
-from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
-from src.table_processing.put_table_header import put_table_header_pipeline
-import gc
-from src.layout_detection.load_model import load_model_for_process
-# Set multiprocessing start method
-multiprocessing.set_start_method('spawn', force=True)
-logger = logging.getLogger(__name__)
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-def load_torch(version):
-    if version == "2.2.2":
-        sys.path.insert(0, "./torch_2_2_2")
-    elif version == "2.6.0":
-        sys.path.insert(0, "./torch_2_6_0")
-    import torch
-    logger.info(f"Using Torch Version: {torch.__version__}")
-    return torch
-torch = load_torch("2.2.2")
-def get_file_name_without_extension(file_path):
-    directory, file_name = os.path.split(file_path)
-    name, extension = os.path.splitext(file_name)
-    return name
-def convert_numpy(data):
-    if isinstance(data, dict):
-        return {key: convert_numpy(value) for key, value in data.items()}
-    elif isinstance(data, list):
-        return [convert_numpy(item) for item in data]
-    elif isinstance(data, np.integer):
-        return int(data)
-    elif isinstance(data, np.floating):
-        return float(data)
-    elif isinstance(data, np.ndarray):
-        return data.tolist()
-    elif isinstance(data, pd.DataFrame):
-        return data.to_dict(orient='records')
-    else:
-        return data
-def filter_layout_blocks(input_data):
-    filtered_layout_blocks = []
-    for blocks in input_data.values():
-        filtered_layout_blocks.extend([block for block in blocks])
-    return filtered_layout_blocks
-def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
-    images = convert_from_path(file_path, dpi=dpi)
-    total_pages = len(images)
-    def page_generator():
-        for start_page in range(1, total_pages + 1, batch_size):
-            end_page = min(start_page + batch_size - 1, total_pages)
-            yield images[start_page-1:end_page]
-    return page_generator()
-def read_json(json_file):
-    with open(json_file, 'r') as file:
-        return json.load(file)
-def filter_and_sort_headers(data, modified_json_output_filepath):
-    def sort_blocks_by_min_x(blocks):
-        return sorted(blocks, key=lambda block: block['bbox'][0])
-    def sort_blocks_by_min_y(blocks):
-        return sorted(blocks, key=lambda block: block['bbox'][1])
-    def find_headers_and_group(sorted_blocks):
-        headers_list = []
-        current_group = []
-        previous_block = None
-        for i, block in enumerate(sorted_blocks):
-            if previous_block:
-                prev_xmax = previous_block['bbox'][2]
-                prev_xmax_threshold = int(previous_block['bbox'][2])
-                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
-                    if current_group:
-                        headers_list.extend(sort_blocks_by_min_y(current_group))
-                        current_group = []
-            current_group.append(block)
-            previous_block = block
-        if current_group:
-            headers_list.extend(sort_blocks_by_min_y(current_group))
-        return headers_list
-    result = {}
-    for key, blocks in data.items():
-        sorted_blocks = sort_blocks_by_min_x(blocks)
-        sorted_headers = find_headers_and_group(sorted_blocks)
-        result[key] = sorted_headers
-    sorted_data = result
-    with open(modified_json_output_filepath, 'w') as f:
-        json.dump(sorted_data, f, indent=4)
-    return sorted_data, modified_json_output_filepath
-def filter_and_sort_layouts(data, modified_json_output_filepath):
-    def sort_blocks_by_min_x(blocks):
-        return sorted(blocks, key=lambda block: block['bbox'][0])
-    def sort_blocks_by_min_y(blocks):
-        return sorted(blocks, key=lambda block: block['bbox'][1])
-    def find_classes_and_group(sorted_blocks):
-        classes_list = []
-        current_group = []
-        previous_block = None
-        for i, block in enumerate(sorted_blocks):
-            if previous_block:
-                prev_xmax = previous_block['bbox'][2]
-                prev_xmax_threshold = int(previous_block['bbox'][2])
-                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
-                    if current_group:
-                        classes_list.extend(sort_blocks_by_min_y(current_group))
-                        current_group = []
-            current_group.append(block)
-            previous_block = block
-        if current_group:
-            classes_list.extend(sort_blocks_by_min_y(current_group))
-        return classes_list
-    result = {}
-    for key, blocks in data.items():
-        sorted_blocks = sort_blocks_by_min_x(blocks)
-        sorted_layouts = find_classes_and_group(sorted_blocks)
-        result[key] = sorted_layouts
-    sorted_layout_data = result
-    with open(modified_json_output_filepath, 'w') as f:
-        json.dump(sorted_layout_data, f, indent=4)
-    return sorted_layout_data, modified_json_output_filepath
-@log_time_taken
-def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
-    try:
-        image = Image.open(img_path).convert("RGB")
-        # Process image with the Docling Heron model
-        inputs = image_processor(images=[image], return_tensors="pt")
-        # Move inputs to the same device as the model
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # Post-process the results
-        results = image_processor.post_process_object_detection(
-            outputs,
-            target_sizes=torch.tensor([image.size[::-1]], device=device),
-            threshold=threshold
-        )[0]
-        # Move results to CPU for further processing
-        results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
-        # Convert to supervision Detections format for compatibility
-        xyxy = results["boxes"].numpy()
-        confidence = results["scores"].numpy()
-        class_id = results["labels"].numpy()
-        class_name = [model.config.id2label[label_id] for label_id in class_id]
-        detections = sv.Detections(
-            xyxy=xyxy,
-            confidence=confidence,
-            class_id=class_id,
-            data={"class_name": class_name}
-        )
-        # Custom bounding box color (Red)
-        bbox_color = sv.Color(r=255, g=0, b=0)
-        bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
-        label_annotator = sv.LabelAnnotator()
-        # Annotate the image
-        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-        annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
-        annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
-        # Clean up
-        del inputs, outputs
-        torch.cuda.empty_cache() if device == 'cuda' else None
-        gc.collect()
-        return annotated_image, detections, results
-    except Exception as e:
-        logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
-        raise
-def enhance_dpi(image, new_dpi=300, old_dpi=150):
-    old_dpi = int(old_dpi)
-    new_dpi = int(new_dpi)
-    scaling_factor = new_dpi / old_dpi
-    new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
-    resized_image = image.resize(new_size, Image.LANCZOS)
-    return resized_image
-def extract_text_from_bbox(image, bbox):
-    if isinstance(image, Image.Image):
-        image = np.array(image)
-    elif isinstance(image, np.ndarray):
-        pass
-    else:
-        raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
-    image_height, image_width = image.shape[:2]
-    ymin = max(0, int(bbox['ymin'] - 5))
-    ymax = min(image_height, int(bbox['ymax'] + 5))
-    xmin = max(0, int(bbox['xmin'] - 20))
-    xmax = min(image_width, int(bbox['xmax'] + 20))
-    cropped_image = image[ymin:ymax, xmin:xmax]
-    cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
-    high_dpi_image = enhance_dpi(cropped_image_pil)
-    high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
-    gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
-    custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
-    extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
-    return extracted_text
-def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
-    if not isinstance(extracted_text, pd.DataFrame):
-        return False
-    model = SentenceTransformer(model_name)
-    extracted_headers = list(extracted_text.columns)
-    extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
-    header_embeddings = model.encode(header_list, convert_to_tensor=True)
-    similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
-    for i, header in enumerate(header_list):
-        for j, extracted_header in enumerate(extracted_headers):
-            if similarity_matrix[i][j] > threshold:
-                logger.info(f"Matching header found: {extracted_header} (similar to {header})")
-                return True
-    logger.info("No matching headers found.")
-    return False
-def process_page(args):
-    (page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    try:
-        model, image_processor, class_names = load_model_for_process()
-        model.to(device)  # Ensure model is on the correct device
-        image = np.array(page_img)
-        h, w, _ = image.shape
-        page_number = str(current_page_num)
-        img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
-        img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
-        pil_image = Image.fromarray(image)
-        pil_image.save(img_output_filepath)
-        cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
-        os.makedirs(cropped_images_path, exist_ok=True)
-        bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
-        logger.info(f"Processed layout detection for page {page_number}")
-        pil_bbox_image = Image.fromarray(bbox_image)
-        bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
-        bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
-        pil_bbox_image.save(bbox_output_filepath)
-        page_information = []
-        for idx, bbox in enumerate(page_detections_info.xyxy):
-            label_name = page_detections_info.data['class_name'][idx]
-            class_id = page_detections_info.class_id[idx]
-            score = page_detections_info.confidence[idx]
-            image_height = h
-            image_width = w
-            ymin = max(0, bbox[1] - 10)
-            ymax = min(image_height, bbox[3] + 10)
-            xmin = max(0, bbox[0] - 10)
-            xmax = min(image_width, bbox[2] + 10)
-            new_bbox = {
-                "xmin": int(bbox[0]),
-                "ymin": int(bbox[1]),
-                "xmax": int(bbox[2]),
-                "ymax": int(bbox[3])
-            }
-            cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
-            os.makedirs(cropped_labels_images_path, exist_ok=True)
-            crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
-            crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
-            crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
-            cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
-            cropped_label_pil_image.save(crop_label_image_filename_filepath)
-            if label_name == 'Table':
-                crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
-                cropped_image = pil_image.crop(crop_bbox)
-                df_post_processed, df_original = tsr_inference_image(cropped_image)
-                extracted_df = df_post_processed
-                extracted_text = extracted_df
-                if isinstance(df_original, pd.DataFrame):
-                    extracted_df_markdown = df_original.to_markdown()
-                else:
-                    extracted_df_markdown = df_original
-            else:
-                extracted_text = extract_text_from_bbox(image, new_bbox)
-                extracted_df_markdown = ""
-            page_block_id = f"{str(idx + 1) + str(current_page_num)}"
-            page_block_id = int(page_block_id)
-            page_information.append({
-                'page_block_id': page_block_id,
-                'label_name': label_name,
-                'pdf_page_id': current_page_num,
-                'pdf_name': file_name,
-                'label_id': class_id,
-                'yolo_detection_confidence_score': score,
-                'bbox': [xmin, ymin, xmax, ymax],
-                'page_img_width': w,
-                'page_img_height': h,
-                'extracted_text': [extracted_text],
-                "extracted_table_markdown": [extracted_df_markdown]
-            })
-        # Clean up
-        del image, bbox_image, model, image_processor
-        torch.cuda.empty_cache() if device == 'cuda' else None
-        gc.collect()
-        return page_number, page_information, class_names
-    except Exception as e:
-        logger.error(f"Error processing page {current_page_num}: {str(e)}")
-        raise
-@log_time_taken
-def yolov10_layout_pipeline(file_name, file_path, directory_path):
-    if not file_path.lower().endswith('.pdf'):
-        raise ValueError("Input file must be a PDF.")
-    logger.info(f"Starting processing for {file_name}")
-    start_time = datetime.now()
-    file_name = get_file_name_without_extension(file_path)
-    pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
-    os.makedirs(pdf_images_path, exist_ok=True)
-    bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
-    os.makedirs(bbox_images_path, exist_ok=True)
-    json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
-    os.makedirs(json_output_path, exist_ok=True)
-    total_pages_processed = 0
-    data_pdf = {}
-    try:
-        page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
-        page_args = []
-        for pages in page_generator:
-            if not pages:
-                break
-            for page_num, page_img in enumerate(pages):
-                current_page_num = total_pages_processed + page_num + 1
-                logger.info(f"Processing file {file_name}, page {current_page_num}")
-                page_args.append((
-                    page_img,
-                    current_page_num,
-                    file_name,
-                    pdf_images_path,
-                    bbox_images_path
-                ))
-            total_pages_processed += len(pages)
-        logger.info(f"Total pages to process: {total_pages_processed}")
-        with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
-            future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
-            for future in as_completed(future_to_page):
-                page_number = future_to_page[future]
-                try:
-                    result = future.result()
-                    page_number, page_information, class_names = result
-                    data_pdf[page_number] = page_information
-                except Exception as e:
-                    logger.error(f"Error processing page {page_number}: {str(e)}")
-                    raise
-        logger.info(f"Processed pages: {data_pdf.keys()}")
-        layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
-        user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
-        tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
-        data_pdf = convert_numpy(data_pdf)
-        layout_list_data = filter_layout_blocks(data_pdf)
-        with open(layout_json_file_path, 'w') as json_file:
-            json.dump(data_pdf, json_file, indent=4)
-        with open(user_modification_json_file_path, 'w') as json_file:
-            json.dump(data_pdf, json_file, indent=4)
-        sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
-        tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
-        sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
-        filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
-        end_time = datetime.now()
-        logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
-        logger.info(f"JSON file created at: {modified_json_output_filepath}")
-        return (
-            json_output_path,
-            layout_list_data,
-            class_names,
-            sorted_data,
-            modified_json_output_filepath,
-            pdf_images_path,
-            file_name,
-            sorted_layout_data,
-            sorted_layout_json_filepath,
-            tree_structured_organized_json_data,
-            tree_structured_json_output_path,
-            filtered_table_header_data,
-            filtered_table_header_data_json_path
-        )
-    except Exception as e:
-        logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
-        raise
-    finally:
-        # Ensure GPU memory is cleared
-        torch.cuda.empty_cache() if torch.cuda.is_available() else None
-        gc.collect()
-# Example usage
-if __name__ == "__main__":
-    pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
-    output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
-    file_name = get_file_name_without_extension(pdf_path)
-    yolov10_layout_pipeline(file_name, pdf_path, output_directory)