Upload 3 files

Browse files

Files changed (3) hide show

layout_detection_docling_heron.py +498 -0
load_model.py +106 -0
post_process_portfolio_company_json.py +375 -0

layout_detection_docling_heron.py ADDED Viewed

	@@ -0,0 +1,498 @@

+import cv2
+import os
+import supervision as sv  # pip install supervision
+from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
+from pdf2image import convert_from_path
+import numpy as np
+from PIL import Image
+import json
+import pytesseract
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+from PyPDF2 import PdfReader
+from datetime import datetime
+import torch
+import logging
+from utils.utils_code import log_time_taken
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
+import sys
+import gc
+from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
+from config.set_config import set_configuration
+set_config_project = set_configuration()
+layout_model_weights_path = set_config_project.layout_model_weights_path
+no_of_threads = set_config_project.no_of_threads
+from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
+from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
+from src.table_processing.put_table_header import put_table_header_pipeline
+import gc
+from src.layout_detection.load_model import load_model_for_process
+# Set multiprocessing start method
+multiprocessing.set_start_method('spawn', force=True)
+logger = logging.getLogger(__name__)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def load_torch(version):
+    if version == "2.2.2":
+        sys.path.insert(0, "./torch_2_2_2")
+    elif version == "2.6.0":
+        sys.path.insert(0, "./torch_2_6_0")
+    import torch
+    logger.info(f"Using Torch Version: {torch.__version__}")
+    return torch
+torch = load_torch("2.2.2")
+MODEL_NAME_DOCLING = "ds4sd/docling-layout-heron"
+def get_file_name_without_extension(file_path):
+    directory, file_name = os.path.split(file_path)
+    name, extension = os.path.splitext(file_name)
+    return name
+def convert_numpy(data):
+    if isinstance(data, dict):
+        return {key: convert_numpy(value) for key, value in data.items()}
+    elif isinstance(data, list):
+        return [convert_numpy(item) for item in data]
+    elif isinstance(data, np.integer):
+        return int(data)
+    elif isinstance(data, np.floating):
+        return float(data)
+    elif isinstance(data, np.ndarray):
+        return data.tolist()
+    elif isinstance(data, pd.DataFrame):
+        return data.to_dict(orient='records')
+    else:
+        return data
+def filter_layout_blocks(input_data):
+    filtered_layout_blocks = []
+    for blocks in input_data.values():
+        filtered_layout_blocks.extend([block for block in blocks])
+    return filtered_layout_blocks
+def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
+    images = convert_from_path(file_path, dpi=dpi)
+    total_pages = len(images)
+    def page_generator():
+        for start_page in range(1, total_pages + 1, batch_size):
+            end_page = min(start_page + batch_size - 1, total_pages)
+            yield images[start_page-1:end_page]
+    return page_generator()
+def read_json(json_file):
+    with open(json_file, 'r') as file:
+        return json.load(file)
+def filter_and_sort_headers(data, modified_json_output_filepath):
+    def sort_blocks_by_min_x(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][0])
+    def sort_blocks_by_min_y(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][1])
+    def find_headers_and_group(sorted_blocks):
+        headers_list = []
+        current_group = []
+        previous_block = None
+        for i, block in enumerate(sorted_blocks):
+            if previous_block:
+                prev_xmax = previous_block['bbox'][2]
+                prev_xmax_threshold = int(previous_block['bbox'][2])
+                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
+                    if current_group:
+                        headers_list.extend(sort_blocks_by_min_y(current_group))
+                        current_group = []
+            current_group.append(block)
+            previous_block = block
+        if current_group:
+            headers_list.extend(sort_blocks_by_min_y(current_group))
+        return headers_list
+    result = {}
+    for key, blocks in data.items():
+        sorted_blocks = sort_blocks_by_min_x(blocks)
+        sorted_headers = find_headers_and_group(sorted_blocks)
+        result[key] = sorted_headers
+    sorted_data = result
+    with open(modified_json_output_filepath, 'w') as f:
+        json.dump(sorted_data, f, indent=4)
+    return sorted_data, modified_json_output_filepath
+def filter_and_sort_layouts(data, modified_json_output_filepath):
+    def sort_blocks_by_min_x(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][0])
+    def sort_blocks_by_min_y(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][1])
+    def find_classes_and_group(sorted_blocks):
+        classes_list = []
+        current_group = []
+        previous_block = None
+        for i, block in enumerate(sorted_blocks):
+            if previous_block:
+                prev_xmax = previous_block['bbox'][2]
+                prev_xmax_threshold = int(previous_block['bbox'][2])
+                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
+                    if current_group:
+                        classes_list.extend(sort_blocks_by_min_y(current_group))
+                        current_group = []
+            current_group.append(block)
+            previous_block = block
+        if current_group:
+            classes_list.extend(sort_blocks_by_min_y(current_group))
+        return classes_list
+    result = {}
+    for key, blocks in data.items():
+        sorted_blocks = sort_blocks_by_min_x(blocks)
+        sorted_layouts = find_classes_and_group(sorted_blocks)
+        result[key] = sorted_layouts
+    sorted_layout_data = result
+    with open(modified_json_output_filepath, 'w') as f:
+        json.dump(sorted_layout_data, f, indent=4)
+    return sorted_layout_data, modified_json_output_filepath
+@log_time_taken
+def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
+    try:
+        image = Image.open(img_path).convert("RGB")
+        # Process image with the Docling Heron model
+        inputs = image_processor(images=[image], return_tensors="pt")
+        # Move inputs to the same device as the model
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Post-process the results
+        results = image_processor.post_process_object_detection(
+            outputs,
+            target_sizes=torch.tensor([image.size[::-1]], device=device),
+            threshold=threshold
+        )[0]
+        # Move results to CPU for further processing
+        results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
+        # Convert to supervision Detections format for compatibility
+        xyxy = results["boxes"].numpy()
+        confidence = results["scores"].numpy()
+        class_id = results["labels"].numpy()
+        class_name = [model.config.id2label[label_id] for label_id in class_id]
+        detections = sv.Detections(
+            xyxy=xyxy,
+            confidence=confidence,
+            class_id=class_id,
+            data={"class_name": class_name}
+        )
+        # Custom bounding box color (Red)
+        bbox_color = sv.Color(r=255, g=0, b=0)
+        bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
+        label_annotator = sv.LabelAnnotator()
+        # Annotate the image
+        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
+        annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
+        # Clean up
+        del inputs, outputs
+        torch.cuda.empty_cache() if device == 'cuda' else None
+        gc.collect()
+        return annotated_image, detections, results
+    except Exception as e:
+        logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
+        raise
+def enhance_dpi(image, new_dpi=300, old_dpi=150):
+    old_dpi = int(old_dpi)
+    new_dpi = int(new_dpi)
+    scaling_factor = new_dpi / old_dpi
+    new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
+    resized_image = image.resize(new_size, Image.LANCZOS)
+    return resized_image
+def extract_text_from_bbox(image, bbox):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    elif isinstance(image, np.ndarray):
+        pass
+    else:
+        raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
+    image_height, image_width = image.shape[:2]
+    ymin = max(0, int(bbox['ymin'] - 5))
+    ymax = min(image_height, int(bbox['ymax'] + 5))
+    xmin = max(0, int(bbox['xmin'] - 20))
+    xmax = min(image_width, int(bbox['xmax'] + 20))
+    cropped_image = image[ymin:ymax, xmin:xmax]
+    cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
+    high_dpi_image = enhance_dpi(cropped_image_pil)
+    high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
+    gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
+    custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
+    extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
+    return extracted_text
+def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
+    if not isinstance(extracted_text, pd.DataFrame):
+        return False
+    model = SentenceTransformer(model_name)
+    extracted_headers = list(extracted_text.columns)
+    extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
+    header_embeddings = model.encode(header_list, convert_to_tensor=True)
+    similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
+    for i, header in enumerate(header_list):
+        for j, extracted_header in enumerate(extracted_headers):
+            if similarity_matrix[i][j] > threshold:
+                logger.info(f"Matching header found: {extracted_header} (similar to {header})")
+                return True
+    logger.info("No matching headers found.")
+    return False
+def process_page(args):
+    (page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    try:
+        model, image_processor, class_names = load_model_for_process(model_name=MODEL_NAME_DOCLING)
+        model.to(device)  # Ensure model is on the correct device
+        image = np.array(page_img)
+        h, w, _ = image.shape
+        page_number = str(current_page_num)
+        img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
+        img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
+        pil_image = Image.fromarray(image)
+        pil_image.save(img_output_filepath)
+        cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
+        os.makedirs(cropped_images_path, exist_ok=True)
+        bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
+        logger.info(f"Processed layout detection for page {page_number}")
+        pil_bbox_image = Image.fromarray(bbox_image)
+        bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
+        bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
+        pil_bbox_image.save(bbox_output_filepath)
+        page_information = []
+        for idx, bbox in enumerate(page_detections_info.xyxy):
+            label_name = page_detections_info.data['class_name'][idx]
+            class_id = page_detections_info.class_id[idx]
+            score = page_detections_info.confidence[idx]
+            image_height = h
+            image_width = w
+            ymin = max(0, bbox[1] - 10)
+            ymax = min(image_height, bbox[3] + 10)
+            xmin = max(0, bbox[0] - 10)
+            xmax = min(image_width, bbox[2] + 10)
+            new_bbox = {
+                "xmin": int(bbox[0]),
+                "ymin": int(bbox[1]),
+                "xmax": int(bbox[2]),
+                "ymax": int(bbox[3])
+            }
+            cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
+            os.makedirs(cropped_labels_images_path, exist_ok=True)
+            crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
+            crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
+            crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
+            cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
+            cropped_label_pil_image.save(crop_label_image_filename_filepath)
+            if label_name == 'Table':
+                crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
+                cropped_image = pil_image.crop(crop_bbox)
+                df_post_processed, df_original = tsr_inference_image(cropped_image)
+                extracted_df = df_post_processed
+                extracted_text = extracted_df
+                if isinstance(df_original, pd.DataFrame):
+                    extracted_df_markdown = df_original.to_markdown()
+                else:
+                    extracted_df_markdown = df_original
+            else:
+                extracted_text = extract_text_from_bbox(image, new_bbox)
+                extracted_df_markdown = ""
+            page_block_id = f"{str(idx + 1) + str(current_page_num)}"
+            page_block_id = int(page_block_id)
+            page_information.append({
+                'page_block_id': page_block_id,
+                'label_name': label_name,
+                'pdf_page_id': current_page_num,
+                'pdf_name': file_name,
+                'label_id': class_id,
+                'yolo_detection_confidence_score': score,
+                'bbox': [xmin, ymin, xmax, ymax],
+                'page_img_width': w,
+                'page_img_height': h,
+                'extracted_text': [extracted_text],
+                "extracted_table_markdown": [extracted_df_markdown]
+            })
+        # Clean up
+        del image, bbox_image, model, image_processor
+        torch.cuda.empty_cache() if device == 'cuda' else None
+        gc.collect()
+        return page_number, page_information, class_names
+    except Exception as e:
+        logger.error(f"Error processing page {current_page_num}: {str(e)}")
+        raise
+@log_time_taken
+def yolov10_layout_pipeline(file_name, file_path, directory_path):
+    if not file_path.lower().endswith('.pdf'):
+        raise ValueError("Input file must be a PDF.")
+    logger.info(f"Starting processing for {file_name}")
+    start_time = datetime.now()
+    file_name = get_file_name_without_extension(file_path)
+    pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
+    os.makedirs(pdf_images_path, exist_ok=True)
+    bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
+    os.makedirs(bbox_images_path, exist_ok=True)
+    json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
+    os.makedirs(json_output_path, exist_ok=True)
+    total_pages_processed = 0
+    data_pdf = {}
+    try:
+        page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
+        page_args = []
+        for pages in page_generator:
+            if not pages:
+                break
+            for page_num, page_img in enumerate(pages):
+                current_page_num = total_pages_processed + page_num + 1
+                logger.info(f"Processing file {file_name}, page {current_page_num}")
+                page_args.append((
+                    page_img,
+                    current_page_num,
+                    file_name,
+                    pdf_images_path,
+                    bbox_images_path
+                ))
+            total_pages_processed += len(pages)
+        logger.info(f"Total pages to process: {total_pages_processed}")
+        with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
+            future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
+            for future in as_completed(future_to_page):
+                page_number = future_to_page[future]
+                try:
+                    result = future.result()
+                    page_number, page_information, class_names = result
+                    data_pdf[page_number] = page_information
+                except Exception as e:
+                    logger.error(f"Error processing page {page_number}: {str(e)}")
+                    raise
+        logger.info(f"Processed pages: {data_pdf.keys()}")
+        layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
+        user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
+        tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
+        data_pdf = convert_numpy(data_pdf)
+        layout_list_data = filter_layout_blocks(data_pdf)
+        with open(layout_json_file_path, 'w') as json_file:
+            json.dump(data_pdf, json_file, indent=4)
+        with open(user_modification_json_file_path, 'w') as json_file:
+            json.dump(data_pdf, json_file, indent=4)
+        sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
+        tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
+        sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
+        filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
+        end_time = datetime.now()
+        logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
+        logger.info(f"JSON file created at: {modified_json_output_filepath}")
+        return (
+            json_output_path,
+            layout_list_data,
+            class_names,
+            sorted_data,
+            modified_json_output_filepath,
+            pdf_images_path,
+            file_name,
+            sorted_layout_data,
+            sorted_layout_json_filepath,
+            tree_structured_organized_json_data,
+            tree_structured_json_output_path,
+            filtered_table_header_data,
+            filtered_table_header_data_json_path
+        )
+    except Exception as e:
+        logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
+        raise
+    finally:
+        # Ensure GPU memory is cleared
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        gc.collect()
+# Example usage
+if __name__ == "__main__":
+    pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
+    output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
+    file_name = get_file_name_without_extension(pdf_path)
+    yolov10_layout_pipeline(file_name, pdf_path, output_directory)

load_model.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# from ultralytics import YOLOv10
+# import torch
+# from config.set_config import set_configuration
+# set_config_project = set_configuration()
+# layout_model_weights_path = set_config_project.layout_model_weights_path
+# no_of_threads = set_config_project.no_of_threads
+# def load_model_for_process(detection_model_path=layout_model_weights_path):
+#     """
+#     Load model in each subprocess to avoid CUDA initialization issues
+#     Returns:
+#         Model loaded in appropriate device
+#     """
+#     # Your model loading logic
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     # print(f"Using device: {device}")
+#     model = YOLOv10(detection_model_path).to(device)
+#     class_names = model.names
+#     class_names["11"] = "Table-header"
+#     class_names["12"] = "Portfolio-Company-Table"
+#     return model, class_names
+import torch
+from ultralytics import YOLO
+layout_model_weights_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking/model_weights/yolov12_epoch60.pt"
+# def load_model_for_process(detection_model_path=layout_model_weights_path):
+#     """
+#     Load model in each subprocess to avoid CUDA initialization issues
+#     Returns:
+#         Model loaded in appropriate device
+#     """
+#     # Your model loading logic
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     # print(f"Using device: {device}")
+#     model = YOLO(detection_model_path).to(device)
+#     class_names = model.names
+#     class_names["11"] = "Table-header"
+#     class_names["12"] = "Portfolio-Company-Table"
+#     print("YOLOV12"*10)
+#     return model, class_names
+'''Below code for docling heron model'''
+from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
+def load_model_for_process(model_name="ds4sd/docling-layout-heron"):
+    """
+    Load the Docling Heron model and image processor in each subprocess to avoid CUDA initialization issues.
+    Returns:
+        Tuple of (model, image_processor, class_names)
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    # Load the image processor and model
+    image_processor = RTDetrImageProcessor.from_pretrained(model_name)
+    model = RTDetrV2ForObjectDetection.from_pretrained(model_name).to(device)
+    # Define class names mapping
+    class_names = {
+        0: "Caption",
+        1: "Footnote",
+        2: "Formula",
+        3: "List-item",
+        4: "Page-footer",
+        5: "Page-header",
+        6: "Picture",
+        7: "Section-header",
+        8: "Table",
+        9: "Text",
+        10: "Title",
+        11: "Document Index",
+        12: "Code",
+        13: "Checkbox-Selected",
+        14: "Checkbox-Unselected",
+        15: "Form",
+        16: "Key-Value Region",
+        # Additional classes for compatibility with existing pipeline
+        17 : "Table-header",
+        18 : "Portfolio-Company-Table"
+    }
+    return model, image_processor, class_names

post_process_portfolio_company_json.py ADDED Viewed

	@@ -0,0 +1,375 @@

+import json
+import os
+from fuzzywuzzy import fuzz
+from typing import List, Dict, Any
+import yaml
+import warnings
+import pandas as pd
+# Constants
+# PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["portfolio company or platforms", "portfolio company"]
+PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["portfolio company or platforms","\u20acm","$m","Unrealised fair market valuation","Realised proceeds in the period","Portfolio Company or Platforms","portfolio company", "active investment", "realized/unrealized company","Realized Company","Unrealized Company", "quoted/unquoted company", "portfolio investment", "portfolio company"]
+FUZZY_MATCH_THRESHOLD = 70
+EXCLUDE_COMPANY_NAMES = ["total", "subtotal","Total","Investments","Fund"]
+def get_file_name_without_extension(file_path: str) -> str:
+    """Extract file name without extension from path."""
+    return os.path.splitext(os.path.basename(file_path))[0]
+def fuzzy_match(text: str, patterns: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> bool:
+    """Check if text fuzzy matches any of the patterns."""
+    text = str(text).lower()
+    for pattern in patterns:
+        if fuzz.partial_ratio(text, pattern.lower()) >= threshold:
+            return True
+    return False
+def extract_portfolio_companies_from_table(table_data: Dict) -> List[str]:
+    """Extract company names from a portfolio company table."""
+    companies = []
+    if not table_data.get("table_info"):
+        return companies
+    # Find the company column
+    company_column = None
+    for i, header in enumerate(table_data.get("table_column_header", [])):
+        if fuzzy_match(header, PORTFOLIO_COMPANY_LIST_IDENTIFIER):
+            company_column = i
+            break
+    if company_column is None:
+        return companies
+    # Get the column name that contains companies
+    company_column_name = table_data["table_column_header"][company_column]
+    print("company_column::",company_column)
+    print("cpmpany_column_name::",company_column_name)
+    # Extract companies
+    for row in table_data["table_info"]:
+        if not isinstance(row, dict):
+            continue
+        company_name = str(row.get(company_column_name, "")).strip()
+        if company_name and not fuzzy_match(company_name, EXCLUDE_COMPANY_NAMES):
+            companies.append(company_name)
+    return companies
+def get_portfolio_company_list(intermediate_data: List[Dict]) -> List[str]:
+    """Extract portfolio companies from all tables in the document."""
+    portfolio_companies = set()
+    for entry in intermediate_data:
+        if "table_content" not in entry:
+            continue
+        for table in entry["table_content"]:
+            companies = extract_portfolio_companies_from_table(table)
+            portfolio_companies.update(companies)
+    return list(portfolio_companies)
+def merge_content_under_same_header(
+    intermediate_data: List[Dict],
+    portfolio_company_list: List[str],
+    start_index: int
+) -> Dict:
+    """
+    Merge content under the same header until next company match is found.
+    Returns merged content and the next index to process.
+    """
+    merged_entry = {
+        "header": intermediate_data[start_index]["header"],
+        "content": intermediate_data[start_index].get("content", ""),
+        "table_content": intermediate_data[start_index].get("table_content", []),
+        "label_name": intermediate_data[start_index]["label_name"],
+        "page_number": intermediate_data[start_index]["page_number"],
+        "pdf_name": intermediate_data[start_index]["pdf_name"]
+    }
+    current_index = start_index + 1
+    while current_index < len(intermediate_data):
+        current_entry = intermediate_data[current_index]
+        # Check if we're still under the same header
+        if current_entry["header"] != merged_entry["header"]:
+            break
+        # Check if current entry matches any portfolio company
+        content_match = any(company in current_entry.get("content", "")
+                          for company in portfolio_company_list)
+        table_match = False
+        for table in current_entry.get("table_content", []):
+            if extract_portfolio_companies_from_table(table):
+                table_match = True
+                break
+        if content_match or table_match:
+            break
+        # Merge content
+        if "content" in current_entry:
+            if merged_entry["content"]:
+                merged_entry["content"] += "\n" + current_entry["content"]
+            else:
+                merged_entry["content"] = current_entry["content"]
+        # Merge tables
+        if "table_content" in current_entry:
+            merged_entry["table_content"].extend(current_entry["table_content"])
+        current_index += 1
+    return merged_entry, current_index
+def process_table_page_ids(merged_output):
+    """
+    Process the data to update the page_number key by combining its existing values with unique page numbers
+    from table_content metadata, for pages that contain table_content.
+    Args:
+        data (dict): Input data dictionary with page numbers as keys and page content as values.
+    Returns:
+        dict: Modified data with updated page_number key including existing and metadata page numbers.
+    """
+    # Iterate through each page in the data
+    for current_merged_entry in merged_output:
+        # Only process pages that have table_content
+        if 'table_content' in current_merged_entry:
+            # Initialize a set with existing page numbers from the page_number key
+            existing_page_numbers = set(current_merged_entry.get('page_number', '').split(',')) if current_merged_entry.get('page_number') else set()
+            # Add unique page numbers from table_content metadata
+            for table in current_merged_entry['table_content']:
+                if 'metadata' in table and 'table_page_id' in table['metadata']:
+                    existing_page_numbers.add(str(table['metadata']['table_page_id']))
+            # Update the page_number key with sorted, unique page numbers
+            if existing_page_numbers:
+                current_merged_entry['page_number'] = ','.join(sorted(existing_page_numbers, key=int))
+    return merged_output
+################################################################################################################
+## Below function for more than one occurence of underlying_assets
+def merge_portfolio_company_sections(intermediate_data: List[Dict]) -> tuple[List[Dict], List[str], List[str]]:
+    """Merge all content and tables under the same portfolio company header until next company is found.
+    Returns:
+        - merged_output: List of merged document sections
+        - fuzzy_matched_companies: List of companies that were fuzzy matched in headers
+        - portfolio_companies: List of all portfolio companies found in tables
+    """
+    portfolio_companies = get_portfolio_company_list(intermediate_data)
+    print(f"Extracted portfolio companies: {portfolio_companies}")
+    merged_output = []
+    fuzzy_matched_companies = set()
+    current_chunk = None
+    active_company = None
+    for entry in intermediate_data:
+        # Find all companies in this entry's header
+        # header_companies = []
+        # for company in portfolio_companies:
+        #     if fuzzy_match(entry["header"], [company], threshold=90):
+        #         header_companies.append(company)
+        #         fuzzy_matched_companies.add(company)
+        entry_copy = entry.copy()
+        header_companies = match_company_names(entry["header"], portfolio_companies)
+        if header_companies:
+            print("&"*100)
+            print("*"*100)
+            print("entry_header::", entry["header"])
+            print("page number of header::", entry["page_number"])
+            print("*"*100)
+            print("header_companies::", header_companies)
+            print("*"*100)
+            # If we have an active chunk, finalize it before starting new one
+            if current_chunk:
+                merged_output.append(current_chunk)
+                current_chunk = None
+                active_company = None
+            # Start new chunk with the first matched company
+            # (in case multiple companies matched, we take the first one)
+            active_company = header_companies[0]
+            current_chunk = {
+                "page_number": entry["page_number"],
+                "pdf_name": entry["pdf_name"],
+                "header": entry["header"],
+                "label_name": entry["label_name"],
+                "content": entry.get("content", ""),
+                "table_content": entry.get("table_content", []),
+                "matched_company": active_company
+            }
+            # If multiple companies matched, create separate chunks for others
+            for additional_company in header_companies[1:]:
+                merged_output.append({
+                    "page_number": entry["page_number"],
+                    "pdf_name": entry["pdf_name"],
+                    "header": entry["header"],
+                    "label_name": entry["label_name"],
+                    "content": entry.get("content", ""),
+                    "table_content": entry.get("table_content", []),
+                    "matched_company": additional_company
+                })
+        elif current_chunk:
+            # Continue adding to current chunk if no new company detected
+            if "content" in entry:
+                if current_chunk["content"]:
+                    current_chunk["content"] += "\n\n" + entry["content"]
+                    current_chunk["page_number"] += "," + str(entry["page_number"])
+                    page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
+                    page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+                    current_chunk["page_number"] = ",".join(page_numbers_list)
+                else:
+                    current_chunk["content"] = entry["content"]
+                    current_chunk["page_number"] = str(entry["page_number"])
+            if "table_content" in entry:
+                current_chunk["table_content"].extend(entry["table_content"])
+                if current_chunk["page_number"]:
+                    if "metadata" in entry["table_content"]:
+                        if "table_page_id" in entry["table_content"]["metadata"]:
+                            current_chunk["page_number"] += "," + str(entry["table_content"]["metadata"]["table_page_id"])
+                    current_chunk["page_number"] += "," + str(entry["page_number"])
+                    page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
+                    page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+                    current_chunk["page_number"] = ",".join(page_numbers_list)
+            # if "page_number" in entry:
+            #     if current_chunk["page_number"]:
+            #         current_chunk["page_number"] += "," + str(entry["page_number"])
+            #     else:
+            #         current_chunk["page_number"] = str(entry["page_number"])
+        else:
+            # Ensure Unique page numbers for this entry
+            entry_copy = entry.copy()
+            if "page_number" in entry_copy :
+                page_numbers_list = list(dict.fromkeys(str(entry_copy["page_number"]).split(",")))
+                page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+                entry_copy["page_number"] = ",".join(page_numbers_list)
+            # Content before any company section
+            merged_output.append(entry_copy)
+    # Add the last active chunk if it exists
+    if current_chunk:
+        # Ensure Unique page numbers for last entry
+        page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
+        page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+        entry_copy["page_number"] = ",".join(page_numbers_list)
+        merged_output.append(current_chunk)
+    merged_output_new = process_table_page_ids(merged_output=merged_output)
+    return merged_output_new, list(fuzzy_matched_companies), portfolio_companies
+################################################################################################
+## Below code for using abbreviation funcnality
+import re
+def match_company_names(header_text: str, companies: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> List[str]:
+    """Match company names in text, first checking header text abbreviations, then company abbreviations."""
+    header_text = str(header_text).lower().strip()
+    matched_companies = []
+    # Generate possible abbreviations for header_text
+    header_abbreviations = [
+        ''.join(word[0] for word in header_text.split() if word),  # First letters of each word
+        re.sub(r'[aeiou\s]', '', header_text),  # Remove vowels and spaces
+        header_text.replace(' ', '')  # Remove spaces
+    ]
+    for company in companies:
+        company_lower = company.lower()
+        # First check: header text (full or abbreviated) against company full name
+        for header_pattern in [header_text] + header_abbreviations:
+            if fuzz.partial_ratio(header_pattern, company_lower) >= threshold:
+                matched_companies.append(company)
+                break
+        else:
+            # Second check: header text against company abbreviations
+            company_abbreviations = [
+                ''.join(word[0] for word in company_lower.split() if word),  # First letters of each word
+                re.sub(r'[aeiou\s]', '', company_lower),  # Remove vowels and spaces
+                company_lower.replace(' ', '')  # Remove spaces
+            ]
+            for company_pattern in company_abbreviations:
+                if fuzz.partial_ratio(header_text, company_pattern) >= threshold:
+                    matched_companies.append(company)
+                    break
+    return list(dict.fromkeys(matched_companies))  # Remove duplicates while preserving order
+################################################################################################################
+def process_document_company_wise(
+    intermediate_str_chunk_json: List[Dict],
+    output_directory: str,
+    file_name: str
+) -> List[Dict]:
+    """Process the document and return merged content in original format."""
+    # Convert string input to dict if needed
+    if isinstance(intermediate_str_chunk_json, str):
+        intermediate_str_chunk_json = json.loads(intermediate_str_chunk_json)
+    # Merge content by company sections
+    # merged_content,matched_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json)
+    merged_content,matched_company_list,portfolio_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json)
+    # merged_content[0]["companies_list"] = matched_company_list
+    merged_content[0]["portfolio_companies_list_fuzzy_matched"] = matched_company_list
+    merged_content[0]["portfolio_companies_list_before"] = portfolio_company_list
+    # Ensure output directory exists
+    os.makedirs(output_directory, exist_ok=True)
+    # Save output
+    output_path = os.path.join(output_directory, f"{file_name}_h2h_merged_output.json")
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(merged_content, f, indent=4, ensure_ascii=False)
+    print(f"Saved merged output to {output_path}")
+    return merged_content
+def read_json(file_path):
+    """Reads a JSON file and returns the parsed data."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+# # Example usage
+if __name__ == "__main__":
+    input_str_chunk_json_path="/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Triton2023Q4_patria_sample_output/Triton2023Q4_patria_sample_json_output/Triton2023Q4_patria_sample_final_h2h_extraction.json"
+    input_json = read_json(input_str_chunk_json_path)
+    # Process the data
+    result = process_document_company_wise(
+        intermediate_str_chunk_json=input_json,
+        output_directory="db_structured_chunking/structure_chunking/src/iqeq_modification/testing_sample/output",
+        file_name="sample_report"
+    )
+    print("Processing complete.")
+    # print(json.dumps(result, indent=2))