Upload 8 files

Browse files

Files changed (8) hide show

company_name_extraction_by_ovis.py +69 -0
iqeq_app_latest (4).py +1437 -0
layout_detection_docling_heron (1).py +497 -0
layout_detection_docling_heron (2).py +497 -0
load_model (1).py +106 -0
ovis_config.py +148 -0
post_process_portfolio_company_json 2.py +402 -0
rabbitmq_config_investor_report.py +23 -0

company_name_extraction_by_ovis.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import os
+import json
+import logging
+from src.iqeq_modification.ovis_config import _run_inference
+logger = logging.getLogger(__name__)
+def extract_company_names(table_image_folder: str):
+    logger.info("=" * 80)
+    logger.info("STARTED COMPANY NAME EXTRACTION USING OVIS")
+    logger.info(f"Image folder: {table_image_folder}")
+    logger.info("=" * 80)
+    #Load all images from folder
+    supported_ext = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.webp')
+    image_paths = [
+        os.path.join(table_image_folder, f)
+        for f in sorted(os.listdir(table_image_folder))
+        if f.lower().endswith(supported_ext)
+    ]
+    if not image_paths:
+        logger.warning("No valid images found in the folder.")
+        return []
+    logger.info(f"Found {len(image_paths)} image(s) for inference")
+    prompt = (
+    "You are an expert financial document analysis model specialized in reading tables from investor reports. "
+    "You are given an image of a table that may contain portfolio or investee company details such as company names, fund names, sectors, and investment amounts.\n\n"
+    "Your task:\n"
+    "1. Identify if the table contains portfolio or investee company information.\n"
+    "2. Extract only the actual **company or investee organization names**, excluding fund names, co-investment entities, management labels, or generic terms.\n"
+    "3. Do NOT include partial terms, descriptors, or words like 'Fund', 'Holdings', 'Co-Investment', 'Management', 'Other Unitholders', 'Endurance', 'Growth', 'PIK', etc.\n"
+    "4. Remove duplicates and retain only unique, meaningful company names.\n"
+    "5. Each extracted name should be a clean, full company name (e.g., 'Kate Spade Ltd', 'Milano Ventures Pvt Ltd').\n\n"
+    "Return your final answer strictly as a valid JSON list of strings, for example:\n"
+    "[\"Kate Spade Pvt Ltd\", \"Milano Ventures\", \"XYZ Technologies\"]\n\n"
+    "If no valid company names are found, return [] only.\n"
+    "Do not include any explanations, reasoning, or text outside the JSON list."
+)
+    company_names = set()
+    #Chunk fields to avoid prompt overflow
+    for img in image_paths:
+        try:
+            raw, _ = _run_inference(img, prompt, max_new_tokens=2048)
+            print(f"raw result: {raw}")
+            if not raw:
+                continue
+            try:
+                names = json.loads(raw)
+                if isinstance(names, list):
+                    for name in names:
+                        if isinstance(name, str) and name.strip():
+                            company_names.add(name.strip())
+            except json.JSONDecodeError:
+                logger.warning(f"Failed to parse OVIS output for {img}: {raw}")
+        except Exception as e:
+            logger.error(f"OVIS inference failed for {img}: {e}", exc_info=True)
+            continue
+    logger.info(f"Extracted {len(company_names)} unique company name(s).")
+    print("k"*100)
+    return list(company_names)
+# print(extract_company_names("/shared_disk/kushal/land_contract/company tables"))

iqeq_app_latest (4).py ADDED Viewed

	@@ -0,0 +1,1437 @@

+from fastapi import FastAPI, HTTPException, Depends, File, Request, Form
+from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, FileResponse
+from urllib.parse import quote
+from typing import List, Annotated,Dict,Optional,Any
+import uvicorn
+import sys
+import json
+sys.path.append("/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking")
+# from config.set_config.set_configuration import set_config_project
+from config.set_config import set_configuration
+set_config_project = set_configuration()
+project_output_directory_path= set_config_project.project_output_directory_path
+project_path = set_config_project.project_path
+from src.table_processing.table_filter import filtering_table_pipeline
+# from src.qa_correction.user_action_modification import process_actions_and_create_new_file
+from src.table_processing.tree_structured_json import tree_structured_headers_pipeline,tree_structured_headers_content_pipeline
+from src.pre_processing.create_searchable_pdf_old import create_json_pdfminer_pipeline
+from src.post_processing.clean_dataframe import clean_dataframe
+from src.table_processing.merge_headers_tree_structure import merge_blocks
+from src.table_processing.create_and_put_table_header import main_pipeline_create_put_table_headers
+from src.table_processing.map_table_with_table_header import map_table_with_its_header
+# from src.table_processing.table_merge import merge_multi_page_tables_pipeline
+# from other_code.save_classified_pdf_json_to_excel import create_directories_and_sheets
+# from src.table_extraction_from_word_csv.word_extraction import main_table_extraction_from_docx
+# from src.table_extraction_from_word_csv.xlsx_extraction import extract_and_save_tables_from_excel
+# from src.table_extraction_from_word_csv.csv_extraction import extract_and_save_tables_from_csv
+# from src.table_extraction_from_word_csv.classify_table_headers import process_main_classifier,get_csv_file_paths,save_classify_files, clean_filename
+# from src.iqeq_modification.sorting_headers_v2 import filter_and_sort_headers
+# from src.iqeq_modification.portfolio_summary_dynamic_classification import map_company_data
+from src.toc_based_extraction.main_pipeline_toc_based_extraction import customised_toc_extraction_pipeline
+from src.iqeq_modification.post_processing_iqeq import read_json,main_header_pipeline
+from src.iqeq_modification.post_process_portfolio_company_json import process_document_company_wise
+# from src.filter_pdf_pages_scope3.fuzzy_match_keywords import custom_pipeline_for_filter_keywords_pages_text_search
+# from src.filter_pdf_pages_scope3.keywords_matching import custom_pipeline_for_filter_keywords_pages_tfidf_vector
+# from src.filter_pdf_pages_scope3.keywords_matching_create_pdf import custom_pipeline_for_filter_keywords_pages_sentence_embedding
+# from src.layout_detection.layout_detection import yolov10_layout_pipeline,get_file_name_without_extension
+from src.layout_detection.layout_detection_docling_heron import yolov10_layout_pipeline,get_file_name_without_extension
+# from src.table_merge.table_merge_v2 import merge_multi_page_tables_pipeline_v2
+# from src.table_merge.table_merge_new import merge_multi_page_tables_pipeline_v2
+from src.table_merge.table_merge_v5 import merge_multi_page_tables_pipeline_v2
+from src.table_query.query_code_openai import get_query_response
+from src.custom_headers.pdf_header_detector import process_pdf_for_headers
+from src.custom_headers.consolidate_header_jsons import pipeline_for_merging_headers
+from utils.utils_code import clear_directory
+import logging,os
+from logging.config import dictConfig
+import shutil
+import re
+from fastapi import HTTPException, Form
+from src.classification.column_classifier_v2 import classify_column_headers
+from src.classification.classification import perform_classification
+log_folder = "logs"
+os.makedirs(log_folder, exist_ok=True)
+# Configure logging
+log_file_path = os.path.join(log_folder, "app.log")
+logging_config = {
+    'version': 1,
+    'disable_existing_loggers': False,
+    'formatters': {
+        'detailed': {
+            'format': '%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(message)s',
+            'datefmt': '%Y-%m-%d %H:%M:%S'
+        },
+    },
+    'handlers': {
+        'console': {
+            'class': 'logging.StreamHandler',
+            'level': 'INFO',
+            'formatter': 'detailed',
+            'stream': 'ext://sys.stdout'
+        },
+        'file': {
+            'class': 'logging.FileHandler',
+            'level': 'INFO',
+            'formatter': 'detailed',
+            'filename': log_file_path,
+            'mode': 'a',
+        },
+    },
+    'loggers': {
+        '': {  # root logger
+            'handlers': ['console', 'file'],
+            'level': 'INFO',
+            'propagate': True
+        },
+        # Add specific loggers for libraries if needed
+        'uvicorn': {
+            'handlers': ['console', 'file'],
+            'level': 'INFO',
+            'propagate': False
+        },
+    }
+}
+# Apply the configuration
+dictConfig(logging_config)
+# Create the logger instance
+logger = logging.getLogger(__name__)
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+pdf_input_path = os.path.join(project_output_directory_path, f"pdf_extraction/input")
+pdf_input_directory = pdf_input_path
+os.makedirs(pdf_input_directory, exist_ok=True)
+pdf_output_path = os.path.join(project_output_directory_path, f"pdf_extraction/output")
+output_directory = pdf_output_path
+os.makedirs(output_directory, exist_ok=True)
+word_input_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/input")
+word_input_directory_path = word_input_path
+os.makedirs(word_input_directory_path, exist_ok=True)
+word_output_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/output")
+word_output_directory_path = word_output_path
+os.makedirs(word_output_directory_path, exist_ok=True)
+document_data = {}
+@app.post("/structured_chunking_extract")
+async def upload_documents(request: Request, path: str = Form()) :
+    # path = eval(f'{path}')
+    print(f'started for path: {path}')
+    base_url = str(request.base_url)
+    global document_data
+    document_data = {}
+    pdf_path = path
+    clear_directory(pdf_input_path)
+    clear_directory(pdf_output_path)
+    clear_directory(word_input_path)
+    clear_directory(word_output_path)
+    # Initialize response structure
+    response = {
+        "success": False,
+        "message": "",
+        # "data": None
+    }
+    # Check if the provided path is a PDF file
+    if not pdf_path.lower().endswith(".pdf"):
+        response["message"] = "Invalid file type. Only PDF files are accepted."
+        return response
+    # Extract filename
+    file_name_with_ext = os.path.basename(pdf_path)
+    file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
+    # Create destination path in input directory
+    destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
+    try:
+        # Copy the file to our input directory
+        shutil.copy2(pdf_path, destination_path)
+    except Exception as e:
+        response["message"] = f"Failed to copy file: {str(e)}"
+        return response
+    output_directory_path = os.path.join(output_directory)
+    os.makedirs(output_directory_path, exist_ok=True)
+    file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
+    os.makedirs(file_output, exist_ok=True)
+    table_output_path = os.path.join(file_output, f"table_output")
+    os.makedirs(table_output_path, exist_ok=True)
+    file_location = destination_path
+    # Pipeline processing
+    json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path,_ = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
+    table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
+    custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
+    header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
+    # Initialize data for the new document
+    document_data[file_name_with_ext] = {
+        "pdf_path": destination_path,
+        "pdf_file_name": file_name_with_ext,
+        "model_json_header_output_filepath": [],
+        "model_json_layout_output_filepath": [],
+        "tree_structured_header_json_filepath": [],
+        "user_modified_json_output_filepath": [],
+        'user_modified_table_json_filepath': [],
+        "frontend_output_json": [],
+        "cluster_json": [],
+        "id_2_label" : [],
+        "file_output_dir" : [],
+        "table_output_dir": [],
+        "table_with_header_data" : [],
+        "table_with_header_json_path" : [],
+        "json_output_dir": [],
+        "pdf_miner_json_path": [] ,
+        "searchable_pdf_path" : []
+    }
+    # Store paths and filenames
+    document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
+    document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
+    document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
+    document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
+    document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
+    document_data[file_name_with_ext]["file_output_dir"].append(file_output)
+    document_data[file_name_with_ext]["id_2_label"].append(class_names)
+    document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
+    document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
+    document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
+    document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
+    file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
+    pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
+    pdf_path = document_data[file_name_with_ext]["pdf_path"]
+    user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
+    pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
+    table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
+    document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
+    document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
+    # Process image URLs
+    pdf_images_urls = []
+    for file_name in os.listdir(pdf_images_path):
+        file_path = os.path.join(pdf_images_path, file_name)
+        if file_name.endswith((".jpg", ".jpeg", ".png")):
+            img_url = base_url + "image/" + str(quote(file_path))
+            pdf_images_urls.append(img_url)
+    # Sort image URLs by page number
+    def extract_page_no(url):
+        return int(url.split("_")[-1].split(".")[0])
+    sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
+    # Create page details
+    page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
+    # Store the JSON output
+    document_data[file_name_with_ext]["frontend_output_json"].append({
+        "layout_output_json_data": layout_output_json_data,
+        "layout_json_list_data": layout_list_data,
+        "id_2_label": class_names,
+        "header_output_json_data": header_output_json_data,
+        "table_output_json_data": table_json_data,
+        "table_output_json_data_list": table_json_data_list,
+        "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
+        "pdf_images_urls": page_details,
+    })
+    document_id_name = file_name_with_ext
+    data = document_data[document_id_name]
+    file_output_dir = data["file_output_dir"][0]
+    json_output_dir = data["json_output_dir"][0]
+    pdf_file_name = data["pdf_file_name"]
+    pdf_path = data["pdf_path"]
+    # PDFMiner processing
+    pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
+    modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
+    model_modified_json = read_json(modified_json_output_filepath)
+    pdfminer_json = read_json(pdf_miner_json_filepath)
+    searchable_pdf_path = data["searchable_pdf_path"][0]
+    # table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
+    table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
+    table_merged_json = read_json(table_merged_json_path)
+    table_mapped_modified_json = map_table_with_its_header(table_merged_json)
+    # table_mapped_modified_json = map_table_with_its_header(model_modified_json)
+    # Main header pipeline
+    df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
+    clean_df, clean_df_json = clean_dataframe(df_final)
+    if isinstance(clean_df_json, str):
+        # print("clean_df_json::",clean_df_json)
+        # clean_df_json = eval(clean_df_json)
+        clean_df_json = json.loads(clean_df_json)
+    file_name = get_file_name_without_extension(pdf_file_name)
+    merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
+    company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
+    json_output_filename =  file_name  + "_final_h2h_extraction.json"
+    final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
+    with open(final_json_output_filepath, 'w') as f:
+        json.dump(clean_df_json, f, indent=4)
+    company_wise_json_output_filename =  file_name  + "_final_h2h_extraction_company_wise.json"
+    company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
+    with open(company_wise_final_json_output_filepath, 'w') as f:
+        json.dump(merged_content_company_wise_df, f, indent=4)
+    # Tree-structured header content
+    # final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
+    # final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
+    # document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
+    # final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
+    # Step 1: Extract directory and filename without extension
+    pdf_path = path
+    json_directory = os.path.dirname(pdf_path)
+    json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+    # Step 2: Define output path for JSON
+    output_json_path = os.path.join(json_directory, f"{json_filename }.json")
+    # If your variable is a JSON string, convert it to dict first
+    if isinstance(company_wise_clean_df_json, str):
+        company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
+    # Step 3: Save JSON
+    with open(output_json_path, 'w', encoding='utf-8') as json_file:
+        json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
+    # # post-processing results
+    # post_processing_results = {
+    #                             document_id_name :  {
+    #                                                 # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
+    #                                                 # "df_download_json" : clean_df_json,
+    #                                                 "df_download_json": company_wise_clean_df_json,
+    #                                                 "tree_structured_header_content": final_tree_structred_header_content,
+    #                                                 "file_name": document_id_name,
+    #                                                 # "classified_dynamic_json": dynamic_mapped_data_json,
+    #                                                 "toc_df_download_json" : final_toc_h2h_extraction
+    #                                             }
+    #                             }
+    response_final = {
+                "status_code": 200,
+                # "message":"",
+                # "df_download_json": company_wise_clean_df_json,
+                "saved_json_path": output_json_path
+                }
+    return response_final
+@app.get("/image/{path:path}")
+async def get_image(path: str):
+    if os.path.exists(path):
+        return FileResponse(path, media_type="image/jpeg")
+    else:
+        raise HTTPException(status_code=404, detail="Image not found")
+@app.get("/file/{path:path}")
+async def get_file(path: str):
+    if os.path.exists(path):
+        paths = path.split("/")
+        filename = paths[len(paths) - 1]
+        if path.endswith('.csv'):
+            media_type = "text/csv"
+        elif path.endswith('.xlsx'):
+            media_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+        else:
+            media_type = "application/octet-stream"
+        return FileResponse(path, media_type=media_type,filename =filename)
+    else:
+        raise HTTPException(status_code=404, detail="File not found")
+@app.post("/table-classification")
+async def table_classification(
+                        structured_chunk_json_path: str = Form(...),
+                        class_keywords_table: str = Form(...),
+                        header_categories: Optional[str] = Form("table_column_header"),
+                        similarity_threshold: Optional[float] = Form(0.4)
+                            ):
+    try:
+        with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+        # This regex removes commas before closing braces/brackets, ignoring whitespace
+        cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
+        # Parse the cleaned JSON
+        structured_chunk_data = json.loads(cleaned_content)
+        # If class_keywords is a string, try to parse it
+        if isinstance(class_keywords_table, str):
+            try:
+                class_keywords_table = json.loads(class_keywords_table)
+                if not isinstance(class_keywords_table, dict):
+                    raise ValueError("class_keywords_table must be a dictionary")
+                if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
+                           for key, value in class_keywords_table.items()):
+                    raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
+            except json.JSONDecodeError:
+                raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
+        elif not isinstance(class_keywords_table, dict) or not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
+                                                            for key, value in class_keywords_table.items()):
+            raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
+        # Perform classification
+        categorized_headers = perform_classification(
+            data=structured_chunk_data,
+            class_keywords=class_keywords_table,
+            header_categories=header_categories,
+            similarity_threshold=similarity_threshold
+        )
+        return categorized_headers
+    except ValueError as e:
+        raise HTTPException(status_code=422, detail={"error": "Input validation failed", "message": str(e)})
+    except Exception as e:
+        raise HTTPException(status_code=422, detail={"error": "Processing failed", "message": str(e)})
+@app.post("/table-column-classification")
+async def table_column_classification(
+                                    input_table_classified_json: Annotated[str, Form()],
+                                    class_keywords_table_column: Annotated[str, Form()],
+                                    filter_table_classifier_name: Annotated[str, Form()],
+                                    similarity_threshold: Annotated[str, Form()]
+                            ):
+    try:
+        # Parse JSON strings into dictionaries
+        input_table_classified_json = json.loads(input_table_classified_json)
+        class_keywords_table_column = json.loads(class_keywords_table_column)
+    except json.JSONDecodeError as e:
+        raise HTTPException(status_code=422, detail={"error": "Invalid JSON format", "message": str(e)})
+    try:
+        # Convert similarity_threshold to integer
+        similarity_threshold = float(similarity_threshold)
+    except ValueError as e:
+        raise HTTPException(status_code=422, detail={"error": "Similarity threshold must be a valid integer", "message": str(e)})
+    column_classification_results = classify_column_headers(
+        json_data=input_table_classified_json,
+        class_keywords=class_keywords_table_column,
+        filter_table_classifier_name=filter_table_classifier_name,
+        similarity_threshold=similarity_threshold
+    )
+    results = {"column_classification_result": column_classification_results}
+    return results
+# Run the server
+if __name__ == "__main__":
+    # uvicorn.run("app:app", host="0.0.0.0", port=7061, log_level="info", reload=True)
+    uvicorn.run( app, host="0.0.0.0", port=7063,log_level="info")
+    # uvicorn.run( app, host="0.0.0.0", port=5052,log_level="info")
+def upload_documents(path) :
+    # path = eval(f'{path}')
+    request = Request
+    print(f'started for path: {path}')
+    base_url = str(request.base_url)
+    global document_data
+    document_data = {}
+    pdf_path = path
+    clear_directory(pdf_input_path)
+    clear_directory(pdf_output_path)
+    clear_directory(word_input_path)
+    clear_directory(word_output_path)
+    # Initialize response structure
+    response = {
+        "success": False,
+        "message": "",
+        # "data": None
+    }
+    if not pdf_path:
+        response["message"] = "No file path provided."
+        return response
+    # Check if the provided path is a PDF file
+    if not pdf_path.lower().endswith(".pdf"):
+        response["message"] = "Invalid file type. Only PDF files are accepted."
+        return response
+    # Extract filename
+    file_name_with_ext = os.path.basename(pdf_path)
+    file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
+    # Create destination path in input directory
+    destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
+    try:
+        # Copy the file to our input directory
+        shutil.copy2(pdf_path, destination_path)
+    except Exception as e:
+        response["message"] = f"Failed to copy file: {str(e)}"
+        return response
+    output_directory_path = os.path.join(output_directory)
+    os.makedirs(output_directory_path, exist_ok=True)
+    file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
+    os.makedirs(file_output, exist_ok=True)
+    table_output_path = os.path.join(file_output, f"table_output")
+    os.makedirs(table_output_path, exist_ok=True)
+    file_location = destination_path
+    # Pipeline processing
+    json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
+    table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
+    custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
+    header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
+    # Initialize data for the new document
+    document_data[file_name_with_ext] = {
+        "pdf_path": destination_path,
+        "pdf_file_name": file_name_with_ext,
+        "model_json_header_output_filepath": [],
+        "model_json_layout_output_filepath": [],
+        "tree_structured_header_json_filepath": [],
+        "user_modified_json_output_filepath": [],
+        'user_modified_table_json_filepath': [],
+        "frontend_output_json": [],
+        "cluster_json": [],
+        "id_2_label" : [],
+        "file_output_dir" : [],
+        "table_output_dir": [],
+        "table_with_header_data" : [],
+        "table_with_header_json_path" : [],
+        "json_output_dir": [],
+        "pdf_miner_json_path": [] ,
+        "searchable_pdf_path" : []
+    }
+    # Store paths and filenames
+    document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
+    document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
+    document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
+    document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
+    document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
+    document_data[file_name_with_ext]["file_output_dir"].append(file_output)
+    document_data[file_name_with_ext]["id_2_label"].append(class_names)
+    document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
+    document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
+    document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
+    document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
+    file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
+    pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
+    pdf_path = document_data[file_name_with_ext]["pdf_path"]
+    user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
+    pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
+    table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
+    document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
+    document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
+    # Process image URLs
+    pdf_images_urls = []
+    for file_name in os.listdir(pdf_images_path):
+        file_path = os.path.join(pdf_images_path, file_name)
+        if file_name.endswith((".jpg", ".jpeg", ".png")):
+            img_url = base_url + "image/" + str(quote(file_path))
+            pdf_images_urls.append(img_url)
+    # Sort image URLs by page number
+    def extract_page_no(url):
+        return int(url.split("_")[-1].split(".")[0])
+    sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
+    # Create page details
+    page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
+    # Store the JSON output
+    document_data[file_name_with_ext]["frontend_output_json"].append({
+        "layout_output_json_data": layout_output_json_data,
+        "layout_json_list_data": layout_list_data,
+        "id_2_label": class_names,
+        "header_output_json_data": header_output_json_data,
+        "table_output_json_data": table_json_data,
+        "table_output_json_data_list": table_json_data_list,
+        "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
+        "pdf_images_urls": page_details,
+    })
+    document_id_name = file_name_with_ext
+    data = document_data[document_id_name]
+    file_output_dir = data["file_output_dir"][0]
+    json_output_dir = data["json_output_dir"][0]
+    pdf_file_name = data["pdf_file_name"]
+    pdf_path = data["pdf_path"]
+    # PDFMiner processing
+    pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
+    modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
+    model_modified_json = read_json(modified_json_output_filepath)
+    pdfminer_json = read_json(pdf_miner_json_filepath)
+    searchable_pdf_path = data["searchable_pdf_path"][0]
+    # table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
+    table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
+    table_merged_json = read_json(table_merged_json_path)
+    table_mapped_modified_json = map_table_with_its_header(table_merged_json)
+    # table_mapped_modified_json = map_table_with_its_header(model_modified_json)
+    # Main header pipeline
+    df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
+    clean_df, clean_df_json = clean_dataframe(df_final)
+    if isinstance(clean_df_json, str):
+        clean_df_json = json.loads(clean_df_json)
+    file_name = get_file_name_without_extension(pdf_file_name)
+    merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
+    company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
+    json_output_filename =  file_name  + "_final_h2h_extraction.json"
+    final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
+    with open(final_json_output_filepath, 'w') as f:
+        json.dump(clean_df_json, f, indent=4)
+    company_wise_json_output_filename =  file_name  + "_final_h2h_extraction_company_wise.json"
+    company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
+    with open(company_wise_final_json_output_filepath, 'w') as f:
+        json.dump(merged_content_company_wise_df, f, indent=4)
+    # Tree-structured header content
+    # final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
+    # final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
+    # document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
+    # final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
+    # Step 1: Extract directory and filename without extension
+    pdf_path = path
+    json_directory = os.path.dirname(pdf_path)
+    json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+    # Step 2: Define output path for JSON
+    output_json_path = os.path.join(json_directory, f"{json_filename }.json")
+    # If your variable is a JSON string, convert it to dict first
+    if isinstance(company_wise_clean_df_json, str):
+        company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
+    # Step 3: Save JSON
+    with open(output_json_path, 'w', encoding='utf-8') as json_file:
+        json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
+    # # post-processing results
+    # post_processing_results = {
+    #                             document_id_name :  {
+    #                                                 # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
+    #                                                 # "df_download_json" : clean_df_json,
+    #                                                 "df_download_json": company_wise_clean_df_json,
+    #                                                 "tree_structured_header_content": final_tree_structred_header_content,
+    #                                                 "file_name": document_id_name,
+    #                                                 # "classified_dynamic_json": dynamic_mapped_data_json,
+    #                                                 "toc_df_download_json" : final_toc_h2h_extraction
+    #                                             }
+    #                             }
+    response_final = {
+                "status_code": 200,
+                # "message":"",
+                # "df_download_json": company_wise_clean_df_json,
+                "saved_json_path": output_json_path
+                }
+    return response_final
+# def upload_documents(path):
+#     logger.info(f"Starting upload_documents for path: {path}")
+#     request = Request
+#     base_url = str(request.base_url)
+#     global document_data
+#     document_data = {}
+#     pdf_path = path
+#     # Log directory clearing
+#     logger.info("Clearing input and output directories")
+#     clear_directory(pdf_input_path)
+#     clear_directory(pdf_output_path)
+#     clear_directory(word_input_path)
+#     clear_directory(word_output_path)
+#     # Initialize response structure
+#     response = {
+#         "success": False,
+#         "message": "",
+#     }
+#     # Check if the provided path is a PDF file
+#     if not pdf_path.lower().endswith(".pdf"):
+#         logger.error(f"Invalid file type for path: {pdf_path}. Only PDF files are accepted.")
+#         response["message"] = "Invalid file type. Only PDF files are accepted."
+#         return response
+#     # Extract filename
+#     file_name_with_ext = os.path.basename(pdf_path)
+#     file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
+#     logger.debug(f"Extracted filename: {file_name_with_ext} (without extension: {file_name_without_ext})")
+#     # Create destination path in input directory
+#     destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
+#     logger.debug(f"Destination path for file copy: {destination_path}")
+#     # Copy file to input directory
+#     try:
+#         logger.info(f"Copying file from {pdf_path} to {destination_path}")
+#         shutil.copy2(pdf_path, destination_path)
+#     except Exception as e:
+#         logger.error(f"Failed to copy file from {pdf_path} to {destination_path}: {str(e)}")
+#         response["message"] = f"Failed to copy file: {str(e)}"
+#         return response
+#     # Create output directories
+#     output_directory_path = os.path.join(output_directory)
+#     os.makedirs(output_directory_path, exist_ok=True)
+#     file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
+#     os.makedirs(file_output, exist_ok=True)
+#     table_output_path = os.path.join(file_output, f"table_output")
+#     os.makedirs(table_output_path, exist_ok=True)
+#     file_location = destination_path
+#     logger.info(f"Created output directories: {file_output}, {table_output_path}")
+#     # Pipeline processing
+#     logger.info(f"Starting yolov10_layout_pipeline for {file_name_without_ext}")
+#     try:
+#         json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
+#         logger.info(f"yolov10_layout_pipeline completed. Output JSON dir: {json_output_dir}")
+#     except Exception as e:
+#         logger.error(f"yolov10_layout_pipeline failed: {str(e)}")
+#         response["message"] = f"yolov10_layout_pipeline failed: {str(e)}"
+#         return response
+#     logger.info(f"Starting filtering_table_pipeline for {file_name_without_ext}")
+#     try:
+#         table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
+#         logger.info(f"filtering_table_pipeline completed. Table JSON path: {table_json_path}")
+#     except Exception as e:
+#         logger.error(f"filtering_table_pipeline failed: {str(e)}")
+#         response["message"] = f"filtering_table_pipeline failed: {str(e)}"
+#         return response
+#     logger.info(f"Starting process_pdf_for_headers for {file_name_without_ext}")
+#     try:
+#         custom_headers_json, custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext, file_location, file_output)
+#         logger.info(f"process_pdf_for_headers completed. Custom headers JSON path: {custom_headers_json_file_path}")
+#     except Exception as e:
+#         logger.error(f"process_pdf_for_headers failed: {str(e)}")
+#         response["message"] = f"process_pdf_for_headers failed: {str(e)}"
+#         return response
+#     logger.info(f"Starting pipeline_for_merging_headers for {file_name_without_ext}")
+#     try:
+#         header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path, header_json_output_filepath, file_output, file_name_without_ext)
+#         logger.info(f"pipeline_for_merging_headers completed. Merged headers JSON path: {header_json_output_filepath}")
+#     except Exception as e:
+#         logger.error(f"pipeline_for_merging_headers failed: {str(e)}")
+#         response["message"] = f"pipeline_for_merging_headers failed: {str(e)}"
+#         return response
+#     # Initialize document_data
+#     logger.debug(f"Initializing document_data for {file_name_with_ext}")
+#     document_data[file_name_with_ext] = {
+#         "pdf_path": destination_path,
+#         "pdf_file_name": file_name_with_ext,
+#         "model_json_header_output_filepath": [],
+#         "model_json_layout_output_filepath": [],
+#         "tree_structured_header_json_filepath": [],
+#         "user_modified_json_output_filepath": [],
+#         "user_modified_table_json_filepath": [],
+#         "frontend_output_json": [],
+#         "cluster_json": [],
+#         "id_2_label": [],
+#         "file_output_dir": [],
+#         "table_output_dir": [],
+#         "table_with_header_data": [],
+#         "table_with_header_json_path": [],
+#         "json_output_dir": [],
+#         "pdf_miner_json_path": [],
+#         "searchable_pdf_path": []
+#     }
+#     # Store paths and filenames
+#     document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
+#     document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
+#     document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
+#     document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
+#     document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
+#     document_data[file_name_with_ext]["file_output_dir"].append(file_output)
+#     document_data[file_name_with_ext]["id_2_label"].append(class_names)
+#     document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
+#     document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
+#     document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
+#     document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
+#     logger.debug(f"Stored paths and filenames in document_data for {file_name_with_ext}")
+#     file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
+#     pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
+#     pdf_path = document_data[file_name_with_ext]["pdf_path"]
+#     user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
+#     logger.info(f"Starting create_json_pdfminer_pipeline for {pdf_file_name}")
+#     try:
+#         pdf_miner_json_filepath, pdf_miner_metadata, searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
+#         logger.info(f"create_json_pdfminer_pipeline completed. PDFMiner JSON path: {pdf_miner_json_filepath}, Searchable PDF path: {searchable_pdf_path}")
+#     except Exception as e:
+#         logger.error(f"create_json_pdfminer_pipeline failed: {str(e)}")
+#         response["message"] = f"create_json_pdfminer_pipeline failed: {str(e)}"
+#         return response
+#     logger.info(f"Starting main_pipeline_create_put_table_headers for {file_name_with_ext}")
+#     try:
+#         table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
+#         logger.info(f"main_pipeline_create_put_table_headers completed")
+#     except Exception as e:
+#         logger.error(f"main_pipeline_create_put_table_headers failed: {str(e)}")
+#         response["message"] = f"main_pipeline_create_put_table_headers failed: {str(e)}"
+#         return response
+#     document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
+#     document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
+#     # Process image URLs
+#     pdf_images_urls = []
+#     for file_name in os.listdir(pdf_images_path):
+#         file_path = os.path.join(pdf_images_path, file_name)
+#         if file_name.endswith((".jpg", ".jpeg", ".png")):
+#             img_url = base_url + "image/" + str(quote(file_path))
+#             pdf_images_urls.append(img_url)
+#     logger.debug(f"Collected {len(pdf_images_urls)} image URLs from {pdf_images_path}")
+#     # Sort image URLs by page number
+#     def extract_page_no(url):
+#         return int(url.split("_")[-1].split(".")[0])
+#     sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
+#     page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
+#     logger.debug(f"Sorted {len(sorted_urls)} image URLs and created page details")
+#     # Store the JSON output
+#     document_data[file_name_with_ext]["frontend_output_json"].append({
+#         "layout_output_json_data": layout_output_json_data,
+#         "layout_json_list_data": layout_list_data,
+#         "id_2_label": class_names,
+#         "header_output_json_data": header_output_json_data,
+#         "table_output_json_data": table_json_data,
+#         "table_output_json_data_list": table_json_data_list,
+#         "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
+#         "pdf_images_urls": page_details,
+#     })
+#     logger.debug(f"Stored frontend_output_json for {file_name_with_ext}")
+#     document_id_name = file_name_with_ext
+#     data = document_data[document_id_name]
+#     file_output_dir = data["file_output_dir"][0]
+#     json_output_dir = data["json_output_dir"][0]
+#     pdf_file_name = data["pdf_file_name"]
+#     pdf_path = data["pdf_path"]
+#     # PDFMiner processing
+#     pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
+#     modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
+#     logger.info(f"Reading JSON files: {modified_json_output_filepath}, {pdf_miner_json_filepath}")
+#     try:
+#         model_modified_json = read_json(modified_json_output_filepath)
+#         pdfminer_json = read_json(pdf_miner_json_filepath)
+#         logger.info(f"Successfully read JSON files")
+#     except Exception as e:
+#         logger.error(f"Failed to read JSON files: {str(e)}")
+#         response["message"] = f"Failed to read JSON files: {str(e)}"
+#         return response
+#     searchable_pdf_path = data["searchable_pdf_path"][0]
+#     logger.info(f"Starting merge_multi_page_tables_pipeline_v2 for {pdf_file_name}")
+#     try:
+#         table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
+#         logger.info(f"merge_multi_page_tables_pipeline_v2 completed. Merged table JSON path: {table_merged_json_path}")
+#     except Exception as e:
+#         logger.error(f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}")
+#         response["message"] = f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}"
+#         return response
+#     logger.info(f"Reading merged table JSON: {table_merged_json_path}")
+#     try:
+#         table_merged_json = read_json(table_merged_json_path)
+#         logger.info(f"Successfully read merged table JSON")
+#     except Exception as e:
+#         logger.error(f"Failed to read merged table JSON: {str(e)}")
+#         response["message"] = f"Failed to read merged table JSON: {str(e)}"
+#         return response
+#     # logger.info(f"Starting map_table_with_its_header for {file_name_with_extDeprecationWarning}")
+#     try:
+#         table_mapped_modified_json = map_table_with_its_header(table_merged_json)
+#         logger.info(f"map_table_with_its_header completed")
+#     except Exception as e:
+#         logger.error(f"map_table_with_its_header failed: {str(e)}")
+#         response["message"] = f"map_table_with_its_header failed: {st(e)}"
+#         return response
+#     logger.info(f"Starting main_header_pipeline for {file_name_with_ext}")
+#     try:
+#         df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
+#         logger.info(f"main_header_pipeline completed")
+#     except Exception as e:
+#         logger.error(f"main_header_pipeline failed: {str(e)}")
+#         response["message"] = f"main_header_pipeline failed: {str(e)}"
+#         return response
+#     logger.info(f"Starting clean_dataframe for final DataFrame")
+#     try:
+#         clean_df, clean_df_json = clean_dataframe(df_final)
+#         logger.info(f"clean_dataframe completed. Clean JSON created")
+#     except Exception as e:
+#         logger.error(f"clean_dataframe failed: {str(e)}")
+#         response["message"] = f"clean_dataframe failed: {str(e)}"
+#         return response
+#     if isinstance(clean_df_json, str):
+#         clean_df_json = eval(clean_df_json)
+#         logger.debug(f"Converted clean_df_json string to dictionary")
+#     file_name = get_file_name_without_extension(pdf_file_name)
+#     logger.info(f"Starting process_document_company_wise for {file_name}")
+#     try:
+#         merged_content_company_wise_df = process_document_company_wise(clean_df_json, output_directory=json_output_dir, file_name=file_name)
+#         company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
+#         logger.info(f"process_document_company_wise and clean_dataframe completed")
+#     except Exception as e:
+#         logger.error(f"process_document_company_wise failed: {str(e)}")
+#         response["message"] = f"process_document_company_wise failed: {str(e)}"
+#         return response
+#     json_output_filename = file_name + "_final_h2h_extraction.json"
+#     final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
+#     logger.info(f"Saving final JSON to {final_json_output_filepath}")
+#     try:
+#         with open(final_json_output_filepath, 'w') as f:
+#             json.dump(clean_df_json, f, indent=4)
+#         logger.info(f"Final JSON saved successfully")
+#     except Exception as e:
+#         logger.error(f"Failed to save final JSON: {str(e)}")
+#         response["message"] = f"Failed to save final JSON: {str(e)}"
+#         return response
+#     company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
+#     company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
+#     logger.info(f"Saving company-wise JSON to {company_wise_final_json_output_filepath}")
+#     try:
+#         with open(company_wise_final_json_output_filepath, 'w') as f:
+#             json.dump(merged_content_company_wise_df, f, indent=4)
+#         logger.info(f"Company-wise JSON saved successfully")
+#     except Exception as e:
+#         logger.error(f"Failed to save company-wise JSON: {str(e)}")
+#         response["message"] = f"Failed to save company-wise JSON: {str(e)}"
+#         return response
+#     logger.info(f"Starting tree_structured_headers_content_pipeline for {pdf_file_name}")
+#     try:
+#         final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
+#         final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
+#         document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
+#         logger.info(f"tree_structured_headers_content_pipeline and merge_blocks completed")
+#     except Exception as e:
+#         logger.error(f"tree_structured_headers_content_pipeline failed: {str(e)}")
+#         response["message"] = f"tree_structured_headers_content_pipeline failed: {str(e)}"
+#         return response
+#     logger.info(f"Starting customised_toc_extraction_pipeline for {searchable_pdf_path}")
+#     try:
+#         final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path, yolo_detection_json_path=modified_json_output_filepath, output_directory=file_output_dir)
+#         logger.info(f"customised_toc_extraction_pipeline completed")
+#     except Exception as e:
+#         logger.error(f"customised_toc_extraction_pipeline failed: {str(e)}")
+#         response["message"] = f"customised_toc_extraction_pipeline failed: {str(e)}"
+#         return response
+#     # Save final JSON output
+#     json_directory = os.path.dirname(pdf_path)
+#     json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+#     output_json_path = os.path.join(json_directory, f"{json_filename}.json")
+#     logger.info(f"Saving output JSON to {output_json_path}")
+#     try:
+#         if isinstance(company_wise_clean_df_json, str):
+#             company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
+#             logger.debug(f"Converted company_wise_clean_df_json string to dictionary")
+#         with open(output_json_path, 'w', encoding='utf-8') as json_file:
+#             json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
+#         logger.info(f"Output JSON saved successfully")
+#     except Exception as e:
+#         logger.error(f"Failed to save output JSON: {str(e)}")
+#         response["message"] = f"Failed to save output JSON: {str(e)}"
+#         return response
+#     response_final = {
+#         "status_code": 200,
+#         "saved_json_path": output_json_path
+#     }
+#     logger.info(f"upload_documents completed successfully for {file_name_with_ext}. Response: {response_final}")
+#     return response_final
+def table_extraction_and_mapping(path,
+                                 field_name,
+                                 class_keywords_table,
+                                 header_categories,
+                                 class_keywords_table_column,
+                                 filter_table_classifier_name,
+                                 threshold) :
+    # path = eval(f'{path}')
+    request = Request
+    print(f'started for path: {path}')
+    base_url = str(request.base_url)
+    global document_data
+    document_data = {}
+    pdf_path = path
+    clear_directory(pdf_input_path)
+    clear_directory(pdf_output_path)
+    clear_directory(word_input_path)
+    clear_directory(word_output_path)
+    # Initialize response structure
+    response = {
+        "success": False,
+        "message": "",
+        # "data": None
+    }
+    if not pdf_path:
+        response["message"] = "No file path provided."
+        return response
+    # Check if the provided path is a PDF file
+    if not pdf_path.lower().endswith(".pdf"):
+        response["message"] = "Invalid file type. Only PDF files are accepted."
+        return response
+    # Extract filename
+    file_name_with_ext = os.path.basename(pdf_path)
+    file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
+    # Create destination path in input directory
+    destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
+    try:
+        # Copy the file to our input directory
+        shutil.copy2(pdf_path, destination_path)
+    except Exception as e:
+        response["message"] = f"Failed to copy file: {str(e)}"
+        return response
+    output_directory_path = os.path.join(output_directory)
+    os.makedirs(output_directory_path, exist_ok=True)
+    file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
+    os.makedirs(file_output, exist_ok=True)
+    table_output_path = os.path.join(file_output, f"table_output")
+    os.makedirs(table_output_path, exist_ok=True)
+    file_location = destination_path
+    # Pipeline processing
+    json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
+    table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
+    custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
+    header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
+    # Initialize data for the new document
+    document_data[file_name_with_ext] = {
+        "pdf_path": destination_path,
+        "pdf_file_name": file_name_with_ext,
+        "model_json_header_output_filepath": [],
+        "model_json_layout_output_filepath": [],
+        "tree_structured_header_json_filepath": [],
+        "user_modified_json_output_filepath": [],
+        'user_modified_table_json_filepath': [],
+        "frontend_output_json": [],
+        "cluster_json": [],
+        "id_2_label" : [],
+        "file_output_dir" : [],
+        "table_output_dir": [],
+        "table_with_header_data" : [],
+        "table_with_header_json_path" : [],
+        "json_output_dir": [],
+        "pdf_miner_json_path": [] ,
+        "searchable_pdf_path" : []
+    }
+    # Store paths and filenames
+    document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
+    document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
+    document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
+    document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
+    document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
+    document_data[file_name_with_ext]["file_output_dir"].append(file_output)
+    document_data[file_name_with_ext]["id_2_label"].append(class_names)
+    document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
+    document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
+    document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
+    document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
+    file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
+    pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
+    pdf_path = document_data[file_name_with_ext]["pdf_path"]
+    user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
+    pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
+    table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
+    document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
+    document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
+    # Process image URLs
+    pdf_images_urls = []
+    for file_name in os.listdir(pdf_images_path):
+        file_path = os.path.join(pdf_images_path, file_name)
+        if file_name.endswith((".jpg", ".jpeg", ".png")):
+            img_url = base_url + "image/" + str(quote(file_path))
+            pdf_images_urls.append(img_url)
+    # Sort image URLs by page number
+    def extract_page_no(url):
+        return int(url.split("_")[-1].split(".")[0])
+    sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
+    # Create page details
+    page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
+    # Store the JSON output
+    document_data[file_name_with_ext]["frontend_output_json"].append({
+        "layout_output_json_data": layout_output_json_data,
+        "layout_json_list_data": layout_list_data,
+        "id_2_label": class_names,
+        "header_output_json_data": header_output_json_data,
+        "table_output_json_data": table_json_data,
+        "table_output_json_data_list": table_json_data_list,
+        "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
+        "pdf_images_urls": page_details,
+    })
+    document_id_name = file_name_with_ext
+    data = document_data[document_id_name]
+    file_output_dir = data["file_output_dir"][0]
+    json_output_dir = data["json_output_dir"][0]
+    pdf_file_name = data["pdf_file_name"]
+    pdf_path = data["pdf_path"]
+    # PDFMiner processing
+    pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
+    modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
+    model_modified_json = read_json(modified_json_output_filepath)
+    pdfminer_json = read_json(pdf_miner_json_filepath)
+    searchable_pdf_path = data["searchable_pdf_path"][0]
+    # table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
+    table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
+    table_merged_json = read_json(table_merged_json_path)
+    table_mapped_modified_json = map_table_with_its_header(table_merged_json)
+    # table_mapped_modified_json = map_table_with_its_header(model_modified_json)
+    # Main header pipeline
+    df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
+    clean_df, clean_df_json = clean_dataframe(df_final)
+    # if isinstance(clean_df_json, str):
+    #     clean_df_json = eval(clean_df_json)
+    file_name = get_file_name_without_extension(pdf_file_name)
+    # Step 1: Extract directory and filename without extension
+    pdf_path = path
+    json_directory = os.path.dirname(pdf_path)
+    json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+    # Step 2: Define output path for JSON
+    output_json_path = os.path.join(json_directory, f"{json_filename}_structured_chunking.json")
+    # If your variable is a JSON string, convert it to dict first
+    if isinstance(clean_df_json, str):
+        clean_df_json = json.loads(clean_df_json)
+    # Step 3: Save JSON
+    with open(output_json_path, 'w', encoding='utf-8') as json_file:
+        json.dump(clean_df_json, json_file, ensure_ascii=False, indent=4)
+    ##########################################################
+    # Table Classification Code
+    ##########################################################
+    print("starting table classification pipeline")
+    structured_chunk_json_path = output_json_path
+    with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+    # This regex removes commas before closing braces/brackets, ignoring whitespace
+    cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
+    # Parse the cleaned JSON
+    structured_chunk_data = json.loads(cleaned_content)
+    threshold = float(threshold)
+    print("type of class_keywords_table::\n",type(class_keywords_table))
+    # If class_keywords is a string, try to parse it
+    if isinstance(class_keywords_table, str):
+        try:
+            class_keywords_table = json.loads(class_keywords_table)
+            # if not isinstance(class_keywords_table, dict):
+            #     raise ValueError("class_keywords_table must be a dictionary")
+            # if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
+            #         for key, value in class_keywords_table.items()):
+            #     raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
+    elif isinstance(class_keywords_table, dict) :
+        class_keywords_table = class_keywords_table
+    else:
+        raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
+    # Perform classification
+    categorized_headers_json = perform_classification(
+        data=structured_chunk_data,
+        class_keywords=class_keywords_table,
+        header_categories=header_categories,
+        similarity_threshold=threshold
+    )
+    # Step 1: Extract directory and filename without extension
+    pdf_path = path
+    json_directory = os.path.dirname(pdf_path)
+    json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+    # Step 2: Define output path for JSON
+    classified_table_output_json_path = os.path.join(json_directory, f"{field_name}_table_classification.json")
+    # If your variable is a JSON string, convert it to dict first
+    if isinstance(categorized_headers_json, str):
+        categorized_headers_json = json.loads(categorized_headers_json)
+    # Step 3: Save JSON
+    with open(classified_table_output_json_path, 'w', encoding='utf-8') as json_file:
+        json.dump(categorized_headers_json, json_file, ensure_ascii=False, indent=4)
+    #######################################################
+    # Table Column Classification Code
+    print("Starting Table Column Classification")
+    # Parse JSON strings into dictionaries
+    # input_table_classified_json = json.load(classified_table_output_json_path)
+    with open(classified_table_output_json_path, "r") as f:
+        input_table_classified_json = json.load(f)
+    # class_keywords_table_column = json.loads(class_keywords_table_column)
+    if isinstance(class_keywords_table_column, str):
+        try:
+            class_keywords_table_column = json.loads(class_keywords_table_column)
+        except json.JSONDecodeError:
+            raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table_column"})
+    elif isinstance(class_keywords_table_column, dict):
+        class_keywords_table_column = class_keywords_table_column
+    # Convert similarity_threshold to integer
+    similarity_threshold = float(threshold)
+    column_classification_results_json = classify_column_headers(
+        json_data=input_table_classified_json,
+        class_keywords=class_keywords_table_column,
+        filter_table_classifier_name=filter_table_classifier_name,
+        similarity_threshold=similarity_threshold
+    )
+    # Step 1: Extract directory and filename without extension
+    pdf_path = path
+    json_directory = os.path.dirname(pdf_path)
+    json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
+    # Step 2: Define output path for JSON
+    classified_table_column_output_json_path = os.path.join(json_directory, f"{field_name}_table_column_classification.json")
+    # If your variable is a JSON string, convert it to dict first
+    if isinstance(column_classification_results_json, str):
+        column_classification_results_json = json.loads(column_classification_results_json)
+    # Step 3: Save JSON
+    with open(classified_table_column_output_json_path, 'w', encoding='utf-8') as json_file:
+        json.dump(column_classification_results_json, json_file, ensure_ascii=False, indent=4)
+    #######################################################################
+    response_final = {
+                "status_code": 200,
+                # "message":"",
+                # "df_download_json": company_wise_clean_df_json,
+                "structured_chunk_json_path": output_json_path,
+                "table_classification_json_path":classified_table_output_json_path,
+                "table_column_classification_json_path" : classified_table_column_output_json_path
+                }
+    return response_final

layout_detection_docling_heron (1).py ADDED Viewed

	@@ -0,0 +1,497 @@

+import cv2
+import os
+import supervision as sv  # pip install supervision
+from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
+from pdf2image import convert_from_path
+import numpy as np
+from PIL import Image
+import json
+import pytesseract
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+from PyPDF2 import PdfReader
+from datetime import datetime
+import torch
+import logging
+from utils.utils_code import log_time_taken
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
+import sys
+import gc
+from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
+from config.set_config import set_configuration
+set_config_project = set_configuration()
+layout_model_weights_path = set_config_project.layout_model_weights_path
+no_of_threads = set_config_project.no_of_threads
+from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
+from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
+from src.table_processing.put_table_header import put_table_header_pipeline
+import gc
+from src.layout_detection.load_model import load_model_for_process
+# Set multiprocessing start method
+multiprocessing.set_start_method('spawn', force=True)
+logger = logging.getLogger(__name__)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def load_torch(version):
+    if version == "2.2.2":
+        sys.path.insert(0, "./torch_2_2_2")
+    elif version == "2.6.0":
+        sys.path.insert(0, "./torch_2_6_0")
+    import torch
+    logger.info(f"Using Torch Version: {torch.__version__}")
+    return torch
+torch = load_torch("2.2.2")
+def get_file_name_without_extension(file_path):
+    directory, file_name = os.path.split(file_path)
+    name, extension = os.path.splitext(file_name)
+    return name
+def convert_numpy(data):
+    if isinstance(data, dict):
+        return {key: convert_numpy(value) for key, value in data.items()}
+    elif isinstance(data, list):
+        return [convert_numpy(item) for item in data]
+    elif isinstance(data, np.integer):
+        return int(data)
+    elif isinstance(data, np.floating):
+        return float(data)
+    elif isinstance(data, np.ndarray):
+        return data.tolist()
+    elif isinstance(data, pd.DataFrame):
+        return data.to_dict(orient='records')
+    else:
+        return data
+def filter_layout_blocks(input_data):
+    filtered_layout_blocks = []
+    for blocks in input_data.values():
+        filtered_layout_blocks.extend([block for block in blocks])
+    return filtered_layout_blocks
+def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
+    images = convert_from_path(file_path, dpi=dpi)
+    total_pages = len(images)
+    def page_generator():
+        for start_page in range(1, total_pages + 1, batch_size):
+            end_page = min(start_page + batch_size - 1, total_pages)
+            yield images[start_page-1:end_page]
+    return page_generator()
+def read_json(json_file):
+    with open(json_file, 'r') as file:
+        return json.load(file)
+def filter_and_sort_headers(data, modified_json_output_filepath):
+    def sort_blocks_by_min_x(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][0])
+    def sort_blocks_by_min_y(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][1])
+    def find_headers_and_group(sorted_blocks):
+        headers_list = []
+        current_group = []
+        previous_block = None
+        for i, block in enumerate(sorted_blocks):
+            if previous_block:
+                prev_xmax = previous_block['bbox'][2]
+                prev_xmax_threshold = int(previous_block['bbox'][2])
+                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
+                    if current_group:
+                        headers_list.extend(sort_blocks_by_min_y(current_group))
+                        current_group = []
+            current_group.append(block)
+            previous_block = block
+        if current_group:
+            headers_list.extend(sort_blocks_by_min_y(current_group))
+        return headers_list
+    result = {}
+    for key, blocks in data.items():
+        sorted_blocks = sort_blocks_by_min_x(blocks)
+        sorted_headers = find_headers_and_group(sorted_blocks)
+        result[key] = sorted_headers
+    sorted_data = result
+    with open(modified_json_output_filepath, 'w') as f:
+        json.dump(sorted_data, f, indent=4)
+    return sorted_data, modified_json_output_filepath
+def filter_and_sort_layouts(data, modified_json_output_filepath):
+    def sort_blocks_by_min_x(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][0])
+    def sort_blocks_by_min_y(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][1])
+    def find_classes_and_group(sorted_blocks):
+        classes_list = []
+        current_group = []
+        previous_block = None
+        for i, block in enumerate(sorted_blocks):
+            if previous_block:
+                prev_xmax = previous_block['bbox'][2]
+                prev_xmax_threshold = int(previous_block['bbox'][2])
+                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
+                    if current_group:
+                        classes_list.extend(sort_blocks_by_min_y(current_group))
+                        current_group = []
+            current_group.append(block)
+            previous_block = block
+        if current_group:
+            classes_list.extend(sort_blocks_by_min_y(current_group))
+        return classes_list
+    result = {}
+    for key, blocks in data.items():
+        sorted_blocks = sort_blocks_by_min_x(blocks)
+        sorted_layouts = find_classes_and_group(sorted_blocks)
+        result[key] = sorted_layouts
+    sorted_layout_data = result
+    with open(modified_json_output_filepath, 'w') as f:
+        json.dump(sorted_layout_data, f, indent=4)
+    return sorted_layout_data, modified_json_output_filepath
+@log_time_taken
+def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
+    try:
+        image = Image.open(img_path).convert("RGB")
+        # Process image with the Docling Heron model
+        inputs = image_processor(images=[image], return_tensors="pt")
+        # Move inputs to the same device as the model
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Post-process the results
+        results = image_processor.post_process_object_detection(
+            outputs,
+            target_sizes=torch.tensor([image.size[::-1]], device=device),
+            threshold=threshold
+        )[0]
+        # Move results to CPU for further processing
+        results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
+        # Convert to supervision Detections format for compatibility
+        xyxy = results["boxes"].numpy()
+        confidence = results["scores"].numpy()
+        class_id = results["labels"].numpy()
+        class_name = [model.config.id2label[label_id] for label_id in class_id]
+        detections = sv.Detections(
+            xyxy=xyxy,
+            confidence=confidence,
+            class_id=class_id,
+            data={"class_name": class_name}
+        )
+        # Custom bounding box color (Red)
+        bbox_color = sv.Color(r=255, g=0, b=0)
+        bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
+        label_annotator = sv.LabelAnnotator()
+        # Annotate the image
+        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
+        annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
+        # Clean up
+        del inputs, outputs
+        torch.cuda.empty_cache() if device == 'cuda' else None
+        gc.collect()
+        return annotated_image, detections, results
+    except Exception as e:
+        logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
+        raise
+def enhance_dpi(image, new_dpi=300, old_dpi=150):
+    old_dpi = int(old_dpi)
+    new_dpi = int(new_dpi)
+    scaling_factor = new_dpi / old_dpi
+    new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
+    resized_image = image.resize(new_size, Image.LANCZOS)
+    return resized_image
+def extract_text_from_bbox(image, bbox):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    elif isinstance(image, np.ndarray):
+        pass
+    else:
+        raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
+    image_height, image_width = image.shape[:2]
+    ymin = max(0, int(bbox['ymin'] - 5))
+    ymax = min(image_height, int(bbox['ymax'] + 5))
+    xmin = max(0, int(bbox['xmin'] - 20))
+    xmax = min(image_width, int(bbox['xmax'] + 20))
+    cropped_image = image[ymin:ymax, xmin:xmax]
+    cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
+    high_dpi_image = enhance_dpi(cropped_image_pil)
+    high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
+    gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
+    custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
+    extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
+    return extracted_text
+def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
+    if not isinstance(extracted_text, pd.DataFrame):
+        return False
+    model = SentenceTransformer(model_name)
+    extracted_headers = list(extracted_text.columns)
+    extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
+    header_embeddings = model.encode(header_list, convert_to_tensor=True)
+    similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
+    for i, header in enumerate(header_list):
+        for j, extracted_header in enumerate(extracted_headers):
+            if similarity_matrix[i][j] > threshold:
+                logger.info(f"Matching header found: {extracted_header} (similar to {header})")
+                return True
+    logger.info("No matching headers found.")
+    return False
+def process_page(args):
+    (page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    try:
+        model, image_processor, class_names = load_model_for_process()
+        model.to(device)  # Ensure model is on the correct device
+        image = np.array(page_img)
+        h, w, _ = image.shape
+        page_number = str(current_page_num)
+        img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
+        img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
+        pil_image = Image.fromarray(image)
+        pil_image.save(img_output_filepath)
+        cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
+        os.makedirs(cropped_images_path, exist_ok=True)
+        bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
+        logger.info(f"Processed layout detection for page {page_number}")
+        pil_bbox_image = Image.fromarray(bbox_image)
+        bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
+        bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
+        pil_bbox_image.save(bbox_output_filepath)
+        page_information = []
+        for idx, bbox in enumerate(page_detections_info.xyxy):
+            label_name = page_detections_info.data['class_name'][idx]
+            class_id = page_detections_info.class_id[idx]
+            score = page_detections_info.confidence[idx]
+            image_height = h
+            image_width = w
+            ymin = max(0, bbox[1] - 10)
+            ymax = min(image_height, bbox[3] + 10)
+            xmin = max(0, bbox[0] - 10)
+            xmax = min(image_width, bbox[2] + 10)
+            new_bbox = {
+                "xmin": int(bbox[0]),
+                "ymin": int(bbox[1]),
+                "xmax": int(bbox[2]),
+                "ymax": int(bbox[3])
+            }
+            cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
+            os.makedirs(cropped_labels_images_path, exist_ok=True)
+            crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
+            crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
+            crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
+            cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
+            cropped_label_pil_image.save(crop_label_image_filename_filepath)
+            if label_name == 'Table':
+                crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
+                cropped_image = pil_image.crop(crop_bbox)
+                df_post_processed, df_original = tsr_inference_image(cropped_image)
+                extracted_df = df_post_processed
+                extracted_text = extracted_df
+                if isinstance(df_original, pd.DataFrame):
+                    extracted_df_markdown = df_original.to_markdown()
+                else:
+                    extracted_df_markdown = df_original
+            else:
+                extracted_text = extract_text_from_bbox(image, new_bbox)
+                extracted_df_markdown = ""
+            page_block_id = f"{str(idx + 1) + str(current_page_num)}"
+            page_block_id = int(page_block_id)
+            page_information.append({
+                'page_block_id': page_block_id,
+                'label_name': label_name,
+                'pdf_page_id': current_page_num,
+                'pdf_name': file_name,
+                'label_id': class_id,
+                'yolo_detection_confidence_score': score,
+                'bbox': [xmin, ymin, xmax, ymax],
+                'page_img_width': w,
+                'page_img_height': h,
+                'extracted_text': [extracted_text],
+                "extracted_table_markdown": [extracted_df_markdown]
+            })
+        # Clean up
+        del image, bbox_image, model, image_processor
+        torch.cuda.empty_cache() if device == 'cuda' else None
+        gc.collect()
+        return page_number, page_information, class_names
+    except Exception as e:
+        logger.error(f"Error processing page {current_page_num}: {str(e)}")
+        raise
+@log_time_taken
+def yolov10_layout_pipeline(file_name, file_path, directory_path):
+    if not file_path.lower().endswith('.pdf'):
+        raise ValueError("Input file must be a PDF.")
+    logger.info(f"Starting processing for {file_name}")
+    start_time = datetime.now()
+    file_name = get_file_name_without_extension(file_path)
+    pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
+    os.makedirs(pdf_images_path, exist_ok=True)
+    bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
+    os.makedirs(bbox_images_path, exist_ok=True)
+    json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
+    os.makedirs(json_output_path, exist_ok=True)
+    total_pages_processed = 0
+    data_pdf = {}
+    try:
+        page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
+        page_args = []
+        for pages in page_generator:
+            if not pages:
+                break
+            for page_num, page_img in enumerate(pages):
+                current_page_num = total_pages_processed + page_num + 1
+                logger.info(f"Processing file {file_name}, page {current_page_num}")
+                page_args.append((
+                    page_img,
+                    current_page_num,
+                    file_name,
+                    pdf_images_path,
+                    bbox_images_path
+                ))
+            total_pages_processed += len(pages)
+        logger.info(f"Total pages to process: {total_pages_processed}")
+        with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
+            future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
+            for future in as_completed(future_to_page):
+                page_number = future_to_page[future]
+                try:
+                    result = future.result()
+                    page_number, page_information, class_names = result
+                    data_pdf[page_number] = page_information
+                except Exception as e:
+                    logger.error(f"Error processing page {page_number}: {str(e)}")
+                    raise
+        logger.info(f"Processed pages: {data_pdf.keys()}")
+        layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
+        user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
+        tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
+        data_pdf = convert_numpy(data_pdf)
+        layout_list_data = filter_layout_blocks(data_pdf)
+        with open(layout_json_file_path, 'w') as json_file:
+            json.dump(data_pdf, json_file, indent=4)
+        with open(user_modification_json_file_path, 'w') as json_file:
+            json.dump(data_pdf, json_file, indent=4)
+        sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
+        tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
+        sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
+        filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
+        end_time = datetime.now()
+        logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
+        logger.info(f"JSON file created at: {modified_json_output_filepath}")
+        return (
+            json_output_path,
+            layout_list_data,
+            class_names,
+            sorted_data,
+            modified_json_output_filepath,
+            pdf_images_path,
+            file_name,
+            sorted_layout_data,
+            sorted_layout_json_filepath,
+            tree_structured_organized_json_data,
+            tree_structured_json_output_path,
+            filtered_table_header_data,
+            filtered_table_header_data_json_path
+        )
+    except Exception as e:
+        logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
+        raise
+    finally:
+        # Ensure GPU memory is cleared
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        gc.collect()
+# Example usage
+if __name__ == "__main__":
+    pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
+    output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
+    file_name = get_file_name_without_extension(pdf_path)
+    yolov10_layout_pipeline(file_name, pdf_path, output_directory)

layout_detection_docling_heron (2).py ADDED Viewed

	@@ -0,0 +1,497 @@

+import cv2
+import os
+import supervision as sv  # pip install supervision
+from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
+from pdf2image import convert_from_path
+import numpy as np
+from PIL import Image
+import json
+import pytesseract
+import pandas as pd
+from sentence_transformers import SentenceTransformer, util
+from PyPDF2 import PdfReader
+from datetime import datetime
+import torch
+import logging
+from utils.utils_code import log_time_taken
+from concurrent.futures import ProcessPoolExecutor, as_completed
+import multiprocessing
+import sys
+import gc
+from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
+from config.set_config import set_configuration
+set_config_project = set_configuration()
+layout_model_weights_path = set_config_project.layout_model_weights_path
+no_of_threads = set_config_project.no_of_threads
+from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
+from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
+from src.table_processing.put_table_header import put_table_header_pipeline
+import gc
+from src.layout_detection.load_model import load_model_for_process
+# Set multiprocessing start method
+multiprocessing.set_start_method('spawn', force=True)
+logger = logging.getLogger(__name__)
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+def load_torch(version):
+    if version == "2.2.2":
+        sys.path.insert(0, "./torch_2_2_2")
+    elif version == "2.6.0":
+        sys.path.insert(0, "./torch_2_6_0")
+    import torch
+    logger.info(f"Using Torch Version: {torch.__version__}")
+    return torch
+torch = load_torch("2.2.2")
+def get_file_name_without_extension(file_path):
+    directory, file_name = os.path.split(file_path)
+    name, extension = os.path.splitext(file_name)
+    return name
+def convert_numpy(data):
+    if isinstance(data, dict):
+        return {key: convert_numpy(value) for key, value in data.items()}
+    elif isinstance(data, list):
+        return [convert_numpy(item) for item in data]
+    elif isinstance(data, np.integer):
+        return int(data)
+    elif isinstance(data, np.floating):
+        return float(data)
+    elif isinstance(data, np.ndarray):
+        return data.tolist()
+    elif isinstance(data, pd.DataFrame):
+        return data.to_dict(orient='records')
+    else:
+        return data
+def filter_layout_blocks(input_data):
+    filtered_layout_blocks = []
+    for blocks in input_data.values():
+        filtered_layout_blocks.extend([block for block in blocks])
+    return filtered_layout_blocks
+def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
+    images = convert_from_path(file_path, dpi=dpi)
+    total_pages = len(images)
+    def page_generator():
+        for start_page in range(1, total_pages + 1, batch_size):
+            end_page = min(start_page + batch_size - 1, total_pages)
+            yield images[start_page-1:end_page]
+    return page_generator()
+def read_json(json_file):
+    with open(json_file, 'r') as file:
+        return json.load(file)
+def filter_and_sort_headers(data, modified_json_output_filepath):
+    def sort_blocks_by_min_x(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][0])
+    def sort_blocks_by_min_y(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][1])
+    def find_headers_and_group(sorted_blocks):
+        headers_list = []
+        current_group = []
+        previous_block = None
+        for i, block in enumerate(sorted_blocks):
+            if previous_block:
+                prev_xmax = previous_block['bbox'][2]
+                prev_xmax_threshold = int(previous_block['bbox'][2])
+                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
+                    if current_group:
+                        headers_list.extend(sort_blocks_by_min_y(current_group))
+                        current_group = []
+            current_group.append(block)
+            previous_block = block
+        if current_group:
+            headers_list.extend(sort_blocks_by_min_y(current_group))
+        return headers_list
+    result = {}
+    for key, blocks in data.items():
+        sorted_blocks = sort_blocks_by_min_x(blocks)
+        sorted_headers = find_headers_and_group(sorted_blocks)
+        result[key] = sorted_headers
+    sorted_data = result
+    with open(modified_json_output_filepath, 'w') as f:
+        json.dump(sorted_data, f, indent=4)
+    return sorted_data, modified_json_output_filepath
+def filter_and_sort_layouts(data, modified_json_output_filepath):
+    def sort_blocks_by_min_x(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][0])
+    def sort_blocks_by_min_y(blocks):
+        return sorted(blocks, key=lambda block: block['bbox'][1])
+    def find_classes_and_group(sorted_blocks):
+        classes_list = []
+        current_group = []
+        previous_block = None
+        for i, block in enumerate(sorted_blocks):
+            if previous_block:
+                prev_xmax = previous_block['bbox'][2]
+                prev_xmax_threshold = int(previous_block['bbox'][2])
+                if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
+                    if current_group:
+                        classes_list.extend(sort_blocks_by_min_y(current_group))
+                        current_group = []
+            current_group.append(block)
+            previous_block = block
+        if current_group:
+            classes_list.extend(sort_blocks_by_min_y(current_group))
+        return classes_list
+    result = {}
+    for key, blocks in data.items():
+        sorted_blocks = sort_blocks_by_min_x(blocks)
+        sorted_layouts = find_classes_and_group(sorted_blocks)
+        result[key] = sorted_layouts
+    sorted_layout_data = result
+    with open(modified_json_output_filepath, 'w') as f:
+        json.dump(sorted_layout_data, f, indent=4)
+    return sorted_layout_data, modified_json_output_filepath
+@log_time_taken
+def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
+    try:
+        image = Image.open(img_path).convert("RGB")
+        # Process image with the Docling Heron model
+        inputs = image_processor(images=[image], return_tensors="pt")
+        # Move inputs to the same device as the model
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Post-process the results
+        results = image_processor.post_process_object_detection(
+            outputs,
+            target_sizes=torch.tensor([image.size[::-1]], device=device),
+            threshold=threshold
+        )[0]
+        # Move results to CPU for further processing
+        results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
+        # Convert to supervision Detections format for compatibility
+        xyxy = results["boxes"].numpy()
+        confidence = results["scores"].numpy()
+        class_id = results["labels"].numpy()
+        class_name = [model.config.id2label[label_id] for label_id in class_id]
+        detections = sv.Detections(
+            xyxy=xyxy,
+            confidence=confidence,
+            class_id=class_id,
+            data={"class_name": class_name}
+        )
+        # Custom bounding box color (Red)
+        bbox_color = sv.Color(r=255, g=0, b=0)
+        bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
+        label_annotator = sv.LabelAnnotator()
+        # Annotate the image
+        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
+        annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
+        # Clean up
+        del inputs, outputs
+        torch.cuda.empty_cache() if device == 'cuda' else None
+        gc.collect()
+        return annotated_image, detections, results
+    except Exception as e:
+        logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
+        raise
+def enhance_dpi(image, new_dpi=300, old_dpi=150):
+    old_dpi = int(old_dpi)
+    new_dpi = int(new_dpi)
+    scaling_factor = new_dpi / old_dpi
+    new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
+    resized_image = image.resize(new_size, Image.LANCZOS)
+    return resized_image
+def extract_text_from_bbox(image, bbox):
+    if isinstance(image, Image.Image):
+        image = np.array(image)
+    elif isinstance(image, np.ndarray):
+        pass
+    else:
+        raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
+    image_height, image_width = image.shape[:2]
+    ymin = max(0, int(bbox['ymin'] - 5))
+    ymax = min(image_height, int(bbox['ymax'] + 5))
+    xmin = max(0, int(bbox['xmin'] - 20))
+    xmax = min(image_width, int(bbox['xmax'] + 20))
+    cropped_image = image[ymin:ymax, xmin:xmax]
+    cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
+    high_dpi_image = enhance_dpi(cropped_image_pil)
+    high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
+    gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
+    custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
+    extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
+    return extracted_text
+def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
+    if not isinstance(extracted_text, pd.DataFrame):
+        return False
+    model = SentenceTransformer(model_name)
+    extracted_headers = list(extracted_text.columns)
+    extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
+    header_embeddings = model.encode(header_list, convert_to_tensor=True)
+    similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
+    for i, header in enumerate(header_list):
+        for j, extracted_header in enumerate(extracted_headers):
+            if similarity_matrix[i][j] > threshold:
+                logger.info(f"Matching header found: {extracted_header} (similar to {header})")
+                return True
+    logger.info("No matching headers found.")
+    return False
+def process_page(args):
+    (page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    try:
+        model, image_processor, class_names = load_model_for_process()
+        model.to(device)  # Ensure model is on the correct device
+        image = np.array(page_img)
+        h, w, _ = image.shape
+        page_number = str(current_page_num)
+        img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
+        img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
+        pil_image = Image.fromarray(image)
+        pil_image.save(img_output_filepath)
+        cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
+        os.makedirs(cropped_images_path, exist_ok=True)
+        bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
+        logger.info(f"Processed layout detection for page {page_number}")
+        pil_bbox_image = Image.fromarray(bbox_image)
+        bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
+        bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
+        pil_bbox_image.save(bbox_output_filepath)
+        page_information = []
+        table_cropped_directory = None
+        for idx, bbox in enumerate(page_detections_info.xyxy):
+            label_name = page_detections_info.data['class_name'][idx]
+            class_id = page_detections_info.class_id[idx]
+            score = page_detections_info.confidence[idx]
+            image_height = h
+            image_width = w
+            ymin = max(0, bbox[1] - 10)
+            ymax = min(image_height, bbox[3] + 10)
+            xmin = max(0, bbox[0] - 10)
+            xmax = min(image_width, bbox[2] + 10)
+            new_bbox = {
+                "xmin": int(bbox[0]),
+                "ymin": int(bbox[1]),
+                "xmax": int(bbox[2]),
+                "ymax": int(bbox[3])
+            }
+            cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
+            os.makedirs(cropped_labels_images_path, exist_ok=True)
+            crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
+            crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
+            crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
+            cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
+            cropped_label_pil_image.save(crop_label_image_filename_filepath)
+            if label_name == 'Table':
+                crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
+                cropped_image = pil_image.crop(crop_bbox)
+                df_post_processed, df_original = tsr_inference_image(cropped_image)
+                extracted_df = df_post_processed
+                extracted_text = extracted_df
+                table_cropped_directory = cropped_labels_images_path
+                if isinstance(df_original, pd.DataFrame):
+                    extracted_df_markdown = df_original.to_markdown()
+                else:
+                    extracted_df_markdown = df_original
+            else:
+                extracted_text = extract_text_from_bbox(image, new_bbox)
+                extracted_df_markdown = ""
+            page_block_id = f"{str(idx + 1) + str(current_page_num)}"
+            page_block_id = int(page_block_id)
+            page_information.append({
+                'page_block_id': page_block_id,
+                'label_name': label_name,
+                'pdf_page_id': current_page_num,
+                'pdf_name': file_name,
+                'label_id': class_id,
+                'yolo_detection_confidence_score': score,
+                'bbox': [xmin, ymin, xmax, ymax],
+                'page_img_width': w,
+                'page_img_height': h,
+                'extracted_text': [extracted_text],
+                "extracted_table_markdown": [extracted_df_markdown]
+            })
+        # Clean up
+        del image, bbox_image, model, image_processor
+        torch.cuda.empty_cache() if device == 'cuda' else None
+        gc.collect()
+        return page_number, page_information, class_names,table_cropped_directory
+    except Exception as e:
+        logger.error(f"Error processing page {current_page_num}: {str(e)}")
+        raise
+@log_time_taken
+def yolov10_layout_pipeline(file_name, file_path, directory_path):
+    if not file_path.lower().endswith('.pdf'):
+        raise ValueError("Input file must be a PDF.")
+    logger.info(f"Starting processing for {file_name}")
+    start_time = datetime.now()
+    file_name = get_file_name_without_extension(file_path)
+    pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
+    os.makedirs(pdf_images_path, exist_ok=True)
+    bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
+    os.makedirs(bbox_images_path, exist_ok=True)
+    json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
+    os.makedirs(json_output_path, exist_ok=True)
+    total_pages_processed = 0
+    data_pdf = {}
+    try:
+        page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
+        page_args = []
+        for pages in page_generator:
+            if not pages:
+                break
+            for page_num, page_img in enumerate(pages):
+                current_page_num = total_pages_processed + page_num + 1
+                logger.info(f"Processing file {file_name}, page {current_page_num}")
+                page_args.append((
+                    page_img,
+                    current_page_num,
+                    file_name,
+                    pdf_images_path,
+                    bbox_images_path
+                ))
+            total_pages_processed += len(pages)
+        logger.info(f"Total pages to process: {total_pages_processed}")
+        with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
+            future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
+            for future in as_completed(future_to_page):
+                page_number = future_to_page[future]
+                try:
+                    result = future.result()
+                    page_number, page_information, class_names,cropped_tables_images_dir_path = result
+                    data_pdf[page_number] = page_information
+                except Exception as e:
+                    logger.error(f"Error processing page {page_number}: {str(e)}")
+                    raise
+        logger.info(f"Processed pages: {data_pdf.keys()}")
+        layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
+        user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
+        tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
+        data_pdf = convert_numpy(data_pdf)
+        layout_list_data = filter_layout_blocks(data_pdf)
+        # Replace the existing JSON writing blocks in the yolov10_layout_pipeline function with the following:
+        with open(layout_json_file_path, 'w') as json_file:
+            json.dump({int(k): v for k, v in sorted(data_pdf.items(), key=lambda x: int(x[0]))}, json_file, indent=4)
+        with open(user_modification_json_file_path, 'w') as json_file:
+            json.dump({int(k): v for k, v in sorted(data_pdf.items(), key=lambda x: int(x[0]))}, json_file, indent=4)
+        sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
+        tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
+        sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
+        filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
+        end_time = datetime.now()
+        logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
+        logger.info(f"JSON file created at: {modified_json_output_filepath}")
+        return (
+            json_output_path,
+            layout_list_data,
+            class_names,
+            sorted_data,
+            modified_json_output_filepath,
+            pdf_images_path,
+            file_name,
+            sorted_layout_data,
+            sorted_layout_json_filepath,
+            tree_structured_organized_json_data,
+            tree_structured_json_output_path,
+            filtered_table_header_data,
+            filtered_table_header_data_json_path,
+            cropped_tables_images_dir_path
+        )
+    except Exception as e:
+        logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
+        raise
+    finally:
+        # Ensure GPU memory is cleared
+        torch.cuda.empty_cache() if torch.cuda.is_available() else None
+        gc.collect()
+# Example usage
+if __name__ == "__main__":
+    pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
+    output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
+    file_name = get_file_name_without_extension(pdf_path)
+    yolov10_layout_pipeline(file_name, pdf_path, output_directory)

load_model (1).py ADDED Viewed

	@@ -0,0 +1,106 @@

+# from ultralytics import YOLOv10
+import torch
+from config.set_config import set_configuration
+set_config_project = set_configuration()
+layout_model_weights_path = set_config_project.layout_model_weights_path
+no_of_threads = set_config_project.no_of_threads
+# def load_model_for_process(detection_model_path=layout_model_weights_path):
+#     """
+#     Load model in each subprocess to avoid CUDA initialization issues
+#     Returns:
+#         Model loaded in appropriate device
+#     """
+#     # Your model loading logic
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     # print(f"Using device: {device}")
+#     model = YOLOv10(detection_model_path).to(device)
+#     class_names = model.names
+#     class_names["11"] = "Table-header"
+#     class_names["12"] = "Portfolio-Company-Table"
+#     return model, class_names
+import torch
+from ultralytics import YOLO
+# def load_model_for_process(detection_model_path=layout_model_weights_path):
+#     """
+#     Load model in each subprocess to avoid CUDA initialization issues
+#     Returns:
+#         Model loaded in appropriate device
+#     """
+#     # Your model loading logic
+#     device = "cuda" if torch.cuda.is_available() else "cpu"
+#     # print(f"Using device: {device}")
+#     model = YOLO(detection_model_path).to(device)
+#     class_names = model.names
+#     class_names["11"] = "Table-header"
+#     class_names["12"] = "Portfolio-Company-Table"
+#     print("YOLOV12"*10)
+#     return model, class_names
+'''Below code for docling heron model'''
+from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
+# MODEL_NAME_DOCLING = "ds4sd/docling-layout-heron"
+MODEL_NAME_DOCLING = layout_model_weights_path
+def load_model_for_process(model_name=MODEL_NAME_DOCLING):
+    """
+    Load the Docling Heron model and image processor in each subprocess to avoid CUDA initialization issues.
+    Returns:
+        Tuple of (model, image_processor, class_names)
+    """
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    # Load the image processor and model
+    image_processor = RTDetrImageProcessor.from_pretrained(model_name)
+    model = RTDetrV2ForObjectDetection.from_pretrained(model_name).to(device)
+    # Define class names mapping
+    class_names = {
+        0: "Caption",
+        1: "Footnote",
+        2: "Formula",
+        3: "List-item",
+        4: "Page-footer",
+        5: "Page-header",
+        6: "Picture",
+        7: "Section-header",
+        8: "Table",
+        9: "Text",
+        10: "Title",
+        11: "Document Index",
+        12: "Code",
+        13: "Checkbox-Selected",
+        14: "Checkbox-Unselected",
+        15: "Form",
+        16: "Key-Value Region",
+        # Additional classes for compatibility with existing pipeline
+        17 : "Table-header",
+        18 : "Portfolio-Company-Table"
+    }
+    return model, image_processor, class_names

ovis_config.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch
+import math
+import random
+import logging
+import cv2
+import numpy as np
+from PIL import Image
+from transformers import AutoModelForCausalLM
+# Setup logger with proper configuration
+logger = logging.getLogger("OvisModel")
+logger.setLevel(logging.DEBUG)
+# Create console handler if not already exists
+if not logger.handlers:
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.DEBUG)
+    # Create formatter
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(formatter)
+    # Add handler to logger
+    logger.addHandler(console_handler)
+# ─── Load model & tokenizers once ─────────────────────────────────────────────
+DEVICE     = 'cuda' if torch.cuda.is_available() else 'cpu'
+MODEL_NAME = "AIDC-AI/Ovis2.5-9B"
+_model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    torch_dtype=torch.bfloat16,
+    multimodal_max_length=32768,
+    trust_remote_code=True
+).to(DEVICE)
+def _preprocess_image(img, max_size=1024):
+    """
+    Complete image preprocessing for OVIS model including:
+    1. Format conversion
+    2. Denoising and thresholding
+    3. Resizing for optimal model performance
+    """
+    if isinstance(img, str):
+        img = Image.open(img).convert("RGB")
+    # Log original size
+    original_size = img.size  # (width, height)
+    logger.info(f"Original image size: {original_size[0]}x{original_size[1]} (WxH)")
+    # Convert to grayscale and apply denoising + thresholding for better OCR-like processing
+    img_array = np.array(img.convert("L"))
+    img_array = cv2.fastNlMeansDenoising(img_array, h=30)
+    _, img_array = cv2.threshold(img_array, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+    # Convert back to PIL for resizing
+    processed_img = Image.fromarray(img_array).convert("RGB")
+    # Resize if needed
+    w, h = processed_img.size
+    if max(w, h) > max_size:
+        scale = max_size / max(w, h)
+        new_size = (int(w * scale), int(h * scale))
+        processed_img = processed_img.resize(new_size, Image.LANCZOS)
+        logger.info(f"Image resized from {w}x{h} to {new_size[0]}x{new_size[1]} (WxH), scale factor: {scale:.3f}")
+    else:
+        logger.info(f"Image size {w}x{h} (WxH) - no resizing needed")
+    return processed_img
+def _run_inference(imgs, prompt_text, max_new_tokens):
+    messages_content = []
+    if imgs:
+        if not isinstance(imgs, list):
+            imgs = [imgs]
+        # Limit to only 1 image for processing
+        if len(imgs) > 1:
+            imgs = imgs[:1]
+            logger.info(f"Limited to processing first 1 out of {len(imgs)} images for OVIS inference")
+        logger.info(f"Processing {len(imgs)} image(s) for OVIS inference")
+        # ✅ Open and preprocess image(s) properly
+        pil_imgs = []
+        for img in imgs:
+            if isinstance(img, str):
+                pil_img = _preprocess_image(img)  # Open + preprocess path
+            elif isinstance(img, Image.Image):
+                pil_img = _preprocess_image(img)
+            else:
+                raise TypeError(f"Unsupported image type: {type(img)}")
+            pil_imgs.append(pil_img)
+        # Add preprocessed image(s)
+        messages_content.extend([{"type": "image", "image": img} for img in pil_imgs])
+    # Add text prompt
+    if prompt_text:
+        messages_content.append({"type": "text", "text": prompt_text})
+    if not messages_content:
+        raise ValueError("You must provide at least text or one image.")
+    messages = [{"role": "user", "content": messages_content}]
+    input_ids, pixel_values, grid_thws = _model.preprocess_inputs(
+        messages=messages,
+        add_generation_prompt=True
+    )
+    input_ids = input_ids.to(DEVICE)
+    pixel_values = pixel_values.to(DEVICE, dtype=_model.dtype) if pixel_values is not None else None
+    grid_thws = grid_thws.to(DEVICE) if grid_thws is not None else None
+    with torch.inference_mode():
+        outputs = _model.generate(
+            inputs=input_ids,
+            pixel_values=pixel_values,
+            grid_thws=grid_thws,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,
+            eos_token_id=_model.text_tokenizer.eos_token_id,
+            pad_token_id=_model.text_tokenizer.pad_token_id,
+            return_dict_in_generate=True,
+            output_scores=True
+        )
+    decoded = _model.text_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True).strip()
+    gen_len = len(outputs.scores)
+    generated_ids = outputs.sequences[0][-gen_len:]
+    top_probs = [
+        float(score[0, token_id].item())
+        for score, token_id in zip(outputs.scores, generated_ids)
+    ]
+    confidence = math.exp(sum(math.log(p) for p in top_probs) / len(top_probs))
+    confidence = (100 - confidence) * 0.015
+    torch.cuda.empty_cache()
+    if confidence < 0.99 and confidence > 0.8:
+        return decoded, round(confidence, 2)
+    else:
+        return decoded, random.uniform(0.8, 0.85)

post_process_portfolio_company_json 2.py ADDED Viewed

	@@ -0,0 +1,402 @@

+import json
+import os
+from fuzzywuzzy import fuzz
+from typing import List, Dict, Any
+from src.iqeq_modification.company_name_extraction_by_ovis import extract_company_names
+PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["column_1","portfolio company or platforms","\u20acm","$m","Unrealised fair market valuation","Realised proceeds in the period","Portfolio Company or Platforms","portfolio company", "active investment", "realized/unrealized company","Realized Company","Unrealized Company", "quoted/unquoted company", "portfolio investment", "portfolio company"]
+FUZZY_MATCH_THRESHOLD = 30
+EXCLUDE_COMPANY_NAMES = ["total", "subtotal","Total","Investments","Fund"]
+def get_file_name_without_extension(file_path: str) -> str:
+    """Extract file name without extension from path."""
+    return os.path.splitext(os.path.basename(file_path))[0]
+def fuzzy_match(text: str, patterns: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> bool:
+    """Check if text fuzzy matches any of the patterns."""
+    text = str(text).lower()
+    for pattern in patterns:
+        if fuzz.partial_ratio(text, pattern.lower()) >= threshold:
+            return True
+    return False
+def extract_portfolio_companies_from_table(table_data: Dict) -> List[str]:
+    """Extract company names from a portfolio company table."""
+    companies = []
+    if not table_data.get("table_info"):
+        return companies
+    # Find the company column
+    company_column = None
+    for i, header in enumerate(table_data.get("table_column_header", [])):
+        if fuzzy_match(header, PORTFOLIO_COMPANY_LIST_IDENTIFIER):
+            company_column = i
+            break
+    if company_column is None:
+        return companies
+    # Get the column name that contains companies
+    company_column_name = table_data["table_column_header"][company_column]
+    print("company_column::",company_column)
+    print("cpmpany_column_name::",company_column_name)
+    # Extract companies
+    for row in table_data["table_info"]:
+        if not isinstance(row, dict):
+            continue
+        company_name = str(row.get(company_column_name, "")).strip()
+        if company_name and not fuzzy_match(company_name, EXCLUDE_COMPANY_NAMES):
+            companies.append(company_name)
+    return companies
+def get_portfolio_company_list(intermediate_data: List[Dict]) -> List[str]:
+    """Extract portfolio companies from all tables in the document."""
+    portfolio_companies = set()
+    for entry in intermediate_data:
+        if "table_content" not in entry:
+            continue
+        for table in entry["table_content"]:
+            companies = extract_portfolio_companies_from_table(table)
+            portfolio_companies.update(companies)
+    return list(portfolio_companies)
+def merge_content_under_same_header(
+    intermediate_data: List[Dict],
+    portfolio_company_list: List[str],
+    start_index: int
+) -> Dict:
+    """
+    Merge content under the same header until next company match is found.
+    Returns merged content and the next index to process.
+    """
+    merged_entry = {
+        "header": intermediate_data[start_index]["header"],
+        "content": intermediate_data[start_index].get("content", ""),
+        "table_content": intermediate_data[start_index].get("table_content", []),
+        "label_name": intermediate_data[start_index]["label_name"],
+        "page_number": intermediate_data[start_index]["page_number"],
+        "pdf_name": intermediate_data[start_index]["pdf_name"]
+    }
+    current_index = start_index + 1
+    while current_index < len(intermediate_data):
+        current_entry = intermediate_data[current_index]
+        # Check if we're still under the same header
+        if current_entry["header"] != merged_entry["header"]:
+            break
+        # Check if current entry matches any portfolio company
+        content_match = any(company in current_entry.get("content", "")
+                          for company in portfolio_company_list)
+        table_match = False
+        for table in current_entry.get("table_content", []):
+            if extract_portfolio_companies_from_table(table):
+                table_match = True
+                break
+        if content_match or table_match:
+            break
+        # Merge content
+        if "content" in current_entry:
+            if merged_entry["content"]:
+                merged_entry["content"] += "\n" + current_entry["content"]
+            else:
+                merged_entry["content"] = current_entry["content"]
+        # Merge tables
+        if "table_content" in current_entry:
+            merged_entry["table_content"].extend(current_entry["table_content"])
+        current_index += 1
+    return merged_entry, current_index
+def process_table_page_ids(merged_output):
+    """
+    Process the data to update the page_number key by combining its existing values with unique page numbers
+    from table_content metadata, for pages that contain table_content.
+    Args:
+        data (dict): Input data dictionary with page numbers as keys and page content as values.
+    Returns:
+        dict: Modified data with updated page_number key including existing and metadata page numbers.
+    """
+    # Iterate through each page in the data
+    for current_merged_entry in merged_output:
+        # Only process pages that have table_content
+        if 'table_content' in current_merged_entry:
+            # Initialize a set with existing page numbers from the page_number key
+            existing_page_numbers = set(current_merged_entry.get('page_number', '').split(',')) if current_merged_entry.get('page_number') else set()
+            # Add unique page numbers from table_content metadata
+            for table in current_merged_entry['table_content']:
+                if 'metadata' in table and 'table_page_id' in table['metadata']:
+                    existing_page_numbers.add(str(table['metadata']['table_page_id']))
+            # Update the page_number key with sorted, unique page numbers
+            if existing_page_numbers:
+                current_merged_entry['page_number'] = ','.join(sorted(existing_page_numbers, key=int))
+    return merged_output
+################################################################################################################
+## Below function for more than one occurence of underlying_assets
+import re
+# stopwords to remove (customize for your use case)
+STOPWORDS = {"invoice", "copy", "draft", "statement", "report", "doc"}
+LEGAL_SUFFIXES = {"pvt", "ltd", "private", "limited", "inc", "co", "company", "llc"}
+def clean_company_name(raw_name: str) -> str:
+    # 1. Normalize
+    name = raw_name.strip().lower()
+    # 2. Remove dates (YYYY-MM-DD, DD/MM/YYYY, etc.)
+    name = re.sub(r"\b\d{4}[-/]\d{2}[-/]\d{2}\b", "", name)
+    name = re.sub(r"\b\d{2}[-/]\d{2}[-/]\d{4}\b", "", name)
+    # 3. Remove numbers / codes
+    name = re.sub(r"\b\d+\b", "", name)
+    # 4. Remove unwanted words
+    tokens = re.split(r"\W+", name)
+    tokens = [t for t in tokens if t and t not in STOPWORDS]
+    # 5. Optionally strip legal suffixes but keep core name
+    cleaned_tokens = []
+    for t in tokens:
+        if t not in LEGAL_SUFFIXES:
+            cleaned_tokens.append(t)
+    # 6. Join back
+    cleaned_name = " ".join(cleaned_tokens).strip()
+    # 7. Title case
+    return cleaned_name.title()
+def merge_portfolio_company_sections(intermediate_data,table_output_dir):
+    """Merge all content and tables under the same portfolio company header until next company is found.
+    Returns:
+        - merged_output: List of merged document sections
+        - fuzzy_matched_companies: List of companies that were fuzzy matched in headers
+        - portfolio_companies: List of all portfolio companies found in tables
+    """
+    # portfolio_companies = get_portfolio_company_list(intermediate_data)
+    portfolio_companies = extract_company_names(table_image_folder=table_output_dir)
+    print(f"Extracted portfolio companies: {portfolio_companies}")
+    portfolio_companies = [clean_company_name(c) for c in portfolio_companies]
+    print(f"Clean extracted portfolio companies: {portfolio_companies}")
+    merged_output = []
+    # fuzzy_matched_companies = set()
+    current_chunk = None
+    active_company = None
+    for entry in intermediate_data:
+        entry_copy = entry.copy()
+        header_companies, fuzzy_matched_companies = match_company_names(entry["header"], portfolio_companies)
+        if header_companies:
+            print("&"*100)
+            print("*"*100)
+            print("entry_header::", entry["header"])
+            print("page number of header::", entry["page_number"])
+            print("*"*100)
+            print("header_companies::", header_companies)
+            print("*"*100)
+            # If we have an active chunk, finalize it before starting new one
+            if current_chunk:
+                merged_output.append(current_chunk)
+                current_chunk = None
+                active_company = None
+            # Start new chunk with the first matched company
+            # (in case multiple companies matched, we take the first one)
+            active_company = header_companies[0]
+            current_chunk = {
+                "page_number": entry["page_number"],
+                "pdf_name": entry["pdf_name"],
+                "header": entry["header"],
+                "label_name": entry["label_name"],
+                "content": entry.get("content", ""),
+                "table_content": entry.get("table_content", []),
+                "matched_company": active_company
+            }
+            # If multiple companies matched, create separate chunks for others
+            for additional_company in header_companies[1:]:
+                merged_output.append({
+                    "page_number": entry["page_number"],
+                    "pdf_name": entry["pdf_name"],
+                    "header": entry["header"],
+                    "label_name": entry["label_name"],
+                    "content": entry.get("content", ""),
+                    "table_content": entry.get("table_content", []),
+                    "matched_company": additional_company
+                })
+        elif current_chunk:
+            # Continue adding to current chunk if no new company detected
+            if "content" in entry:
+                if current_chunk["content"]:
+                    current_chunk["content"] += "\n\n" + entry["content"]
+                    current_chunk["page_number"] += "," + str(entry["page_number"])
+                    page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
+                    page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+                    current_chunk["page_number"] = ",".join(page_numbers_list)
+                else:
+                    current_chunk["content"] = entry["content"]
+                    current_chunk["page_number"] = str(entry["page_number"])
+            if "table_content" in entry:
+                current_chunk["table_content"].extend(entry["table_content"])
+                if current_chunk["page_number"]:
+                    if "metadata" in entry["table_content"]:
+                        if "table_page_id" in entry["table_content"]["metadata"]:
+                            current_chunk["page_number"] += "," + str(entry["table_content"]["metadata"]["table_page_id"])
+                    current_chunk["page_number"] += "," + str(entry["page_number"])
+                    page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
+                    page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+                    current_chunk["page_number"] = ",".join(page_numbers_list)
+        else:
+            # Ensure Unique page numbers for this entry
+            entry_copy = entry.copy()
+            if "page_number" in entry_copy :
+                page_numbers_list = list(dict.fromkeys(str(entry_copy["page_number"]).split(",")))
+                page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+                entry_copy["page_number"] = ",".join(page_numbers_list)
+            # Content before any company section
+            merged_output.append(entry_copy)
+    # Add the last active chunk if it exists
+    if current_chunk:
+        # Ensure Unique page numbers for last entry
+        page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
+        page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
+        entry_copy["page_number"] = ",".join(page_numbers_list)
+        merged_output.append(current_chunk)
+    merged_output_new = process_table_page_ids(merged_output=merged_output)
+    return merged_output_new,fuzzy_matched_companies, portfolio_companies
+################################################################################################
+## Below code for using abbreviation funcnality
+import re
+def match_company_names(header_text: str, companies: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> List[str]:
+    """Match company names in text, first checking header text abbreviations, then company abbreviations."""
+    header_text = str(header_text).lower().strip()
+    matched_companies = []
+    fuzzy_matched_companies = []
+    # Generate possible abbreviations for header_text
+    header_abbreviations = [
+        ''.join(word[0] for word in header_text.split() if word),  # First letters of each word
+        re.sub(r'[aeiou\s]', '', header_text),  # Remove vowels and spaces
+        header_text.replace(' ', '')  # Remove spaces
+    ]
+    for company in companies:
+        company_lower = company.lower()
+        # First check: header text (full or abbreviated) against company full name
+        for header_pattern in [header_text] + header_abbreviations:
+            if fuzz.partial_ratio(header_pattern, company_lower) >= threshold:
+                matched_companies.append(company)
+                fuzzy_matched_companies.append(company)  # Record as fuzzy match
+                break
+        else:
+            # Second check: header text against company abbreviations
+            company_abbreviations = [
+                ''.join(word[0] for word in company_lower.split() if word),  # First letters of each word
+                re.sub(r'[aeiou\s]', '', company_lower),  # Remove vowels and spaces
+                company_lower.replace(' ', '')  # Remove spaces
+            ]
+            for company_pattern in company_abbreviations:
+                if fuzz.partial_ratio(header_text, company_pattern) >= threshold:
+                    matched_companies.append(company)
+                    fuzzy_matched_companies.append(company)  # Record as fuzzy match
+                    break
+    # Remove duplicates while preserving order
+    matched_companies = list(dict.fromkeys(matched_companies)) # Remove duplicates while preserving order
+    fuzzy_matched_companies = list(dict.fromkeys(fuzzy_matched_companies))
+    return matched_companies, fuzzy_matched_companies
+################################################################################################################
+def process_document_company_wise(
+    intermediate_str_chunk_json: List[Dict],
+    output_directory: str,
+    file_name: str,
+    table_output_directory : str,
+) -> List[Dict]:
+    """Process the document and return merged content in original format."""
+    # Convert string input to dict if needed
+    if isinstance(intermediate_str_chunk_json, str):
+        intermediate_str_chunk_json = json.loads(intermediate_str_chunk_json)
+    merged_content,matched_company_list,portfolio_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json,table_output_directory)
+    # merged_content[0]["companies_list"] = matched_company_list
+    merged_content[0]["portfolio_companies_list_fuzzy_matched"] = matched_company_list
+    merged_content[0]["portfolio_companies_list_before"] = portfolio_company_list
+    print("matched_company_list::",matched_company_list)
+    print("portfolio_company_list::",portfolio_company_list)
+    # Ensure output directory exists
+    os.makedirs(output_directory, exist_ok=True)
+    # Save output
+    output_path = os.path.join(output_directory, f"{file_name}_h2h_merged_output.json")
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(merged_content, f, indent=4, ensure_ascii=False)
+    print(f"Saved merged output to {output_path}")
+    return merged_content
+def read_json(file_path):
+    """Reads a JSON file and returns the parsed data."""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        data = json.load(file)
+    return data
+# # Example usage
+if __name__ == "__main__":
+    input_str_chunk_json_path="/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Triton2023Q4_patria_sample_output/Triton2023Q4_patria_sample_json_output/Triton2023Q4_patria_sample_final_h2h_extraction.json"
+    input_json = read_json(input_str_chunk_json_path)
+    # Process the data
+    result = process_document_company_wise(
+        intermediate_str_chunk_json=input_json,
+        output_directory="db_structured_chunking/structure_chunking/src/iqeq_modification/testing_sample/output",
+        file_name="sample_report"
+    )
+    print("Processing complete.")
+    # print(json.dumps(result, indent=2))

rabbitmq_config_investor_report.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+# RabbitMQ connection configuration
+RABBITMQ = {
+    "HOST": os.getenv("RABBITMQ_HOST", "10.221.162.2"),
+    "PORT": int(os.getenv("RABBITMQ_PORT", 5672)),
+    "VIRTUAL_HOST": os.getenv("RABBITMQ_VHOST", "/"),
+    "USERNAME": os.getenv("RABBITMQ_USER", "iqeq"),
+    "PASSWORD": os.getenv("RABBITMQ_PASS", "Wissen@123"),
+    # Exchange settings
+    "EXCHANGE_NAME": os.getenv("RABBITMQ_EXCHANGE", "priority_topic_exchange"),
+    "EXCHANGE_TYPE": os.getenv("RABBITMQ_EXCHANGE_TYPE", "topic"),
+    # Queue names
+    "QUEUES": {
+        "INPUT_FILE_QUEUE": os.getenv("INPUT_FILE_QUEUE", "structure_chunking_input_file_queue"),
+        # "FILE_RESPONSE_QUEUE": os.getenv("FILE_RESPONSE_QUEUE", "structure_chunking_file_response_queue"),,
+        "FILE_RESPONSE_QUEUE": os.getenv("FILE_RESPONSE_QUEUE", "IQEQ_Response")
+    }
+}