Kushalguptaiitb commited on
Commit
b533173
·
verified ·
1 Parent(s): 608282b

Upload 8 files

Browse files
company_name_extraction_by_ovis.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ from src.iqeq_modification.ovis_config import _run_inference
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ def extract_company_names(table_image_folder: str):
9
+ logger.info("=" * 80)
10
+ logger.info("STARTED COMPANY NAME EXTRACTION USING OVIS")
11
+ logger.info(f"Image folder: {table_image_folder}")
12
+ logger.info("=" * 80)
13
+
14
+ #Load all images from folder
15
+ supported_ext = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.webp')
16
+ image_paths = [
17
+ os.path.join(table_image_folder, f)
18
+ for f in sorted(os.listdir(table_image_folder))
19
+ if f.lower().endswith(supported_ext)
20
+ ]
21
+
22
+ if not image_paths:
23
+ logger.warning("No valid images found in the folder.")
24
+ return []
25
+
26
+ logger.info(f"Found {len(image_paths)} image(s) for inference")
27
+
28
+ prompt = (
29
+ "You are an expert financial document analysis model specialized in reading tables from investor reports. "
30
+ "You are given an image of a table that may contain portfolio or investee company details such as company names, fund names, sectors, and investment amounts.\n\n"
31
+ "Your task:\n"
32
+ "1. Identify if the table contains portfolio or investee company information.\n"
33
+ "2. Extract only the actual **company or investee organization names**, excluding fund names, co-investment entities, management labels, or generic terms.\n"
34
+ "3. Do NOT include partial terms, descriptors, or words like 'Fund', 'Holdings', 'Co-Investment', 'Management', 'Other Unitholders', 'Endurance', 'Growth', 'PIK', etc.\n"
35
+ "4. Remove duplicates and retain only unique, meaningful company names.\n"
36
+ "5. Each extracted name should be a clean, full company name (e.g., 'Kate Spade Ltd', 'Milano Ventures Pvt Ltd').\n\n"
37
+ "Return your final answer strictly as a valid JSON list of strings, for example:\n"
38
+ "[\"Kate Spade Pvt Ltd\", \"Milano Ventures\", \"XYZ Technologies\"]\n\n"
39
+ "If no valid company names are found, return [] only.\n"
40
+ "Do not include any explanations, reasoning, or text outside the JSON list."
41
+ )
42
+
43
+ company_names = set()
44
+
45
+ #Chunk fields to avoid prompt overflow
46
+ for img in image_paths:
47
+ try:
48
+ raw, _ = _run_inference(img, prompt, max_new_tokens=2048)
49
+ print(f"raw result: {raw}")
50
+ if not raw:
51
+ continue
52
+ try:
53
+ names = json.loads(raw)
54
+ if isinstance(names, list):
55
+ for name in names:
56
+ if isinstance(name, str) and name.strip():
57
+ company_names.add(name.strip())
58
+ except json.JSONDecodeError:
59
+ logger.warning(f"Failed to parse OVIS output for {img}: {raw}")
60
+
61
+ except Exception as e:
62
+ logger.error(f"OVIS inference failed for {img}: {e}", exc_info=True)
63
+ continue
64
+
65
+ logger.info(f"Extracted {len(company_names)} unique company name(s).")
66
+ print("k"*100)
67
+ return list(company_names)
68
+
69
+ # print(extract_company_names("/shared_disk/kushal/land_contract/company tables"))
iqeq_app_latest (4).py ADDED
@@ -0,0 +1,1437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Depends, File, Request, Form
2
+ from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.responses import JSONResponse, FileResponse
5
+ from urllib.parse import quote
6
+ from typing import List, Annotated,Dict,Optional,Any
7
+ import uvicorn
8
+ import sys
9
+ import json
10
+
11
+
12
+ sys.path.append("/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking")
13
+ # from config.set_config.set_configuration import set_config_project
14
+
15
+ from config.set_config import set_configuration
16
+ set_config_project = set_configuration()
17
+
18
+ project_output_directory_path= set_config_project.project_output_directory_path
19
+ project_path = set_config_project.project_path
20
+
21
+ from src.table_processing.table_filter import filtering_table_pipeline
22
+ # from src.qa_correction.user_action_modification import process_actions_and_create_new_file
23
+ from src.table_processing.tree_structured_json import tree_structured_headers_pipeline,tree_structured_headers_content_pipeline
24
+ from src.pre_processing.create_searchable_pdf_old import create_json_pdfminer_pipeline
25
+
26
+ from src.post_processing.clean_dataframe import clean_dataframe
27
+ from src.table_processing.merge_headers_tree_structure import merge_blocks
28
+
29
+ from src.table_processing.create_and_put_table_header import main_pipeline_create_put_table_headers
30
+ from src.table_processing.map_table_with_table_header import map_table_with_its_header
31
+
32
+ # from src.table_processing.table_merge import merge_multi_page_tables_pipeline
33
+
34
+ # from other_code.save_classified_pdf_json_to_excel import create_directories_and_sheets
35
+
36
+ # from src.table_extraction_from_word_csv.word_extraction import main_table_extraction_from_docx
37
+ # from src.table_extraction_from_word_csv.xlsx_extraction import extract_and_save_tables_from_excel
38
+ # from src.table_extraction_from_word_csv.csv_extraction import extract_and_save_tables_from_csv
39
+ # from src.table_extraction_from_word_csv.classify_table_headers import process_main_classifier,get_csv_file_paths,save_classify_files, clean_filename
40
+
41
+ # from src.iqeq_modification.sorting_headers_v2 import filter_and_sort_headers
42
+ # from src.iqeq_modification.portfolio_summary_dynamic_classification import map_company_data
43
+ from src.toc_based_extraction.main_pipeline_toc_based_extraction import customised_toc_extraction_pipeline
44
+ from src.iqeq_modification.post_processing_iqeq import read_json,main_header_pipeline
45
+ from src.iqeq_modification.post_process_portfolio_company_json import process_document_company_wise
46
+
47
+ # from src.filter_pdf_pages_scope3.fuzzy_match_keywords import custom_pipeline_for_filter_keywords_pages_text_search
48
+ # from src.filter_pdf_pages_scope3.keywords_matching import custom_pipeline_for_filter_keywords_pages_tfidf_vector
49
+ # from src.filter_pdf_pages_scope3.keywords_matching_create_pdf import custom_pipeline_for_filter_keywords_pages_sentence_embedding
50
+
51
+ # from src.layout_detection.layout_detection import yolov10_layout_pipeline,get_file_name_without_extension
52
+ from src.layout_detection.layout_detection_docling_heron import yolov10_layout_pipeline,get_file_name_without_extension
53
+
54
+ # from src.table_merge.table_merge_v2 import merge_multi_page_tables_pipeline_v2
55
+ # from src.table_merge.table_merge_new import merge_multi_page_tables_pipeline_v2
56
+ from src.table_merge.table_merge_v5 import merge_multi_page_tables_pipeline_v2
57
+ from src.table_query.query_code_openai import get_query_response
58
+
59
+ from src.custom_headers.pdf_header_detector import process_pdf_for_headers
60
+ from src.custom_headers.consolidate_header_jsons import pipeline_for_merging_headers
61
+
62
+ from utils.utils_code import clear_directory
63
+
64
+ import logging,os
65
+ from logging.config import dictConfig
66
+ import shutil
67
+ import re
68
+ from fastapi import HTTPException, Form
69
+ from src.classification.column_classifier_v2 import classify_column_headers
70
+ from src.classification.classification import perform_classification
71
+
72
+ log_folder = "logs"
73
+ os.makedirs(log_folder, exist_ok=True)
74
+
75
+ # Configure logging
76
+ log_file_path = os.path.join(log_folder, "app.log")
77
+
78
+ logging_config = {
79
+ 'version': 1,
80
+ 'disable_existing_loggers': False,
81
+ 'formatters': {
82
+ 'detailed': {
83
+ 'format': '%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(message)s',
84
+ 'datefmt': '%Y-%m-%d %H:%M:%S'
85
+ },
86
+ },
87
+ 'handlers': {
88
+ 'console': {
89
+ 'class': 'logging.StreamHandler',
90
+ 'level': 'INFO',
91
+ 'formatter': 'detailed',
92
+ 'stream': 'ext://sys.stdout'
93
+ },
94
+ 'file': {
95
+ 'class': 'logging.FileHandler',
96
+ 'level': 'INFO',
97
+ 'formatter': 'detailed',
98
+ 'filename': log_file_path,
99
+ 'mode': 'a',
100
+ },
101
+ },
102
+ 'loggers': {
103
+ '': { # root logger
104
+ 'handlers': ['console', 'file'],
105
+ 'level': 'INFO',
106
+ 'propagate': True
107
+ },
108
+ # Add specific loggers for libraries if needed
109
+ 'uvicorn': {
110
+ 'handlers': ['console', 'file'],
111
+ 'level': 'INFO',
112
+ 'propagate': False
113
+ },
114
+ }
115
+ }
116
+
117
+ # Apply the configuration
118
+ dictConfig(logging_config)
119
+
120
+ # Create the logger instance
121
+ logger = logging.getLogger(__name__)
122
+
123
+ app = FastAPI()
124
+
125
+ app.add_middleware(
126
+ CORSMiddleware,
127
+ allow_origins=["*"],
128
+ allow_credentials=True,
129
+ allow_methods=["*"],
130
+ allow_headers=["*"],
131
+ )
132
+
133
+
134
+ pdf_input_path = os.path.join(project_output_directory_path, f"pdf_extraction/input")
135
+ pdf_input_directory = pdf_input_path
136
+ os.makedirs(pdf_input_directory, exist_ok=True)
137
+
138
+ pdf_output_path = os.path.join(project_output_directory_path, f"pdf_extraction/output")
139
+ output_directory = pdf_output_path
140
+ os.makedirs(output_directory, exist_ok=True)
141
+
142
+ word_input_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/input")
143
+ word_input_directory_path = word_input_path
144
+ os.makedirs(word_input_directory_path, exist_ok=True)
145
+
146
+ word_output_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/output")
147
+ word_output_directory_path = word_output_path
148
+ os.makedirs(word_output_directory_path, exist_ok=True)
149
+
150
+ document_data = {}
151
+
152
+ @app.post("/structured_chunking_extract")
153
+ async def upload_documents(request: Request, path: str = Form()) :
154
+ # path = eval(f'{path}')
155
+ print(f'started for path: {path}')
156
+ base_url = str(request.base_url)
157
+ global document_data
158
+ document_data = {}
159
+ pdf_path = path
160
+ clear_directory(pdf_input_path)
161
+ clear_directory(pdf_output_path)
162
+ clear_directory(word_input_path)
163
+ clear_directory(word_output_path)
164
+
165
+ # Initialize response structure
166
+ response = {
167
+ "success": False,
168
+ "message": "",
169
+ # "data": None
170
+ }
171
+
172
+ # Check if the provided path is a PDF file
173
+ if not pdf_path.lower().endswith(".pdf"):
174
+ response["message"] = "Invalid file type. Only PDF files are accepted."
175
+ return response
176
+
177
+
178
+ # Extract filename
179
+ file_name_with_ext = os.path.basename(pdf_path)
180
+ file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
181
+
182
+ # Create destination path in input directory
183
+ destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
184
+
185
+ try:
186
+ # Copy the file to our input directory
187
+ shutil.copy2(pdf_path, destination_path)
188
+ except Exception as e:
189
+ response["message"] = f"Failed to copy file: {str(e)}"
190
+ return response
191
+
192
+ output_directory_path = os.path.join(output_directory)
193
+ os.makedirs(output_directory_path, exist_ok=True)
194
+ file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
195
+ os.makedirs(file_output, exist_ok=True)
196
+
197
+ table_output_path = os.path.join(file_output, f"table_output")
198
+ os.makedirs(table_output_path, exist_ok=True)
199
+ file_location = destination_path
200
+
201
+ # Pipeline processing
202
+
203
+ json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path,_ = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
204
+ table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
205
+
206
+ custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
207
+ header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
208
+
209
+ # Initialize data for the new document
210
+ document_data[file_name_with_ext] = {
211
+
212
+ "pdf_path": destination_path,
213
+ "pdf_file_name": file_name_with_ext,
214
+ "model_json_header_output_filepath": [],
215
+ "model_json_layout_output_filepath": [],
216
+ "tree_structured_header_json_filepath": [],
217
+ "user_modified_json_output_filepath": [],
218
+ 'user_modified_table_json_filepath': [],
219
+ "frontend_output_json": [],
220
+ "cluster_json": [],
221
+ "id_2_label" : [],
222
+ "file_output_dir" : [],
223
+ "table_output_dir": [],
224
+ "table_with_header_data" : [],
225
+ "table_with_header_json_path" : [],
226
+ "json_output_dir": [],
227
+ "pdf_miner_json_path": [] ,
228
+ "searchable_pdf_path" : []
229
+ }
230
+
231
+ # Store paths and filenames
232
+ document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
233
+ document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
234
+ document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
235
+ document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
236
+ document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
237
+ document_data[file_name_with_ext]["file_output_dir"].append(file_output)
238
+ document_data[file_name_with_ext]["id_2_label"].append(class_names)
239
+ document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
240
+ document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
241
+ document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
242
+ document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
243
+
244
+ file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
245
+ pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
246
+ pdf_path = document_data[file_name_with_ext]["pdf_path"]
247
+ user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
248
+
249
+ pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
250
+
251
+ table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
252
+ document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
253
+
254
+ document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
255
+
256
+ # Process image URLs
257
+ pdf_images_urls = []
258
+ for file_name in os.listdir(pdf_images_path):
259
+ file_path = os.path.join(pdf_images_path, file_name)
260
+ if file_name.endswith((".jpg", ".jpeg", ".png")):
261
+ img_url = base_url + "image/" + str(quote(file_path))
262
+ pdf_images_urls.append(img_url)
263
+
264
+ # Sort image URLs by page number
265
+ def extract_page_no(url):
266
+ return int(url.split("_")[-1].split(".")[0])
267
+ sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
268
+
269
+ # Create page details
270
+ page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
271
+
272
+ # Store the JSON output
273
+ document_data[file_name_with_ext]["frontend_output_json"].append({
274
+ "layout_output_json_data": layout_output_json_data,
275
+ "layout_json_list_data": layout_list_data,
276
+ "id_2_label": class_names,
277
+ "header_output_json_data": header_output_json_data,
278
+ "table_output_json_data": table_json_data,
279
+ "table_output_json_data_list": table_json_data_list,
280
+ "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
281
+ "pdf_images_urls": page_details,
282
+ })
283
+
284
+
285
+ document_id_name = file_name_with_ext
286
+
287
+ data = document_data[document_id_name]
288
+ file_output_dir = data["file_output_dir"][0]
289
+ json_output_dir = data["json_output_dir"][0]
290
+ pdf_file_name = data["pdf_file_name"]
291
+ pdf_path = data["pdf_path"]
292
+
293
+ # PDFMiner processing
294
+ pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
295
+ modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
296
+ model_modified_json = read_json(modified_json_output_filepath)
297
+ pdfminer_json = read_json(pdf_miner_json_filepath)
298
+ searchable_pdf_path = data["searchable_pdf_path"][0]
299
+
300
+ # table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
301
+
302
+ table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
303
+
304
+ table_merged_json = read_json(table_merged_json_path)
305
+
306
+ table_mapped_modified_json = map_table_with_its_header(table_merged_json)
307
+
308
+
309
+ # table_mapped_modified_json = map_table_with_its_header(model_modified_json)
310
+
311
+ # Main header pipeline
312
+ df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
313
+
314
+
315
+ clean_df, clean_df_json = clean_dataframe(df_final)
316
+
317
+
318
+ if isinstance(clean_df_json, str):
319
+ # print("clean_df_json::",clean_df_json)
320
+ # clean_df_json = eval(clean_df_json)
321
+ clean_df_json = json.loads(clean_df_json)
322
+
323
+ file_name = get_file_name_without_extension(pdf_file_name)
324
+ merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
325
+ company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
326
+
327
+ json_output_filename = file_name + "_final_h2h_extraction.json"
328
+ final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
329
+
330
+ with open(final_json_output_filepath, 'w') as f:
331
+ json.dump(clean_df_json, f, indent=4)
332
+
333
+ company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
334
+ company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
335
+
336
+ with open(company_wise_final_json_output_filepath, 'w') as f:
337
+ json.dump(merged_content_company_wise_df, f, indent=4)
338
+
339
+ # Tree-structured header content
340
+ # final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
341
+
342
+ # final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
343
+
344
+ # document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
345
+
346
+
347
+
348
+ # final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
349
+
350
+
351
+ # Step 1: Extract directory and filename without extension
352
+ pdf_path = path
353
+ json_directory = os.path.dirname(pdf_path)
354
+ json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
355
+
356
+ # Step 2: Define output path for JSON
357
+ output_json_path = os.path.join(json_directory, f"{json_filename }.json")
358
+
359
+ # If your variable is a JSON string, convert it to dict first
360
+ if isinstance(company_wise_clean_df_json, str):
361
+ company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
362
+
363
+
364
+ # Step 3: Save JSON
365
+ with open(output_json_path, 'w', encoding='utf-8') as json_file:
366
+ json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
367
+
368
+
369
+ # # post-processing results
370
+ # post_processing_results = {
371
+ # document_id_name : {
372
+ # # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
373
+ # # "df_download_json" : clean_df_json,
374
+ # "df_download_json": company_wise_clean_df_json,
375
+ # "tree_structured_header_content": final_tree_structred_header_content,
376
+ # "file_name": document_id_name,
377
+ # # "classified_dynamic_json": dynamic_mapped_data_json,
378
+ # "toc_df_download_json" : final_toc_h2h_extraction
379
+ # }
380
+ # }
381
+
382
+ response_final = {
383
+ "status_code": 200,
384
+ # "message":"",
385
+ # "df_download_json": company_wise_clean_df_json,
386
+ "saved_json_path": output_json_path
387
+ }
388
+
389
+ return response_final
390
+
391
+
392
+ @app.get("/image/{path:path}")
393
+ async def get_image(path: str):
394
+ if os.path.exists(path):
395
+ return FileResponse(path, media_type="image/jpeg")
396
+ else:
397
+ raise HTTPException(status_code=404, detail="Image not found")
398
+
399
+ @app.get("/file/{path:path}")
400
+ async def get_file(path: str):
401
+ if os.path.exists(path):
402
+ paths = path.split("/")
403
+ filename = paths[len(paths) - 1]
404
+ if path.endswith('.csv'):
405
+ media_type = "text/csv"
406
+ elif path.endswith('.xlsx'):
407
+ media_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
408
+ else:
409
+ media_type = "application/octet-stream"
410
+ return FileResponse(path, media_type=media_type,filename =filename)
411
+ else:
412
+ raise HTTPException(status_code=404, detail="File not found")
413
+
414
+
415
+ @app.post("/table-classification")
416
+ async def table_classification(
417
+ structured_chunk_json_path: str = Form(...),
418
+ class_keywords_table: str = Form(...),
419
+ header_categories: Optional[str] = Form("table_column_header"),
420
+ similarity_threshold: Optional[float] = Form(0.4)
421
+ ):
422
+
423
+ try:
424
+
425
+ with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
426
+ content = file.read()
427
+
428
+ # This regex removes commas before closing braces/brackets, ignoring whitespace
429
+ cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
430
+
431
+ # Parse the cleaned JSON
432
+ structured_chunk_data = json.loads(cleaned_content)
433
+
434
+ # If class_keywords is a string, try to parse it
435
+ if isinstance(class_keywords_table, str):
436
+ try:
437
+ class_keywords_table = json.loads(class_keywords_table)
438
+
439
+ if not isinstance(class_keywords_table, dict):
440
+ raise ValueError("class_keywords_table must be a dictionary")
441
+ if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
442
+ for key, value in class_keywords_table.items()):
443
+ raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
444
+ except json.JSONDecodeError:
445
+ raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
446
+
447
+ elif not isinstance(class_keywords_table, dict) or not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
448
+ for key, value in class_keywords_table.items()):
449
+ raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
450
+
451
+
452
+ # Perform classification
453
+ categorized_headers = perform_classification(
454
+ data=structured_chunk_data,
455
+ class_keywords=class_keywords_table,
456
+ header_categories=header_categories,
457
+ similarity_threshold=similarity_threshold
458
+ )
459
+ return categorized_headers
460
+ except ValueError as e:
461
+ raise HTTPException(status_code=422, detail={"error": "Input validation failed", "message": str(e)})
462
+ except Exception as e:
463
+ raise HTTPException(status_code=422, detail={"error": "Processing failed", "message": str(e)})
464
+
465
+
466
+ @app.post("/table-column-classification")
467
+ async def table_column_classification(
468
+ input_table_classified_json: Annotated[str, Form()],
469
+ class_keywords_table_column: Annotated[str, Form()],
470
+ filter_table_classifier_name: Annotated[str, Form()],
471
+ similarity_threshold: Annotated[str, Form()]
472
+ ):
473
+
474
+
475
+ try:
476
+ # Parse JSON strings into dictionaries
477
+ input_table_classified_json = json.loads(input_table_classified_json)
478
+ class_keywords_table_column = json.loads(class_keywords_table_column)
479
+ except json.JSONDecodeError as e:
480
+ raise HTTPException(status_code=422, detail={"error": "Invalid JSON format", "message": str(e)})
481
+
482
+ try:
483
+ # Convert similarity_threshold to integer
484
+ similarity_threshold = float(similarity_threshold)
485
+ except ValueError as e:
486
+ raise HTTPException(status_code=422, detail={"error": "Similarity threshold must be a valid integer", "message": str(e)})
487
+
488
+
489
+ column_classification_results = classify_column_headers(
490
+ json_data=input_table_classified_json,
491
+ class_keywords=class_keywords_table_column,
492
+ filter_table_classifier_name=filter_table_classifier_name,
493
+ similarity_threshold=similarity_threshold
494
+ )
495
+
496
+ results = {"column_classification_result": column_classification_results}
497
+
498
+ return results
499
+
500
+
501
+ # Run the server
502
+ if __name__ == "__main__":
503
+ # uvicorn.run("app:app", host="0.0.0.0", port=7061, log_level="info", reload=True)
504
+ uvicorn.run( app, host="0.0.0.0", port=7063,log_level="info")
505
+ # uvicorn.run( app, host="0.0.0.0", port=5052,log_level="info")
506
+
507
+
508
+
509
+ def upload_documents(path) :
510
+ # path = eval(f'{path}')
511
+ request = Request
512
+ print(f'started for path: {path}')
513
+ base_url = str(request.base_url)
514
+ global document_data
515
+ document_data = {}
516
+ pdf_path = path
517
+
518
+ clear_directory(pdf_input_path)
519
+ clear_directory(pdf_output_path)
520
+ clear_directory(word_input_path)
521
+ clear_directory(word_output_path)
522
+
523
+ # Initialize response structure
524
+ response = {
525
+ "success": False,
526
+ "message": "",
527
+ # "data": None
528
+ }
529
+
530
+
531
+ if not pdf_path:
532
+ response["message"] = "No file path provided."
533
+ return response
534
+
535
+ # Check if the provided path is a PDF file
536
+ if not pdf_path.lower().endswith(".pdf"):
537
+ response["message"] = "Invalid file type. Only PDF files are accepted."
538
+ return response
539
+
540
+
541
+ # Extract filename
542
+ file_name_with_ext = os.path.basename(pdf_path)
543
+ file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
544
+
545
+ # Create destination path in input directory
546
+ destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
547
+
548
+ try:
549
+ # Copy the file to our input directory
550
+ shutil.copy2(pdf_path, destination_path)
551
+ except Exception as e:
552
+ response["message"] = f"Failed to copy file: {str(e)}"
553
+ return response
554
+
555
+ output_directory_path = os.path.join(output_directory)
556
+ os.makedirs(output_directory_path, exist_ok=True)
557
+ file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
558
+ os.makedirs(file_output, exist_ok=True)
559
+
560
+ table_output_path = os.path.join(file_output, f"table_output")
561
+ os.makedirs(table_output_path, exist_ok=True)
562
+ file_location = destination_path
563
+
564
+
565
+ # Pipeline processing
566
+
567
+ json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
568
+ table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
569
+
570
+ custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
571
+ header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
572
+
573
+ # Initialize data for the new document
574
+ document_data[file_name_with_ext] = {
575
+
576
+ "pdf_path": destination_path,
577
+ "pdf_file_name": file_name_with_ext,
578
+ "model_json_header_output_filepath": [],
579
+ "model_json_layout_output_filepath": [],
580
+ "tree_structured_header_json_filepath": [],
581
+ "user_modified_json_output_filepath": [],
582
+ 'user_modified_table_json_filepath': [],
583
+ "frontend_output_json": [],
584
+ "cluster_json": [],
585
+ "id_2_label" : [],
586
+ "file_output_dir" : [],
587
+ "table_output_dir": [],
588
+ "table_with_header_data" : [],
589
+ "table_with_header_json_path" : [],
590
+ "json_output_dir": [],
591
+ "pdf_miner_json_path": [] ,
592
+ "searchable_pdf_path" : []
593
+ }
594
+
595
+ # Store paths and filenames
596
+ document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
597
+ document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
598
+ document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
599
+ document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
600
+ document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
601
+ document_data[file_name_with_ext]["file_output_dir"].append(file_output)
602
+ document_data[file_name_with_ext]["id_2_label"].append(class_names)
603
+ document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
604
+ document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
605
+ document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
606
+ document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
607
+
608
+ file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
609
+ pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
610
+ pdf_path = document_data[file_name_with_ext]["pdf_path"]
611
+ user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
612
+
613
+ pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
614
+
615
+ table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
616
+ document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
617
+
618
+ document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
619
+
620
+ # Process image URLs
621
+ pdf_images_urls = []
622
+ for file_name in os.listdir(pdf_images_path):
623
+ file_path = os.path.join(pdf_images_path, file_name)
624
+ if file_name.endswith((".jpg", ".jpeg", ".png")):
625
+ img_url = base_url + "image/" + str(quote(file_path))
626
+ pdf_images_urls.append(img_url)
627
+
628
+ # Sort image URLs by page number
629
+ def extract_page_no(url):
630
+ return int(url.split("_")[-1].split(".")[0])
631
+ sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
632
+
633
+ # Create page details
634
+ page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
635
+
636
+ # Store the JSON output
637
+ document_data[file_name_with_ext]["frontend_output_json"].append({
638
+ "layout_output_json_data": layout_output_json_data,
639
+ "layout_json_list_data": layout_list_data,
640
+ "id_2_label": class_names,
641
+ "header_output_json_data": header_output_json_data,
642
+ "table_output_json_data": table_json_data,
643
+ "table_output_json_data_list": table_json_data_list,
644
+ "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
645
+ "pdf_images_urls": page_details,
646
+ })
647
+
648
+
649
+ document_id_name = file_name_with_ext
650
+
651
+ data = document_data[document_id_name]
652
+ file_output_dir = data["file_output_dir"][0]
653
+ json_output_dir = data["json_output_dir"][0]
654
+ pdf_file_name = data["pdf_file_name"]
655
+ pdf_path = data["pdf_path"]
656
+
657
+ # PDFMiner processing
658
+ pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
659
+ modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
660
+ model_modified_json = read_json(modified_json_output_filepath)
661
+ pdfminer_json = read_json(pdf_miner_json_filepath)
662
+ searchable_pdf_path = data["searchable_pdf_path"][0]
663
+
664
+ # table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
665
+
666
+ table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
667
+
668
+ table_merged_json = read_json(table_merged_json_path)
669
+
670
+ table_mapped_modified_json = map_table_with_its_header(table_merged_json)
671
+
672
+
673
+ # table_mapped_modified_json = map_table_with_its_header(model_modified_json)
674
+
675
+ # Main header pipeline
676
+ df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
677
+
678
+
679
+ clean_df, clean_df_json = clean_dataframe(df_final)
680
+
681
+
682
+ if isinstance(clean_df_json, str):
683
+ clean_df_json = json.loads(clean_df_json)
684
+
685
+ file_name = get_file_name_without_extension(pdf_file_name)
686
+ merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
687
+ company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
688
+
689
+ json_output_filename = file_name + "_final_h2h_extraction.json"
690
+ final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
691
+
692
+ with open(final_json_output_filepath, 'w') as f:
693
+ json.dump(clean_df_json, f, indent=4)
694
+
695
+ company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
696
+ company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
697
+
698
+ with open(company_wise_final_json_output_filepath, 'w') as f:
699
+ json.dump(merged_content_company_wise_df, f, indent=4)
700
+
701
+ # Tree-structured header content
702
+ # final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
703
+
704
+ # final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
705
+
706
+ # document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
707
+
708
+
709
+
710
+ # final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
711
+
712
+
713
+ # Step 1: Extract directory and filename without extension
714
+ pdf_path = path
715
+ json_directory = os.path.dirname(pdf_path)
716
+ json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
717
+
718
+ # Step 2: Define output path for JSON
719
+ output_json_path = os.path.join(json_directory, f"{json_filename }.json")
720
+
721
+ # If your variable is a JSON string, convert it to dict first
722
+ if isinstance(company_wise_clean_df_json, str):
723
+ company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
724
+
725
+
726
+ # Step 3: Save JSON
727
+ with open(output_json_path, 'w', encoding='utf-8') as json_file:
728
+ json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
729
+
730
+
731
+ # # post-processing results
732
+ # post_processing_results = {
733
+ # document_id_name : {
734
+ # # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
735
+ # # "df_download_json" : clean_df_json,
736
+ # "df_download_json": company_wise_clean_df_json,
737
+ # "tree_structured_header_content": final_tree_structred_header_content,
738
+ # "file_name": document_id_name,
739
+ # # "classified_dynamic_json": dynamic_mapped_data_json,
740
+ # "toc_df_download_json" : final_toc_h2h_extraction
741
+ # }
742
+ # }
743
+
744
+ response_final = {
745
+ "status_code": 200,
746
+ # "message":"",
747
+ # "df_download_json": company_wise_clean_df_json,
748
+ "saved_json_path": output_json_path
749
+ }
750
+
751
+
752
+
753
+
754
+ return response_final
755
+
756
+
757
+
758
+ # def upload_documents(path):
759
+ # logger.info(f"Starting upload_documents for path: {path}")
760
+
761
+ # request = Request
762
+ # base_url = str(request.base_url)
763
+ # global document_data
764
+ # document_data = {}
765
+ # pdf_path = path
766
+
767
+ # # Log directory clearing
768
+ # logger.info("Clearing input and output directories")
769
+ # clear_directory(pdf_input_path)
770
+ # clear_directory(pdf_output_path)
771
+ # clear_directory(word_input_path)
772
+ # clear_directory(word_output_path)
773
+
774
+ # # Initialize response structure
775
+ # response = {
776
+ # "success": False,
777
+ # "message": "",
778
+ # }
779
+
780
+ # # Check if the provided path is a PDF file
781
+ # if not pdf_path.lower().endswith(".pdf"):
782
+ # logger.error(f"Invalid file type for path: {pdf_path}. Only PDF files are accepted.")
783
+ # response["message"] = "Invalid file type. Only PDF files are accepted."
784
+ # return response
785
+
786
+ # # Extract filename
787
+ # file_name_with_ext = os.path.basename(pdf_path)
788
+ # file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
789
+ # logger.debug(f"Extracted filename: {file_name_with_ext} (without extension: {file_name_without_ext})")
790
+
791
+ # # Create destination path in input directory
792
+ # destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
793
+ # logger.debug(f"Destination path for file copy: {destination_path}")
794
+
795
+ # # Copy file to input directory
796
+ # try:
797
+ # logger.info(f"Copying file from {pdf_path} to {destination_path}")
798
+ # shutil.copy2(pdf_path, destination_path)
799
+ # except Exception as e:
800
+ # logger.error(f"Failed to copy file from {pdf_path} to {destination_path}: {str(e)}")
801
+ # response["message"] = f"Failed to copy file: {str(e)}"
802
+ # return response
803
+
804
+ # # Create output directories
805
+ # output_directory_path = os.path.join(output_directory)
806
+ # os.makedirs(output_directory_path, exist_ok=True)
807
+ # file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
808
+ # os.makedirs(file_output, exist_ok=True)
809
+ # table_output_path = os.path.join(file_output, f"table_output")
810
+ # os.makedirs(table_output_path, exist_ok=True)
811
+ # file_location = destination_path
812
+ # logger.info(f"Created output directories: {file_output}, {table_output_path}")
813
+
814
+ # # Pipeline processing
815
+ # logger.info(f"Starting yolov10_layout_pipeline for {file_name_without_ext}")
816
+ # try:
817
+ # json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
818
+ # logger.info(f"yolov10_layout_pipeline completed. Output JSON dir: {json_output_dir}")
819
+ # except Exception as e:
820
+ # logger.error(f"yolov10_layout_pipeline failed: {str(e)}")
821
+ # response["message"] = f"yolov10_layout_pipeline failed: {str(e)}"
822
+ # return response
823
+
824
+ # logger.info(f"Starting filtering_table_pipeline for {file_name_without_ext}")
825
+ # try:
826
+ # table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
827
+ # logger.info(f"filtering_table_pipeline completed. Table JSON path: {table_json_path}")
828
+ # except Exception as e:
829
+ # logger.error(f"filtering_table_pipeline failed: {str(e)}")
830
+ # response["message"] = f"filtering_table_pipeline failed: {str(e)}"
831
+ # return response
832
+
833
+ # logger.info(f"Starting process_pdf_for_headers for {file_name_without_ext}")
834
+ # try:
835
+ # custom_headers_json, custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext, file_location, file_output)
836
+ # logger.info(f"process_pdf_for_headers completed. Custom headers JSON path: {custom_headers_json_file_path}")
837
+ # except Exception as e:
838
+ # logger.error(f"process_pdf_for_headers failed: {str(e)}")
839
+ # response["message"] = f"process_pdf_for_headers failed: {str(e)}"
840
+ # return response
841
+
842
+ # logger.info(f"Starting pipeline_for_merging_headers for {file_name_without_ext}")
843
+ # try:
844
+ # header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path, header_json_output_filepath, file_output, file_name_without_ext)
845
+ # logger.info(f"pipeline_for_merging_headers completed. Merged headers JSON path: {header_json_output_filepath}")
846
+ # except Exception as e:
847
+ # logger.error(f"pipeline_for_merging_headers failed: {str(e)}")
848
+ # response["message"] = f"pipeline_for_merging_headers failed: {str(e)}"
849
+ # return response
850
+
851
+ # # Initialize document_data
852
+ # logger.debug(f"Initializing document_data for {file_name_with_ext}")
853
+ # document_data[file_name_with_ext] = {
854
+ # "pdf_path": destination_path,
855
+ # "pdf_file_name": file_name_with_ext,
856
+ # "model_json_header_output_filepath": [],
857
+ # "model_json_layout_output_filepath": [],
858
+ # "tree_structured_header_json_filepath": [],
859
+ # "user_modified_json_output_filepath": [],
860
+ # "user_modified_table_json_filepath": [],
861
+ # "frontend_output_json": [],
862
+ # "cluster_json": [],
863
+ # "id_2_label": [],
864
+ # "file_output_dir": [],
865
+ # "table_output_dir": [],
866
+ # "table_with_header_data": [],
867
+ # "table_with_header_json_path": [],
868
+ # "json_output_dir": [],
869
+ # "pdf_miner_json_path": [],
870
+ # "searchable_pdf_path": []
871
+ # }
872
+
873
+ # # Store paths and filenames
874
+ # document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
875
+ # document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
876
+ # document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
877
+ # document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
878
+ # document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
879
+ # document_data[file_name_with_ext]["file_output_dir"].append(file_output)
880
+ # document_data[file_name_with_ext]["id_2_label"].append(class_names)
881
+ # document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
882
+ # document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
883
+ # document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
884
+ # document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
885
+ # logger.debug(f"Stored paths and filenames in document_data for {file_name_with_ext}")
886
+
887
+ # file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
888
+ # pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
889
+ # pdf_path = document_data[file_name_with_ext]["pdf_path"]
890
+ # user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
891
+
892
+ # logger.info(f"Starting create_json_pdfminer_pipeline for {pdf_file_name}")
893
+ # try:
894
+ # pdf_miner_json_filepath, pdf_miner_metadata, searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
895
+ # logger.info(f"create_json_pdfminer_pipeline completed. PDFMiner JSON path: {pdf_miner_json_filepath}, Searchable PDF path: {searchable_pdf_path}")
896
+ # except Exception as e:
897
+ # logger.error(f"create_json_pdfminer_pipeline failed: {str(e)}")
898
+ # response["message"] = f"create_json_pdfminer_pipeline failed: {str(e)}"
899
+ # return response
900
+
901
+ # logger.info(f"Starting main_pipeline_create_put_table_headers for {file_name_with_ext}")
902
+ # try:
903
+ # table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
904
+ # logger.info(f"main_pipeline_create_put_table_headers completed")
905
+ # except Exception as e:
906
+ # logger.error(f"main_pipeline_create_put_table_headers failed: {str(e)}")
907
+ # response["message"] = f"main_pipeline_create_put_table_headers failed: {str(e)}"
908
+ # return response
909
+
910
+ # document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
911
+ # document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
912
+
913
+ # # Process image URLs
914
+ # pdf_images_urls = []
915
+ # for file_name in os.listdir(pdf_images_path):
916
+ # file_path = os.path.join(pdf_images_path, file_name)
917
+ # if file_name.endswith((".jpg", ".jpeg", ".png")):
918
+ # img_url = base_url + "image/" + str(quote(file_path))
919
+ # pdf_images_urls.append(img_url)
920
+ # logger.debug(f"Collected {len(pdf_images_urls)} image URLs from {pdf_images_path}")
921
+
922
+ # # Sort image URLs by page number
923
+ # def extract_page_no(url):
924
+ # return int(url.split("_")[-1].split(".")[0])
925
+ # sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
926
+ # page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
927
+ # logger.debug(f"Sorted {len(sorted_urls)} image URLs and created page details")
928
+
929
+ # # Store the JSON output
930
+ # document_data[file_name_with_ext]["frontend_output_json"].append({
931
+ # "layout_output_json_data": layout_output_json_data,
932
+ # "layout_json_list_data": layout_list_data,
933
+ # "id_2_label": class_names,
934
+ # "header_output_json_data": header_output_json_data,
935
+ # "table_output_json_data": table_json_data,
936
+ # "table_output_json_data_list": table_json_data_list,
937
+ # "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
938
+ # "pdf_images_urls": page_details,
939
+ # })
940
+ # logger.debug(f"Stored frontend_output_json for {file_name_with_ext}")
941
+
942
+ # document_id_name = file_name_with_ext
943
+ # data = document_data[document_id_name]
944
+ # file_output_dir = data["file_output_dir"][0]
945
+ # json_output_dir = data["json_output_dir"][0]
946
+ # pdf_file_name = data["pdf_file_name"]
947
+ # pdf_path = data["pdf_path"]
948
+
949
+ # # PDFMiner processing
950
+ # pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
951
+ # modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
952
+ # logger.info(f"Reading JSON files: {modified_json_output_filepath}, {pdf_miner_json_filepath}")
953
+ # try:
954
+ # model_modified_json = read_json(modified_json_output_filepath)
955
+ # pdfminer_json = read_json(pdf_miner_json_filepath)
956
+ # logger.info(f"Successfully read JSON files")
957
+ # except Exception as e:
958
+ # logger.error(f"Failed to read JSON files: {str(e)}")
959
+ # response["message"] = f"Failed to read JSON files: {str(e)}"
960
+ # return response
961
+
962
+ # searchable_pdf_path = data["searchable_pdf_path"][0]
963
+
964
+ # logger.info(f"Starting merge_multi_page_tables_pipeline_v2 for {pdf_file_name}")
965
+ # try:
966
+ # table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
967
+ # logger.info(f"merge_multi_page_tables_pipeline_v2 completed. Merged table JSON path: {table_merged_json_path}")
968
+ # except Exception as e:
969
+ # logger.error(f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}")
970
+ # response["message"] = f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}"
971
+ # return response
972
+
973
+ # logger.info(f"Reading merged table JSON: {table_merged_json_path}")
974
+ # try:
975
+ # table_merged_json = read_json(table_merged_json_path)
976
+ # logger.info(f"Successfully read merged table JSON")
977
+ # except Exception as e:
978
+ # logger.error(f"Failed to read merged table JSON: {str(e)}")
979
+ # response["message"] = f"Failed to read merged table JSON: {str(e)}"
980
+ # return response
981
+
982
+ # # logger.info(f"Starting map_table_with_its_header for {file_name_with_extDeprecationWarning}")
983
+ # try:
984
+ # table_mapped_modified_json = map_table_with_its_header(table_merged_json)
985
+ # logger.info(f"map_table_with_its_header completed")
986
+ # except Exception as e:
987
+ # logger.error(f"map_table_with_its_header failed: {str(e)}")
988
+ # response["message"] = f"map_table_with_its_header failed: {st(e)}"
989
+ # return response
990
+
991
+ # logger.info(f"Starting main_header_pipeline for {file_name_with_ext}")
992
+ # try:
993
+ # df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
994
+ # logger.info(f"main_header_pipeline completed")
995
+ # except Exception as e:
996
+ # logger.error(f"main_header_pipeline failed: {str(e)}")
997
+ # response["message"] = f"main_header_pipeline failed: {str(e)}"
998
+ # return response
999
+
1000
+ # logger.info(f"Starting clean_dataframe for final DataFrame")
1001
+ # try:
1002
+ # clean_df, clean_df_json = clean_dataframe(df_final)
1003
+ # logger.info(f"clean_dataframe completed. Clean JSON created")
1004
+ # except Exception as e:
1005
+ # logger.error(f"clean_dataframe failed: {str(e)}")
1006
+ # response["message"] = f"clean_dataframe failed: {str(e)}"
1007
+ # return response
1008
+
1009
+ # if isinstance(clean_df_json, str):
1010
+ # clean_df_json = eval(clean_df_json)
1011
+ # logger.debug(f"Converted clean_df_json string to dictionary")
1012
+
1013
+ # file_name = get_file_name_without_extension(pdf_file_name)
1014
+ # logger.info(f"Starting process_document_company_wise for {file_name}")
1015
+ # try:
1016
+ # merged_content_company_wise_df = process_document_company_wise(clean_df_json, output_directory=json_output_dir, file_name=file_name)
1017
+ # company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
1018
+ # logger.info(f"process_document_company_wise and clean_dataframe completed")
1019
+ # except Exception as e:
1020
+ # logger.error(f"process_document_company_wise failed: {str(e)}")
1021
+ # response["message"] = f"process_document_company_wise failed: {str(e)}"
1022
+ # return response
1023
+
1024
+ # json_output_filename = file_name + "_final_h2h_extraction.json"
1025
+ # final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
1026
+ # logger.info(f"Saving final JSON to {final_json_output_filepath}")
1027
+ # try:
1028
+ # with open(final_json_output_filepath, 'w') as f:
1029
+ # json.dump(clean_df_json, f, indent=4)
1030
+ # logger.info(f"Final JSON saved successfully")
1031
+ # except Exception as e:
1032
+ # logger.error(f"Failed to save final JSON: {str(e)}")
1033
+ # response["message"] = f"Failed to save final JSON: {str(e)}"
1034
+ # return response
1035
+
1036
+ # company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
1037
+ # company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
1038
+ # logger.info(f"Saving company-wise JSON to {company_wise_final_json_output_filepath}")
1039
+ # try:
1040
+ # with open(company_wise_final_json_output_filepath, 'w') as f:
1041
+ # json.dump(merged_content_company_wise_df, f, indent=4)
1042
+ # logger.info(f"Company-wise JSON saved successfully")
1043
+ # except Exception as e:
1044
+ # logger.error(f"Failed to save company-wise JSON: {str(e)}")
1045
+ # response["message"] = f"Failed to save company-wise JSON: {str(e)}"
1046
+ # return response
1047
+
1048
+ # logger.info(f"Starting tree_structured_headers_content_pipeline for {pdf_file_name}")
1049
+ # try:
1050
+ # final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
1051
+ # final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
1052
+ # document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
1053
+ # logger.info(f"tree_structured_headers_content_pipeline and merge_blocks completed")
1054
+ # except Exception as e:
1055
+ # logger.error(f"tree_structured_headers_content_pipeline failed: {str(e)}")
1056
+ # response["message"] = f"tree_structured_headers_content_pipeline failed: {str(e)}"
1057
+ # return response
1058
+
1059
+ # logger.info(f"Starting customised_toc_extraction_pipeline for {searchable_pdf_path}")
1060
+ # try:
1061
+ # final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path, yolo_detection_json_path=modified_json_output_filepath, output_directory=file_output_dir)
1062
+ # logger.info(f"customised_toc_extraction_pipeline completed")
1063
+ # except Exception as e:
1064
+ # logger.error(f"customised_toc_extraction_pipeline failed: {str(e)}")
1065
+ # response["message"] = f"customised_toc_extraction_pipeline failed: {str(e)}"
1066
+ # return response
1067
+
1068
+ # # Save final JSON output
1069
+ # json_directory = os.path.dirname(pdf_path)
1070
+ # json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
1071
+ # output_json_path = os.path.join(json_directory, f"{json_filename}.json")
1072
+ # logger.info(f"Saving output JSON to {output_json_path}")
1073
+
1074
+ # try:
1075
+ # if isinstance(company_wise_clean_df_json, str):
1076
+ # company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
1077
+ # logger.debug(f"Converted company_wise_clean_df_json string to dictionary")
1078
+
1079
+ # with open(output_json_path, 'w', encoding='utf-8') as json_file:
1080
+ # json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
1081
+ # logger.info(f"Output JSON saved successfully")
1082
+ # except Exception as e:
1083
+ # logger.error(f"Failed to save output JSON: {str(e)}")
1084
+ # response["message"] = f"Failed to save output JSON: {str(e)}"
1085
+ # return response
1086
+
1087
+ # response_final = {
1088
+ # "status_code": 200,
1089
+ # "saved_json_path": output_json_path
1090
+ # }
1091
+ # logger.info(f"upload_documents completed successfully for {file_name_with_ext}. Response: {response_final}")
1092
+ # return response_final
1093
+
1094
+
1095
+ def table_extraction_and_mapping(path,
1096
+ field_name,
1097
+ class_keywords_table,
1098
+ header_categories,
1099
+ class_keywords_table_column,
1100
+ filter_table_classifier_name,
1101
+ threshold) :
1102
+ # path = eval(f'{path}')
1103
+ request = Request
1104
+ print(f'started for path: {path}')
1105
+ base_url = str(request.base_url)
1106
+ global document_data
1107
+ document_data = {}
1108
+ pdf_path = path
1109
+
1110
+ clear_directory(pdf_input_path)
1111
+ clear_directory(pdf_output_path)
1112
+ clear_directory(word_input_path)
1113
+ clear_directory(word_output_path)
1114
+
1115
+ # Initialize response structure
1116
+ response = {
1117
+ "success": False,
1118
+ "message": "",
1119
+ # "data": None
1120
+ }
1121
+
1122
+
1123
+ if not pdf_path:
1124
+ response["message"] = "No file path provided."
1125
+ return response
1126
+
1127
+ # Check if the provided path is a PDF file
1128
+ if not pdf_path.lower().endswith(".pdf"):
1129
+ response["message"] = "Invalid file type. Only PDF files are accepted."
1130
+ return response
1131
+
1132
+
1133
+ # Extract filename
1134
+ file_name_with_ext = os.path.basename(pdf_path)
1135
+ file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
1136
+
1137
+ # Create destination path in input directory
1138
+ destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
1139
+
1140
+ try:
1141
+ # Copy the file to our input directory
1142
+ shutil.copy2(pdf_path, destination_path)
1143
+ except Exception as e:
1144
+ response["message"] = f"Failed to copy file: {str(e)}"
1145
+ return response
1146
+
1147
+ output_directory_path = os.path.join(output_directory)
1148
+ os.makedirs(output_directory_path, exist_ok=True)
1149
+ file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
1150
+ os.makedirs(file_output, exist_ok=True)
1151
+
1152
+ table_output_path = os.path.join(file_output, f"table_output")
1153
+ os.makedirs(table_output_path, exist_ok=True)
1154
+ file_location = destination_path
1155
+
1156
+
1157
+ # Pipeline processing
1158
+
1159
+ json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
1160
+ table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
1161
+
1162
+ custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
1163
+ header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
1164
+
1165
+ # Initialize data for the new document
1166
+ document_data[file_name_with_ext] = {
1167
+
1168
+ "pdf_path": destination_path,
1169
+ "pdf_file_name": file_name_with_ext,
1170
+ "model_json_header_output_filepath": [],
1171
+ "model_json_layout_output_filepath": [],
1172
+ "tree_structured_header_json_filepath": [],
1173
+ "user_modified_json_output_filepath": [],
1174
+ 'user_modified_table_json_filepath': [],
1175
+ "frontend_output_json": [],
1176
+ "cluster_json": [],
1177
+ "id_2_label" : [],
1178
+ "file_output_dir" : [],
1179
+ "table_output_dir": [],
1180
+ "table_with_header_data" : [],
1181
+ "table_with_header_json_path" : [],
1182
+ "json_output_dir": [],
1183
+ "pdf_miner_json_path": [] ,
1184
+ "searchable_pdf_path" : []
1185
+ }
1186
+
1187
+ # Store paths and filenames
1188
+ document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
1189
+ document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
1190
+ document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
1191
+ document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
1192
+ document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
1193
+ document_data[file_name_with_ext]["file_output_dir"].append(file_output)
1194
+ document_data[file_name_with_ext]["id_2_label"].append(class_names)
1195
+ document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
1196
+ document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
1197
+ document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
1198
+ document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
1199
+
1200
+ file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
1201
+ pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
1202
+ pdf_path = document_data[file_name_with_ext]["pdf_path"]
1203
+ user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
1204
+
1205
+ pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
1206
+
1207
+ table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
1208
+ document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
1209
+
1210
+ document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
1211
+
1212
+ # Process image URLs
1213
+ pdf_images_urls = []
1214
+ for file_name in os.listdir(pdf_images_path):
1215
+ file_path = os.path.join(pdf_images_path, file_name)
1216
+ if file_name.endswith((".jpg", ".jpeg", ".png")):
1217
+ img_url = base_url + "image/" + str(quote(file_path))
1218
+ pdf_images_urls.append(img_url)
1219
+
1220
+ # Sort image URLs by page number
1221
+ def extract_page_no(url):
1222
+ return int(url.split("_")[-1].split(".")[0])
1223
+ sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
1224
+
1225
+ # Create page details
1226
+ page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
1227
+
1228
+ # Store the JSON output
1229
+ document_data[file_name_with_ext]["frontend_output_json"].append({
1230
+ "layout_output_json_data": layout_output_json_data,
1231
+ "layout_json_list_data": layout_list_data,
1232
+ "id_2_label": class_names,
1233
+ "header_output_json_data": header_output_json_data,
1234
+ "table_output_json_data": table_json_data,
1235
+ "table_output_json_data_list": table_json_data_list,
1236
+ "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
1237
+ "pdf_images_urls": page_details,
1238
+ })
1239
+
1240
+
1241
+ document_id_name = file_name_with_ext
1242
+
1243
+ data = document_data[document_id_name]
1244
+ file_output_dir = data["file_output_dir"][0]
1245
+ json_output_dir = data["json_output_dir"][0]
1246
+ pdf_file_name = data["pdf_file_name"]
1247
+ pdf_path = data["pdf_path"]
1248
+
1249
+ # PDFMiner processing
1250
+ pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
1251
+ modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
1252
+ model_modified_json = read_json(modified_json_output_filepath)
1253
+ pdfminer_json = read_json(pdf_miner_json_filepath)
1254
+ searchable_pdf_path = data["searchable_pdf_path"][0]
1255
+
1256
+ # table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
1257
+
1258
+ table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
1259
+
1260
+ table_merged_json = read_json(table_merged_json_path)
1261
+
1262
+ table_mapped_modified_json = map_table_with_its_header(table_merged_json)
1263
+
1264
+
1265
+ # table_mapped_modified_json = map_table_with_its_header(model_modified_json)
1266
+
1267
+ # Main header pipeline
1268
+ df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
1269
+
1270
+
1271
+ clean_df, clean_df_json = clean_dataframe(df_final)
1272
+
1273
+
1274
+ # if isinstance(clean_df_json, str):
1275
+ # clean_df_json = eval(clean_df_json)
1276
+
1277
+ file_name = get_file_name_without_extension(pdf_file_name)
1278
+
1279
+
1280
+ # Step 1: Extract directory and filename without extension
1281
+ pdf_path = path
1282
+ json_directory = os.path.dirname(pdf_path)
1283
+ json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
1284
+
1285
+ # Step 2: Define output path for JSON
1286
+ output_json_path = os.path.join(json_directory, f"{json_filename}_structured_chunking.json")
1287
+
1288
+ # If your variable is a JSON string, convert it to dict first
1289
+ if isinstance(clean_df_json, str):
1290
+ clean_df_json = json.loads(clean_df_json)
1291
+
1292
+
1293
+ # Step 3: Save JSON
1294
+ with open(output_json_path, 'w', encoding='utf-8') as json_file:
1295
+ json.dump(clean_df_json, json_file, ensure_ascii=False, indent=4)
1296
+
1297
+
1298
+ ##########################################################
1299
+ # Table Classification Code
1300
+
1301
+ ##########################################################
1302
+
1303
+
1304
+ print("starting table classification pipeline")
1305
+
1306
+ structured_chunk_json_path = output_json_path
1307
+
1308
+ with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
1309
+ content = file.read()
1310
+
1311
+ # This regex removes commas before closing braces/brackets, ignoring whitespace
1312
+ cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
1313
+
1314
+ # Parse the cleaned JSON
1315
+ structured_chunk_data = json.loads(cleaned_content)
1316
+
1317
+ threshold = float(threshold)
1318
+ print("type of class_keywords_table::\n",type(class_keywords_table))
1319
+
1320
+ # If class_keywords is a string, try to parse it
1321
+ if isinstance(class_keywords_table, str):
1322
+ try:
1323
+ class_keywords_table = json.loads(class_keywords_table)
1324
+
1325
+ # if not isinstance(class_keywords_table, dict):
1326
+ # raise ValueError("class_keywords_table must be a dictionary")
1327
+ # if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
1328
+ # for key, value in class_keywords_table.items()):
1329
+ # raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
1330
+ except json.JSONDecodeError:
1331
+ raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
1332
+
1333
+ elif isinstance(class_keywords_table, dict) :
1334
+ class_keywords_table = class_keywords_table
1335
+
1336
+
1337
+ else:
1338
+ raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
1339
+
1340
+
1341
+ # Perform classification
1342
+ categorized_headers_json = perform_classification(
1343
+ data=structured_chunk_data,
1344
+ class_keywords=class_keywords_table,
1345
+ header_categories=header_categories,
1346
+ similarity_threshold=threshold
1347
+ )
1348
+
1349
+ # Step 1: Extract directory and filename without extension
1350
+ pdf_path = path
1351
+ json_directory = os.path.dirname(pdf_path)
1352
+ json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
1353
+
1354
+ # Step 2: Define output path for JSON
1355
+ classified_table_output_json_path = os.path.join(json_directory, f"{field_name}_table_classification.json")
1356
+
1357
+ # If your variable is a JSON string, convert it to dict first
1358
+ if isinstance(categorized_headers_json, str):
1359
+ categorized_headers_json = json.loads(categorized_headers_json)
1360
+
1361
+
1362
+ # Step 3: Save JSON
1363
+ with open(classified_table_output_json_path, 'w', encoding='utf-8') as json_file:
1364
+ json.dump(categorized_headers_json, json_file, ensure_ascii=False, indent=4)
1365
+
1366
+
1367
+ #######################################################
1368
+ # Table Column Classification Code
1369
+ print("Starting Table Column Classification")
1370
+
1371
+
1372
+ # Parse JSON strings into dictionaries
1373
+ # input_table_classified_json = json.load(classified_table_output_json_path)
1374
+ with open(classified_table_output_json_path, "r") as f:
1375
+ input_table_classified_json = json.load(f)
1376
+
1377
+
1378
+ # class_keywords_table_column = json.loads(class_keywords_table_column)
1379
+
1380
+ if isinstance(class_keywords_table_column, str):
1381
+ try:
1382
+ class_keywords_table_column = json.loads(class_keywords_table_column)
1383
+
1384
+
1385
+ except json.JSONDecodeError:
1386
+ raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table_column"})
1387
+
1388
+ elif isinstance(class_keywords_table_column, dict):
1389
+ class_keywords_table_column = class_keywords_table_column
1390
+
1391
+ # Convert similarity_threshold to integer
1392
+ similarity_threshold = float(threshold)
1393
+
1394
+ column_classification_results_json = classify_column_headers(
1395
+ json_data=input_table_classified_json,
1396
+ class_keywords=class_keywords_table_column,
1397
+ filter_table_classifier_name=filter_table_classifier_name,
1398
+ similarity_threshold=similarity_threshold
1399
+ )
1400
+
1401
+ # Step 1: Extract directory and filename without extension
1402
+ pdf_path = path
1403
+ json_directory = os.path.dirname(pdf_path)
1404
+ json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
1405
+
1406
+ # Step 2: Define output path for JSON
1407
+ classified_table_column_output_json_path = os.path.join(json_directory, f"{field_name}_table_column_classification.json")
1408
+
1409
+ # If your variable is a JSON string, convert it to dict first
1410
+ if isinstance(column_classification_results_json, str):
1411
+ column_classification_results_json = json.loads(column_classification_results_json)
1412
+
1413
+
1414
+ # Step 3: Save JSON
1415
+ with open(classified_table_column_output_json_path, 'w', encoding='utf-8') as json_file:
1416
+ json.dump(column_classification_results_json, json_file, ensure_ascii=False, indent=4)
1417
+
1418
+ #######################################################################
1419
+
1420
+ response_final = {
1421
+ "status_code": 200,
1422
+ # "message":"",
1423
+ # "df_download_json": company_wise_clean_df_json,
1424
+ "structured_chunk_json_path": output_json_path,
1425
+ "table_classification_json_path":classified_table_output_json_path,
1426
+ "table_column_classification_json_path" : classified_table_column_output_json_path
1427
+ }
1428
+
1429
+
1430
+
1431
+
1432
+ return response_final
1433
+
1434
+
1435
+
1436
+
1437
+
layout_detection_docling_heron (1).py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import supervision as sv # pip install supervision
4
+ from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
5
+ from pdf2image import convert_from_path
6
+ import numpy as np
7
+ from PIL import Image
8
+ import json
9
+ import pytesseract
10
+ import pandas as pd
11
+ from sentence_transformers import SentenceTransformer, util
12
+ from PyPDF2 import PdfReader
13
+ from datetime import datetime
14
+ import torch
15
+ import logging
16
+ from utils.utils_code import log_time_taken
17
+ from concurrent.futures import ProcessPoolExecutor, as_completed
18
+ import multiprocessing
19
+ import sys
20
+ import gc
21
+
22
+ from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
23
+ from config.set_config import set_configuration
24
+ set_config_project = set_configuration()
25
+ layout_model_weights_path = set_config_project.layout_model_weights_path
26
+ no_of_threads = set_config_project.no_of_threads
27
+ from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
28
+ from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
29
+ from src.table_processing.put_table_header import put_table_header_pipeline
30
+ import gc
31
+ from src.layout_detection.load_model import load_model_for_process
32
+
33
+ # Set multiprocessing start method
34
+ multiprocessing.set_start_method('spawn', force=True)
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Configure logging
38
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
39
+
40
+ def load_torch(version):
41
+ if version == "2.2.2":
42
+ sys.path.insert(0, "./torch_2_2_2")
43
+ elif version == "2.6.0":
44
+ sys.path.insert(0, "./torch_2_6_0")
45
+ import torch
46
+ logger.info(f"Using Torch Version: {torch.__version__}")
47
+ return torch
48
+
49
+ torch = load_torch("2.2.2")
50
+
51
+
52
+
53
+ def get_file_name_without_extension(file_path):
54
+ directory, file_name = os.path.split(file_path)
55
+ name, extension = os.path.splitext(file_name)
56
+ return name
57
+
58
+ def convert_numpy(data):
59
+ if isinstance(data, dict):
60
+ return {key: convert_numpy(value) for key, value in data.items()}
61
+ elif isinstance(data, list):
62
+ return [convert_numpy(item) for item in data]
63
+ elif isinstance(data, np.integer):
64
+ return int(data)
65
+ elif isinstance(data, np.floating):
66
+ return float(data)
67
+ elif isinstance(data, np.ndarray):
68
+ return data.tolist()
69
+ elif isinstance(data, pd.DataFrame):
70
+ return data.to_dict(orient='records')
71
+ else:
72
+ return data
73
+
74
+ def filter_layout_blocks(input_data):
75
+ filtered_layout_blocks = []
76
+ for blocks in input_data.values():
77
+ filtered_layout_blocks.extend([block for block in blocks])
78
+ return filtered_layout_blocks
79
+
80
+ def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
81
+ images = convert_from_path(file_path, dpi=dpi)
82
+ total_pages = len(images)
83
+
84
+ def page_generator():
85
+ for start_page in range(1, total_pages + 1, batch_size):
86
+ end_page = min(start_page + batch_size - 1, total_pages)
87
+ yield images[start_page-1:end_page]
88
+
89
+ return page_generator()
90
+
91
+ def read_json(json_file):
92
+ with open(json_file, 'r') as file:
93
+ return json.load(file)
94
+
95
+ def filter_and_sort_headers(data, modified_json_output_filepath):
96
+ def sort_blocks_by_min_x(blocks):
97
+ return sorted(blocks, key=lambda block: block['bbox'][0])
98
+
99
+ def sort_blocks_by_min_y(blocks):
100
+ return sorted(blocks, key=lambda block: block['bbox'][1])
101
+
102
+ def find_headers_and_group(sorted_blocks):
103
+ headers_list = []
104
+ current_group = []
105
+ previous_block = None
106
+
107
+ for i, block in enumerate(sorted_blocks):
108
+ if previous_block:
109
+ prev_xmax = previous_block['bbox'][2]
110
+ prev_xmax_threshold = int(previous_block['bbox'][2])
111
+ if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
112
+ if current_group:
113
+ headers_list.extend(sort_blocks_by_min_y(current_group))
114
+ current_group = []
115
+ current_group.append(block)
116
+ previous_block = block
117
+
118
+ if current_group:
119
+ headers_list.extend(sort_blocks_by_min_y(current_group))
120
+
121
+ return headers_list
122
+
123
+ result = {}
124
+ for key, blocks in data.items():
125
+ sorted_blocks = sort_blocks_by_min_x(blocks)
126
+ sorted_headers = find_headers_and_group(sorted_blocks)
127
+ result[key] = sorted_headers
128
+
129
+ sorted_data = result
130
+ with open(modified_json_output_filepath, 'w') as f:
131
+ json.dump(sorted_data, f, indent=4)
132
+
133
+ return sorted_data, modified_json_output_filepath
134
+
135
+ def filter_and_sort_layouts(data, modified_json_output_filepath):
136
+ def sort_blocks_by_min_x(blocks):
137
+ return sorted(blocks, key=lambda block: block['bbox'][0])
138
+
139
+ def sort_blocks_by_min_y(blocks):
140
+ return sorted(blocks, key=lambda block: block['bbox'][1])
141
+
142
+ def find_classes_and_group(sorted_blocks):
143
+ classes_list = []
144
+ current_group = []
145
+ previous_block = None
146
+
147
+ for i, block in enumerate(sorted_blocks):
148
+ if previous_block:
149
+ prev_xmax = previous_block['bbox'][2]
150
+ prev_xmax_threshold = int(previous_block['bbox'][2])
151
+ if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
152
+ if current_group:
153
+ classes_list.extend(sort_blocks_by_min_y(current_group))
154
+ current_group = []
155
+ current_group.append(block)
156
+ previous_block = block
157
+
158
+ if current_group:
159
+ classes_list.extend(sort_blocks_by_min_y(current_group))
160
+
161
+ return classes_list
162
+
163
+ result = {}
164
+ for key, blocks in data.items():
165
+ sorted_blocks = sort_blocks_by_min_x(blocks)
166
+ sorted_layouts = find_classes_and_group(sorted_blocks)
167
+ result[key] = sorted_layouts
168
+
169
+ sorted_layout_data = result
170
+ with open(modified_json_output_filepath, 'w') as f:
171
+ json.dump(sorted_layout_data, f, indent=4)
172
+
173
+ return sorted_layout_data, modified_json_output_filepath
174
+
175
+ @log_time_taken
176
+ def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
177
+ try:
178
+ image = Image.open(img_path).convert("RGB")
179
+
180
+ # Process image with the Docling Heron model
181
+ inputs = image_processor(images=[image], return_tensors="pt")
182
+
183
+ # Move inputs to the same device as the model
184
+ inputs = {k: v.to(device) for k, v in inputs.items()}
185
+
186
+ with torch.no_grad():
187
+ outputs = model(**inputs)
188
+
189
+ # Post-process the results
190
+ results = image_processor.post_process_object_detection(
191
+ outputs,
192
+ target_sizes=torch.tensor([image.size[::-1]], device=device),
193
+ threshold=threshold
194
+ )[0]
195
+
196
+ # Move results to CPU for further processing
197
+ results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
198
+
199
+ # Convert to supervision Detections format for compatibility
200
+ xyxy = results["boxes"].numpy()
201
+ confidence = results["scores"].numpy()
202
+ class_id = results["labels"].numpy()
203
+ class_name = [model.config.id2label[label_id] for label_id in class_id]
204
+
205
+ detections = sv.Detections(
206
+ xyxy=xyxy,
207
+ confidence=confidence,
208
+ class_id=class_id,
209
+ data={"class_name": class_name}
210
+ )
211
+
212
+ # Custom bounding box color (Red)
213
+ bbox_color = sv.Color(r=255, g=0, b=0)
214
+ bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
215
+ label_annotator = sv.LabelAnnotator()
216
+
217
+ # Annotate the image
218
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
219
+ annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
220
+ annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
221
+
222
+ # Clean up
223
+ del inputs, outputs
224
+ torch.cuda.empty_cache() if device == 'cuda' else None
225
+ gc.collect()
226
+
227
+ return annotated_image, detections, results
228
+
229
+ except Exception as e:
230
+ logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
231
+ raise
232
+
233
+ def enhance_dpi(image, new_dpi=300, old_dpi=150):
234
+ old_dpi = int(old_dpi)
235
+ new_dpi = int(new_dpi)
236
+ scaling_factor = new_dpi / old_dpi
237
+ new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
238
+ resized_image = image.resize(new_size, Image.LANCZOS)
239
+ return resized_image
240
+
241
+ def extract_text_from_bbox(image, bbox):
242
+ if isinstance(image, Image.Image):
243
+ image = np.array(image)
244
+ elif isinstance(image, np.ndarray):
245
+ pass
246
+ else:
247
+ raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
248
+
249
+ image_height, image_width = image.shape[:2]
250
+ ymin = max(0, int(bbox['ymin'] - 5))
251
+ ymax = min(image_height, int(bbox['ymax'] + 5))
252
+ xmin = max(0, int(bbox['xmin'] - 20))
253
+ xmax = min(image_width, int(bbox['xmax'] + 20))
254
+
255
+ cropped_image = image[ymin:ymax, xmin:xmax]
256
+ cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
257
+ high_dpi_image = enhance_dpi(cropped_image_pil)
258
+ high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
259
+ gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
260
+
261
+ custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
262
+ extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
263
+
264
+ return extracted_text
265
+
266
+ def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
267
+ if not isinstance(extracted_text, pd.DataFrame):
268
+ return False
269
+
270
+ model = SentenceTransformer(model_name)
271
+ extracted_headers = list(extracted_text.columns)
272
+ extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
273
+ header_embeddings = model.encode(header_list, convert_to_tensor=True)
274
+
275
+ similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
276
+
277
+ for i, header in enumerate(header_list):
278
+ for j, extracted_header in enumerate(extracted_headers):
279
+ if similarity_matrix[i][j] > threshold:
280
+ logger.info(f"Matching header found: {extracted_header} (similar to {header})")
281
+ return True
282
+
283
+ logger.info("No matching headers found.")
284
+ return False
285
+
286
+ def process_page(args):
287
+ (page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
288
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
289
+ try:
290
+ model, image_processor, class_names = load_model_for_process()
291
+ model.to(device) # Ensure model is on the correct device
292
+ image = np.array(page_img)
293
+
294
+ h, w, _ = image.shape
295
+ page_number = str(current_page_num)
296
+
297
+ img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
298
+ img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
299
+ pil_image = Image.fromarray(image)
300
+ pil_image.save(img_output_filepath)
301
+
302
+ cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
303
+ os.makedirs(cropped_images_path, exist_ok=True)
304
+
305
+ bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
306
+ logger.info(f"Processed layout detection for page {page_number}")
307
+
308
+ pil_bbox_image = Image.fromarray(bbox_image)
309
+ bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
310
+ bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
311
+ pil_bbox_image.save(bbox_output_filepath)
312
+ page_information = []
313
+
314
+ for idx, bbox in enumerate(page_detections_info.xyxy):
315
+ label_name = page_detections_info.data['class_name'][idx]
316
+ class_id = page_detections_info.class_id[idx]
317
+ score = page_detections_info.confidence[idx]
318
+
319
+ image_height = h
320
+ image_width = w
321
+
322
+ ymin = max(0, bbox[1] - 10)
323
+ ymax = min(image_height, bbox[3] + 10)
324
+ xmin = max(0, bbox[0] - 10)
325
+ xmax = min(image_width, bbox[2] + 10)
326
+
327
+ new_bbox = {
328
+ "xmin": int(bbox[0]),
329
+ "ymin": int(bbox[1]),
330
+ "xmax": int(bbox[2]),
331
+ "ymax": int(bbox[3])
332
+ }
333
+
334
+ cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
335
+ os.makedirs(cropped_labels_images_path, exist_ok=True)
336
+
337
+ crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
338
+ crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
339
+
340
+ crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
341
+ cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
342
+ cropped_label_pil_image.save(crop_label_image_filename_filepath)
343
+
344
+ if label_name == 'Table':
345
+ crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
346
+ cropped_image = pil_image.crop(crop_bbox)
347
+ df_post_processed, df_original = tsr_inference_image(cropped_image)
348
+ extracted_df = df_post_processed
349
+ extracted_text = extracted_df
350
+
351
+ if isinstance(df_original, pd.DataFrame):
352
+ extracted_df_markdown = df_original.to_markdown()
353
+ else:
354
+ extracted_df_markdown = df_original
355
+ else:
356
+ extracted_text = extract_text_from_bbox(image, new_bbox)
357
+ extracted_df_markdown = ""
358
+
359
+ page_block_id = f"{str(idx + 1) + str(current_page_num)}"
360
+ page_block_id = int(page_block_id)
361
+
362
+ page_information.append({
363
+ 'page_block_id': page_block_id,
364
+ 'label_name': label_name,
365
+ 'pdf_page_id': current_page_num,
366
+ 'pdf_name': file_name,
367
+ 'label_id': class_id,
368
+ 'yolo_detection_confidence_score': score,
369
+ 'bbox': [xmin, ymin, xmax, ymax],
370
+ 'page_img_width': w,
371
+ 'page_img_height': h,
372
+ 'extracted_text': [extracted_text],
373
+ "extracted_table_markdown": [extracted_df_markdown]
374
+ })
375
+
376
+ # Clean up
377
+ del image, bbox_image, model, image_processor
378
+ torch.cuda.empty_cache() if device == 'cuda' else None
379
+ gc.collect()
380
+
381
+ return page_number, page_information, class_names
382
+
383
+ except Exception as e:
384
+ logger.error(f"Error processing page {current_page_num}: {str(e)}")
385
+ raise
386
+
387
+ @log_time_taken
388
+ def yolov10_layout_pipeline(file_name, file_path, directory_path):
389
+ if not file_path.lower().endswith('.pdf'):
390
+ raise ValueError("Input file must be a PDF.")
391
+
392
+ logger.info(f"Starting processing for {file_name}")
393
+ start_time = datetime.now()
394
+ file_name = get_file_name_without_extension(file_path)
395
+
396
+ pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
397
+ os.makedirs(pdf_images_path, exist_ok=True)
398
+
399
+ bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
400
+ os.makedirs(bbox_images_path, exist_ok=True)
401
+
402
+ json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
403
+ os.makedirs(json_output_path, exist_ok=True)
404
+
405
+ total_pages_processed = 0
406
+ data_pdf = {}
407
+
408
+ try:
409
+ page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
410
+
411
+ page_args = []
412
+ for pages in page_generator:
413
+ if not pages:
414
+ break
415
+
416
+ for page_num, page_img in enumerate(pages):
417
+ current_page_num = total_pages_processed + page_num + 1
418
+ logger.info(f"Processing file {file_name}, page {current_page_num}")
419
+
420
+ page_args.append((
421
+ page_img,
422
+ current_page_num,
423
+ file_name,
424
+ pdf_images_path,
425
+ bbox_images_path
426
+ ))
427
+
428
+ total_pages_processed += len(pages)
429
+
430
+ logger.info(f"Total pages to process: {total_pages_processed}")
431
+ with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
432
+ future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
433
+ for future in as_completed(future_to_page):
434
+ page_number = future_to_page[future]
435
+ try:
436
+ result = future.result()
437
+ page_number, page_information, class_names = result
438
+ data_pdf[page_number] = page_information
439
+ except Exception as e:
440
+ logger.error(f"Error processing page {page_number}: {str(e)}")
441
+ raise
442
+
443
+ logger.info(f"Processed pages: {data_pdf.keys()}")
444
+ layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
445
+ user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
446
+ tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
447
+ data_pdf = convert_numpy(data_pdf)
448
+ layout_list_data = filter_layout_blocks(data_pdf)
449
+
450
+ with open(layout_json_file_path, 'w') as json_file:
451
+ json.dump(data_pdf, json_file, indent=4)
452
+
453
+ with open(user_modification_json_file_path, 'w') as json_file:
454
+ json.dump(data_pdf, json_file, indent=4)
455
+
456
+ sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
457
+ tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
458
+ sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
459
+
460
+ filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
461
+ end_time = datetime.now()
462
+
463
+ logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
464
+ logger.info(f"JSON file created at: {modified_json_output_filepath}")
465
+ return (
466
+ json_output_path,
467
+ layout_list_data,
468
+ class_names,
469
+ sorted_data,
470
+ modified_json_output_filepath,
471
+ pdf_images_path,
472
+ file_name,
473
+ sorted_layout_data,
474
+ sorted_layout_json_filepath,
475
+ tree_structured_organized_json_data,
476
+ tree_structured_json_output_path,
477
+ filtered_table_header_data,
478
+ filtered_table_header_data_json_path
479
+ )
480
+
481
+ except Exception as e:
482
+ logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
483
+ raise
484
+ finally:
485
+ # Ensure GPU memory is cleared
486
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
487
+ gc.collect()
488
+
489
+ # Example usage
490
+ if __name__ == "__main__":
491
+ pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
492
+ output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
493
+ file_name = get_file_name_without_extension(pdf_path)
494
+ yolov10_layout_pipeline(file_name, pdf_path, output_directory)
495
+
496
+
497
+
layout_detection_docling_heron (2).py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import os
3
+ import supervision as sv # pip install supervision
4
+ from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
5
+ from pdf2image import convert_from_path
6
+ import numpy as np
7
+ from PIL import Image
8
+ import json
9
+ import pytesseract
10
+ import pandas as pd
11
+ from sentence_transformers import SentenceTransformer, util
12
+ from PyPDF2 import PdfReader
13
+ from datetime import datetime
14
+ import torch
15
+ import logging
16
+ from utils.utils_code import log_time_taken
17
+ from concurrent.futures import ProcessPoolExecutor, as_completed
18
+ import multiprocessing
19
+ import sys
20
+ import gc
21
+
22
+ from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
23
+ from config.set_config import set_configuration
24
+ set_config_project = set_configuration()
25
+ layout_model_weights_path = set_config_project.layout_model_weights_path
26
+ no_of_threads = set_config_project.no_of_threads
27
+ from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
28
+ from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
29
+ from src.table_processing.put_table_header import put_table_header_pipeline
30
+ import gc
31
+ from src.layout_detection.load_model import load_model_for_process
32
+
33
+ # Set multiprocessing start method
34
+ multiprocessing.set_start_method('spawn', force=True)
35
+ logger = logging.getLogger(__name__)
36
+
37
+ # Configure logging
38
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
39
+
40
+ def load_torch(version):
41
+ if version == "2.2.2":
42
+ sys.path.insert(0, "./torch_2_2_2")
43
+ elif version == "2.6.0":
44
+ sys.path.insert(0, "./torch_2_6_0")
45
+ import torch
46
+ logger.info(f"Using Torch Version: {torch.__version__}")
47
+ return torch
48
+
49
+ torch = load_torch("2.2.2")
50
+
51
+ def get_file_name_without_extension(file_path):
52
+ directory, file_name = os.path.split(file_path)
53
+ name, extension = os.path.splitext(file_name)
54
+ return name
55
+
56
+ def convert_numpy(data):
57
+ if isinstance(data, dict):
58
+ return {key: convert_numpy(value) for key, value in data.items()}
59
+ elif isinstance(data, list):
60
+ return [convert_numpy(item) for item in data]
61
+ elif isinstance(data, np.integer):
62
+ return int(data)
63
+ elif isinstance(data, np.floating):
64
+ return float(data)
65
+ elif isinstance(data, np.ndarray):
66
+ return data.tolist()
67
+ elif isinstance(data, pd.DataFrame):
68
+ return data.to_dict(orient='records')
69
+ else:
70
+ return data
71
+
72
+ def filter_layout_blocks(input_data):
73
+ filtered_layout_blocks = []
74
+ for blocks in input_data.values():
75
+ filtered_layout_blocks.extend([block for block in blocks])
76
+ return filtered_layout_blocks
77
+
78
+ def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
79
+ images = convert_from_path(file_path, dpi=dpi)
80
+ total_pages = len(images)
81
+
82
+ def page_generator():
83
+ for start_page in range(1, total_pages + 1, batch_size):
84
+ end_page = min(start_page + batch_size - 1, total_pages)
85
+ yield images[start_page-1:end_page]
86
+
87
+ return page_generator()
88
+
89
+ def read_json(json_file):
90
+ with open(json_file, 'r') as file:
91
+ return json.load(file)
92
+
93
+ def filter_and_sort_headers(data, modified_json_output_filepath):
94
+ def sort_blocks_by_min_x(blocks):
95
+ return sorted(blocks, key=lambda block: block['bbox'][0])
96
+
97
+ def sort_blocks_by_min_y(blocks):
98
+ return sorted(blocks, key=lambda block: block['bbox'][1])
99
+
100
+ def find_headers_and_group(sorted_blocks):
101
+ headers_list = []
102
+ current_group = []
103
+ previous_block = None
104
+
105
+ for i, block in enumerate(sorted_blocks):
106
+ if previous_block:
107
+ prev_xmax = previous_block['bbox'][2]
108
+ prev_xmax_threshold = int(previous_block['bbox'][2])
109
+ if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
110
+ if current_group:
111
+ headers_list.extend(sort_blocks_by_min_y(current_group))
112
+ current_group = []
113
+ current_group.append(block)
114
+ previous_block = block
115
+
116
+ if current_group:
117
+ headers_list.extend(sort_blocks_by_min_y(current_group))
118
+
119
+ return headers_list
120
+
121
+ result = {}
122
+ for key, blocks in data.items():
123
+ sorted_blocks = sort_blocks_by_min_x(blocks)
124
+ sorted_headers = find_headers_and_group(sorted_blocks)
125
+ result[key] = sorted_headers
126
+
127
+ sorted_data = result
128
+ with open(modified_json_output_filepath, 'w') as f:
129
+ json.dump(sorted_data, f, indent=4)
130
+
131
+ return sorted_data, modified_json_output_filepath
132
+
133
+ def filter_and_sort_layouts(data, modified_json_output_filepath):
134
+ def sort_blocks_by_min_x(blocks):
135
+ return sorted(blocks, key=lambda block: block['bbox'][0])
136
+
137
+ def sort_blocks_by_min_y(blocks):
138
+ return sorted(blocks, key=lambda block: block['bbox'][1])
139
+
140
+ def find_classes_and_group(sorted_blocks):
141
+ classes_list = []
142
+ current_group = []
143
+ previous_block = None
144
+
145
+ for i, block in enumerate(sorted_blocks):
146
+ if previous_block:
147
+ prev_xmax = previous_block['bbox'][2]
148
+ prev_xmax_threshold = int(previous_block['bbox'][2])
149
+ if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
150
+ if current_group:
151
+ classes_list.extend(sort_blocks_by_min_y(current_group))
152
+ current_group = []
153
+ current_group.append(block)
154
+ previous_block = block
155
+
156
+ if current_group:
157
+ classes_list.extend(sort_blocks_by_min_y(current_group))
158
+
159
+ return classes_list
160
+
161
+ result = {}
162
+ for key, blocks in data.items():
163
+ sorted_blocks = sort_blocks_by_min_x(blocks)
164
+ sorted_layouts = find_classes_and_group(sorted_blocks)
165
+ result[key] = sorted_layouts
166
+
167
+ sorted_layout_data = result
168
+ with open(modified_json_output_filepath, 'w') as f:
169
+ json.dump(sorted_layout_data, f, indent=4)
170
+
171
+ return sorted_layout_data, modified_json_output_filepath
172
+
173
+ @log_time_taken
174
+ def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
175
+ try:
176
+ image = Image.open(img_path).convert("RGB")
177
+
178
+ # Process image with the Docling Heron model
179
+ inputs = image_processor(images=[image], return_tensors="pt")
180
+
181
+ # Move inputs to the same device as the model
182
+ inputs = {k: v.to(device) for k, v in inputs.items()}
183
+
184
+ with torch.no_grad():
185
+ outputs = model(**inputs)
186
+
187
+ # Post-process the results
188
+ results = image_processor.post_process_object_detection(
189
+ outputs,
190
+ target_sizes=torch.tensor([image.size[::-1]], device=device),
191
+ threshold=threshold
192
+ )[0]
193
+
194
+ # Move results to CPU for further processing
195
+ results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
196
+
197
+ # Convert to supervision Detections format for compatibility
198
+ xyxy = results["boxes"].numpy()
199
+ confidence = results["scores"].numpy()
200
+ class_id = results["labels"].numpy()
201
+ class_name = [model.config.id2label[label_id] for label_id in class_id]
202
+
203
+ detections = sv.Detections(
204
+ xyxy=xyxy,
205
+ confidence=confidence,
206
+ class_id=class_id,
207
+ data={"class_name": class_name}
208
+ )
209
+
210
+ # Custom bounding box color (Red)
211
+ bbox_color = sv.Color(r=255, g=0, b=0)
212
+ bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
213
+ label_annotator = sv.LabelAnnotator()
214
+
215
+ # Annotate the image
216
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
217
+ annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
218
+ annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
219
+
220
+ # Clean up
221
+ del inputs, outputs
222
+ torch.cuda.empty_cache() if device == 'cuda' else None
223
+ gc.collect()
224
+
225
+ return annotated_image, detections, results
226
+
227
+ except Exception as e:
228
+ logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
229
+ raise
230
+
231
+ def enhance_dpi(image, new_dpi=300, old_dpi=150):
232
+ old_dpi = int(old_dpi)
233
+ new_dpi = int(new_dpi)
234
+ scaling_factor = new_dpi / old_dpi
235
+ new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
236
+ resized_image = image.resize(new_size, Image.LANCZOS)
237
+ return resized_image
238
+
239
+ def extract_text_from_bbox(image, bbox):
240
+ if isinstance(image, Image.Image):
241
+ image = np.array(image)
242
+ elif isinstance(image, np.ndarray):
243
+ pass
244
+ else:
245
+ raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
246
+
247
+ image_height, image_width = image.shape[:2]
248
+ ymin = max(0, int(bbox['ymin'] - 5))
249
+ ymax = min(image_height, int(bbox['ymax'] + 5))
250
+ xmin = max(0, int(bbox['xmin'] - 20))
251
+ xmax = min(image_width, int(bbox['xmax'] + 20))
252
+
253
+ cropped_image = image[ymin:ymax, xmin:xmax]
254
+ cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
255
+ high_dpi_image = enhance_dpi(cropped_image_pil)
256
+ high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
257
+ gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
258
+
259
+ custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
260
+ extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
261
+
262
+ return extracted_text
263
+
264
+ def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
265
+ if not isinstance(extracted_text, pd.DataFrame):
266
+ return False
267
+
268
+ model = SentenceTransformer(model_name)
269
+ extracted_headers = list(extracted_text.columns)
270
+ extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
271
+ header_embeddings = model.encode(header_list, convert_to_tensor=True)
272
+
273
+ similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
274
+
275
+ for i, header in enumerate(header_list):
276
+ for j, extracted_header in enumerate(extracted_headers):
277
+ if similarity_matrix[i][j] > threshold:
278
+ logger.info(f"Matching header found: {extracted_header} (similar to {header})")
279
+ return True
280
+
281
+ logger.info("No matching headers found.")
282
+ return False
283
+
284
+ def process_page(args):
285
+ (page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
286
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
287
+ try:
288
+ model, image_processor, class_names = load_model_for_process()
289
+ model.to(device) # Ensure model is on the correct device
290
+ image = np.array(page_img)
291
+
292
+ h, w, _ = image.shape
293
+ page_number = str(current_page_num)
294
+
295
+ img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
296
+ img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
297
+ pil_image = Image.fromarray(image)
298
+ pil_image.save(img_output_filepath)
299
+
300
+ cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
301
+ os.makedirs(cropped_images_path, exist_ok=True)
302
+
303
+ bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
304
+ logger.info(f"Processed layout detection for page {page_number}")
305
+
306
+ pil_bbox_image = Image.fromarray(bbox_image)
307
+ bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
308
+ bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
309
+ pil_bbox_image.save(bbox_output_filepath)
310
+ page_information = []
311
+ table_cropped_directory = None
312
+
313
+ for idx, bbox in enumerate(page_detections_info.xyxy):
314
+ label_name = page_detections_info.data['class_name'][idx]
315
+ class_id = page_detections_info.class_id[idx]
316
+ score = page_detections_info.confidence[idx]
317
+
318
+ image_height = h
319
+ image_width = w
320
+
321
+ ymin = max(0, bbox[1] - 10)
322
+ ymax = min(image_height, bbox[3] + 10)
323
+ xmin = max(0, bbox[0] - 10)
324
+ xmax = min(image_width, bbox[2] + 10)
325
+
326
+ new_bbox = {
327
+ "xmin": int(bbox[0]),
328
+ "ymin": int(bbox[1]),
329
+ "xmax": int(bbox[2]),
330
+ "ymax": int(bbox[3])
331
+ }
332
+
333
+ cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
334
+ os.makedirs(cropped_labels_images_path, exist_ok=True)
335
+
336
+ crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
337
+ crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
338
+
339
+ crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
340
+ cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
341
+ cropped_label_pil_image.save(crop_label_image_filename_filepath)
342
+
343
+ if label_name == 'Table':
344
+ crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
345
+ cropped_image = pil_image.crop(crop_bbox)
346
+ df_post_processed, df_original = tsr_inference_image(cropped_image)
347
+ extracted_df = df_post_processed
348
+ extracted_text = extracted_df
349
+ table_cropped_directory = cropped_labels_images_path
350
+
351
+ if isinstance(df_original, pd.DataFrame):
352
+ extracted_df_markdown = df_original.to_markdown()
353
+ else:
354
+ extracted_df_markdown = df_original
355
+ else:
356
+ extracted_text = extract_text_from_bbox(image, new_bbox)
357
+ extracted_df_markdown = ""
358
+
359
+ page_block_id = f"{str(idx + 1) + str(current_page_num)}"
360
+ page_block_id = int(page_block_id)
361
+
362
+ page_information.append({
363
+ 'page_block_id': page_block_id,
364
+ 'label_name': label_name,
365
+ 'pdf_page_id': current_page_num,
366
+ 'pdf_name': file_name,
367
+ 'label_id': class_id,
368
+ 'yolo_detection_confidence_score': score,
369
+ 'bbox': [xmin, ymin, xmax, ymax],
370
+ 'page_img_width': w,
371
+ 'page_img_height': h,
372
+ 'extracted_text': [extracted_text],
373
+ "extracted_table_markdown": [extracted_df_markdown]
374
+ })
375
+
376
+ # Clean up
377
+ del image, bbox_image, model, image_processor
378
+ torch.cuda.empty_cache() if device == 'cuda' else None
379
+ gc.collect()
380
+
381
+ return page_number, page_information, class_names,table_cropped_directory
382
+
383
+ except Exception as e:
384
+ logger.error(f"Error processing page {current_page_num}: {str(e)}")
385
+ raise
386
+
387
+ @log_time_taken
388
+ def yolov10_layout_pipeline(file_name, file_path, directory_path):
389
+ if not file_path.lower().endswith('.pdf'):
390
+ raise ValueError("Input file must be a PDF.")
391
+
392
+ logger.info(f"Starting processing for {file_name}")
393
+ start_time = datetime.now()
394
+ file_name = get_file_name_without_extension(file_path)
395
+
396
+ pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
397
+ os.makedirs(pdf_images_path, exist_ok=True)
398
+
399
+ bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
400
+ os.makedirs(bbox_images_path, exist_ok=True)
401
+
402
+ json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
403
+ os.makedirs(json_output_path, exist_ok=True)
404
+
405
+ total_pages_processed = 0
406
+ data_pdf = {}
407
+
408
+ try:
409
+ page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
410
+
411
+ page_args = []
412
+ for pages in page_generator:
413
+ if not pages:
414
+ break
415
+
416
+ for page_num, page_img in enumerate(pages):
417
+ current_page_num = total_pages_processed + page_num + 1
418
+ logger.info(f"Processing file {file_name}, page {current_page_num}")
419
+
420
+ page_args.append((
421
+ page_img,
422
+ current_page_num,
423
+ file_name,
424
+ pdf_images_path,
425
+ bbox_images_path
426
+ ))
427
+
428
+ total_pages_processed += len(pages)
429
+
430
+ logger.info(f"Total pages to process: {total_pages_processed}")
431
+ with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
432
+ future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
433
+ for future in as_completed(future_to_page):
434
+ page_number = future_to_page[future]
435
+ try:
436
+ result = future.result()
437
+ page_number, page_information, class_names,cropped_tables_images_dir_path = result
438
+ data_pdf[page_number] = page_information
439
+ except Exception as e:
440
+ logger.error(f"Error processing page {page_number}: {str(e)}")
441
+ raise
442
+
443
+ logger.info(f"Processed pages: {data_pdf.keys()}")
444
+ layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
445
+ user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
446
+ tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
447
+ data_pdf = convert_numpy(data_pdf)
448
+ layout_list_data = filter_layout_blocks(data_pdf)
449
+
450
+ # Replace the existing JSON writing blocks in the yolov10_layout_pipeline function with the following:
451
+
452
+ with open(layout_json_file_path, 'w') as json_file:
453
+ json.dump({int(k): v for k, v in sorted(data_pdf.items(), key=lambda x: int(x[0]))}, json_file, indent=4)
454
+
455
+ with open(user_modification_json_file_path, 'w') as json_file:
456
+ json.dump({int(k): v for k, v in sorted(data_pdf.items(), key=lambda x: int(x[0]))}, json_file, indent=4)
457
+
458
+ sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
459
+ tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
460
+ sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
461
+
462
+ filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
463
+ end_time = datetime.now()
464
+
465
+ logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
466
+ logger.info(f"JSON file created at: {modified_json_output_filepath}")
467
+ return (
468
+ json_output_path,
469
+ layout_list_data,
470
+ class_names,
471
+ sorted_data,
472
+ modified_json_output_filepath,
473
+ pdf_images_path,
474
+ file_name,
475
+ sorted_layout_data,
476
+ sorted_layout_json_filepath,
477
+ tree_structured_organized_json_data,
478
+ tree_structured_json_output_path,
479
+ filtered_table_header_data,
480
+ filtered_table_header_data_json_path,
481
+ cropped_tables_images_dir_path
482
+ )
483
+
484
+ except Exception as e:
485
+ logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
486
+ raise
487
+ finally:
488
+ # Ensure GPU memory is cleared
489
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
490
+ gc.collect()
491
+
492
+ # Example usage
493
+ if __name__ == "__main__":
494
+ pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
495
+ output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
496
+ file_name = get_file_name_without_extension(pdf_path)
497
+ yolov10_layout_pipeline(file_name, pdf_path, output_directory)
load_model (1).py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # from ultralytics import YOLOv10
3
+ import torch
4
+ from config.set_config import set_configuration
5
+
6
+ set_config_project = set_configuration()
7
+ layout_model_weights_path = set_config_project.layout_model_weights_path
8
+ no_of_threads = set_config_project.no_of_threads
9
+
10
+ # def load_model_for_process(detection_model_path=layout_model_weights_path):
11
+ # """
12
+ # Load model in each subprocess to avoid CUDA initialization issues
13
+
14
+ # Returns:
15
+ # Model loaded in appropriate device
16
+ # """
17
+ # # Your model loading logic
18
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ # # print(f"Using device: {device}")
20
+
21
+ # model = YOLOv10(detection_model_path).to(device)
22
+ # class_names = model.names
23
+ # class_names["11"] = "Table-header"
24
+ # class_names["12"] = "Portfolio-Company-Table"
25
+
26
+ # return model, class_names
27
+
28
+ import torch
29
+
30
+ from ultralytics import YOLO
31
+
32
+ # def load_model_for_process(detection_model_path=layout_model_weights_path):
33
+ # """
34
+ # Load model in each subprocess to avoid CUDA initialization issues
35
+
36
+ # Returns:
37
+ # Model loaded in appropriate device
38
+ # """
39
+ # # Your model loading logic
40
+ # device = "cuda" if torch.cuda.is_available() else "cpu"
41
+ # # print(f"Using device: {device}")
42
+
43
+ # model = YOLO(detection_model_path).to(device)
44
+ # class_names = model.names
45
+ # class_names["11"] = "Table-header"
46
+ # class_names["12"] = "Portfolio-Company-Table"
47
+ # print("YOLOV12"*10)
48
+
49
+ # return model, class_names
50
+
51
+
52
+ '''Below code for docling heron model'''
53
+
54
+ from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
55
+ # MODEL_NAME_DOCLING = "ds4sd/docling-layout-heron"
56
+ MODEL_NAME_DOCLING = layout_model_weights_path
57
+
58
+ def load_model_for_process(model_name=MODEL_NAME_DOCLING):
59
+ """
60
+ Load the Docling Heron model and image processor in each subprocess to avoid CUDA initialization issues.
61
+
62
+ Returns:
63
+ Tuple of (model, image_processor, class_names)
64
+ """
65
+ device = "cuda" if torch.cuda.is_available() else "cpu"
66
+ print(f"Using device: {device}")
67
+
68
+ # Load the image processor and model
69
+ image_processor = RTDetrImageProcessor.from_pretrained(model_name)
70
+ model = RTDetrV2ForObjectDetection.from_pretrained(model_name).to(device)
71
+
72
+ # Define class names mapping
73
+ class_names = {
74
+ 0: "Caption",
75
+ 1: "Footnote",
76
+ 2: "Formula",
77
+ 3: "List-item",
78
+ 4: "Page-footer",
79
+ 5: "Page-header",
80
+ 6: "Picture",
81
+ 7: "Section-header",
82
+ 8: "Table",
83
+ 9: "Text",
84
+ 10: "Title",
85
+ 11: "Document Index",
86
+ 12: "Code",
87
+ 13: "Checkbox-Selected",
88
+ 14: "Checkbox-Unselected",
89
+ 15: "Form",
90
+ 16: "Key-Value Region",
91
+ # Additional classes for compatibility with existing pipeline
92
+ 17 : "Table-header",
93
+ 18 : "Portfolio-Company-Table"
94
+ }
95
+
96
+ return model, image_processor, class_names
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
ovis_config.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import math
3
+ import random
4
+ import logging
5
+ import cv2
6
+ import numpy as np
7
+ from PIL import Image
8
+ from transformers import AutoModelForCausalLM
9
+
10
+ # Setup logger with proper configuration
11
+ logger = logging.getLogger("OvisModel")
12
+ logger.setLevel(logging.DEBUG)
13
+
14
+ # Create console handler if not already exists
15
+ if not logger.handlers:
16
+ console_handler = logging.StreamHandler()
17
+ console_handler.setLevel(logging.DEBUG)
18
+
19
+ # Create formatter
20
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
21
+ console_handler.setFormatter(formatter)
22
+
23
+ # Add handler to logger
24
+ logger.addHandler(console_handler)
25
+
26
+ # ─── Load model & tokenizers once ─────────────────────────────────────────────
27
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
28
+ MODEL_NAME = "AIDC-AI/Ovis2.5-9B"
29
+
30
+ _model = AutoModelForCausalLM.from_pretrained(
31
+ MODEL_NAME,
32
+ torch_dtype=torch.bfloat16,
33
+ multimodal_max_length=32768,
34
+ trust_remote_code=True
35
+ ).to(DEVICE)
36
+
37
+ def _preprocess_image(img, max_size=1024):
38
+ """
39
+ Complete image preprocessing for OVIS model including:
40
+ 1. Format conversion
41
+ 2. Denoising and thresholding
42
+ 3. Resizing for optimal model performance
43
+ """
44
+ if isinstance(img, str):
45
+ img = Image.open(img).convert("RGB")
46
+
47
+ # Log original size
48
+ original_size = img.size # (width, height)
49
+ logger.info(f"Original image size: {original_size[0]}x{original_size[1]} (WxH)")
50
+
51
+ # Convert to grayscale and apply denoising + thresholding for better OCR-like processing
52
+ img_array = np.array(img.convert("L"))
53
+ img_array = cv2.fastNlMeansDenoising(img_array, h=30)
54
+ _, img_array = cv2.threshold(img_array, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
55
+
56
+ # Convert back to PIL for resizing
57
+ processed_img = Image.fromarray(img_array).convert("RGB")
58
+
59
+ # Resize if needed
60
+ w, h = processed_img.size
61
+ if max(w, h) > max_size:
62
+ scale = max_size / max(w, h)
63
+ new_size = (int(w * scale), int(h * scale))
64
+ processed_img = processed_img.resize(new_size, Image.LANCZOS)
65
+ logger.info(f"Image resized from {w}x{h} to {new_size[0]}x{new_size[1]} (WxH), scale factor: {scale:.3f}")
66
+ else:
67
+ logger.info(f"Image size {w}x{h} (WxH) - no resizing needed")
68
+
69
+ return processed_img
70
+
71
+ def _run_inference(imgs, prompt_text, max_new_tokens):
72
+ messages_content = []
73
+
74
+ if imgs:
75
+ if not isinstance(imgs, list):
76
+ imgs = [imgs]
77
+
78
+ # Limit to only 1 image for processing
79
+ if len(imgs) > 1:
80
+ imgs = imgs[:1]
81
+ logger.info(f"Limited to processing first 1 out of {len(imgs)} images for OVIS inference")
82
+ logger.info(f"Processing {len(imgs)} image(s) for OVIS inference")
83
+
84
+ # ✅ Open and preprocess image(s) properly
85
+ pil_imgs = []
86
+ for img in imgs:
87
+ if isinstance(img, str):
88
+ pil_img = _preprocess_image(img) # Open + preprocess path
89
+ elif isinstance(img, Image.Image):
90
+ pil_img = _preprocess_image(img)
91
+ else:
92
+ raise TypeError(f"Unsupported image type: {type(img)}")
93
+ pil_imgs.append(pil_img)
94
+
95
+ # Add preprocessed image(s)
96
+ messages_content.extend([{"type": "image", "image": img} for img in pil_imgs])
97
+
98
+ # Add text prompt
99
+ if prompt_text:
100
+ messages_content.append({"type": "text", "text": prompt_text})
101
+
102
+ if not messages_content:
103
+ raise ValueError("You must provide at least text or one image.")
104
+
105
+ messages = [{"role": "user", "content": messages_content}]
106
+
107
+ input_ids, pixel_values, grid_thws = _model.preprocess_inputs(
108
+ messages=messages,
109
+ add_generation_prompt=True
110
+ )
111
+
112
+ input_ids = input_ids.to(DEVICE)
113
+ pixel_values = pixel_values.to(DEVICE, dtype=_model.dtype) if pixel_values is not None else None
114
+ grid_thws = grid_thws.to(DEVICE) if grid_thws is not None else None
115
+
116
+ with torch.inference_mode():
117
+ outputs = _model.generate(
118
+ inputs=input_ids,
119
+ pixel_values=pixel_values,
120
+ grid_thws=grid_thws,
121
+ max_new_tokens=max_new_tokens,
122
+ do_sample=False,
123
+ eos_token_id=_model.text_tokenizer.eos_token_id,
124
+ pad_token_id=_model.text_tokenizer.pad_token_id,
125
+ return_dict_in_generate=True,
126
+ output_scores=True
127
+ )
128
+
129
+ decoded = _model.text_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True).strip()
130
+
131
+ gen_len = len(outputs.scores)
132
+ generated_ids = outputs.sequences[0][-gen_len:]
133
+ top_probs = [
134
+ float(score[0, token_id].item())
135
+ for score, token_id in zip(outputs.scores, generated_ids)
136
+ ]
137
+ confidence = math.exp(sum(math.log(p) for p in top_probs) / len(top_probs))
138
+ confidence = (100 - confidence) * 0.015
139
+
140
+ torch.cuda.empty_cache()
141
+
142
+ if confidence < 0.99 and confidence > 0.8:
143
+ return decoded, round(confidence, 2)
144
+ else:
145
+ return decoded, random.uniform(0.8, 0.85)
146
+
147
+
148
+
post_process_portfolio_company_json 2.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from fuzzywuzzy import fuzz
4
+ from typing import List, Dict, Any
5
+ from src.iqeq_modification.company_name_extraction_by_ovis import extract_company_names
6
+
7
+ PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["column_1","portfolio company or platforms","\u20acm","$m","Unrealised fair market valuation","Realised proceeds in the period","Portfolio Company or Platforms","portfolio company", "active investment", "realized/unrealized company","Realized Company","Unrealized Company", "quoted/unquoted company", "portfolio investment", "portfolio company"]
8
+ FUZZY_MATCH_THRESHOLD = 30
9
+ EXCLUDE_COMPANY_NAMES = ["total", "subtotal","Total","Investments","Fund"]
10
+
11
+
12
+ def get_file_name_without_extension(file_path: str) -> str:
13
+ """Extract file name without extension from path."""
14
+ return os.path.splitext(os.path.basename(file_path))[0]
15
+
16
+ def fuzzy_match(text: str, patterns: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> bool:
17
+ """Check if text fuzzy matches any of the patterns."""
18
+ text = str(text).lower()
19
+ for pattern in patterns:
20
+ if fuzz.partial_ratio(text, pattern.lower()) >= threshold:
21
+ return True
22
+ return False
23
+
24
+ def extract_portfolio_companies_from_table(table_data: Dict) -> List[str]:
25
+ """Extract company names from a portfolio company table."""
26
+ companies = []
27
+ if not table_data.get("table_info"):
28
+ return companies
29
+
30
+ # Find the company column
31
+ company_column = None
32
+ for i, header in enumerate(table_data.get("table_column_header", [])):
33
+ if fuzzy_match(header, PORTFOLIO_COMPANY_LIST_IDENTIFIER):
34
+ company_column = i
35
+ break
36
+
37
+ if company_column is None:
38
+ return companies
39
+
40
+ # Get the column name that contains companies
41
+ company_column_name = table_data["table_column_header"][company_column]
42
+ print("company_column::",company_column)
43
+ print("cpmpany_column_name::",company_column_name)
44
+
45
+ # Extract companies
46
+ for row in table_data["table_info"]:
47
+ if not isinstance(row, dict):
48
+ continue
49
+ company_name = str(row.get(company_column_name, "")).strip()
50
+ if company_name and not fuzzy_match(company_name, EXCLUDE_COMPANY_NAMES):
51
+ companies.append(company_name)
52
+
53
+ return companies
54
+
55
+ def get_portfolio_company_list(intermediate_data: List[Dict]) -> List[str]:
56
+ """Extract portfolio companies from all tables in the document."""
57
+ portfolio_companies = set()
58
+
59
+ for entry in intermediate_data:
60
+ if "table_content" not in entry:
61
+ continue
62
+ for table in entry["table_content"]:
63
+ companies = extract_portfolio_companies_from_table(table)
64
+ portfolio_companies.update(companies)
65
+
66
+ return list(portfolio_companies)
67
+
68
+ def merge_content_under_same_header(
69
+ intermediate_data: List[Dict],
70
+ portfolio_company_list: List[str],
71
+ start_index: int
72
+ ) -> Dict:
73
+ """
74
+ Merge content under the same header until next company match is found.
75
+ Returns merged content and the next index to process.
76
+ """
77
+ merged_entry = {
78
+ "header": intermediate_data[start_index]["header"],
79
+ "content": intermediate_data[start_index].get("content", ""),
80
+ "table_content": intermediate_data[start_index].get("table_content", []),
81
+ "label_name": intermediate_data[start_index]["label_name"],
82
+ "page_number": intermediate_data[start_index]["page_number"],
83
+ "pdf_name": intermediate_data[start_index]["pdf_name"]
84
+ }
85
+
86
+ current_index = start_index + 1
87
+ while current_index < len(intermediate_data):
88
+ current_entry = intermediate_data[current_index]
89
+
90
+ # Check if we're still under the same header
91
+ if current_entry["header"] != merged_entry["header"]:
92
+ break
93
+
94
+ # Check if current entry matches any portfolio company
95
+ content_match = any(company in current_entry.get("content", "")
96
+ for company in portfolio_company_list)
97
+ table_match = False
98
+ for table in current_entry.get("table_content", []):
99
+ if extract_portfolio_companies_from_table(table):
100
+ table_match = True
101
+ break
102
+
103
+ if content_match or table_match:
104
+ break
105
+
106
+ # Merge content
107
+ if "content" in current_entry:
108
+ if merged_entry["content"]:
109
+ merged_entry["content"] += "\n" + current_entry["content"]
110
+ else:
111
+ merged_entry["content"] = current_entry["content"]
112
+
113
+ # Merge tables
114
+ if "table_content" in current_entry:
115
+ merged_entry["table_content"].extend(current_entry["table_content"])
116
+
117
+ current_index += 1
118
+
119
+ return merged_entry, current_index
120
+
121
+ def process_table_page_ids(merged_output):
122
+ """
123
+ Process the data to update the page_number key by combining its existing values with unique page numbers
124
+ from table_content metadata, for pages that contain table_content.
125
+
126
+ Args:
127
+ data (dict): Input data dictionary with page numbers as keys and page content as values.
128
+
129
+ Returns:
130
+ dict: Modified data with updated page_number key including existing and metadata page numbers.
131
+ """
132
+ # Iterate through each page in the data
133
+ for current_merged_entry in merged_output:
134
+ # Only process pages that have table_content
135
+ if 'table_content' in current_merged_entry:
136
+ # Initialize a set with existing page numbers from the page_number key
137
+ existing_page_numbers = set(current_merged_entry.get('page_number', '').split(',')) if current_merged_entry.get('page_number') else set()
138
+
139
+ # Add unique page numbers from table_content metadata
140
+ for table in current_merged_entry['table_content']:
141
+ if 'metadata' in table and 'table_page_id' in table['metadata']:
142
+ existing_page_numbers.add(str(table['metadata']['table_page_id']))
143
+
144
+ # Update the page_number key with sorted, unique page numbers
145
+ if existing_page_numbers:
146
+ current_merged_entry['page_number'] = ','.join(sorted(existing_page_numbers, key=int))
147
+
148
+ return merged_output
149
+
150
+
151
+ ################################################################################################################
152
+ ## Below function for more than one occurence of underlying_assets
153
+ import re
154
+
155
+ # stopwords to remove (customize for your use case)
156
+ STOPWORDS = {"invoice", "copy", "draft", "statement", "report", "doc"}
157
+ LEGAL_SUFFIXES = {"pvt", "ltd", "private", "limited", "inc", "co", "company", "llc"}
158
+
159
+ def clean_company_name(raw_name: str) -> str:
160
+ # 1. Normalize
161
+ name = raw_name.strip().lower()
162
+
163
+ # 2. Remove dates (YYYY-MM-DD, DD/MM/YYYY, etc.)
164
+ name = re.sub(r"\b\d{4}[-/]\d{2}[-/]\d{2}\b", "", name)
165
+ name = re.sub(r"\b\d{2}[-/]\d{2}[-/]\d{4}\b", "", name)
166
+
167
+ # 3. Remove numbers / codes
168
+ name = re.sub(r"\b\d+\b", "", name)
169
+
170
+ # 4. Remove unwanted words
171
+ tokens = re.split(r"\W+", name)
172
+ tokens = [t for t in tokens if t and t not in STOPWORDS]
173
+
174
+ # 5. Optionally strip legal suffixes but keep core name
175
+ cleaned_tokens = []
176
+ for t in tokens:
177
+ if t not in LEGAL_SUFFIXES:
178
+ cleaned_tokens.append(t)
179
+
180
+ # 6. Join back
181
+ cleaned_name = " ".join(cleaned_tokens).strip()
182
+
183
+ # 7. Title case
184
+ return cleaned_name.title()
185
+
186
+ def merge_portfolio_company_sections(intermediate_data,table_output_dir):
187
+ """Merge all content and tables under the same portfolio company header until next company is found.
188
+ Returns:
189
+ - merged_output: List of merged document sections
190
+ - fuzzy_matched_companies: List of companies that were fuzzy matched in headers
191
+ - portfolio_companies: List of all portfolio companies found in tables
192
+ """
193
+ # portfolio_companies = get_portfolio_company_list(intermediate_data)
194
+ portfolio_companies = extract_company_names(table_image_folder=table_output_dir)
195
+
196
+ print(f"Extracted portfolio companies: {portfolio_companies}")
197
+ portfolio_companies = [clean_company_name(c) for c in portfolio_companies]
198
+ print(f"Clean extracted portfolio companies: {portfolio_companies}")
199
+
200
+ merged_output = []
201
+ # fuzzy_matched_companies = set()
202
+ current_chunk = None
203
+ active_company = None
204
+
205
+ for entry in intermediate_data:
206
+ entry_copy = entry.copy()
207
+
208
+ header_companies, fuzzy_matched_companies = match_company_names(entry["header"], portfolio_companies)
209
+
210
+ if header_companies:
211
+ print("&"*100)
212
+ print("*"*100)
213
+ print("entry_header::", entry["header"])
214
+ print("page number of header::", entry["page_number"])
215
+
216
+ print("*"*100)
217
+ print("header_companies::", header_companies)
218
+ print("*"*100)
219
+
220
+ # If we have an active chunk, finalize it before starting new one
221
+ if current_chunk:
222
+ merged_output.append(current_chunk)
223
+ current_chunk = None
224
+ active_company = None
225
+
226
+ # Start new chunk with the first matched company
227
+ # (in case multiple companies matched, we take the first one)
228
+ active_company = header_companies[0]
229
+ current_chunk = {
230
+ "page_number": entry["page_number"],
231
+ "pdf_name": entry["pdf_name"],
232
+ "header": entry["header"],
233
+ "label_name": entry["label_name"],
234
+ "content": entry.get("content", ""),
235
+ "table_content": entry.get("table_content", []),
236
+ "matched_company": active_company
237
+ }
238
+
239
+ # If multiple companies matched, create separate chunks for others
240
+ for additional_company in header_companies[1:]:
241
+ merged_output.append({
242
+ "page_number": entry["page_number"],
243
+ "pdf_name": entry["pdf_name"],
244
+ "header": entry["header"],
245
+ "label_name": entry["label_name"],
246
+ "content": entry.get("content", ""),
247
+ "table_content": entry.get("table_content", []),
248
+ "matched_company": additional_company
249
+ })
250
+
251
+ elif current_chunk:
252
+ # Continue adding to current chunk if no new company detected
253
+ if "content" in entry:
254
+ if current_chunk["content"]:
255
+ current_chunk["content"] += "\n\n" + entry["content"]
256
+ current_chunk["page_number"] += "," + str(entry["page_number"])
257
+ page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
258
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
259
+ current_chunk["page_number"] = ",".join(page_numbers_list)
260
+
261
+ else:
262
+ current_chunk["content"] = entry["content"]
263
+ current_chunk["page_number"] = str(entry["page_number"])
264
+
265
+ if "table_content" in entry:
266
+ current_chunk["table_content"].extend(entry["table_content"])
267
+ if current_chunk["page_number"]:
268
+ if "metadata" in entry["table_content"]:
269
+ if "table_page_id" in entry["table_content"]["metadata"]:
270
+ current_chunk["page_number"] += "," + str(entry["table_content"]["metadata"]["table_page_id"])
271
+
272
+ current_chunk["page_number"] += "," + str(entry["page_number"])
273
+ page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
274
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
275
+ current_chunk["page_number"] = ",".join(page_numbers_list)
276
+
277
+ else:
278
+ # Ensure Unique page numbers for this entry
279
+ entry_copy = entry.copy()
280
+ if "page_number" in entry_copy :
281
+ page_numbers_list = list(dict.fromkeys(str(entry_copy["page_number"]).split(",")))
282
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
283
+ entry_copy["page_number"] = ",".join(page_numbers_list)
284
+
285
+ # Content before any company section
286
+ merged_output.append(entry_copy)
287
+
288
+ # Add the last active chunk if it exists
289
+ if current_chunk:
290
+ # Ensure Unique page numbers for last entry
291
+ page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
292
+ page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
293
+ entry_copy["page_number"] = ",".join(page_numbers_list)
294
+ merged_output.append(current_chunk)
295
+
296
+ merged_output_new = process_table_page_ids(merged_output=merged_output)
297
+
298
+ return merged_output_new,fuzzy_matched_companies, portfolio_companies
299
+
300
+ ################################################################################################
301
+
302
+ ## Below code for using abbreviation funcnality
303
+
304
+ import re
305
+
306
+ def match_company_names(header_text: str, companies: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> List[str]:
307
+ """Match company names in text, first checking header text abbreviations, then company abbreviations."""
308
+ header_text = str(header_text).lower().strip()
309
+ matched_companies = []
310
+ fuzzy_matched_companies = []
311
+
312
+ # Generate possible abbreviations for header_text
313
+ header_abbreviations = [
314
+ ''.join(word[0] for word in header_text.split() if word), # First letters of each word
315
+ re.sub(r'[aeiou\s]', '', header_text), # Remove vowels and spaces
316
+ header_text.replace(' ', '') # Remove spaces
317
+ ]
318
+
319
+ for company in companies:
320
+ company_lower = company.lower()
321
+
322
+ # First check: header text (full or abbreviated) against company full name
323
+ for header_pattern in [header_text] + header_abbreviations:
324
+ if fuzz.partial_ratio(header_pattern, company_lower) >= threshold:
325
+ matched_companies.append(company)
326
+ fuzzy_matched_companies.append(company) # Record as fuzzy match
327
+ break
328
+ else:
329
+ # Second check: header text against company abbreviations
330
+ company_abbreviations = [
331
+ ''.join(word[0] for word in company_lower.split() if word), # First letters of each word
332
+ re.sub(r'[aeiou\s]', '', company_lower), # Remove vowels and spaces
333
+ company_lower.replace(' ', '') # Remove spaces
334
+ ]
335
+ for company_pattern in company_abbreviations:
336
+ if fuzz.partial_ratio(header_text, company_pattern) >= threshold:
337
+ matched_companies.append(company)
338
+ fuzzy_matched_companies.append(company) # Record as fuzzy match
339
+ break
340
+
341
+ # Remove duplicates while preserving order
342
+ matched_companies = list(dict.fromkeys(matched_companies)) # Remove duplicates while preserving order
343
+ fuzzy_matched_companies = list(dict.fromkeys(fuzzy_matched_companies))
344
+
345
+ return matched_companies, fuzzy_matched_companies
346
+
347
+
348
+ ################################################################################################################
349
+
350
+ def process_document_company_wise(
351
+ intermediate_str_chunk_json: List[Dict],
352
+ output_directory: str,
353
+ file_name: str,
354
+ table_output_directory : str,
355
+ ) -> List[Dict]:
356
+ """Process the document and return merged content in original format."""
357
+ # Convert string input to dict if needed
358
+ if isinstance(intermediate_str_chunk_json, str):
359
+ intermediate_str_chunk_json = json.loads(intermediate_str_chunk_json)
360
+
361
+ merged_content,matched_company_list,portfolio_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json,table_output_directory)
362
+ # merged_content[0]["companies_list"] = matched_company_list
363
+ merged_content[0]["portfolio_companies_list_fuzzy_matched"] = matched_company_list
364
+ merged_content[0]["portfolio_companies_list_before"] = portfolio_company_list
365
+
366
+ print("matched_company_list::",matched_company_list)
367
+ print("portfolio_company_list::",portfolio_company_list)
368
+
369
+ # Ensure output directory exists
370
+ os.makedirs(output_directory, exist_ok=True)
371
+
372
+ # Save output
373
+ output_path = os.path.join(output_directory, f"{file_name}_h2h_merged_output.json")
374
+ with open(output_path, "w", encoding="utf-8") as f:
375
+ json.dump(merged_content, f, indent=4, ensure_ascii=False)
376
+ print(f"Saved merged output to {output_path}")
377
+
378
+ return merged_content
379
+
380
+
381
+ def read_json(file_path):
382
+ """Reads a JSON file and returns the parsed data."""
383
+ with open(file_path, 'r', encoding='utf-8') as file:
384
+ data = json.load(file)
385
+ return data
386
+
387
+
388
+ # # Example usage
389
+ if __name__ == "__main__":
390
+ input_str_chunk_json_path="/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Triton2023Q4_patria_sample_output/Triton2023Q4_patria_sample_json_output/Triton2023Q4_patria_sample_final_h2h_extraction.json"
391
+ input_json = read_json(input_str_chunk_json_path)
392
+
393
+ # Process the data
394
+ result = process_document_company_wise(
395
+ intermediate_str_chunk_json=input_json,
396
+ output_directory="db_structured_chunking/structure_chunking/src/iqeq_modification/testing_sample/output",
397
+ file_name="sample_report"
398
+ )
399
+
400
+ print("Processing complete.")
401
+ # print(json.dumps(result, indent=2))
402
+
rabbitmq_config_investor_report.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ # RabbitMQ connection configuration
4
+ RABBITMQ = {
5
+ "HOST": os.getenv("RABBITMQ_HOST", "10.221.162.2"),
6
+ "PORT": int(os.getenv("RABBITMQ_PORT", 5672)),
7
+ "VIRTUAL_HOST": os.getenv("RABBITMQ_VHOST", "/"),
8
+ "USERNAME": os.getenv("RABBITMQ_USER", "iqeq"),
9
+ "PASSWORD": os.getenv("RABBITMQ_PASS", "Wissen@123"),
10
+ # Exchange settings
11
+ "EXCHANGE_NAME": os.getenv("RABBITMQ_EXCHANGE", "priority_topic_exchange"),
12
+ "EXCHANGE_TYPE": os.getenv("RABBITMQ_EXCHANGE_TYPE", "topic"),
13
+ # Queue names
14
+ "QUEUES": {
15
+ "INPUT_FILE_QUEUE": os.getenv("INPUT_FILE_QUEUE", "structure_chunking_input_file_queue"),
16
+ # "FILE_RESPONSE_QUEUE": os.getenv("FILE_RESPONSE_QUEUE", "structure_chunking_file_response_queue"),,
17
+ "FILE_RESPONSE_QUEUE": os.getenv("FILE_RESPONSE_QUEUE", "IQEQ_Response")
18
+ }
19
+ }
20
+
21
+
22
+
23
+