table_test / iqeq_app_latest (4).py

Upload 8 files

b533173 verified 6 months ago

67.1 kB

	from fastapi import FastAPI, HTTPException, Depends, File, Request, Form
	from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import JSONResponse, FileResponse
	from urllib.parse import quote
	from typing import List, Annotated,Dict,Optional,Any
	import uvicorn
	import sys
	import json


	sys.path.append("/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking")
	# from config.set_config.set_configuration import set_config_project

	from config.set_config import set_configuration
	set_config_project = set_configuration()

	project_output_directory_path= set_config_project.project_output_directory_path
	project_path = set_config_project.project_path

	from src.table_processing.table_filter import filtering_table_pipeline
	# from src.qa_correction.user_action_modification import process_actions_and_create_new_file
	from src.table_processing.tree_structured_json import tree_structured_headers_pipeline,tree_structured_headers_content_pipeline
	from src.pre_processing.create_searchable_pdf_old import create_json_pdfminer_pipeline

	from src.post_processing.clean_dataframe import clean_dataframe
	from src.table_processing.merge_headers_tree_structure import merge_blocks

	from src.table_processing.create_and_put_table_header import main_pipeline_create_put_table_headers
	from src.table_processing.map_table_with_table_header import map_table_with_its_header

	# from src.table_processing.table_merge import merge_multi_page_tables_pipeline

	# from other_code.save_classified_pdf_json_to_excel import create_directories_and_sheets

	# from src.table_extraction_from_word_csv.word_extraction import main_table_extraction_from_docx
	# from src.table_extraction_from_word_csv.xlsx_extraction import extract_and_save_tables_from_excel
	# from src.table_extraction_from_word_csv.csv_extraction import extract_and_save_tables_from_csv
	# from src.table_extraction_from_word_csv.classify_table_headers import process_main_classifier,get_csv_file_paths,save_classify_files, clean_filename

	# from src.iqeq_modification.sorting_headers_v2 import filter_and_sort_headers
	# from src.iqeq_modification.portfolio_summary_dynamic_classification import map_company_data
	from src.toc_based_extraction.main_pipeline_toc_based_extraction import customised_toc_extraction_pipeline
	from src.iqeq_modification.post_processing_iqeq import read_json,main_header_pipeline
	from src.iqeq_modification.post_process_portfolio_company_json import process_document_company_wise

	# from src.filter_pdf_pages_scope3.fuzzy_match_keywords import custom_pipeline_for_filter_keywords_pages_text_search
	# from src.filter_pdf_pages_scope3.keywords_matching import custom_pipeline_for_filter_keywords_pages_tfidf_vector
	# from src.filter_pdf_pages_scope3.keywords_matching_create_pdf import custom_pipeline_for_filter_keywords_pages_sentence_embedding

	# from src.layout_detection.layout_detection import yolov10_layout_pipeline,get_file_name_without_extension
	from src.layout_detection.layout_detection_docling_heron import yolov10_layout_pipeline,get_file_name_without_extension

	# from src.table_merge.table_merge_v2 import merge_multi_page_tables_pipeline_v2
	# from src.table_merge.table_merge_new import merge_multi_page_tables_pipeline_v2
	from src.table_merge.table_merge_v5 import merge_multi_page_tables_pipeline_v2
	from src.table_query.query_code_openai import get_query_response

	from src.custom_headers.pdf_header_detector import process_pdf_for_headers
	from src.custom_headers.consolidate_header_jsons import pipeline_for_merging_headers

	from utils.utils_code import clear_directory

	import logging,os
	from logging.config import dictConfig
	import shutil
	import re
	from fastapi import HTTPException, Form
	from src.classification.column_classifier_v2 import classify_column_headers
	from src.classification.classification import perform_classification

	log_folder = "logs"
	os.makedirs(log_folder, exist_ok=True)

	# Configure logging
	log_file_path = os.path.join(log_folder, "app.log")

	logging_config = {
	'version': 1,
	'disable_existing_loggers': False,
	'formatters': {
	'detailed': {
	'format': '%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(message)s',
	'datefmt': '%Y-%m-%d %H:%M:%S'
	},
	},
	'handlers': {
	'console': {
	'class': 'logging.StreamHandler',
	'level': 'INFO',
	'formatter': 'detailed',
	'stream': 'ext://sys.stdout'
	},
	'file': {
	'class': 'logging.FileHandler',
	'level': 'INFO',
	'formatter': 'detailed',
	'filename': log_file_path,
	'mode': 'a',
	},
	},
	'loggers': {
	'': { # root logger
	'handlers': ['console', 'file'],
	'level': 'INFO',
	'propagate': True
	},
	# Add specific loggers for libraries if needed
	'uvicorn': {
	'handlers': ['console', 'file'],
	'level': 'INFO',
	'propagate': False
	},
	}
	}

	# Apply the configuration
	dictConfig(logging_config)

	# Create the logger instance
	logger = logging.getLogger(__name__)

	app = FastAPI()

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	pdf_input_path = os.path.join(project_output_directory_path, f"pdf_extraction/input")
	pdf_input_directory = pdf_input_path
	os.makedirs(pdf_input_directory, exist_ok=True)

	pdf_output_path = os.path.join(project_output_directory_path, f"pdf_extraction/output")
	output_directory = pdf_output_path
	os.makedirs(output_directory, exist_ok=True)

	word_input_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/input")
	word_input_directory_path = word_input_path
	os.makedirs(word_input_directory_path, exist_ok=True)

	word_output_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/output")
	word_output_directory_path = word_output_path
	os.makedirs(word_output_directory_path, exist_ok=True)

	document_data = {}

	@app.post("/structured_chunking_extract")
	async def upload_documents(request: Request, path: str = Form()) :
	# path = eval(f'{path}')
	print(f'started for path: {path}')
	base_url = str(request.base_url)
	global document_data
	document_data = {}
	pdf_path = path
	clear_directory(pdf_input_path)
	clear_directory(pdf_output_path)
	clear_directory(word_input_path)
	clear_directory(word_output_path)

	# Initialize response structure
	response = {
	"success": False,
	"message": "",
	# "data": None
	}

	# Check if the provided path is a PDF file
	if not pdf_path.lower().endswith(".pdf"):
	response["message"] = "Invalid file type. Only PDF files are accepted."
	return response


	# Extract filename
	file_name_with_ext = os.path.basename(pdf_path)
	file_name_without_ext = os.path.splitext(file_name_with_ext)[0]

	# Create destination path in input directory
	destination_path = os.path.join(pdf_input_directory, file_name_with_ext)

	try:
	# Copy the file to our input directory
	shutil.copy2(pdf_path, destination_path)
	except Exception as e:
	response["message"] = f"Failed to copy file: {str(e)}"
	return response

	output_directory_path = os.path.join(output_directory)
	os.makedirs(output_directory_path, exist_ok=True)
	file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
	os.makedirs(file_output, exist_ok=True)

	table_output_path = os.path.join(file_output, f"table_output")
	os.makedirs(table_output_path, exist_ok=True)
	file_location = destination_path

	# Pipeline processing

	json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path,_ = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
	table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)

	custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
	header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)

	# Initialize data for the new document
	document_data[file_name_with_ext] = {

	"pdf_path": destination_path,
	"pdf_file_name": file_name_with_ext,
	"model_json_header_output_filepath": [],
	"model_json_layout_output_filepath": [],
	"tree_structured_header_json_filepath": [],
	"user_modified_json_output_filepath": [],
	'user_modified_table_json_filepath': [],
	"frontend_output_json": [],
	"cluster_json": [],
	"id_2_label" : [],
	"file_output_dir" : [],
	"table_output_dir": [],
	"table_with_header_data" : [],
	"table_with_header_json_path" : [],
	"json_output_dir": [],
	"pdf_miner_json_path": [] ,
	"searchable_pdf_path" : []
	}

	# Store paths and filenames
	document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
	document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
	document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
	document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
	document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
	document_data[file_name_with_ext]["file_output_dir"].append(file_output)
	document_data[file_name_with_ext]["id_2_label"].append(class_names)
	document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
	document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
	document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
	document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)

	file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
	pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
	pdf_path = document_data[file_name_with_ext]["pdf_path"]
	user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]

	pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)

	table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
	document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)

	document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)

	# Process image URLs
	pdf_images_urls = []
	for file_name in os.listdir(pdf_images_path):
	file_path = os.path.join(pdf_images_path, file_name)
	if file_name.endswith((".jpg", ".jpeg", ".png")):
	img_url = base_url + "image/" + str(quote(file_path))
	pdf_images_urls.append(img_url)

	# Sort image URLs by page number
	def extract_page_no(url):
	return int(url.split("_")[-1].split(".")[0])
	sorted_urls = sorted(pdf_images_urls, key=extract_page_no)

	# Create page details
	page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]

	# Store the JSON output
	document_data[file_name_with_ext]["frontend_output_json"].append({
	"layout_output_json_data": layout_output_json_data,
	"layout_json_list_data": layout_list_data,
	"id_2_label": class_names,
	"header_output_json_data": header_output_json_data,
	"table_output_json_data": table_json_data,
	"table_output_json_data_list": table_json_data_list,
	"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
	"pdf_images_urls": page_details,
	})


	document_id_name = file_name_with_ext

	data = document_data[document_id_name]
	file_output_dir = data["file_output_dir"][0]
	json_output_dir = data["json_output_dir"][0]
	pdf_file_name = data["pdf_file_name"]
	pdf_path = data["pdf_path"]

	# PDFMiner processing
	pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
	modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
	model_modified_json = read_json(modified_json_output_filepath)
	pdfminer_json = read_json(pdf_miner_json_filepath)
	searchable_pdf_path = data["searchable_pdf_path"][0]

	# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)

	table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)

	table_merged_json = read_json(table_merged_json_path)

	table_mapped_modified_json = map_table_with_its_header(table_merged_json)


	# table_mapped_modified_json = map_table_with_its_header(model_modified_json)

	# Main header pipeline
	df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)


	clean_df, clean_df_json = clean_dataframe(df_final)


	if isinstance(clean_df_json, str):
	# print("clean_df_json::",clean_df_json)
	# clean_df_json = eval(clean_df_json)
	clean_df_json = json.loads(clean_df_json)

	file_name = get_file_name_without_extension(pdf_file_name)
	merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
	company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)

	json_output_filename = file_name + "_final_h2h_extraction.json"
	final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)

	with open(final_json_output_filepath, 'w') as f:
	json.dump(clean_df_json, f, indent=4)

	company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
	company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)

	with open(company_wise_final_json_output_filepath, 'w') as f:
	json.dump(merged_content_company_wise_df, f, indent=4)

	# Tree-structured header content
	# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)

	# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)

	# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)



	# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)


	# Step 1: Extract directory and filename without extension
	pdf_path = path
	json_directory = os.path.dirname(pdf_path)
	json_filename = os.path.splitext(os.path.basename(pdf_path))[0]

	# Step 2: Define output path for JSON
	output_json_path = os.path.join(json_directory, f"{json_filename }.json")

	# If your variable is a JSON string, convert it to dict first
	if isinstance(company_wise_clean_df_json, str):
	company_wise_clean_df_json = json.loads(company_wise_clean_df_json)


	# Step 3: Save JSON
	with open(output_json_path, 'w', encoding='utf-8') as json_file:
	json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)


	# # post-processing results
	# post_processing_results = {
	# document_id_name : {
	# # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
	# # "df_download_json" : clean_df_json,
	# "df_download_json": company_wise_clean_df_json,
	# "tree_structured_header_content": final_tree_structred_header_content,
	# "file_name": document_id_name,
	# # "classified_dynamic_json": dynamic_mapped_data_json,
	# "toc_df_download_json" : final_toc_h2h_extraction
	# }
	# }

	response_final = {
	"status_code": 200,
	# "message":"",
	# "df_download_json": company_wise_clean_df_json,
	"saved_json_path": output_json_path
	}

	return response_final


	@app.get("/image/{path:path}")
	async def get_image(path: str):
	if os.path.exists(path):
	return FileResponse(path, media_type="image/jpeg")
	else:
	raise HTTPException(status_code=404, detail="Image not found")

	@app.get("/file/{path:path}")
	async def get_file(path: str):
	if os.path.exists(path):
	paths = path.split("/")
	filename = paths[len(paths) - 1]
	if path.endswith('.csv'):
	media_type = "text/csv"
	elif path.endswith('.xlsx'):
	media_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	else:
	media_type = "application/octet-stream"
	return FileResponse(path, media_type=media_type,filename =filename)
	else:
	raise HTTPException(status_code=404, detail="File not found")


	@app.post("/table-classification")
	async def table_classification(
	structured_chunk_json_path: str = Form(...),
	class_keywords_table: str = Form(...),
	header_categories: Optional[str] = Form("table_column_header"),
	similarity_threshold: Optional[float] = Form(0.4)
	):

	try:

	with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# This regex removes commas before closing braces/brackets, ignoring whitespace
	cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)

	# Parse the cleaned JSON
	structured_chunk_data = json.loads(cleaned_content)

	# If class_keywords is a string, try to parse it
	if isinstance(class_keywords_table, str):
	try:
	class_keywords_table = json.loads(class_keywords_table)

	if not isinstance(class_keywords_table, dict):
	raise ValueError("class_keywords_table must be a dictionary")
	if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
	for key, value in class_keywords_table.items()):
	raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
	except json.JSONDecodeError:
	raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})

	elif not isinstance(class_keywords_table, dict) or not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
	for key, value in class_keywords_table.items()):
	raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})


	# Perform classification
	categorized_headers = perform_classification(
	data=structured_chunk_data,
	class_keywords=class_keywords_table,
	header_categories=header_categories,
	similarity_threshold=similarity_threshold
	)
	return categorized_headers
	except ValueError as e:
	raise HTTPException(status_code=422, detail={"error": "Input validation failed", "message": str(e)})
	except Exception as e:
	raise HTTPException(status_code=422, detail={"error": "Processing failed", "message": str(e)})


	@app.post("/table-column-classification")
	async def table_column_classification(
	input_table_classified_json: Annotated[str, Form()],
	class_keywords_table_column: Annotated[str, Form()],
	filter_table_classifier_name: Annotated[str, Form()],
	similarity_threshold: Annotated[str, Form()]
	):


	try:
	# Parse JSON strings into dictionaries
	input_table_classified_json = json.loads(input_table_classified_json)
	class_keywords_table_column = json.loads(class_keywords_table_column)
	except json.JSONDecodeError as e:
	raise HTTPException(status_code=422, detail={"error": "Invalid JSON format", "message": str(e)})

	try:
	# Convert similarity_threshold to integer
	similarity_threshold = float(similarity_threshold)
	except ValueError as e:
	raise HTTPException(status_code=422, detail={"error": "Similarity threshold must be a valid integer", "message": str(e)})


	column_classification_results = classify_column_headers(
	json_data=input_table_classified_json,
	class_keywords=class_keywords_table_column,
	filter_table_classifier_name=filter_table_classifier_name,
	similarity_threshold=similarity_threshold
	)

	results = {"column_classification_result": column_classification_results}

	return results


	# Run the server
	if __name__ == "__main__":
	# uvicorn.run("app:app", host="0.0.0.0", port=7061, log_level="info", reload=True)
	uvicorn.run( app, host="0.0.0.0", port=7063,log_level="info")
	# uvicorn.run( app, host="0.0.0.0", port=5052,log_level="info")



	def upload_documents(path) :
	# path = eval(f'{path}')
	request = Request
	print(f'started for path: {path}')
	base_url = str(request.base_url)
	global document_data
	document_data = {}
	pdf_path = path

	clear_directory(pdf_input_path)
	clear_directory(pdf_output_path)
	clear_directory(word_input_path)
	clear_directory(word_output_path)

	# Initialize response structure
	response = {
	"success": False,
	"message": "",
	# "data": None
	}


	if not pdf_path:
	response["message"] = "No file path provided."
	return response

	# Check if the provided path is a PDF file
	if not pdf_path.lower().endswith(".pdf"):
	response["message"] = "Invalid file type. Only PDF files are accepted."
	return response


	# Extract filename
	file_name_with_ext = os.path.basename(pdf_path)
	file_name_without_ext = os.path.splitext(file_name_with_ext)[0]

	# Create destination path in input directory
	destination_path = os.path.join(pdf_input_directory, file_name_with_ext)

	try:
	# Copy the file to our input directory
	shutil.copy2(pdf_path, destination_path)
	except Exception as e:
	response["message"] = f"Failed to copy file: {str(e)}"
	return response

	output_directory_path = os.path.join(output_directory)
	os.makedirs(output_directory_path, exist_ok=True)
	file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
	os.makedirs(file_output, exist_ok=True)

	table_output_path = os.path.join(file_output, f"table_output")
	os.makedirs(table_output_path, exist_ok=True)
	file_location = destination_path


	# Pipeline processing

	json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
	table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)

	custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
	header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)

	# Initialize data for the new document
	document_data[file_name_with_ext] = {

	"pdf_path": destination_path,
	"pdf_file_name": file_name_with_ext,
	"model_json_header_output_filepath": [],
	"model_json_layout_output_filepath": [],
	"tree_structured_header_json_filepath": [],
	"user_modified_json_output_filepath": [],
	'user_modified_table_json_filepath': [],
	"frontend_output_json": [],
	"cluster_json": [],
	"id_2_label" : [],
	"file_output_dir" : [],
	"table_output_dir": [],
	"table_with_header_data" : [],
	"table_with_header_json_path" : [],
	"json_output_dir": [],
	"pdf_miner_json_path": [] ,
	"searchable_pdf_path" : []
	}

	# Store paths and filenames
	document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
	document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
	document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
	document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
	document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
	document_data[file_name_with_ext]["file_output_dir"].append(file_output)
	document_data[file_name_with_ext]["id_2_label"].append(class_names)
	document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
	document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
	document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
	document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)

	file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
	pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
	pdf_path = document_data[file_name_with_ext]["pdf_path"]
	user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]

	pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)

	table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
	document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)

	document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)

	# Process image URLs
	pdf_images_urls = []
	for file_name in os.listdir(pdf_images_path):
	file_path = os.path.join(pdf_images_path, file_name)
	if file_name.endswith((".jpg", ".jpeg", ".png")):
	img_url = base_url + "image/" + str(quote(file_path))
	pdf_images_urls.append(img_url)

	# Sort image URLs by page number
	def extract_page_no(url):
	return int(url.split("_")[-1].split(".")[0])
	sorted_urls = sorted(pdf_images_urls, key=extract_page_no)

	# Create page details
	page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]

	# Store the JSON output
	document_data[file_name_with_ext]["frontend_output_json"].append({
	"layout_output_json_data": layout_output_json_data,
	"layout_json_list_data": layout_list_data,
	"id_2_label": class_names,
	"header_output_json_data": header_output_json_data,
	"table_output_json_data": table_json_data,
	"table_output_json_data_list": table_json_data_list,
	"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
	"pdf_images_urls": page_details,
	})


	document_id_name = file_name_with_ext

	data = document_data[document_id_name]
	file_output_dir = data["file_output_dir"][0]
	json_output_dir = data["json_output_dir"][0]
	pdf_file_name = data["pdf_file_name"]
	pdf_path = data["pdf_path"]

	# PDFMiner processing
	pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
	modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
	model_modified_json = read_json(modified_json_output_filepath)
	pdfminer_json = read_json(pdf_miner_json_filepath)
	searchable_pdf_path = data["searchable_pdf_path"][0]

	# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)

	table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)

	table_merged_json = read_json(table_merged_json_path)

	table_mapped_modified_json = map_table_with_its_header(table_merged_json)


	# table_mapped_modified_json = map_table_with_its_header(model_modified_json)

	# Main header pipeline
	df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)


	clean_df, clean_df_json = clean_dataframe(df_final)


	if isinstance(clean_df_json, str):
	clean_df_json = json.loads(clean_df_json)

	file_name = get_file_name_without_extension(pdf_file_name)
	merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
	company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)

	json_output_filename = file_name + "_final_h2h_extraction.json"
	final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)

	with open(final_json_output_filepath, 'w') as f:
	json.dump(clean_df_json, f, indent=4)

	company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
	company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)

	with open(company_wise_final_json_output_filepath, 'w') as f:
	json.dump(merged_content_company_wise_df, f, indent=4)

	# Tree-structured header content
	# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)

	# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)

	# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)



	# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)


	# Step 1: Extract directory and filename without extension
	pdf_path = path
	json_directory = os.path.dirname(pdf_path)
	json_filename = os.path.splitext(os.path.basename(pdf_path))[0]

	# Step 2: Define output path for JSON
	output_json_path = os.path.join(json_directory, f"{json_filename }.json")

	# If your variable is a JSON string, convert it to dict first
	if isinstance(company_wise_clean_df_json, str):
	company_wise_clean_df_json = json.loads(company_wise_clean_df_json)


	# Step 3: Save JSON
	with open(output_json_path, 'w', encoding='utf-8') as json_file:
	json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)


	# # post-processing results
	# post_processing_results = {
	# document_id_name : {
	# # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
	# # "df_download_json" : clean_df_json,
	# "df_download_json": company_wise_clean_df_json,
	# "tree_structured_header_content": final_tree_structred_header_content,
	# "file_name": document_id_name,
	# # "classified_dynamic_json": dynamic_mapped_data_json,
	# "toc_df_download_json" : final_toc_h2h_extraction
	# }
	# }

	response_final = {
	"status_code": 200,
	# "message":"",
	# "df_download_json": company_wise_clean_df_json,
	"saved_json_path": output_json_path
	}




	return response_final



	# def upload_documents(path):
	# logger.info(f"Starting upload_documents for path: {path}")

	# request = Request
	# base_url = str(request.base_url)
	# global document_data
	# document_data = {}
	# pdf_path = path

	# # Log directory clearing
	# logger.info("Clearing input and output directories")
	# clear_directory(pdf_input_path)
	# clear_directory(pdf_output_path)
	# clear_directory(word_input_path)
	# clear_directory(word_output_path)

	# # Initialize response structure
	# response = {
	# "success": False,
	# "message": "",
	# }

	# # Check if the provided path is a PDF file
	# if not pdf_path.lower().endswith(".pdf"):
	# logger.error(f"Invalid file type for path: {pdf_path}. Only PDF files are accepted.")
	# response["message"] = "Invalid file type. Only PDF files are accepted."
	# return response

	# # Extract filename
	# file_name_with_ext = os.path.basename(pdf_path)
	# file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
	# logger.debug(f"Extracted filename: {file_name_with_ext} (without extension: {file_name_without_ext})")

	# # Create destination path in input directory
	# destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
	# logger.debug(f"Destination path for file copy: {destination_path}")

	# # Copy file to input directory
	# try:
	# logger.info(f"Copying file from {pdf_path} to {destination_path}")
	# shutil.copy2(pdf_path, destination_path)
	# except Exception as e:
	# logger.error(f"Failed to copy file from {pdf_path} to {destination_path}: {str(e)}")
	# response["message"] = f"Failed to copy file: {str(e)}"
	# return response

	# # Create output directories
	# output_directory_path = os.path.join(output_directory)
	# os.makedirs(output_directory_path, exist_ok=True)
	# file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
	# os.makedirs(file_output, exist_ok=True)
	# table_output_path = os.path.join(file_output, f"table_output")
	# os.makedirs(table_output_path, exist_ok=True)
	# file_location = destination_path
	# logger.info(f"Created output directories: {file_output}, {table_output_path}")

	# # Pipeline processing
	# logger.info(f"Starting yolov10_layout_pipeline for {file_name_without_ext}")
	# try:
	# json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
	# logger.info(f"yolov10_layout_pipeline completed. Output JSON dir: {json_output_dir}")
	# except Exception as e:
	# logger.error(f"yolov10_layout_pipeline failed: {str(e)}")
	# response["message"] = f"yolov10_layout_pipeline failed: {str(e)}"
	# return response

	# logger.info(f"Starting filtering_table_pipeline for {file_name_without_ext}")
	# try:
	# table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
	# logger.info(f"filtering_table_pipeline completed. Table JSON path: {table_json_path}")
	# except Exception as e:
	# logger.error(f"filtering_table_pipeline failed: {str(e)}")
	# response["message"] = f"filtering_table_pipeline failed: {str(e)}"
	# return response

	# logger.info(f"Starting process_pdf_for_headers for {file_name_without_ext}")
	# try:
	# custom_headers_json, custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext, file_location, file_output)
	# logger.info(f"process_pdf_for_headers completed. Custom headers JSON path: {custom_headers_json_file_path}")
	# except Exception as e:
	# logger.error(f"process_pdf_for_headers failed: {str(e)}")
	# response["message"] = f"process_pdf_for_headers failed: {str(e)}"
	# return response

	# logger.info(f"Starting pipeline_for_merging_headers for {file_name_without_ext}")
	# try:
	# header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path, header_json_output_filepath, file_output, file_name_without_ext)
	# logger.info(f"pipeline_for_merging_headers completed. Merged headers JSON path: {header_json_output_filepath}")
	# except Exception as e:
	# logger.error(f"pipeline_for_merging_headers failed: {str(e)}")
	# response["message"] = f"pipeline_for_merging_headers failed: {str(e)}"
	# return response

	# # Initialize document_data
	# logger.debug(f"Initializing document_data for {file_name_with_ext}")
	# document_data[file_name_with_ext] = {
	# "pdf_path": destination_path,
	# "pdf_file_name": file_name_with_ext,
	# "model_json_header_output_filepath": [],
	# "model_json_layout_output_filepath": [],
	# "tree_structured_header_json_filepath": [],
	# "user_modified_json_output_filepath": [],
	# "user_modified_table_json_filepath": [],
	# "frontend_output_json": [],
	# "cluster_json": [],
	# "id_2_label": [],
	# "file_output_dir": [],
	# "table_output_dir": [],
	# "table_with_header_data": [],
	# "table_with_header_json_path": [],
	# "json_output_dir": [],
	# "pdf_miner_json_path": [],
	# "searchable_pdf_path": []
	# }

	# # Store paths and filenames
	# document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
	# document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
	# document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
	# document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
	# document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
	# document_data[file_name_with_ext]["file_output_dir"].append(file_output)
	# document_data[file_name_with_ext]["id_2_label"].append(class_names)
	# document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
	# document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
	# document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
	# document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
	# logger.debug(f"Stored paths and filenames in document_data for {file_name_with_ext}")

	# file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
	# pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
	# pdf_path = document_data[file_name_with_ext]["pdf_path"]
	# user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]

	# logger.info(f"Starting create_json_pdfminer_pipeline for {pdf_file_name}")
	# try:
	# pdf_miner_json_filepath, pdf_miner_metadata, searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
	# logger.info(f"create_json_pdfminer_pipeline completed. PDFMiner JSON path: {pdf_miner_json_filepath}, Searchable PDF path: {searchable_pdf_path}")
	# except Exception as e:
	# logger.error(f"create_json_pdfminer_pipeline failed: {str(e)}")
	# response["message"] = f"create_json_pdfminer_pipeline failed: {str(e)}"
	# return response

	# logger.info(f"Starting main_pipeline_create_put_table_headers for {file_name_with_ext}")
	# try:
	# table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
	# logger.info(f"main_pipeline_create_put_table_headers completed")
	# except Exception as e:
	# logger.error(f"main_pipeline_create_put_table_headers failed: {str(e)}")
	# response["message"] = f"main_pipeline_create_put_table_headers failed: {str(e)}"
	# return response

	# document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
	# document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)

	# # Process image URLs
	# pdf_images_urls = []
	# for file_name in os.listdir(pdf_images_path):
	# file_path = os.path.join(pdf_images_path, file_name)
	# if file_name.endswith((".jpg", ".jpeg", ".png")):
	# img_url = base_url + "image/" + str(quote(file_path))
	# pdf_images_urls.append(img_url)
	# logger.debug(f"Collected {len(pdf_images_urls)} image URLs from {pdf_images_path}")

	# # Sort image URLs by page number
	# def extract_page_no(url):
	# return int(url.split("_")[-1].split(".")[0])
	# sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
	# page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
	# logger.debug(f"Sorted {len(sorted_urls)} image URLs and created page details")

	# # Store the JSON output
	# document_data[file_name_with_ext]["frontend_output_json"].append({
	# "layout_output_json_data": layout_output_json_data,
	# "layout_json_list_data": layout_list_data,
	# "id_2_label": class_names,
	# "header_output_json_data": header_output_json_data,
	# "table_output_json_data": table_json_data,
	# "table_output_json_data_list": table_json_data_list,
	# "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
	# "pdf_images_urls": page_details,
	# })
	# logger.debug(f"Stored frontend_output_json for {file_name_with_ext}")

	# document_id_name = file_name_with_ext
	# data = document_data[document_id_name]
	# file_output_dir = data["file_output_dir"][0]
	# json_output_dir = data["json_output_dir"][0]
	# pdf_file_name = data["pdf_file_name"]
	# pdf_path = data["pdf_path"]

	# # PDFMiner processing
	# pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
	# modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
	# logger.info(f"Reading JSON files: {modified_json_output_filepath}, {pdf_miner_json_filepath}")
	# try:
	# model_modified_json = read_json(modified_json_output_filepath)
	# pdfminer_json = read_json(pdf_miner_json_filepath)
	# logger.info(f"Successfully read JSON files")
	# except Exception as e:
	# logger.error(f"Failed to read JSON files: {str(e)}")
	# response["message"] = f"Failed to read JSON files: {str(e)}"
	# return response

	# searchable_pdf_path = data["searchable_pdf_path"][0]

	# logger.info(f"Starting merge_multi_page_tables_pipeline_v2 for {pdf_file_name}")
	# try:
	# table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
	# logger.info(f"merge_multi_page_tables_pipeline_v2 completed. Merged table JSON path: {table_merged_json_path}")
	# except Exception as e:
	# logger.error(f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}")
	# response["message"] = f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}"
	# return response

	# logger.info(f"Reading merged table JSON: {table_merged_json_path}")
	# try:
	# table_merged_json = read_json(table_merged_json_path)
	# logger.info(f"Successfully read merged table JSON")
	# except Exception as e:
	# logger.error(f"Failed to read merged table JSON: {str(e)}")
	# response["message"] = f"Failed to read merged table JSON: {str(e)}"
	# return response

	# # logger.info(f"Starting map_table_with_its_header for {file_name_with_extDeprecationWarning}")
	# try:
	# table_mapped_modified_json = map_table_with_its_header(table_merged_json)
	# logger.info(f"map_table_with_its_header completed")
	# except Exception as e:
	# logger.error(f"map_table_with_its_header failed: {str(e)}")
	# response["message"] = f"map_table_with_its_header failed: {st(e)}"
	# return response

	# logger.info(f"Starting main_header_pipeline for {file_name_with_ext}")
	# try:
	# df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
	# logger.info(f"main_header_pipeline completed")
	# except Exception as e:
	# logger.error(f"main_header_pipeline failed: {str(e)}")
	# response["message"] = f"main_header_pipeline failed: {str(e)}"
	# return response

	# logger.info(f"Starting clean_dataframe for final DataFrame")
	# try:
	# clean_df, clean_df_json = clean_dataframe(df_final)
	# logger.info(f"clean_dataframe completed. Clean JSON created")
	# except Exception as e:
	# logger.error(f"clean_dataframe failed: {str(e)}")
	# response["message"] = f"clean_dataframe failed: {str(e)}"
	# return response

	# if isinstance(clean_df_json, str):
	# clean_df_json = eval(clean_df_json)
	# logger.debug(f"Converted clean_df_json string to dictionary")

	# file_name = get_file_name_without_extension(pdf_file_name)
	# logger.info(f"Starting process_document_company_wise for {file_name}")
	# try:
	# merged_content_company_wise_df = process_document_company_wise(clean_df_json, output_directory=json_output_dir, file_name=file_name)
	# company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
	# logger.info(f"process_document_company_wise and clean_dataframe completed")
	# except Exception as e:
	# logger.error(f"process_document_company_wise failed: {str(e)}")
	# response["message"] = f"process_document_company_wise failed: {str(e)}"
	# return response

	# json_output_filename = file_name + "_final_h2h_extraction.json"
	# final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
	# logger.info(f"Saving final JSON to {final_json_output_filepath}")
	# try:
	# with open(final_json_output_filepath, 'w') as f:
	# json.dump(clean_df_json, f, indent=4)
	# logger.info(f"Final JSON saved successfully")
	# except Exception as e:
	# logger.error(f"Failed to save final JSON: {str(e)}")
	# response["message"] = f"Failed to save final JSON: {str(e)}"
	# return response

	# company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
	# company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
	# logger.info(f"Saving company-wise JSON to {company_wise_final_json_output_filepath}")
	# try:
	# with open(company_wise_final_json_output_filepath, 'w') as f:
	# json.dump(merged_content_company_wise_df, f, indent=4)
	# logger.info(f"Company-wise JSON saved successfully")
	# except Exception as e:
	# logger.error(f"Failed to save company-wise JSON: {str(e)}")
	# response["message"] = f"Failed to save company-wise JSON: {str(e)}"
	# return response

	# logger.info(f"Starting tree_structured_headers_content_pipeline for {pdf_file_name}")
	# try:
	# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
	# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
	# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
	# logger.info(f"tree_structured_headers_content_pipeline and merge_blocks completed")
	# except Exception as e:
	# logger.error(f"tree_structured_headers_content_pipeline failed: {str(e)}")
	# response["message"] = f"tree_structured_headers_content_pipeline failed: {str(e)}"
	# return response

	# logger.info(f"Starting customised_toc_extraction_pipeline for {searchable_pdf_path}")
	# try:
	# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path, yolo_detection_json_path=modified_json_output_filepath, output_directory=file_output_dir)
	# logger.info(f"customised_toc_extraction_pipeline completed")
	# except Exception as e:
	# logger.error(f"customised_toc_extraction_pipeline failed: {str(e)}")
	# response["message"] = f"customised_toc_extraction_pipeline failed: {str(e)}"
	# return response

	# # Save final JSON output
	# json_directory = os.path.dirname(pdf_path)
	# json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
	# output_json_path = os.path.join(json_directory, f"{json_filename}.json")
	# logger.info(f"Saving output JSON to {output_json_path}")

	# try:
	# if isinstance(company_wise_clean_df_json, str):
	# company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
	# logger.debug(f"Converted company_wise_clean_df_json string to dictionary")

	# with open(output_json_path, 'w', encoding='utf-8') as json_file:
	# json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
	# logger.info(f"Output JSON saved successfully")
	# except Exception as e:
	# logger.error(f"Failed to save output JSON: {str(e)}")
	# response["message"] = f"Failed to save output JSON: {str(e)}"
	# return response

	# response_final = {
	# "status_code": 200,
	# "saved_json_path": output_json_path
	# }
	# logger.info(f"upload_documents completed successfully for {file_name_with_ext}. Response: {response_final}")
	# return response_final


	def table_extraction_and_mapping(path,
	field_name,
	class_keywords_table,
	header_categories,
	class_keywords_table_column,
	filter_table_classifier_name,
	threshold) :
	# path = eval(f'{path}')
	request = Request
	print(f'started for path: {path}')
	base_url = str(request.base_url)
	global document_data
	document_data = {}
	pdf_path = path

	clear_directory(pdf_input_path)
	clear_directory(pdf_output_path)
	clear_directory(word_input_path)
	clear_directory(word_output_path)

	# Initialize response structure
	response = {
	"success": False,
	"message": "",
	# "data": None
	}


	if not pdf_path:
	response["message"] = "No file path provided."
	return response

	# Check if the provided path is a PDF file
	if not pdf_path.lower().endswith(".pdf"):
	response["message"] = "Invalid file type. Only PDF files are accepted."
	return response


	# Extract filename
	file_name_with_ext = os.path.basename(pdf_path)
	file_name_without_ext = os.path.splitext(file_name_with_ext)[0]

	# Create destination path in input directory
	destination_path = os.path.join(pdf_input_directory, file_name_with_ext)

	try:
	# Copy the file to our input directory
	shutil.copy2(pdf_path, destination_path)
	except Exception as e:
	response["message"] = f"Failed to copy file: {str(e)}"
	return response

	output_directory_path = os.path.join(output_directory)
	os.makedirs(output_directory_path, exist_ok=True)
	file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
	os.makedirs(file_output, exist_ok=True)

	table_output_path = os.path.join(file_output, f"table_output")
	os.makedirs(table_output_path, exist_ok=True)
	file_location = destination_path


	# Pipeline processing

	json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
	table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)

	custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
	header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)

	# Initialize data for the new document
	document_data[file_name_with_ext] = {

	"pdf_path": destination_path,
	"pdf_file_name": file_name_with_ext,
	"model_json_header_output_filepath": [],
	"model_json_layout_output_filepath": [],
	"tree_structured_header_json_filepath": [],
	"user_modified_json_output_filepath": [],
	'user_modified_table_json_filepath': [],
	"frontend_output_json": [],
	"cluster_json": [],
	"id_2_label" : [],
	"file_output_dir" : [],
	"table_output_dir": [],
	"table_with_header_data" : [],
	"table_with_header_json_path" : [],
	"json_output_dir": [],
	"pdf_miner_json_path": [] ,
	"searchable_pdf_path" : []
	}

	# Store paths and filenames
	document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
	document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
	document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
	document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
	document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
	document_data[file_name_with_ext]["file_output_dir"].append(file_output)
	document_data[file_name_with_ext]["id_2_label"].append(class_names)
	document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
	document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
	document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
	document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)

	file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
	pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
	pdf_path = document_data[file_name_with_ext]["pdf_path"]
	user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]

	pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)

	table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
	document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)

	document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)

	# Process image URLs
	pdf_images_urls = []
	for file_name in os.listdir(pdf_images_path):
	file_path = os.path.join(pdf_images_path, file_name)
	if file_name.endswith((".jpg", ".jpeg", ".png")):
	img_url = base_url + "image/" + str(quote(file_path))
	pdf_images_urls.append(img_url)

	# Sort image URLs by page number
	def extract_page_no(url):
	return int(url.split("_")[-1].split(".")[0])
	sorted_urls = sorted(pdf_images_urls, key=extract_page_no)

	# Create page details
	page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]

	# Store the JSON output
	document_data[file_name_with_ext]["frontend_output_json"].append({
	"layout_output_json_data": layout_output_json_data,
	"layout_json_list_data": layout_list_data,
	"id_2_label": class_names,
	"header_output_json_data": header_output_json_data,
	"table_output_json_data": table_json_data,
	"table_output_json_data_list": table_json_data_list,
	"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
	"pdf_images_urls": page_details,
	})


	document_id_name = file_name_with_ext

	data = document_data[document_id_name]
	file_output_dir = data["file_output_dir"][0]
	json_output_dir = data["json_output_dir"][0]
	pdf_file_name = data["pdf_file_name"]
	pdf_path = data["pdf_path"]

	# PDFMiner processing
	pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
	modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
	model_modified_json = read_json(modified_json_output_filepath)
	pdfminer_json = read_json(pdf_miner_json_filepath)
	searchable_pdf_path = data["searchable_pdf_path"][0]

	# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)

	table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)

	table_merged_json = read_json(table_merged_json_path)

	table_mapped_modified_json = map_table_with_its_header(table_merged_json)


	# table_mapped_modified_json = map_table_with_its_header(model_modified_json)

	# Main header pipeline
	df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)


	clean_df, clean_df_json = clean_dataframe(df_final)


	# if isinstance(clean_df_json, str):
	# clean_df_json = eval(clean_df_json)

	file_name = get_file_name_without_extension(pdf_file_name)


	# Step 1: Extract directory and filename without extension
	pdf_path = path
	json_directory = os.path.dirname(pdf_path)
	json_filename = os.path.splitext(os.path.basename(pdf_path))[0]

	# Step 2: Define output path for JSON
	output_json_path = os.path.join(json_directory, f"{json_filename}_structured_chunking.json")

	# If your variable is a JSON string, convert it to dict first
	if isinstance(clean_df_json, str):
	clean_df_json = json.loads(clean_df_json)


	# Step 3: Save JSON
	with open(output_json_path, 'w', encoding='utf-8') as json_file:
	json.dump(clean_df_json, json_file, ensure_ascii=False, indent=4)


	##########################################################
	# Table Classification Code

	##########################################################


	print("starting table classification pipeline")

	structured_chunk_json_path = output_json_path

	with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# This regex removes commas before closing braces/brackets, ignoring whitespace
	cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)

	# Parse the cleaned JSON
	structured_chunk_data = json.loads(cleaned_content)

	threshold = float(threshold)
	print("type of class_keywords_table::\n",type(class_keywords_table))

	# If class_keywords is a string, try to parse it
	if isinstance(class_keywords_table, str):
	try:
	class_keywords_table = json.loads(class_keywords_table)

	# if not isinstance(class_keywords_table, dict):
	# raise ValueError("class_keywords_table must be a dictionary")
	# if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
	# for key, value in class_keywords_table.items()):
	# raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
	except json.JSONDecodeError:
	raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})

	elif isinstance(class_keywords_table, dict) :
	class_keywords_table = class_keywords_table


	else:
	raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})


	# Perform classification
	categorized_headers_json = perform_classification(
	data=structured_chunk_data,
	class_keywords=class_keywords_table,
	header_categories=header_categories,
	similarity_threshold=threshold
	)

	# Step 1: Extract directory and filename without extension
	pdf_path = path
	json_directory = os.path.dirname(pdf_path)
	json_filename = os.path.splitext(os.path.basename(pdf_path))[0]

	# Step 2: Define output path for JSON
	classified_table_output_json_path = os.path.join(json_directory, f"{field_name}_table_classification.json")

	# If your variable is a JSON string, convert it to dict first
	if isinstance(categorized_headers_json, str):
	categorized_headers_json = json.loads(categorized_headers_json)


	# Step 3: Save JSON
	with open(classified_table_output_json_path, 'w', encoding='utf-8') as json_file:
	json.dump(categorized_headers_json, json_file, ensure_ascii=False, indent=4)


	#######################################################
	# Table Column Classification Code
	print("Starting Table Column Classification")


	# Parse JSON strings into dictionaries
	# input_table_classified_json = json.load(classified_table_output_json_path)
	with open(classified_table_output_json_path, "r") as f:
	input_table_classified_json = json.load(f)


	# class_keywords_table_column = json.loads(class_keywords_table_column)

	if isinstance(class_keywords_table_column, str):
	try:
	class_keywords_table_column = json.loads(class_keywords_table_column)


	except json.JSONDecodeError:
	raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table_column"})

	elif isinstance(class_keywords_table_column, dict):
	class_keywords_table_column = class_keywords_table_column

	# Convert similarity_threshold to integer
	similarity_threshold = float(threshold)

	column_classification_results_json = classify_column_headers(
	json_data=input_table_classified_json,
	class_keywords=class_keywords_table_column,
	filter_table_classifier_name=filter_table_classifier_name,
	similarity_threshold=similarity_threshold
	)

	# Step 1: Extract directory and filename without extension
	pdf_path = path
	json_directory = os.path.dirname(pdf_path)
	json_filename = os.path.splitext(os.path.basename(pdf_path))[0]

	# Step 2: Define output path for JSON
	classified_table_column_output_json_path = os.path.join(json_directory, f"{field_name}_table_column_classification.json")

	# If your variable is a JSON string, convert it to dict first
	if isinstance(column_classification_results_json, str):
	column_classification_results_json = json.loads(column_classification_results_json)


	# Step 3: Save JSON
	with open(classified_table_column_output_json_path, 'w', encoding='utf-8') as json_file:
	json.dump(column_classification_results_json, json_file, ensure_ascii=False, indent=4)

	#######################################################################

	response_final = {
	"status_code": 200,
	# "message":"",
	# "df_download_json": company_wise_clean_df_json,
	"structured_chunk_json_path": output_json_path,
	"table_classification_json_path":classified_table_output_json_path,
	"table_column_classification_json_path" : classified_table_column_output_json_path
	}




	return response_final