table_test / iqeq_app_latest (4).py
Kushalguptaiitb's picture
Upload 8 files
b533173 verified
from fastapi import FastAPI, HTTPException, Depends, File, Request, Form
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, FileResponse
from urllib.parse import quote
from typing import List, Annotated,Dict,Optional,Any
import uvicorn
import sys
import json
sys.path.append("/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking")
# from config.set_config.set_configuration import set_config_project
from config.set_config import set_configuration
set_config_project = set_configuration()
project_output_directory_path= set_config_project.project_output_directory_path
project_path = set_config_project.project_path
from src.table_processing.table_filter import filtering_table_pipeline
# from src.qa_correction.user_action_modification import process_actions_and_create_new_file
from src.table_processing.tree_structured_json import tree_structured_headers_pipeline,tree_structured_headers_content_pipeline
from src.pre_processing.create_searchable_pdf_old import create_json_pdfminer_pipeline
from src.post_processing.clean_dataframe import clean_dataframe
from src.table_processing.merge_headers_tree_structure import merge_blocks
from src.table_processing.create_and_put_table_header import main_pipeline_create_put_table_headers
from src.table_processing.map_table_with_table_header import map_table_with_its_header
# from src.table_processing.table_merge import merge_multi_page_tables_pipeline
# from other_code.save_classified_pdf_json_to_excel import create_directories_and_sheets
# from src.table_extraction_from_word_csv.word_extraction import main_table_extraction_from_docx
# from src.table_extraction_from_word_csv.xlsx_extraction import extract_and_save_tables_from_excel
# from src.table_extraction_from_word_csv.csv_extraction import extract_and_save_tables_from_csv
# from src.table_extraction_from_word_csv.classify_table_headers import process_main_classifier,get_csv_file_paths,save_classify_files, clean_filename
# from src.iqeq_modification.sorting_headers_v2 import filter_and_sort_headers
# from src.iqeq_modification.portfolio_summary_dynamic_classification import map_company_data
from src.toc_based_extraction.main_pipeline_toc_based_extraction import customised_toc_extraction_pipeline
from src.iqeq_modification.post_processing_iqeq import read_json,main_header_pipeline
from src.iqeq_modification.post_process_portfolio_company_json import process_document_company_wise
# from src.filter_pdf_pages_scope3.fuzzy_match_keywords import custom_pipeline_for_filter_keywords_pages_text_search
# from src.filter_pdf_pages_scope3.keywords_matching import custom_pipeline_for_filter_keywords_pages_tfidf_vector
# from src.filter_pdf_pages_scope3.keywords_matching_create_pdf import custom_pipeline_for_filter_keywords_pages_sentence_embedding
# from src.layout_detection.layout_detection import yolov10_layout_pipeline,get_file_name_without_extension
from src.layout_detection.layout_detection_docling_heron import yolov10_layout_pipeline,get_file_name_without_extension
# from src.table_merge.table_merge_v2 import merge_multi_page_tables_pipeline_v2
# from src.table_merge.table_merge_new import merge_multi_page_tables_pipeline_v2
from src.table_merge.table_merge_v5 import merge_multi_page_tables_pipeline_v2
from src.table_query.query_code_openai import get_query_response
from src.custom_headers.pdf_header_detector import process_pdf_for_headers
from src.custom_headers.consolidate_header_jsons import pipeline_for_merging_headers
from utils.utils_code import clear_directory
import logging,os
from logging.config import dictConfig
import shutil
import re
from fastapi import HTTPException, Form
from src.classification.column_classifier_v2 import classify_column_headers
from src.classification.classification import perform_classification
log_folder = "logs"
os.makedirs(log_folder, exist_ok=True)
# Configure logging
log_file_path = os.path.join(log_folder, "app.log")
logging_config = {
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'detailed': {
'format': '%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S'
},
},
'handlers': {
'console': {
'class': 'logging.StreamHandler',
'level': 'INFO',
'formatter': 'detailed',
'stream': 'ext://sys.stdout'
},
'file': {
'class': 'logging.FileHandler',
'level': 'INFO',
'formatter': 'detailed',
'filename': log_file_path,
'mode': 'a',
},
},
'loggers': {
'': { # root logger
'handlers': ['console', 'file'],
'level': 'INFO',
'propagate': True
},
# Add specific loggers for libraries if needed
'uvicorn': {
'handlers': ['console', 'file'],
'level': 'INFO',
'propagate': False
},
}
}
# Apply the configuration
dictConfig(logging_config)
# Create the logger instance
logger = logging.getLogger(__name__)
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
pdf_input_path = os.path.join(project_output_directory_path, f"pdf_extraction/input")
pdf_input_directory = pdf_input_path
os.makedirs(pdf_input_directory, exist_ok=True)
pdf_output_path = os.path.join(project_output_directory_path, f"pdf_extraction/output")
output_directory = pdf_output_path
os.makedirs(output_directory, exist_ok=True)
word_input_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/input")
word_input_directory_path = word_input_path
os.makedirs(word_input_directory_path, exist_ok=True)
word_output_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/output")
word_output_directory_path = word_output_path
os.makedirs(word_output_directory_path, exist_ok=True)
document_data = {}
@app.post("/structured_chunking_extract")
async def upload_documents(request: Request, path: str = Form()) :
# path = eval(f'{path}')
print(f'started for path: {path}')
base_url = str(request.base_url)
global document_data
document_data = {}
pdf_path = path
clear_directory(pdf_input_path)
clear_directory(pdf_output_path)
clear_directory(word_input_path)
clear_directory(word_output_path)
# Initialize response structure
response = {
"success": False,
"message": "",
# "data": None
}
# Check if the provided path is a PDF file
if not pdf_path.lower().endswith(".pdf"):
response["message"] = "Invalid file type. Only PDF files are accepted."
return response
# Extract filename
file_name_with_ext = os.path.basename(pdf_path)
file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
# Create destination path in input directory
destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
try:
# Copy the file to our input directory
shutil.copy2(pdf_path, destination_path)
except Exception as e:
response["message"] = f"Failed to copy file: {str(e)}"
return response
output_directory_path = os.path.join(output_directory)
os.makedirs(output_directory_path, exist_ok=True)
file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
os.makedirs(file_output, exist_ok=True)
table_output_path = os.path.join(file_output, f"table_output")
os.makedirs(table_output_path, exist_ok=True)
file_location = destination_path
# Pipeline processing
json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path,_ = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
# Initialize data for the new document
document_data[file_name_with_ext] = {
"pdf_path": destination_path,
"pdf_file_name": file_name_with_ext,
"model_json_header_output_filepath": [],
"model_json_layout_output_filepath": [],
"tree_structured_header_json_filepath": [],
"user_modified_json_output_filepath": [],
'user_modified_table_json_filepath': [],
"frontend_output_json": [],
"cluster_json": [],
"id_2_label" : [],
"file_output_dir" : [],
"table_output_dir": [],
"table_with_header_data" : [],
"table_with_header_json_path" : [],
"json_output_dir": [],
"pdf_miner_json_path": [] ,
"searchable_pdf_path" : []
}
# Store paths and filenames
document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
document_data[file_name_with_ext]["file_output_dir"].append(file_output)
document_data[file_name_with_ext]["id_2_label"].append(class_names)
document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
pdf_path = document_data[file_name_with_ext]["pdf_path"]
user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
# Process image URLs
pdf_images_urls = []
for file_name in os.listdir(pdf_images_path):
file_path = os.path.join(pdf_images_path, file_name)
if file_name.endswith((".jpg", ".jpeg", ".png")):
img_url = base_url + "image/" + str(quote(file_path))
pdf_images_urls.append(img_url)
# Sort image URLs by page number
def extract_page_no(url):
return int(url.split("_")[-1].split(".")[0])
sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
# Create page details
page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
# Store the JSON output
document_data[file_name_with_ext]["frontend_output_json"].append({
"layout_output_json_data": layout_output_json_data,
"layout_json_list_data": layout_list_data,
"id_2_label": class_names,
"header_output_json_data": header_output_json_data,
"table_output_json_data": table_json_data,
"table_output_json_data_list": table_json_data_list,
"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
"pdf_images_urls": page_details,
})
document_id_name = file_name_with_ext
data = document_data[document_id_name]
file_output_dir = data["file_output_dir"][0]
json_output_dir = data["json_output_dir"][0]
pdf_file_name = data["pdf_file_name"]
pdf_path = data["pdf_path"]
# PDFMiner processing
pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
model_modified_json = read_json(modified_json_output_filepath)
pdfminer_json = read_json(pdf_miner_json_filepath)
searchable_pdf_path = data["searchable_pdf_path"][0]
# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
table_merged_json = read_json(table_merged_json_path)
table_mapped_modified_json = map_table_with_its_header(table_merged_json)
# table_mapped_modified_json = map_table_with_its_header(model_modified_json)
# Main header pipeline
df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
clean_df, clean_df_json = clean_dataframe(df_final)
if isinstance(clean_df_json, str):
# print("clean_df_json::",clean_df_json)
# clean_df_json = eval(clean_df_json)
clean_df_json = json.loads(clean_df_json)
file_name = get_file_name_without_extension(pdf_file_name)
merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
json_output_filename = file_name + "_final_h2h_extraction.json"
final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
with open(final_json_output_filepath, 'w') as f:
json.dump(clean_df_json, f, indent=4)
company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
with open(company_wise_final_json_output_filepath, 'w') as f:
json.dump(merged_content_company_wise_df, f, indent=4)
# Tree-structured header content
# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
# Step 1: Extract directory and filename without extension
pdf_path = path
json_directory = os.path.dirname(pdf_path)
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
# Step 2: Define output path for JSON
output_json_path = os.path.join(json_directory, f"{json_filename }.json")
# If your variable is a JSON string, convert it to dict first
if isinstance(company_wise_clean_df_json, str):
company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
# Step 3: Save JSON
with open(output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
# # post-processing results
# post_processing_results = {
# document_id_name : {
# # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
# # "df_download_json" : clean_df_json,
# "df_download_json": company_wise_clean_df_json,
# "tree_structured_header_content": final_tree_structred_header_content,
# "file_name": document_id_name,
# # "classified_dynamic_json": dynamic_mapped_data_json,
# "toc_df_download_json" : final_toc_h2h_extraction
# }
# }
response_final = {
"status_code": 200,
# "message":"",
# "df_download_json": company_wise_clean_df_json,
"saved_json_path": output_json_path
}
return response_final
@app.get("/image/{path:path}")
async def get_image(path: str):
if os.path.exists(path):
return FileResponse(path, media_type="image/jpeg")
else:
raise HTTPException(status_code=404, detail="Image not found")
@app.get("/file/{path:path}")
async def get_file(path: str):
if os.path.exists(path):
paths = path.split("/")
filename = paths[len(paths) - 1]
if path.endswith('.csv'):
media_type = "text/csv"
elif path.endswith('.xlsx'):
media_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
else:
media_type = "application/octet-stream"
return FileResponse(path, media_type=media_type,filename =filename)
else:
raise HTTPException(status_code=404, detail="File not found")
@app.post("/table-classification")
async def table_classification(
structured_chunk_json_path: str = Form(...),
class_keywords_table: str = Form(...),
header_categories: Optional[str] = Form("table_column_header"),
similarity_threshold: Optional[float] = Form(0.4)
):
try:
with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
content = file.read()
# This regex removes commas before closing braces/brackets, ignoring whitespace
cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
# Parse the cleaned JSON
structured_chunk_data = json.loads(cleaned_content)
# If class_keywords is a string, try to parse it
if isinstance(class_keywords_table, str):
try:
class_keywords_table = json.loads(class_keywords_table)
if not isinstance(class_keywords_table, dict):
raise ValueError("class_keywords_table must be a dictionary")
if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
for key, value in class_keywords_table.items()):
raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
except json.JSONDecodeError:
raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
elif not isinstance(class_keywords_table, dict) or not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
for key, value in class_keywords_table.items()):
raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
# Perform classification
categorized_headers = perform_classification(
data=structured_chunk_data,
class_keywords=class_keywords_table,
header_categories=header_categories,
similarity_threshold=similarity_threshold
)
return categorized_headers
except ValueError as e:
raise HTTPException(status_code=422, detail={"error": "Input validation failed", "message": str(e)})
except Exception as e:
raise HTTPException(status_code=422, detail={"error": "Processing failed", "message": str(e)})
@app.post("/table-column-classification")
async def table_column_classification(
input_table_classified_json: Annotated[str, Form()],
class_keywords_table_column: Annotated[str, Form()],
filter_table_classifier_name: Annotated[str, Form()],
similarity_threshold: Annotated[str, Form()]
):
try:
# Parse JSON strings into dictionaries
input_table_classified_json = json.loads(input_table_classified_json)
class_keywords_table_column = json.loads(class_keywords_table_column)
except json.JSONDecodeError as e:
raise HTTPException(status_code=422, detail={"error": "Invalid JSON format", "message": str(e)})
try:
# Convert similarity_threshold to integer
similarity_threshold = float(similarity_threshold)
except ValueError as e:
raise HTTPException(status_code=422, detail={"error": "Similarity threshold must be a valid integer", "message": str(e)})
column_classification_results = classify_column_headers(
json_data=input_table_classified_json,
class_keywords=class_keywords_table_column,
filter_table_classifier_name=filter_table_classifier_name,
similarity_threshold=similarity_threshold
)
results = {"column_classification_result": column_classification_results}
return results
# Run the server
if __name__ == "__main__":
# uvicorn.run("app:app", host="0.0.0.0", port=7061, log_level="info", reload=True)
uvicorn.run( app, host="0.0.0.0", port=7063,log_level="info")
# uvicorn.run( app, host="0.0.0.0", port=5052,log_level="info")
def upload_documents(path) :
# path = eval(f'{path}')
request = Request
print(f'started for path: {path}')
base_url = str(request.base_url)
global document_data
document_data = {}
pdf_path = path
clear_directory(pdf_input_path)
clear_directory(pdf_output_path)
clear_directory(word_input_path)
clear_directory(word_output_path)
# Initialize response structure
response = {
"success": False,
"message": "",
# "data": None
}
if not pdf_path:
response["message"] = "No file path provided."
return response
# Check if the provided path is a PDF file
if not pdf_path.lower().endswith(".pdf"):
response["message"] = "Invalid file type. Only PDF files are accepted."
return response
# Extract filename
file_name_with_ext = os.path.basename(pdf_path)
file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
# Create destination path in input directory
destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
try:
# Copy the file to our input directory
shutil.copy2(pdf_path, destination_path)
except Exception as e:
response["message"] = f"Failed to copy file: {str(e)}"
return response
output_directory_path = os.path.join(output_directory)
os.makedirs(output_directory_path, exist_ok=True)
file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
os.makedirs(file_output, exist_ok=True)
table_output_path = os.path.join(file_output, f"table_output")
os.makedirs(table_output_path, exist_ok=True)
file_location = destination_path
# Pipeline processing
json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
# Initialize data for the new document
document_data[file_name_with_ext] = {
"pdf_path": destination_path,
"pdf_file_name": file_name_with_ext,
"model_json_header_output_filepath": [],
"model_json_layout_output_filepath": [],
"tree_structured_header_json_filepath": [],
"user_modified_json_output_filepath": [],
'user_modified_table_json_filepath': [],
"frontend_output_json": [],
"cluster_json": [],
"id_2_label" : [],
"file_output_dir" : [],
"table_output_dir": [],
"table_with_header_data" : [],
"table_with_header_json_path" : [],
"json_output_dir": [],
"pdf_miner_json_path": [] ,
"searchable_pdf_path" : []
}
# Store paths and filenames
document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
document_data[file_name_with_ext]["file_output_dir"].append(file_output)
document_data[file_name_with_ext]["id_2_label"].append(class_names)
document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
pdf_path = document_data[file_name_with_ext]["pdf_path"]
user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
# Process image URLs
pdf_images_urls = []
for file_name in os.listdir(pdf_images_path):
file_path = os.path.join(pdf_images_path, file_name)
if file_name.endswith((".jpg", ".jpeg", ".png")):
img_url = base_url + "image/" + str(quote(file_path))
pdf_images_urls.append(img_url)
# Sort image URLs by page number
def extract_page_no(url):
return int(url.split("_")[-1].split(".")[0])
sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
# Create page details
page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
# Store the JSON output
document_data[file_name_with_ext]["frontend_output_json"].append({
"layout_output_json_data": layout_output_json_data,
"layout_json_list_data": layout_list_data,
"id_2_label": class_names,
"header_output_json_data": header_output_json_data,
"table_output_json_data": table_json_data,
"table_output_json_data_list": table_json_data_list,
"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
"pdf_images_urls": page_details,
})
document_id_name = file_name_with_ext
data = document_data[document_id_name]
file_output_dir = data["file_output_dir"][0]
json_output_dir = data["json_output_dir"][0]
pdf_file_name = data["pdf_file_name"]
pdf_path = data["pdf_path"]
# PDFMiner processing
pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
model_modified_json = read_json(modified_json_output_filepath)
pdfminer_json = read_json(pdf_miner_json_filepath)
searchable_pdf_path = data["searchable_pdf_path"][0]
# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
table_merged_json = read_json(table_merged_json_path)
table_mapped_modified_json = map_table_with_its_header(table_merged_json)
# table_mapped_modified_json = map_table_with_its_header(model_modified_json)
# Main header pipeline
df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
clean_df, clean_df_json = clean_dataframe(df_final)
if isinstance(clean_df_json, str):
clean_df_json = json.loads(clean_df_json)
file_name = get_file_name_without_extension(pdf_file_name)
merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
json_output_filename = file_name + "_final_h2h_extraction.json"
final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
with open(final_json_output_filepath, 'w') as f:
json.dump(clean_df_json, f, indent=4)
company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
with open(company_wise_final_json_output_filepath, 'w') as f:
json.dump(merged_content_company_wise_df, f, indent=4)
# Tree-structured header content
# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
# Step 1: Extract directory and filename without extension
pdf_path = path
json_directory = os.path.dirname(pdf_path)
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
# Step 2: Define output path for JSON
output_json_path = os.path.join(json_directory, f"{json_filename }.json")
# If your variable is a JSON string, convert it to dict first
if isinstance(company_wise_clean_df_json, str):
company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
# Step 3: Save JSON
with open(output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
# # post-processing results
# post_processing_results = {
# document_id_name : {
# # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
# # "df_download_json" : clean_df_json,
# "df_download_json": company_wise_clean_df_json,
# "tree_structured_header_content": final_tree_structred_header_content,
# "file_name": document_id_name,
# # "classified_dynamic_json": dynamic_mapped_data_json,
# "toc_df_download_json" : final_toc_h2h_extraction
# }
# }
response_final = {
"status_code": 200,
# "message":"",
# "df_download_json": company_wise_clean_df_json,
"saved_json_path": output_json_path
}
return response_final
# def upload_documents(path):
# logger.info(f"Starting upload_documents for path: {path}")
# request = Request
# base_url = str(request.base_url)
# global document_data
# document_data = {}
# pdf_path = path
# # Log directory clearing
# logger.info("Clearing input and output directories")
# clear_directory(pdf_input_path)
# clear_directory(pdf_output_path)
# clear_directory(word_input_path)
# clear_directory(word_output_path)
# # Initialize response structure
# response = {
# "success": False,
# "message": "",
# }
# # Check if the provided path is a PDF file
# if not pdf_path.lower().endswith(".pdf"):
# logger.error(f"Invalid file type for path: {pdf_path}. Only PDF files are accepted.")
# response["message"] = "Invalid file type. Only PDF files are accepted."
# return response
# # Extract filename
# file_name_with_ext = os.path.basename(pdf_path)
# file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
# logger.debug(f"Extracted filename: {file_name_with_ext} (without extension: {file_name_without_ext})")
# # Create destination path in input directory
# destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
# logger.debug(f"Destination path for file copy: {destination_path}")
# # Copy file to input directory
# try:
# logger.info(f"Copying file from {pdf_path} to {destination_path}")
# shutil.copy2(pdf_path, destination_path)
# except Exception as e:
# logger.error(f"Failed to copy file from {pdf_path} to {destination_path}: {str(e)}")
# response["message"] = f"Failed to copy file: {str(e)}"
# return response
# # Create output directories
# output_directory_path = os.path.join(output_directory)
# os.makedirs(output_directory_path, exist_ok=True)
# file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
# os.makedirs(file_output, exist_ok=True)
# table_output_path = os.path.join(file_output, f"table_output")
# os.makedirs(table_output_path, exist_ok=True)
# file_location = destination_path
# logger.info(f"Created output directories: {file_output}, {table_output_path}")
# # Pipeline processing
# logger.info(f"Starting yolov10_layout_pipeline for {file_name_without_ext}")
# try:
# json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
# logger.info(f"yolov10_layout_pipeline completed. Output JSON dir: {json_output_dir}")
# except Exception as e:
# logger.error(f"yolov10_layout_pipeline failed: {str(e)}")
# response["message"] = f"yolov10_layout_pipeline failed: {str(e)}"
# return response
# logger.info(f"Starting filtering_table_pipeline for {file_name_without_ext}")
# try:
# table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
# logger.info(f"filtering_table_pipeline completed. Table JSON path: {table_json_path}")
# except Exception as e:
# logger.error(f"filtering_table_pipeline failed: {str(e)}")
# response["message"] = f"filtering_table_pipeline failed: {str(e)}"
# return response
# logger.info(f"Starting process_pdf_for_headers for {file_name_without_ext}")
# try:
# custom_headers_json, custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext, file_location, file_output)
# logger.info(f"process_pdf_for_headers completed. Custom headers JSON path: {custom_headers_json_file_path}")
# except Exception as e:
# logger.error(f"process_pdf_for_headers failed: {str(e)}")
# response["message"] = f"process_pdf_for_headers failed: {str(e)}"
# return response
# logger.info(f"Starting pipeline_for_merging_headers for {file_name_without_ext}")
# try:
# header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path, header_json_output_filepath, file_output, file_name_without_ext)
# logger.info(f"pipeline_for_merging_headers completed. Merged headers JSON path: {header_json_output_filepath}")
# except Exception as e:
# logger.error(f"pipeline_for_merging_headers failed: {str(e)}")
# response["message"] = f"pipeline_for_merging_headers failed: {str(e)}"
# return response
# # Initialize document_data
# logger.debug(f"Initializing document_data for {file_name_with_ext}")
# document_data[file_name_with_ext] = {
# "pdf_path": destination_path,
# "pdf_file_name": file_name_with_ext,
# "model_json_header_output_filepath": [],
# "model_json_layout_output_filepath": [],
# "tree_structured_header_json_filepath": [],
# "user_modified_json_output_filepath": [],
# "user_modified_table_json_filepath": [],
# "frontend_output_json": [],
# "cluster_json": [],
# "id_2_label": [],
# "file_output_dir": [],
# "table_output_dir": [],
# "table_with_header_data": [],
# "table_with_header_json_path": [],
# "json_output_dir": [],
# "pdf_miner_json_path": [],
# "searchable_pdf_path": []
# }
# # Store paths and filenames
# document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
# document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
# document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
# document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
# document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
# document_data[file_name_with_ext]["file_output_dir"].append(file_output)
# document_data[file_name_with_ext]["id_2_label"].append(class_names)
# document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
# document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
# document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
# document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
# logger.debug(f"Stored paths and filenames in document_data for {file_name_with_ext}")
# file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
# pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
# pdf_path = document_data[file_name_with_ext]["pdf_path"]
# user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
# logger.info(f"Starting create_json_pdfminer_pipeline for {pdf_file_name}")
# try:
# pdf_miner_json_filepath, pdf_miner_metadata, searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
# logger.info(f"create_json_pdfminer_pipeline completed. PDFMiner JSON path: {pdf_miner_json_filepath}, Searchable PDF path: {searchable_pdf_path}")
# except Exception as e:
# logger.error(f"create_json_pdfminer_pipeline failed: {str(e)}")
# response["message"] = f"create_json_pdfminer_pipeline failed: {str(e)}"
# return response
# logger.info(f"Starting main_pipeline_create_put_table_headers for {file_name_with_ext}")
# try:
# table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
# logger.info(f"main_pipeline_create_put_table_headers completed")
# except Exception as e:
# logger.error(f"main_pipeline_create_put_table_headers failed: {str(e)}")
# response["message"] = f"main_pipeline_create_put_table_headers failed: {str(e)}"
# return response
# document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
# document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
# # Process image URLs
# pdf_images_urls = []
# for file_name in os.listdir(pdf_images_path):
# file_path = os.path.join(pdf_images_path, file_name)
# if file_name.endswith((".jpg", ".jpeg", ".png")):
# img_url = base_url + "image/" + str(quote(file_path))
# pdf_images_urls.append(img_url)
# logger.debug(f"Collected {len(pdf_images_urls)} image URLs from {pdf_images_path}")
# # Sort image URLs by page number
# def extract_page_no(url):
# return int(url.split("_")[-1].split(".")[0])
# sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
# page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
# logger.debug(f"Sorted {len(sorted_urls)} image URLs and created page details")
# # Store the JSON output
# document_data[file_name_with_ext]["frontend_output_json"].append({
# "layout_output_json_data": layout_output_json_data,
# "layout_json_list_data": layout_list_data,
# "id_2_label": class_names,
# "header_output_json_data": header_output_json_data,
# "table_output_json_data": table_json_data,
# "table_output_json_data_list": table_json_data_list,
# "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
# "pdf_images_urls": page_details,
# })
# logger.debug(f"Stored frontend_output_json for {file_name_with_ext}")
# document_id_name = file_name_with_ext
# data = document_data[document_id_name]
# file_output_dir = data["file_output_dir"][0]
# json_output_dir = data["json_output_dir"][0]
# pdf_file_name = data["pdf_file_name"]
# pdf_path = data["pdf_path"]
# # PDFMiner processing
# pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
# modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
# logger.info(f"Reading JSON files: {modified_json_output_filepath}, {pdf_miner_json_filepath}")
# try:
# model_modified_json = read_json(modified_json_output_filepath)
# pdfminer_json = read_json(pdf_miner_json_filepath)
# logger.info(f"Successfully read JSON files")
# except Exception as e:
# logger.error(f"Failed to read JSON files: {str(e)}")
# response["message"] = f"Failed to read JSON files: {str(e)}"
# return response
# searchable_pdf_path = data["searchable_pdf_path"][0]
# logger.info(f"Starting merge_multi_page_tables_pipeline_v2 for {pdf_file_name}")
# try:
# table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
# logger.info(f"merge_multi_page_tables_pipeline_v2 completed. Merged table JSON path: {table_merged_json_path}")
# except Exception as e:
# logger.error(f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}")
# response["message"] = f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}"
# return response
# logger.info(f"Reading merged table JSON: {table_merged_json_path}")
# try:
# table_merged_json = read_json(table_merged_json_path)
# logger.info(f"Successfully read merged table JSON")
# except Exception as e:
# logger.error(f"Failed to read merged table JSON: {str(e)}")
# response["message"] = f"Failed to read merged table JSON: {str(e)}"
# return response
# # logger.info(f"Starting map_table_with_its_header for {file_name_with_extDeprecationWarning}")
# try:
# table_mapped_modified_json = map_table_with_its_header(table_merged_json)
# logger.info(f"map_table_with_its_header completed")
# except Exception as e:
# logger.error(f"map_table_with_its_header failed: {str(e)}")
# response["message"] = f"map_table_with_its_header failed: {st(e)}"
# return response
# logger.info(f"Starting main_header_pipeline for {file_name_with_ext}")
# try:
# df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
# logger.info(f"main_header_pipeline completed")
# except Exception as e:
# logger.error(f"main_header_pipeline failed: {str(e)}")
# response["message"] = f"main_header_pipeline failed: {str(e)}"
# return response
# logger.info(f"Starting clean_dataframe for final DataFrame")
# try:
# clean_df, clean_df_json = clean_dataframe(df_final)
# logger.info(f"clean_dataframe completed. Clean JSON created")
# except Exception as e:
# logger.error(f"clean_dataframe failed: {str(e)}")
# response["message"] = f"clean_dataframe failed: {str(e)}"
# return response
# if isinstance(clean_df_json, str):
# clean_df_json = eval(clean_df_json)
# logger.debug(f"Converted clean_df_json string to dictionary")
# file_name = get_file_name_without_extension(pdf_file_name)
# logger.info(f"Starting process_document_company_wise for {file_name}")
# try:
# merged_content_company_wise_df = process_document_company_wise(clean_df_json, output_directory=json_output_dir, file_name=file_name)
# company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
# logger.info(f"process_document_company_wise and clean_dataframe completed")
# except Exception as e:
# logger.error(f"process_document_company_wise failed: {str(e)}")
# response["message"] = f"process_document_company_wise failed: {str(e)}"
# return response
# json_output_filename = file_name + "_final_h2h_extraction.json"
# final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
# logger.info(f"Saving final JSON to {final_json_output_filepath}")
# try:
# with open(final_json_output_filepath, 'w') as f:
# json.dump(clean_df_json, f, indent=4)
# logger.info(f"Final JSON saved successfully")
# except Exception as e:
# logger.error(f"Failed to save final JSON: {str(e)}")
# response["message"] = f"Failed to save final JSON: {str(e)}"
# return response
# company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
# company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
# logger.info(f"Saving company-wise JSON to {company_wise_final_json_output_filepath}")
# try:
# with open(company_wise_final_json_output_filepath, 'w') as f:
# json.dump(merged_content_company_wise_df, f, indent=4)
# logger.info(f"Company-wise JSON saved successfully")
# except Exception as e:
# logger.error(f"Failed to save company-wise JSON: {str(e)}")
# response["message"] = f"Failed to save company-wise JSON: {str(e)}"
# return response
# logger.info(f"Starting tree_structured_headers_content_pipeline for {pdf_file_name}")
# try:
# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
# logger.info(f"tree_structured_headers_content_pipeline and merge_blocks completed")
# except Exception as e:
# logger.error(f"tree_structured_headers_content_pipeline failed: {str(e)}")
# response["message"] = f"tree_structured_headers_content_pipeline failed: {str(e)}"
# return response
# logger.info(f"Starting customised_toc_extraction_pipeline for {searchable_pdf_path}")
# try:
# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path, yolo_detection_json_path=modified_json_output_filepath, output_directory=file_output_dir)
# logger.info(f"customised_toc_extraction_pipeline completed")
# except Exception as e:
# logger.error(f"customised_toc_extraction_pipeline failed: {str(e)}")
# response["message"] = f"customised_toc_extraction_pipeline failed: {str(e)}"
# return response
# # Save final JSON output
# json_directory = os.path.dirname(pdf_path)
# json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
# output_json_path = os.path.join(json_directory, f"{json_filename}.json")
# logger.info(f"Saving output JSON to {output_json_path}")
# try:
# if isinstance(company_wise_clean_df_json, str):
# company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
# logger.debug(f"Converted company_wise_clean_df_json string to dictionary")
# with open(output_json_path, 'w', encoding='utf-8') as json_file:
# json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
# logger.info(f"Output JSON saved successfully")
# except Exception as e:
# logger.error(f"Failed to save output JSON: {str(e)}")
# response["message"] = f"Failed to save output JSON: {str(e)}"
# return response
# response_final = {
# "status_code": 200,
# "saved_json_path": output_json_path
# }
# logger.info(f"upload_documents completed successfully for {file_name_with_ext}. Response: {response_final}")
# return response_final
def table_extraction_and_mapping(path,
field_name,
class_keywords_table,
header_categories,
class_keywords_table_column,
filter_table_classifier_name,
threshold) :
# path = eval(f'{path}')
request = Request
print(f'started for path: {path}')
base_url = str(request.base_url)
global document_data
document_data = {}
pdf_path = path
clear_directory(pdf_input_path)
clear_directory(pdf_output_path)
clear_directory(word_input_path)
clear_directory(word_output_path)
# Initialize response structure
response = {
"success": False,
"message": "",
# "data": None
}
if not pdf_path:
response["message"] = "No file path provided."
return response
# Check if the provided path is a PDF file
if not pdf_path.lower().endswith(".pdf"):
response["message"] = "Invalid file type. Only PDF files are accepted."
return response
# Extract filename
file_name_with_ext = os.path.basename(pdf_path)
file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
# Create destination path in input directory
destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
try:
# Copy the file to our input directory
shutil.copy2(pdf_path, destination_path)
except Exception as e:
response["message"] = f"Failed to copy file: {str(e)}"
return response
output_directory_path = os.path.join(output_directory)
os.makedirs(output_directory_path, exist_ok=True)
file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
os.makedirs(file_output, exist_ok=True)
table_output_path = os.path.join(file_output, f"table_output")
os.makedirs(table_output_path, exist_ok=True)
file_location = destination_path
# Pipeline processing
json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
# Initialize data for the new document
document_data[file_name_with_ext] = {
"pdf_path": destination_path,
"pdf_file_name": file_name_with_ext,
"model_json_header_output_filepath": [],
"model_json_layout_output_filepath": [],
"tree_structured_header_json_filepath": [],
"user_modified_json_output_filepath": [],
'user_modified_table_json_filepath': [],
"frontend_output_json": [],
"cluster_json": [],
"id_2_label" : [],
"file_output_dir" : [],
"table_output_dir": [],
"table_with_header_data" : [],
"table_with_header_json_path" : [],
"json_output_dir": [],
"pdf_miner_json_path": [] ,
"searchable_pdf_path" : []
}
# Store paths and filenames
document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
document_data[file_name_with_ext]["file_output_dir"].append(file_output)
document_data[file_name_with_ext]["id_2_label"].append(class_names)
document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
pdf_path = document_data[file_name_with_ext]["pdf_path"]
user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
# Process image URLs
pdf_images_urls = []
for file_name in os.listdir(pdf_images_path):
file_path = os.path.join(pdf_images_path, file_name)
if file_name.endswith((".jpg", ".jpeg", ".png")):
img_url = base_url + "image/" + str(quote(file_path))
pdf_images_urls.append(img_url)
# Sort image URLs by page number
def extract_page_no(url):
return int(url.split("_")[-1].split(".")[0])
sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
# Create page details
page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
# Store the JSON output
document_data[file_name_with_ext]["frontend_output_json"].append({
"layout_output_json_data": layout_output_json_data,
"layout_json_list_data": layout_list_data,
"id_2_label": class_names,
"header_output_json_data": header_output_json_data,
"table_output_json_data": table_json_data,
"table_output_json_data_list": table_json_data_list,
"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
"pdf_images_urls": page_details,
})
document_id_name = file_name_with_ext
data = document_data[document_id_name]
file_output_dir = data["file_output_dir"][0]
json_output_dir = data["json_output_dir"][0]
pdf_file_name = data["pdf_file_name"]
pdf_path = data["pdf_path"]
# PDFMiner processing
pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
model_modified_json = read_json(modified_json_output_filepath)
pdfminer_json = read_json(pdf_miner_json_filepath)
searchable_pdf_path = data["searchable_pdf_path"][0]
# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
table_merged_json = read_json(table_merged_json_path)
table_mapped_modified_json = map_table_with_its_header(table_merged_json)
# table_mapped_modified_json = map_table_with_its_header(model_modified_json)
# Main header pipeline
df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
clean_df, clean_df_json = clean_dataframe(df_final)
# if isinstance(clean_df_json, str):
# clean_df_json = eval(clean_df_json)
file_name = get_file_name_without_extension(pdf_file_name)
# Step 1: Extract directory and filename without extension
pdf_path = path
json_directory = os.path.dirname(pdf_path)
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
# Step 2: Define output path for JSON
output_json_path = os.path.join(json_directory, f"{json_filename}_structured_chunking.json")
# If your variable is a JSON string, convert it to dict first
if isinstance(clean_df_json, str):
clean_df_json = json.loads(clean_df_json)
# Step 3: Save JSON
with open(output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(clean_df_json, json_file, ensure_ascii=False, indent=4)
##########################################################
# Table Classification Code
##########################################################
print("starting table classification pipeline")
structured_chunk_json_path = output_json_path
with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
content = file.read()
# This regex removes commas before closing braces/brackets, ignoring whitespace
cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
# Parse the cleaned JSON
structured_chunk_data = json.loads(cleaned_content)
threshold = float(threshold)
print("type of class_keywords_table::\n",type(class_keywords_table))
# If class_keywords is a string, try to parse it
if isinstance(class_keywords_table, str):
try:
class_keywords_table = json.loads(class_keywords_table)
# if not isinstance(class_keywords_table, dict):
# raise ValueError("class_keywords_table must be a dictionary")
# if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
# for key, value in class_keywords_table.items()):
# raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
except json.JSONDecodeError:
raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
elif isinstance(class_keywords_table, dict) :
class_keywords_table = class_keywords_table
else:
raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
# Perform classification
categorized_headers_json = perform_classification(
data=structured_chunk_data,
class_keywords=class_keywords_table,
header_categories=header_categories,
similarity_threshold=threshold
)
# Step 1: Extract directory and filename without extension
pdf_path = path
json_directory = os.path.dirname(pdf_path)
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
# Step 2: Define output path for JSON
classified_table_output_json_path = os.path.join(json_directory, f"{field_name}_table_classification.json")
# If your variable is a JSON string, convert it to dict first
if isinstance(categorized_headers_json, str):
categorized_headers_json = json.loads(categorized_headers_json)
# Step 3: Save JSON
with open(classified_table_output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(categorized_headers_json, json_file, ensure_ascii=False, indent=4)
#######################################################
# Table Column Classification Code
print("Starting Table Column Classification")
# Parse JSON strings into dictionaries
# input_table_classified_json = json.load(classified_table_output_json_path)
with open(classified_table_output_json_path, "r") as f:
input_table_classified_json = json.load(f)
# class_keywords_table_column = json.loads(class_keywords_table_column)
if isinstance(class_keywords_table_column, str):
try:
class_keywords_table_column = json.loads(class_keywords_table_column)
except json.JSONDecodeError:
raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table_column"})
elif isinstance(class_keywords_table_column, dict):
class_keywords_table_column = class_keywords_table_column
# Convert similarity_threshold to integer
similarity_threshold = float(threshold)
column_classification_results_json = classify_column_headers(
json_data=input_table_classified_json,
class_keywords=class_keywords_table_column,
filter_table_classifier_name=filter_table_classifier_name,
similarity_threshold=similarity_threshold
)
# Step 1: Extract directory and filename without extension
pdf_path = path
json_directory = os.path.dirname(pdf_path)
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
# Step 2: Define output path for JSON
classified_table_column_output_json_path = os.path.join(json_directory, f"{field_name}_table_column_classification.json")
# If your variable is a JSON string, convert it to dict first
if isinstance(column_classification_results_json, str):
column_classification_results_json = json.loads(column_classification_results_json)
# Step 3: Save JSON
with open(classified_table_column_output_json_path, 'w', encoding='utf-8') as json_file:
json.dump(column_classification_results_json, json_file, ensure_ascii=False, indent=4)
#######################################################################
response_final = {
"status_code": 200,
# "message":"",
# "df_download_json": company_wise_clean_df_json,
"structured_chunk_json_path": output_json_path,
"table_classification_json_path":classified_table_output_json_path,
"table_column_classification_json_path" : classified_table_column_output_json_path
}
return response_final