| from fastapi import FastAPI, HTTPException, Depends, File, Request, Form |
| from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.responses import JSONResponse, FileResponse |
| from urllib.parse import quote |
| from typing import List, Annotated,Dict,Optional,Any |
| import uvicorn |
| import sys |
| import json |
|
|
|
|
| sys.path.append("/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking") |
| |
|
|
| from config.set_config import set_configuration |
| set_config_project = set_configuration() |
|
|
| project_output_directory_path= set_config_project.project_output_directory_path |
| project_path = set_config_project.project_path |
|
|
| from src.table_processing.table_filter import filtering_table_pipeline |
| |
| from src.table_processing.tree_structured_json import tree_structured_headers_pipeline,tree_structured_headers_content_pipeline |
| from src.pre_processing.create_searchable_pdf_old import create_json_pdfminer_pipeline |
|
|
| from src.post_processing.clean_dataframe import clean_dataframe |
| from src.table_processing.merge_headers_tree_structure import merge_blocks |
|
|
| from src.table_processing.create_and_put_table_header import main_pipeline_create_put_table_headers |
| from src.table_processing.map_table_with_table_header import map_table_with_its_header |
|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| from src.toc_based_extraction.main_pipeline_toc_based_extraction import customised_toc_extraction_pipeline |
| from src.iqeq_modification.post_processing_iqeq import read_json,main_header_pipeline |
| from src.iqeq_modification.post_process_portfolio_company_json import process_document_company_wise |
|
|
| |
| |
| |
|
|
| |
| from src.layout_detection.layout_detection_docling_heron import yolov10_layout_pipeline,get_file_name_without_extension |
|
|
| |
| |
| from src.table_merge.table_merge_v5 import merge_multi_page_tables_pipeline_v2 |
| from src.table_query.query_code_openai import get_query_response |
|
|
| from src.custom_headers.pdf_header_detector import process_pdf_for_headers |
| from src.custom_headers.consolidate_header_jsons import pipeline_for_merging_headers |
|
|
| from utils.utils_code import clear_directory |
|
|
| import logging,os |
| from logging.config import dictConfig |
| import shutil |
| import re |
| from fastapi import HTTPException, Form |
| from src.classification.column_classifier_v2 import classify_column_headers |
| from src.classification.classification import perform_classification |
|
|
| log_folder = "logs" |
| os.makedirs(log_folder, exist_ok=True) |
|
|
| |
| log_file_path = os.path.join(log_folder, "app.log") |
|
|
| logging_config = { |
| 'version': 1, |
| 'disable_existing_loggers': False, |
| 'formatters': { |
| 'detailed': { |
| 'format': '%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(message)s', |
| 'datefmt': '%Y-%m-%d %H:%M:%S' |
| }, |
| }, |
| 'handlers': { |
| 'console': { |
| 'class': 'logging.StreamHandler', |
| 'level': 'INFO', |
| 'formatter': 'detailed', |
| 'stream': 'ext://sys.stdout' |
| }, |
| 'file': { |
| 'class': 'logging.FileHandler', |
| 'level': 'INFO', |
| 'formatter': 'detailed', |
| 'filename': log_file_path, |
| 'mode': 'a', |
| }, |
| }, |
| 'loggers': { |
| '': { |
| 'handlers': ['console', 'file'], |
| 'level': 'INFO', |
| 'propagate': True |
| }, |
| |
| 'uvicorn': { |
| 'handlers': ['console', 'file'], |
| 'level': 'INFO', |
| 'propagate': False |
| }, |
| } |
| } |
|
|
| |
| dictConfig(logging_config) |
|
|
| |
| logger = logging.getLogger(__name__) |
|
|
| app = FastAPI() |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
|
|
| pdf_input_path = os.path.join(project_output_directory_path, f"pdf_extraction/input") |
| pdf_input_directory = pdf_input_path |
| os.makedirs(pdf_input_directory, exist_ok=True) |
|
|
| pdf_output_path = os.path.join(project_output_directory_path, f"pdf_extraction/output") |
| output_directory = pdf_output_path |
| os.makedirs(output_directory, exist_ok=True) |
|
|
| word_input_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/input") |
| word_input_directory_path = word_input_path |
| os.makedirs(word_input_directory_path, exist_ok=True) |
|
|
| word_output_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/output") |
| word_output_directory_path = word_output_path |
| os.makedirs(word_output_directory_path, exist_ok=True) |
|
|
| document_data = {} |
|
|
| @app.post("/structured_chunking_extract") |
| async def upload_documents(request: Request, path: str = Form()) : |
| |
| print(f'started for path: {path}') |
| base_url = str(request.base_url) |
| global document_data |
| document_data = {} |
| pdf_path = path |
| clear_directory(pdf_input_path) |
| clear_directory(pdf_output_path) |
| clear_directory(word_input_path) |
| clear_directory(word_output_path) |
|
|
| |
| response = { |
| "success": False, |
| "message": "", |
| |
| } |
|
|
| |
| if not pdf_path.lower().endswith(".pdf"): |
| response["message"] = "Invalid file type. Only PDF files are accepted." |
| return response |
|
|
| |
| |
| file_name_with_ext = os.path.basename(pdf_path) |
| file_name_without_ext = os.path.splitext(file_name_with_ext)[0] |
|
|
| |
| destination_path = os.path.join(pdf_input_directory, file_name_with_ext) |
| |
| try: |
| |
| shutil.copy2(pdf_path, destination_path) |
| except Exception as e: |
| response["message"] = f"Failed to copy file: {str(e)}" |
| return response |
|
|
| output_directory_path = os.path.join(output_directory) |
| os.makedirs(output_directory_path, exist_ok=True) |
| file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output") |
| os.makedirs(file_output, exist_ok=True) |
|
|
| table_output_path = os.path.join(file_output, f"table_output") |
| os.makedirs(table_output_path, exist_ok=True) |
| file_location = destination_path |
|
|
| |
| |
| json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path,_ = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output) |
| table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext) |
|
|
| custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output) |
| header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext) |
|
|
| |
| document_data[file_name_with_ext] = { |
| |
| "pdf_path": destination_path, |
| "pdf_file_name": file_name_with_ext, |
| "model_json_header_output_filepath": [], |
| "model_json_layout_output_filepath": [], |
| "tree_structured_header_json_filepath": [], |
| "user_modified_json_output_filepath": [], |
| 'user_modified_table_json_filepath': [], |
| "frontend_output_json": [], |
| "cluster_json": [], |
| "id_2_label" : [], |
| "file_output_dir" : [], |
| "table_output_dir": [], |
| "table_with_header_data" : [], |
| "table_with_header_json_path" : [], |
| "json_output_dir": [], |
| "pdf_miner_json_path": [] , |
| "searchable_pdf_path" : [] |
| } |
|
|
| |
| document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath) |
| document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath) |
| document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path) |
| document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath) |
| document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path) |
| document_data[file_name_with_ext]["file_output_dir"].append(file_output) |
| document_data[file_name_with_ext]["id_2_label"].append(class_names) |
| document_data[file_name_with_ext]["table_output_dir"].append(table_output_path) |
| document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data) |
| document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path) |
| document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir) |
|
|
| file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0] |
| pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"] |
| pdf_path = document_data[file_name_with_ext]["pdf_path"] |
| user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0] |
|
|
| pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir) |
| |
| table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath) |
| document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath) |
|
|
| document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path) |
|
|
| |
| pdf_images_urls = [] |
| for file_name in os.listdir(pdf_images_path): |
| file_path = os.path.join(pdf_images_path, file_name) |
| if file_name.endswith((".jpg", ".jpeg", ".png")): |
| img_url = base_url + "image/" + str(quote(file_path)) |
| pdf_images_urls.append(img_url) |
|
|
| |
| def extract_page_no(url): |
| return int(url.split("_")[-1].split(".")[0]) |
| sorted_urls = sorted(pdf_images_urls, key=extract_page_no) |
|
|
| |
| page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)] |
|
|
| |
| document_data[file_name_with_ext]["frontend_output_json"].append({ |
| "layout_output_json_data": layout_output_json_data, |
| "layout_json_list_data": layout_list_data, |
| "id_2_label": class_names, |
| "header_output_json_data": header_output_json_data, |
| "table_output_json_data": table_json_data, |
| "table_output_json_data_list": table_json_data_list, |
| "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header, |
| "pdf_images_urls": page_details, |
| }) |
|
|
|
|
| document_id_name = file_name_with_ext |
|
|
| data = document_data[document_id_name] |
| file_output_dir = data["file_output_dir"][0] |
| json_output_dir = data["json_output_dir"][0] |
| pdf_file_name = data["pdf_file_name"] |
| pdf_path = data["pdf_path"] |
|
|
| |
| pdf_miner_json_filepath = data['pdf_miner_json_path'][0] |
| modified_json_output_filepath = data["user_modified_json_output_filepath"][0] |
| model_modified_json = read_json(modified_json_output_filepath) |
| pdfminer_json = read_json(pdf_miner_json_filepath) |
| searchable_pdf_path = data["searchable_pdf_path"][0] |
|
|
| |
| |
| table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir) |
| |
| table_merged_json = read_json(table_merged_json_path) |
| |
| table_mapped_modified_json = map_table_with_its_header(table_merged_json) |
| |
|
|
| |
|
|
| |
| df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json) |
| |
| |
| clean_df, clean_df_json = clean_dataframe(df_final) |
| |
|
|
| if isinstance(clean_df_json, str): |
| |
| |
| clean_df_json = json.loads(clean_df_json) |
|
|
| file_name = get_file_name_without_extension(pdf_file_name) |
| merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path) |
| company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df) |
|
|
| json_output_filename = file_name + "_final_h2h_extraction.json" |
| final_json_output_filepath = os.path.join(json_output_dir, json_output_filename) |
|
|
| with open(final_json_output_filepath, 'w') as f: |
| json.dump(clean_df_json, f, indent=4) |
|
|
| company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json" |
| company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename) |
|
|
| with open(company_wise_final_json_output_filepath, 'w') as f: |
| json.dump(merged_content_company_wise_df, f, indent=4) |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| pdf_path = path |
| json_directory = os.path.dirname(pdf_path) |
| json_filename = os.path.splitext(os.path.basename(pdf_path))[0] |
|
|
| |
| output_json_path = os.path.join(json_directory, f"{json_filename }.json") |
|
|
| |
| if isinstance(company_wise_clean_df_json, str): |
| company_wise_clean_df_json = json.loads(company_wise_clean_df_json) |
|
|
|
|
| |
| with open(output_json_path, 'w', encoding='utf-8') as json_file: |
| json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| response_final = { |
| "status_code": 200, |
| |
| |
| "saved_json_path": output_json_path |
| } |
|
|
| return response_final |
|
|
|
|
| @app.get("/image/{path:path}") |
| async def get_image(path: str): |
| if os.path.exists(path): |
| return FileResponse(path, media_type="image/jpeg") |
| else: |
| raise HTTPException(status_code=404, detail="Image not found") |
| |
| @app.get("/file/{path:path}") |
| async def get_file(path: str): |
| if os.path.exists(path): |
| paths = path.split("/") |
| filename = paths[len(paths) - 1] |
| if path.endswith('.csv'): |
| media_type = "text/csv" |
| elif path.endswith('.xlsx'): |
| media_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" |
| else: |
| media_type = "application/octet-stream" |
| return FileResponse(path, media_type=media_type,filename =filename) |
| else: |
| raise HTTPException(status_code=404, detail="File not found") |
|
|
|
|
| @app.post("/table-classification") |
| async def table_classification( |
| structured_chunk_json_path: str = Form(...), |
| class_keywords_table: str = Form(...), |
| header_categories: Optional[str] = Form("table_column_header"), |
| similarity_threshold: Optional[float] = Form(0.4) |
| ): |
| |
| try: |
| |
| with open(structured_chunk_json_path, 'r', encoding='utf-8') as file: |
| content = file.read() |
| |
| |
| cleaned_content = re.sub(r',\s*([\]}])', r'\1', content) |
|
|
| |
| structured_chunk_data = json.loads(cleaned_content) |
| |
| |
| if isinstance(class_keywords_table, str): |
| try: |
| class_keywords_table = json.loads(class_keywords_table) |
| |
| if not isinstance(class_keywords_table, dict): |
| raise ValueError("class_keywords_table must be a dictionary") |
| if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value) |
| for key, value in class_keywords_table.items()): |
| raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values") |
| except json.JSONDecodeError: |
| raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"}) |
| |
| elif not isinstance(class_keywords_table, dict) or not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value) |
| for key, value in class_keywords_table.items()): |
| raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"}) |
|
|
| |
| |
| categorized_headers = perform_classification( |
| data=structured_chunk_data, |
| class_keywords=class_keywords_table, |
| header_categories=header_categories, |
| similarity_threshold=similarity_threshold |
| ) |
| return categorized_headers |
| except ValueError as e: |
| raise HTTPException(status_code=422, detail={"error": "Input validation failed", "message": str(e)}) |
| except Exception as e: |
| raise HTTPException(status_code=422, detail={"error": "Processing failed", "message": str(e)}) |
|
|
|
|
| @app.post("/table-column-classification") |
| async def table_column_classification( |
| input_table_classified_json: Annotated[str, Form()], |
| class_keywords_table_column: Annotated[str, Form()], |
| filter_table_classifier_name: Annotated[str, Form()], |
| similarity_threshold: Annotated[str, Form()] |
| ): |
| |
|
|
| try: |
| |
| input_table_classified_json = json.loads(input_table_classified_json) |
| class_keywords_table_column = json.loads(class_keywords_table_column) |
| except json.JSONDecodeError as e: |
| raise HTTPException(status_code=422, detail={"error": "Invalid JSON format", "message": str(e)}) |
|
|
| try: |
| |
| similarity_threshold = float(similarity_threshold) |
| except ValueError as e: |
| raise HTTPException(status_code=422, detail={"error": "Similarity threshold must be a valid integer", "message": str(e)}) |
|
|
| |
| column_classification_results = classify_column_headers( |
| json_data=input_table_classified_json, |
| class_keywords=class_keywords_table_column, |
| filter_table_classifier_name=filter_table_classifier_name, |
| similarity_threshold=similarity_threshold |
| ) |
|
|
| results = {"column_classification_result": column_classification_results} |
|
|
| return results |
|
|
|
|
| |
| if __name__ == "__main__": |
| |
| uvicorn.run( app, host="0.0.0.0", port=7063,log_level="info") |
| |
|
|
|
|
|
|
| def upload_documents(path) : |
| |
| request = Request |
| print(f'started for path: {path}') |
| base_url = str(request.base_url) |
| global document_data |
| document_data = {} |
| pdf_path = path |
|
|
| clear_directory(pdf_input_path) |
| clear_directory(pdf_output_path) |
| clear_directory(word_input_path) |
| clear_directory(word_output_path) |
|
|
| |
| response = { |
| "success": False, |
| "message": "", |
| |
| } |
|
|
| |
| if not pdf_path: |
| response["message"] = "No file path provided." |
| return response |
| |
| |
| if not pdf_path.lower().endswith(".pdf"): |
| response["message"] = "Invalid file type. Only PDF files are accepted." |
| return response |
|
|
| |
| |
| file_name_with_ext = os.path.basename(pdf_path) |
| file_name_without_ext = os.path.splitext(file_name_with_ext)[0] |
|
|
| |
| destination_path = os.path.join(pdf_input_directory, file_name_with_ext) |
| |
| try: |
| |
| shutil.copy2(pdf_path, destination_path) |
| except Exception as e: |
| response["message"] = f"Failed to copy file: {str(e)}" |
| return response |
|
|
| output_directory_path = os.path.join(output_directory) |
| os.makedirs(output_directory_path, exist_ok=True) |
| file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output") |
| os.makedirs(file_output, exist_ok=True) |
|
|
| table_output_path = os.path.join(file_output, f"table_output") |
| os.makedirs(table_output_path, exist_ok=True) |
| file_location = destination_path |
| |
|
|
| |
| |
| json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output) |
| table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext) |
| |
| custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output) |
| header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext) |
|
|
| |
| document_data[file_name_with_ext] = { |
| |
| "pdf_path": destination_path, |
| "pdf_file_name": file_name_with_ext, |
| "model_json_header_output_filepath": [], |
| "model_json_layout_output_filepath": [], |
| "tree_structured_header_json_filepath": [], |
| "user_modified_json_output_filepath": [], |
| 'user_modified_table_json_filepath': [], |
| "frontend_output_json": [], |
| "cluster_json": [], |
| "id_2_label" : [], |
| "file_output_dir" : [], |
| "table_output_dir": [], |
| "table_with_header_data" : [], |
| "table_with_header_json_path" : [], |
| "json_output_dir": [], |
| "pdf_miner_json_path": [] , |
| "searchable_pdf_path" : [] |
| } |
|
|
| |
| document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath) |
| document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath) |
| document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path) |
| document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath) |
| document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path) |
| document_data[file_name_with_ext]["file_output_dir"].append(file_output) |
| document_data[file_name_with_ext]["id_2_label"].append(class_names) |
| document_data[file_name_with_ext]["table_output_dir"].append(table_output_path) |
| document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data) |
| document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path) |
| document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir) |
|
|
| file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0] |
| pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"] |
| pdf_path = document_data[file_name_with_ext]["pdf_path"] |
| user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0] |
|
|
| pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir) |
| |
| table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath) |
| document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath) |
|
|
| document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path) |
|
|
| |
| pdf_images_urls = [] |
| for file_name in os.listdir(pdf_images_path): |
| file_path = os.path.join(pdf_images_path, file_name) |
| if file_name.endswith((".jpg", ".jpeg", ".png")): |
| img_url = base_url + "image/" + str(quote(file_path)) |
| pdf_images_urls.append(img_url) |
|
|
| |
| def extract_page_no(url): |
| return int(url.split("_")[-1].split(".")[0]) |
| sorted_urls = sorted(pdf_images_urls, key=extract_page_no) |
|
|
| |
| page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)] |
|
|
| |
| document_data[file_name_with_ext]["frontend_output_json"].append({ |
| "layout_output_json_data": layout_output_json_data, |
| "layout_json_list_data": layout_list_data, |
| "id_2_label": class_names, |
| "header_output_json_data": header_output_json_data, |
| "table_output_json_data": table_json_data, |
| "table_output_json_data_list": table_json_data_list, |
| "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header, |
| "pdf_images_urls": page_details, |
| }) |
|
|
|
|
| document_id_name = file_name_with_ext |
|
|
| data = document_data[document_id_name] |
| file_output_dir = data["file_output_dir"][0] |
| json_output_dir = data["json_output_dir"][0] |
| pdf_file_name = data["pdf_file_name"] |
| pdf_path = data["pdf_path"] |
|
|
| |
| pdf_miner_json_filepath = data['pdf_miner_json_path'][0] |
| modified_json_output_filepath = data["user_modified_json_output_filepath"][0] |
| model_modified_json = read_json(modified_json_output_filepath) |
| pdfminer_json = read_json(pdf_miner_json_filepath) |
| searchable_pdf_path = data["searchable_pdf_path"][0] |
|
|
| |
| |
| table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir) |
| |
| table_merged_json = read_json(table_merged_json_path) |
| |
| table_mapped_modified_json = map_table_with_its_header(table_merged_json) |
| |
|
|
| |
|
|
| |
| df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json) |
| |
| |
| clean_df, clean_df_json = clean_dataframe(df_final) |
| |
|
|
| if isinstance(clean_df_json, str): |
| clean_df_json = json.loads(clean_df_json) |
|
|
| file_name = get_file_name_without_extension(pdf_file_name) |
| merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path) |
| company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df) |
|
|
| json_output_filename = file_name + "_final_h2h_extraction.json" |
| final_json_output_filepath = os.path.join(json_output_dir, json_output_filename) |
|
|
| with open(final_json_output_filepath, 'w') as f: |
| json.dump(clean_df_json, f, indent=4) |
|
|
| company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json" |
| company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename) |
|
|
| with open(company_wise_final_json_output_filepath, 'w') as f: |
| json.dump(merged_content_company_wise_df, f, indent=4) |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| pdf_path = path |
| json_directory = os.path.dirname(pdf_path) |
| json_filename = os.path.splitext(os.path.basename(pdf_path))[0] |
|
|
| |
| output_json_path = os.path.join(json_directory, f"{json_filename }.json") |
|
|
| |
| if isinstance(company_wise_clean_df_json, str): |
| company_wise_clean_df_json = json.loads(company_wise_clean_df_json) |
|
|
|
|
| |
| with open(output_json_path, 'w', encoding='utf-8') as json_file: |
| json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4) |
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| response_final = { |
| "status_code": 200, |
| |
| |
| "saved_json_path": output_json_path |
| } |
| |
| |
| |
|
|
| return response_final |
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
|
|
| def table_extraction_and_mapping(path, |
| field_name, |
| class_keywords_table, |
| header_categories, |
| class_keywords_table_column, |
| filter_table_classifier_name, |
| threshold) : |
| |
| request = Request |
| print(f'started for path: {path}') |
| base_url = str(request.base_url) |
| global document_data |
| document_data = {} |
| pdf_path = path |
|
|
| clear_directory(pdf_input_path) |
| clear_directory(pdf_output_path) |
| clear_directory(word_input_path) |
| clear_directory(word_output_path) |
|
|
| |
| response = { |
| "success": False, |
| "message": "", |
| |
| } |
|
|
| |
| if not pdf_path: |
| response["message"] = "No file path provided." |
| return response |
| |
| |
| if not pdf_path.lower().endswith(".pdf"): |
| response["message"] = "Invalid file type. Only PDF files are accepted." |
| return response |
|
|
| |
| |
| file_name_with_ext = os.path.basename(pdf_path) |
| file_name_without_ext = os.path.splitext(file_name_with_ext)[0] |
|
|
| |
| destination_path = os.path.join(pdf_input_directory, file_name_with_ext) |
| |
| try: |
| |
| shutil.copy2(pdf_path, destination_path) |
| except Exception as e: |
| response["message"] = f"Failed to copy file: {str(e)}" |
| return response |
|
|
| output_directory_path = os.path.join(output_directory) |
| os.makedirs(output_directory_path, exist_ok=True) |
| file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output") |
| os.makedirs(file_output, exist_ok=True) |
|
|
| table_output_path = os.path.join(file_output, f"table_output") |
| os.makedirs(table_output_path, exist_ok=True) |
| file_location = destination_path |
| |
|
|
| |
| |
| json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output) |
| table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext) |
| |
| custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output) |
| header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext) |
|
|
| |
| document_data[file_name_with_ext] = { |
| |
| "pdf_path": destination_path, |
| "pdf_file_name": file_name_with_ext, |
| "model_json_header_output_filepath": [], |
| "model_json_layout_output_filepath": [], |
| "tree_structured_header_json_filepath": [], |
| "user_modified_json_output_filepath": [], |
| 'user_modified_table_json_filepath': [], |
| "frontend_output_json": [], |
| "cluster_json": [], |
| "id_2_label" : [], |
| "file_output_dir" : [], |
| "table_output_dir": [], |
| "table_with_header_data" : [], |
| "table_with_header_json_path" : [], |
| "json_output_dir": [], |
| "pdf_miner_json_path": [] , |
| "searchable_pdf_path" : [] |
| } |
|
|
| |
| document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath) |
| document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath) |
| document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path) |
| document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath) |
| document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path) |
| document_data[file_name_with_ext]["file_output_dir"].append(file_output) |
| document_data[file_name_with_ext]["id_2_label"].append(class_names) |
| document_data[file_name_with_ext]["table_output_dir"].append(table_output_path) |
| document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data) |
| document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path) |
| document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir) |
|
|
| file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0] |
| pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"] |
| pdf_path = document_data[file_name_with_ext]["pdf_path"] |
| user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0] |
|
|
| pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir) |
| |
| table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath) |
| document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath) |
|
|
| document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path) |
|
|
| |
| pdf_images_urls = [] |
| for file_name in os.listdir(pdf_images_path): |
| file_path = os.path.join(pdf_images_path, file_name) |
| if file_name.endswith((".jpg", ".jpeg", ".png")): |
| img_url = base_url + "image/" + str(quote(file_path)) |
| pdf_images_urls.append(img_url) |
|
|
| |
| def extract_page_no(url): |
| return int(url.split("_")[-1].split(".")[0]) |
| sorted_urls = sorted(pdf_images_urls, key=extract_page_no) |
|
|
| |
| page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)] |
|
|
| |
| document_data[file_name_with_ext]["frontend_output_json"].append({ |
| "layout_output_json_data": layout_output_json_data, |
| "layout_json_list_data": layout_list_data, |
| "id_2_label": class_names, |
| "header_output_json_data": header_output_json_data, |
| "table_output_json_data": table_json_data, |
| "table_output_json_data_list": table_json_data_list, |
| "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header, |
| "pdf_images_urls": page_details, |
| }) |
|
|
|
|
| document_id_name = file_name_with_ext |
|
|
| data = document_data[document_id_name] |
| file_output_dir = data["file_output_dir"][0] |
| json_output_dir = data["json_output_dir"][0] |
| pdf_file_name = data["pdf_file_name"] |
| pdf_path = data["pdf_path"] |
|
|
| |
| pdf_miner_json_filepath = data['pdf_miner_json_path'][0] |
| modified_json_output_filepath = data["user_modified_json_output_filepath"][0] |
| model_modified_json = read_json(modified_json_output_filepath) |
| pdfminer_json = read_json(pdf_miner_json_filepath) |
| searchable_pdf_path = data["searchable_pdf_path"][0] |
|
|
| |
| |
| table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir) |
| |
| table_merged_json = read_json(table_merged_json_path) |
| |
| table_mapped_modified_json = map_table_with_its_header(table_merged_json) |
| |
|
|
| |
|
|
| |
| df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json) |
| |
| |
| clean_df, clean_df_json = clean_dataframe(df_final) |
| |
|
|
| |
| |
|
|
| file_name = get_file_name_without_extension(pdf_file_name) |
| |
|
|
| |
| pdf_path = path |
| json_directory = os.path.dirname(pdf_path) |
| json_filename = os.path.splitext(os.path.basename(pdf_path))[0] |
|
|
| |
| output_json_path = os.path.join(json_directory, f"{json_filename}_structured_chunking.json") |
|
|
| |
| if isinstance(clean_df_json, str): |
| clean_df_json = json.loads(clean_df_json) |
|
|
|
|
| |
| with open(output_json_path, 'w', encoding='utf-8') as json_file: |
| json.dump(clean_df_json, json_file, ensure_ascii=False, indent=4) |
|
|
|
|
| |
| |
| |
| |
|
|
|
|
| print("starting table classification pipeline") |
|
|
| structured_chunk_json_path = output_json_path |
|
|
| with open(structured_chunk_json_path, 'r', encoding='utf-8') as file: |
| content = file.read() |
| |
| |
| cleaned_content = re.sub(r',\s*([\]}])', r'\1', content) |
|
|
| |
| structured_chunk_data = json.loads(cleaned_content) |
|
|
| threshold = float(threshold) |
| print("type of class_keywords_table::\n",type(class_keywords_table)) |
|
|
| |
| if isinstance(class_keywords_table, str): |
| try: |
| class_keywords_table = json.loads(class_keywords_table) |
| |
| |
| |
| |
| |
| |
| except json.JSONDecodeError: |
| raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"}) |
| |
| elif isinstance(class_keywords_table, dict) : |
| class_keywords_table = class_keywords_table |
|
|
|
|
| else: |
| raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"}) |
|
|
| |
| |
| categorized_headers_json = perform_classification( |
| data=structured_chunk_data, |
| class_keywords=class_keywords_table, |
| header_categories=header_categories, |
| similarity_threshold=threshold |
| ) |
|
|
| |
| pdf_path = path |
| json_directory = os.path.dirname(pdf_path) |
| json_filename = os.path.splitext(os.path.basename(pdf_path))[0] |
|
|
| |
| classified_table_output_json_path = os.path.join(json_directory, f"{field_name}_table_classification.json") |
|
|
| |
| if isinstance(categorized_headers_json, str): |
| categorized_headers_json = json.loads(categorized_headers_json) |
|
|
|
|
| |
| with open(classified_table_output_json_path, 'w', encoding='utf-8') as json_file: |
| json.dump(categorized_headers_json, json_file, ensure_ascii=False, indent=4) |
|
|
| |
| |
| |
| print("Starting Table Column Classification") |
|
|
| |
| |
| |
| with open(classified_table_output_json_path, "r") as f: |
| input_table_classified_json = json.load(f) |
|
|
|
|
| |
|
|
| if isinstance(class_keywords_table_column, str): |
| try: |
| class_keywords_table_column = json.loads(class_keywords_table_column) |
| |
| |
| except json.JSONDecodeError: |
| raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table_column"}) |
| |
| elif isinstance(class_keywords_table_column, dict): |
| class_keywords_table_column = class_keywords_table_column |
|
|
| |
| similarity_threshold = float(threshold) |
|
|
| column_classification_results_json = classify_column_headers( |
| json_data=input_table_classified_json, |
| class_keywords=class_keywords_table_column, |
| filter_table_classifier_name=filter_table_classifier_name, |
| similarity_threshold=similarity_threshold |
| ) |
|
|
| |
| pdf_path = path |
| json_directory = os.path.dirname(pdf_path) |
| json_filename = os.path.splitext(os.path.basename(pdf_path))[0] |
|
|
| |
| classified_table_column_output_json_path = os.path.join(json_directory, f"{field_name}_table_column_classification.json") |
|
|
| |
| if isinstance(column_classification_results_json, str): |
| column_classification_results_json = json.loads(column_classification_results_json) |
|
|
|
|
| |
| with open(classified_table_column_output_json_path, 'w', encoding='utf-8') as json_file: |
| json.dump(column_classification_results_json, json_file, ensure_ascii=False, indent=4) |
|
|
| |
| |
| response_final = { |
| "status_code": 200, |
| |
| |
| "structured_chunk_json_path": output_json_path, |
| "table_classification_json_path":classified_table_output_json_path, |
| "table_column_classification_json_path" : classified_table_column_output_json_path |
| } |
| |
| |
| |
|
|
| return response_final |
|
|
|
|
|
|
|
|
|
|
|
|