Upload 8 files
Browse files- company_name_extraction_by_ovis.py +69 -0
- iqeq_app_latest (4).py +1437 -0
- layout_detection_docling_heron (1).py +497 -0
- layout_detection_docling_heron (2).py +497 -0
- load_model (1).py +106 -0
- ovis_config.py +148 -0
- post_process_portfolio_company_json 2.py +402 -0
- rabbitmq_config_investor_report.py +23 -0
company_name_extraction_by_ovis.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import logging
|
| 4 |
+
from src.iqeq_modification.ovis_config import _run_inference
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
def extract_company_names(table_image_folder: str):
|
| 9 |
+
logger.info("=" * 80)
|
| 10 |
+
logger.info("STARTED COMPANY NAME EXTRACTION USING OVIS")
|
| 11 |
+
logger.info(f"Image folder: {table_image_folder}")
|
| 12 |
+
logger.info("=" * 80)
|
| 13 |
+
|
| 14 |
+
#Load all images from folder
|
| 15 |
+
supported_ext = ('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.webp')
|
| 16 |
+
image_paths = [
|
| 17 |
+
os.path.join(table_image_folder, f)
|
| 18 |
+
for f in sorted(os.listdir(table_image_folder))
|
| 19 |
+
if f.lower().endswith(supported_ext)
|
| 20 |
+
]
|
| 21 |
+
|
| 22 |
+
if not image_paths:
|
| 23 |
+
logger.warning("No valid images found in the folder.")
|
| 24 |
+
return []
|
| 25 |
+
|
| 26 |
+
logger.info(f"Found {len(image_paths)} image(s) for inference")
|
| 27 |
+
|
| 28 |
+
prompt = (
|
| 29 |
+
"You are an expert financial document analysis model specialized in reading tables from investor reports. "
|
| 30 |
+
"You are given an image of a table that may contain portfolio or investee company details such as company names, fund names, sectors, and investment amounts.\n\n"
|
| 31 |
+
"Your task:\n"
|
| 32 |
+
"1. Identify if the table contains portfolio or investee company information.\n"
|
| 33 |
+
"2. Extract only the actual **company or investee organization names**, excluding fund names, co-investment entities, management labels, or generic terms.\n"
|
| 34 |
+
"3. Do NOT include partial terms, descriptors, or words like 'Fund', 'Holdings', 'Co-Investment', 'Management', 'Other Unitholders', 'Endurance', 'Growth', 'PIK', etc.\n"
|
| 35 |
+
"4. Remove duplicates and retain only unique, meaningful company names.\n"
|
| 36 |
+
"5. Each extracted name should be a clean, full company name (e.g., 'Kate Spade Ltd', 'Milano Ventures Pvt Ltd').\n\n"
|
| 37 |
+
"Return your final answer strictly as a valid JSON list of strings, for example:\n"
|
| 38 |
+
"[\"Kate Spade Pvt Ltd\", \"Milano Ventures\", \"XYZ Technologies\"]\n\n"
|
| 39 |
+
"If no valid company names are found, return [] only.\n"
|
| 40 |
+
"Do not include any explanations, reasoning, or text outside the JSON list."
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
company_names = set()
|
| 44 |
+
|
| 45 |
+
#Chunk fields to avoid prompt overflow
|
| 46 |
+
for img in image_paths:
|
| 47 |
+
try:
|
| 48 |
+
raw, _ = _run_inference(img, prompt, max_new_tokens=2048)
|
| 49 |
+
print(f"raw result: {raw}")
|
| 50 |
+
if not raw:
|
| 51 |
+
continue
|
| 52 |
+
try:
|
| 53 |
+
names = json.loads(raw)
|
| 54 |
+
if isinstance(names, list):
|
| 55 |
+
for name in names:
|
| 56 |
+
if isinstance(name, str) and name.strip():
|
| 57 |
+
company_names.add(name.strip())
|
| 58 |
+
except json.JSONDecodeError:
|
| 59 |
+
logger.warning(f"Failed to parse OVIS output for {img}: {raw}")
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
logger.error(f"OVIS inference failed for {img}: {e}", exc_info=True)
|
| 63 |
+
continue
|
| 64 |
+
|
| 65 |
+
logger.info(f"Extracted {len(company_names)} unique company name(s).")
|
| 66 |
+
print("k"*100)
|
| 67 |
+
return list(company_names)
|
| 68 |
+
|
| 69 |
+
# print(extract_company_names("/shared_disk/kushal/land_contract/company tables"))
|
iqeq_app_latest (4).py
ADDED
|
@@ -0,0 +1,1437 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Depends, File, Request, Form
|
| 2 |
+
from fastapi.security import OAuth2PasswordBearer, OAuth2PasswordRequestForm
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from fastapi.responses import JSONResponse, FileResponse
|
| 5 |
+
from urllib.parse import quote
|
| 6 |
+
from typing import List, Annotated,Dict,Optional,Any
|
| 7 |
+
import uvicorn
|
| 8 |
+
import sys
|
| 9 |
+
import json
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
sys.path.append("/shared_disk/kushal/db_str_chunking/new_ws_structured_code/db_structured_chunking/structure_chunking")
|
| 13 |
+
# from config.set_config.set_configuration import set_config_project
|
| 14 |
+
|
| 15 |
+
from config.set_config import set_configuration
|
| 16 |
+
set_config_project = set_configuration()
|
| 17 |
+
|
| 18 |
+
project_output_directory_path= set_config_project.project_output_directory_path
|
| 19 |
+
project_path = set_config_project.project_path
|
| 20 |
+
|
| 21 |
+
from src.table_processing.table_filter import filtering_table_pipeline
|
| 22 |
+
# from src.qa_correction.user_action_modification import process_actions_and_create_new_file
|
| 23 |
+
from src.table_processing.tree_structured_json import tree_structured_headers_pipeline,tree_structured_headers_content_pipeline
|
| 24 |
+
from src.pre_processing.create_searchable_pdf_old import create_json_pdfminer_pipeline
|
| 25 |
+
|
| 26 |
+
from src.post_processing.clean_dataframe import clean_dataframe
|
| 27 |
+
from src.table_processing.merge_headers_tree_structure import merge_blocks
|
| 28 |
+
|
| 29 |
+
from src.table_processing.create_and_put_table_header import main_pipeline_create_put_table_headers
|
| 30 |
+
from src.table_processing.map_table_with_table_header import map_table_with_its_header
|
| 31 |
+
|
| 32 |
+
# from src.table_processing.table_merge import merge_multi_page_tables_pipeline
|
| 33 |
+
|
| 34 |
+
# from other_code.save_classified_pdf_json_to_excel import create_directories_and_sheets
|
| 35 |
+
|
| 36 |
+
# from src.table_extraction_from_word_csv.word_extraction import main_table_extraction_from_docx
|
| 37 |
+
# from src.table_extraction_from_word_csv.xlsx_extraction import extract_and_save_tables_from_excel
|
| 38 |
+
# from src.table_extraction_from_word_csv.csv_extraction import extract_and_save_tables_from_csv
|
| 39 |
+
# from src.table_extraction_from_word_csv.classify_table_headers import process_main_classifier,get_csv_file_paths,save_classify_files, clean_filename
|
| 40 |
+
|
| 41 |
+
# from src.iqeq_modification.sorting_headers_v2 import filter_and_sort_headers
|
| 42 |
+
# from src.iqeq_modification.portfolio_summary_dynamic_classification import map_company_data
|
| 43 |
+
from src.toc_based_extraction.main_pipeline_toc_based_extraction import customised_toc_extraction_pipeline
|
| 44 |
+
from src.iqeq_modification.post_processing_iqeq import read_json,main_header_pipeline
|
| 45 |
+
from src.iqeq_modification.post_process_portfolio_company_json import process_document_company_wise
|
| 46 |
+
|
| 47 |
+
# from src.filter_pdf_pages_scope3.fuzzy_match_keywords import custom_pipeline_for_filter_keywords_pages_text_search
|
| 48 |
+
# from src.filter_pdf_pages_scope3.keywords_matching import custom_pipeline_for_filter_keywords_pages_tfidf_vector
|
| 49 |
+
# from src.filter_pdf_pages_scope3.keywords_matching_create_pdf import custom_pipeline_for_filter_keywords_pages_sentence_embedding
|
| 50 |
+
|
| 51 |
+
# from src.layout_detection.layout_detection import yolov10_layout_pipeline,get_file_name_without_extension
|
| 52 |
+
from src.layout_detection.layout_detection_docling_heron import yolov10_layout_pipeline,get_file_name_without_extension
|
| 53 |
+
|
| 54 |
+
# from src.table_merge.table_merge_v2 import merge_multi_page_tables_pipeline_v2
|
| 55 |
+
# from src.table_merge.table_merge_new import merge_multi_page_tables_pipeline_v2
|
| 56 |
+
from src.table_merge.table_merge_v5 import merge_multi_page_tables_pipeline_v2
|
| 57 |
+
from src.table_query.query_code_openai import get_query_response
|
| 58 |
+
|
| 59 |
+
from src.custom_headers.pdf_header_detector import process_pdf_for_headers
|
| 60 |
+
from src.custom_headers.consolidate_header_jsons import pipeline_for_merging_headers
|
| 61 |
+
|
| 62 |
+
from utils.utils_code import clear_directory
|
| 63 |
+
|
| 64 |
+
import logging,os
|
| 65 |
+
from logging.config import dictConfig
|
| 66 |
+
import shutil
|
| 67 |
+
import re
|
| 68 |
+
from fastapi import HTTPException, Form
|
| 69 |
+
from src.classification.column_classifier_v2 import classify_column_headers
|
| 70 |
+
from src.classification.classification import perform_classification
|
| 71 |
+
|
| 72 |
+
log_folder = "logs"
|
| 73 |
+
os.makedirs(log_folder, exist_ok=True)
|
| 74 |
+
|
| 75 |
+
# Configure logging
|
| 76 |
+
log_file_path = os.path.join(log_folder, "app.log")
|
| 77 |
+
|
| 78 |
+
logging_config = {
|
| 79 |
+
'version': 1,
|
| 80 |
+
'disable_existing_loggers': False,
|
| 81 |
+
'formatters': {
|
| 82 |
+
'detailed': {
|
| 83 |
+
'format': '%(asctime)s - %(name)s - %(levelname)s - %(pathname)s:%(lineno)d - %(message)s',
|
| 84 |
+
'datefmt': '%Y-%m-%d %H:%M:%S'
|
| 85 |
+
},
|
| 86 |
+
},
|
| 87 |
+
'handlers': {
|
| 88 |
+
'console': {
|
| 89 |
+
'class': 'logging.StreamHandler',
|
| 90 |
+
'level': 'INFO',
|
| 91 |
+
'formatter': 'detailed',
|
| 92 |
+
'stream': 'ext://sys.stdout'
|
| 93 |
+
},
|
| 94 |
+
'file': {
|
| 95 |
+
'class': 'logging.FileHandler',
|
| 96 |
+
'level': 'INFO',
|
| 97 |
+
'formatter': 'detailed',
|
| 98 |
+
'filename': log_file_path,
|
| 99 |
+
'mode': 'a',
|
| 100 |
+
},
|
| 101 |
+
},
|
| 102 |
+
'loggers': {
|
| 103 |
+
'': { # root logger
|
| 104 |
+
'handlers': ['console', 'file'],
|
| 105 |
+
'level': 'INFO',
|
| 106 |
+
'propagate': True
|
| 107 |
+
},
|
| 108 |
+
# Add specific loggers for libraries if needed
|
| 109 |
+
'uvicorn': {
|
| 110 |
+
'handlers': ['console', 'file'],
|
| 111 |
+
'level': 'INFO',
|
| 112 |
+
'propagate': False
|
| 113 |
+
},
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
# Apply the configuration
|
| 118 |
+
dictConfig(logging_config)
|
| 119 |
+
|
| 120 |
+
# Create the logger instance
|
| 121 |
+
logger = logging.getLogger(__name__)
|
| 122 |
+
|
| 123 |
+
app = FastAPI()
|
| 124 |
+
|
| 125 |
+
app.add_middleware(
|
| 126 |
+
CORSMiddleware,
|
| 127 |
+
allow_origins=["*"],
|
| 128 |
+
allow_credentials=True,
|
| 129 |
+
allow_methods=["*"],
|
| 130 |
+
allow_headers=["*"],
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
pdf_input_path = os.path.join(project_output_directory_path, f"pdf_extraction/input")
|
| 135 |
+
pdf_input_directory = pdf_input_path
|
| 136 |
+
os.makedirs(pdf_input_directory, exist_ok=True)
|
| 137 |
+
|
| 138 |
+
pdf_output_path = os.path.join(project_output_directory_path, f"pdf_extraction/output")
|
| 139 |
+
output_directory = pdf_output_path
|
| 140 |
+
os.makedirs(output_directory, exist_ok=True)
|
| 141 |
+
|
| 142 |
+
word_input_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/input")
|
| 143 |
+
word_input_directory_path = word_input_path
|
| 144 |
+
os.makedirs(word_input_directory_path, exist_ok=True)
|
| 145 |
+
|
| 146 |
+
word_output_path = os.path.join(project_output_directory_path, f"word_csv_extraction/directory/output")
|
| 147 |
+
word_output_directory_path = word_output_path
|
| 148 |
+
os.makedirs(word_output_directory_path, exist_ok=True)
|
| 149 |
+
|
| 150 |
+
document_data = {}
|
| 151 |
+
|
| 152 |
+
@app.post("/structured_chunking_extract")
|
| 153 |
+
async def upload_documents(request: Request, path: str = Form()) :
|
| 154 |
+
# path = eval(f'{path}')
|
| 155 |
+
print(f'started for path: {path}')
|
| 156 |
+
base_url = str(request.base_url)
|
| 157 |
+
global document_data
|
| 158 |
+
document_data = {}
|
| 159 |
+
pdf_path = path
|
| 160 |
+
clear_directory(pdf_input_path)
|
| 161 |
+
clear_directory(pdf_output_path)
|
| 162 |
+
clear_directory(word_input_path)
|
| 163 |
+
clear_directory(word_output_path)
|
| 164 |
+
|
| 165 |
+
# Initialize response structure
|
| 166 |
+
response = {
|
| 167 |
+
"success": False,
|
| 168 |
+
"message": "",
|
| 169 |
+
# "data": None
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
# Check if the provided path is a PDF file
|
| 173 |
+
if not pdf_path.lower().endswith(".pdf"):
|
| 174 |
+
response["message"] = "Invalid file type. Only PDF files are accepted."
|
| 175 |
+
return response
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# Extract filename
|
| 179 |
+
file_name_with_ext = os.path.basename(pdf_path)
|
| 180 |
+
file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
|
| 181 |
+
|
| 182 |
+
# Create destination path in input directory
|
| 183 |
+
destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
# Copy the file to our input directory
|
| 187 |
+
shutil.copy2(pdf_path, destination_path)
|
| 188 |
+
except Exception as e:
|
| 189 |
+
response["message"] = f"Failed to copy file: {str(e)}"
|
| 190 |
+
return response
|
| 191 |
+
|
| 192 |
+
output_directory_path = os.path.join(output_directory)
|
| 193 |
+
os.makedirs(output_directory_path, exist_ok=True)
|
| 194 |
+
file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
|
| 195 |
+
os.makedirs(file_output, exist_ok=True)
|
| 196 |
+
|
| 197 |
+
table_output_path = os.path.join(file_output, f"table_output")
|
| 198 |
+
os.makedirs(table_output_path, exist_ok=True)
|
| 199 |
+
file_location = destination_path
|
| 200 |
+
|
| 201 |
+
# Pipeline processing
|
| 202 |
+
|
| 203 |
+
json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path,_ = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
|
| 204 |
+
table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
|
| 205 |
+
|
| 206 |
+
custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
|
| 207 |
+
header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
|
| 208 |
+
|
| 209 |
+
# Initialize data for the new document
|
| 210 |
+
document_data[file_name_with_ext] = {
|
| 211 |
+
|
| 212 |
+
"pdf_path": destination_path,
|
| 213 |
+
"pdf_file_name": file_name_with_ext,
|
| 214 |
+
"model_json_header_output_filepath": [],
|
| 215 |
+
"model_json_layout_output_filepath": [],
|
| 216 |
+
"tree_structured_header_json_filepath": [],
|
| 217 |
+
"user_modified_json_output_filepath": [],
|
| 218 |
+
'user_modified_table_json_filepath': [],
|
| 219 |
+
"frontend_output_json": [],
|
| 220 |
+
"cluster_json": [],
|
| 221 |
+
"id_2_label" : [],
|
| 222 |
+
"file_output_dir" : [],
|
| 223 |
+
"table_output_dir": [],
|
| 224 |
+
"table_with_header_data" : [],
|
| 225 |
+
"table_with_header_json_path" : [],
|
| 226 |
+
"json_output_dir": [],
|
| 227 |
+
"pdf_miner_json_path": [] ,
|
| 228 |
+
"searchable_pdf_path" : []
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
# Store paths and filenames
|
| 232 |
+
document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
|
| 233 |
+
document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
|
| 234 |
+
document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
|
| 235 |
+
document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
|
| 236 |
+
document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
|
| 237 |
+
document_data[file_name_with_ext]["file_output_dir"].append(file_output)
|
| 238 |
+
document_data[file_name_with_ext]["id_2_label"].append(class_names)
|
| 239 |
+
document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
|
| 240 |
+
document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
|
| 241 |
+
document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
|
| 242 |
+
document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
|
| 243 |
+
|
| 244 |
+
file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
|
| 245 |
+
pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
|
| 246 |
+
pdf_path = document_data[file_name_with_ext]["pdf_path"]
|
| 247 |
+
user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
|
| 248 |
+
|
| 249 |
+
pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
|
| 250 |
+
|
| 251 |
+
table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
|
| 252 |
+
document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
|
| 253 |
+
|
| 254 |
+
document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
|
| 255 |
+
|
| 256 |
+
# Process image URLs
|
| 257 |
+
pdf_images_urls = []
|
| 258 |
+
for file_name in os.listdir(pdf_images_path):
|
| 259 |
+
file_path = os.path.join(pdf_images_path, file_name)
|
| 260 |
+
if file_name.endswith((".jpg", ".jpeg", ".png")):
|
| 261 |
+
img_url = base_url + "image/" + str(quote(file_path))
|
| 262 |
+
pdf_images_urls.append(img_url)
|
| 263 |
+
|
| 264 |
+
# Sort image URLs by page number
|
| 265 |
+
def extract_page_no(url):
|
| 266 |
+
return int(url.split("_")[-1].split(".")[0])
|
| 267 |
+
sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
|
| 268 |
+
|
| 269 |
+
# Create page details
|
| 270 |
+
page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
|
| 271 |
+
|
| 272 |
+
# Store the JSON output
|
| 273 |
+
document_data[file_name_with_ext]["frontend_output_json"].append({
|
| 274 |
+
"layout_output_json_data": layout_output_json_data,
|
| 275 |
+
"layout_json_list_data": layout_list_data,
|
| 276 |
+
"id_2_label": class_names,
|
| 277 |
+
"header_output_json_data": header_output_json_data,
|
| 278 |
+
"table_output_json_data": table_json_data,
|
| 279 |
+
"table_output_json_data_list": table_json_data_list,
|
| 280 |
+
"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
|
| 281 |
+
"pdf_images_urls": page_details,
|
| 282 |
+
})
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
document_id_name = file_name_with_ext
|
| 286 |
+
|
| 287 |
+
data = document_data[document_id_name]
|
| 288 |
+
file_output_dir = data["file_output_dir"][0]
|
| 289 |
+
json_output_dir = data["json_output_dir"][0]
|
| 290 |
+
pdf_file_name = data["pdf_file_name"]
|
| 291 |
+
pdf_path = data["pdf_path"]
|
| 292 |
+
|
| 293 |
+
# PDFMiner processing
|
| 294 |
+
pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
|
| 295 |
+
modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
|
| 296 |
+
model_modified_json = read_json(modified_json_output_filepath)
|
| 297 |
+
pdfminer_json = read_json(pdf_miner_json_filepath)
|
| 298 |
+
searchable_pdf_path = data["searchable_pdf_path"][0]
|
| 299 |
+
|
| 300 |
+
# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 301 |
+
|
| 302 |
+
table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 303 |
+
|
| 304 |
+
table_merged_json = read_json(table_merged_json_path)
|
| 305 |
+
|
| 306 |
+
table_mapped_modified_json = map_table_with_its_header(table_merged_json)
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# table_mapped_modified_json = map_table_with_its_header(model_modified_json)
|
| 310 |
+
|
| 311 |
+
# Main header pipeline
|
| 312 |
+
df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
clean_df, clean_df_json = clean_dataframe(df_final)
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
if isinstance(clean_df_json, str):
|
| 319 |
+
# print("clean_df_json::",clean_df_json)
|
| 320 |
+
# clean_df_json = eval(clean_df_json)
|
| 321 |
+
clean_df_json = json.loads(clean_df_json)
|
| 322 |
+
|
| 323 |
+
file_name = get_file_name_without_extension(pdf_file_name)
|
| 324 |
+
merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
|
| 325 |
+
company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
|
| 326 |
+
|
| 327 |
+
json_output_filename = file_name + "_final_h2h_extraction.json"
|
| 328 |
+
final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
|
| 329 |
+
|
| 330 |
+
with open(final_json_output_filepath, 'w') as f:
|
| 331 |
+
json.dump(clean_df_json, f, indent=4)
|
| 332 |
+
|
| 333 |
+
company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
|
| 334 |
+
company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
|
| 335 |
+
|
| 336 |
+
with open(company_wise_final_json_output_filepath, 'w') as f:
|
| 337 |
+
json.dump(merged_content_company_wise_df, f, indent=4)
|
| 338 |
+
|
| 339 |
+
# Tree-structured header content
|
| 340 |
+
# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
|
| 341 |
+
|
| 342 |
+
# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
|
| 343 |
+
|
| 344 |
+
# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
|
| 345 |
+
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
# Step 1: Extract directory and filename without extension
|
| 352 |
+
pdf_path = path
|
| 353 |
+
json_directory = os.path.dirname(pdf_path)
|
| 354 |
+
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 355 |
+
|
| 356 |
+
# Step 2: Define output path for JSON
|
| 357 |
+
output_json_path = os.path.join(json_directory, f"{json_filename }.json")
|
| 358 |
+
|
| 359 |
+
# If your variable is a JSON string, convert it to dict first
|
| 360 |
+
if isinstance(company_wise_clean_df_json, str):
|
| 361 |
+
company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
# Step 3: Save JSON
|
| 365 |
+
with open(output_json_path, 'w', encoding='utf-8') as json_file:
|
| 366 |
+
json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
|
| 367 |
+
|
| 368 |
+
|
| 369 |
+
# # post-processing results
|
| 370 |
+
# post_processing_results = {
|
| 371 |
+
# document_id_name : {
|
| 372 |
+
# # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
|
| 373 |
+
# # "df_download_json" : clean_df_json,
|
| 374 |
+
# "df_download_json": company_wise_clean_df_json,
|
| 375 |
+
# "tree_structured_header_content": final_tree_structred_header_content,
|
| 376 |
+
# "file_name": document_id_name,
|
| 377 |
+
# # "classified_dynamic_json": dynamic_mapped_data_json,
|
| 378 |
+
# "toc_df_download_json" : final_toc_h2h_extraction
|
| 379 |
+
# }
|
| 380 |
+
# }
|
| 381 |
+
|
| 382 |
+
response_final = {
|
| 383 |
+
"status_code": 200,
|
| 384 |
+
# "message":"",
|
| 385 |
+
# "df_download_json": company_wise_clean_df_json,
|
| 386 |
+
"saved_json_path": output_json_path
|
| 387 |
+
}
|
| 388 |
+
|
| 389 |
+
return response_final
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
@app.get("/image/{path:path}")
|
| 393 |
+
async def get_image(path: str):
|
| 394 |
+
if os.path.exists(path):
|
| 395 |
+
return FileResponse(path, media_type="image/jpeg")
|
| 396 |
+
else:
|
| 397 |
+
raise HTTPException(status_code=404, detail="Image not found")
|
| 398 |
+
|
| 399 |
+
@app.get("/file/{path:path}")
|
| 400 |
+
async def get_file(path: str):
|
| 401 |
+
if os.path.exists(path):
|
| 402 |
+
paths = path.split("/")
|
| 403 |
+
filename = paths[len(paths) - 1]
|
| 404 |
+
if path.endswith('.csv'):
|
| 405 |
+
media_type = "text/csv"
|
| 406 |
+
elif path.endswith('.xlsx'):
|
| 407 |
+
media_type = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
| 408 |
+
else:
|
| 409 |
+
media_type = "application/octet-stream"
|
| 410 |
+
return FileResponse(path, media_type=media_type,filename =filename)
|
| 411 |
+
else:
|
| 412 |
+
raise HTTPException(status_code=404, detail="File not found")
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
@app.post("/table-classification")
|
| 416 |
+
async def table_classification(
|
| 417 |
+
structured_chunk_json_path: str = Form(...),
|
| 418 |
+
class_keywords_table: str = Form(...),
|
| 419 |
+
header_categories: Optional[str] = Form("table_column_header"),
|
| 420 |
+
similarity_threshold: Optional[float] = Form(0.4)
|
| 421 |
+
):
|
| 422 |
+
|
| 423 |
+
try:
|
| 424 |
+
|
| 425 |
+
with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
|
| 426 |
+
content = file.read()
|
| 427 |
+
|
| 428 |
+
# This regex removes commas before closing braces/brackets, ignoring whitespace
|
| 429 |
+
cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
|
| 430 |
+
|
| 431 |
+
# Parse the cleaned JSON
|
| 432 |
+
structured_chunk_data = json.loads(cleaned_content)
|
| 433 |
+
|
| 434 |
+
# If class_keywords is a string, try to parse it
|
| 435 |
+
if isinstance(class_keywords_table, str):
|
| 436 |
+
try:
|
| 437 |
+
class_keywords_table = json.loads(class_keywords_table)
|
| 438 |
+
|
| 439 |
+
if not isinstance(class_keywords_table, dict):
|
| 440 |
+
raise ValueError("class_keywords_table must be a dictionary")
|
| 441 |
+
if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
|
| 442 |
+
for key, value in class_keywords_table.items()):
|
| 443 |
+
raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
|
| 444 |
+
except json.JSONDecodeError:
|
| 445 |
+
raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
|
| 446 |
+
|
| 447 |
+
elif not isinstance(class_keywords_table, dict) or not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
|
| 448 |
+
for key, value in class_keywords_table.items()):
|
| 449 |
+
raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
|
| 450 |
+
|
| 451 |
+
|
| 452 |
+
# Perform classification
|
| 453 |
+
categorized_headers = perform_classification(
|
| 454 |
+
data=structured_chunk_data,
|
| 455 |
+
class_keywords=class_keywords_table,
|
| 456 |
+
header_categories=header_categories,
|
| 457 |
+
similarity_threshold=similarity_threshold
|
| 458 |
+
)
|
| 459 |
+
return categorized_headers
|
| 460 |
+
except ValueError as e:
|
| 461 |
+
raise HTTPException(status_code=422, detail={"error": "Input validation failed", "message": str(e)})
|
| 462 |
+
except Exception as e:
|
| 463 |
+
raise HTTPException(status_code=422, detail={"error": "Processing failed", "message": str(e)})
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
@app.post("/table-column-classification")
|
| 467 |
+
async def table_column_classification(
|
| 468 |
+
input_table_classified_json: Annotated[str, Form()],
|
| 469 |
+
class_keywords_table_column: Annotated[str, Form()],
|
| 470 |
+
filter_table_classifier_name: Annotated[str, Form()],
|
| 471 |
+
similarity_threshold: Annotated[str, Form()]
|
| 472 |
+
):
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
try:
|
| 476 |
+
# Parse JSON strings into dictionaries
|
| 477 |
+
input_table_classified_json = json.loads(input_table_classified_json)
|
| 478 |
+
class_keywords_table_column = json.loads(class_keywords_table_column)
|
| 479 |
+
except json.JSONDecodeError as e:
|
| 480 |
+
raise HTTPException(status_code=422, detail={"error": "Invalid JSON format", "message": str(e)})
|
| 481 |
+
|
| 482 |
+
try:
|
| 483 |
+
# Convert similarity_threshold to integer
|
| 484 |
+
similarity_threshold = float(similarity_threshold)
|
| 485 |
+
except ValueError as e:
|
| 486 |
+
raise HTTPException(status_code=422, detail={"error": "Similarity threshold must be a valid integer", "message": str(e)})
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
column_classification_results = classify_column_headers(
|
| 490 |
+
json_data=input_table_classified_json,
|
| 491 |
+
class_keywords=class_keywords_table_column,
|
| 492 |
+
filter_table_classifier_name=filter_table_classifier_name,
|
| 493 |
+
similarity_threshold=similarity_threshold
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
results = {"column_classification_result": column_classification_results}
|
| 497 |
+
|
| 498 |
+
return results
|
| 499 |
+
|
| 500 |
+
|
| 501 |
+
# Run the server
|
| 502 |
+
if __name__ == "__main__":
|
| 503 |
+
# uvicorn.run("app:app", host="0.0.0.0", port=7061, log_level="info", reload=True)
|
| 504 |
+
uvicorn.run( app, host="0.0.0.0", port=7063,log_level="info")
|
| 505 |
+
# uvicorn.run( app, host="0.0.0.0", port=5052,log_level="info")
|
| 506 |
+
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
def upload_documents(path) :
|
| 510 |
+
# path = eval(f'{path}')
|
| 511 |
+
request = Request
|
| 512 |
+
print(f'started for path: {path}')
|
| 513 |
+
base_url = str(request.base_url)
|
| 514 |
+
global document_data
|
| 515 |
+
document_data = {}
|
| 516 |
+
pdf_path = path
|
| 517 |
+
|
| 518 |
+
clear_directory(pdf_input_path)
|
| 519 |
+
clear_directory(pdf_output_path)
|
| 520 |
+
clear_directory(word_input_path)
|
| 521 |
+
clear_directory(word_output_path)
|
| 522 |
+
|
| 523 |
+
# Initialize response structure
|
| 524 |
+
response = {
|
| 525 |
+
"success": False,
|
| 526 |
+
"message": "",
|
| 527 |
+
# "data": None
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
|
| 531 |
+
if not pdf_path:
|
| 532 |
+
response["message"] = "No file path provided."
|
| 533 |
+
return response
|
| 534 |
+
|
| 535 |
+
# Check if the provided path is a PDF file
|
| 536 |
+
if not pdf_path.lower().endswith(".pdf"):
|
| 537 |
+
response["message"] = "Invalid file type. Only PDF files are accepted."
|
| 538 |
+
return response
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
# Extract filename
|
| 542 |
+
file_name_with_ext = os.path.basename(pdf_path)
|
| 543 |
+
file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
|
| 544 |
+
|
| 545 |
+
# Create destination path in input directory
|
| 546 |
+
destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
|
| 547 |
+
|
| 548 |
+
try:
|
| 549 |
+
# Copy the file to our input directory
|
| 550 |
+
shutil.copy2(pdf_path, destination_path)
|
| 551 |
+
except Exception as e:
|
| 552 |
+
response["message"] = f"Failed to copy file: {str(e)}"
|
| 553 |
+
return response
|
| 554 |
+
|
| 555 |
+
output_directory_path = os.path.join(output_directory)
|
| 556 |
+
os.makedirs(output_directory_path, exist_ok=True)
|
| 557 |
+
file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
|
| 558 |
+
os.makedirs(file_output, exist_ok=True)
|
| 559 |
+
|
| 560 |
+
table_output_path = os.path.join(file_output, f"table_output")
|
| 561 |
+
os.makedirs(table_output_path, exist_ok=True)
|
| 562 |
+
file_location = destination_path
|
| 563 |
+
|
| 564 |
+
|
| 565 |
+
# Pipeline processing
|
| 566 |
+
|
| 567 |
+
json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path,cropped_tables_images_dir_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
|
| 568 |
+
table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
|
| 569 |
+
|
| 570 |
+
custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
|
| 571 |
+
header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
|
| 572 |
+
|
| 573 |
+
# Initialize data for the new document
|
| 574 |
+
document_data[file_name_with_ext] = {
|
| 575 |
+
|
| 576 |
+
"pdf_path": destination_path,
|
| 577 |
+
"pdf_file_name": file_name_with_ext,
|
| 578 |
+
"model_json_header_output_filepath": [],
|
| 579 |
+
"model_json_layout_output_filepath": [],
|
| 580 |
+
"tree_structured_header_json_filepath": [],
|
| 581 |
+
"user_modified_json_output_filepath": [],
|
| 582 |
+
'user_modified_table_json_filepath': [],
|
| 583 |
+
"frontend_output_json": [],
|
| 584 |
+
"cluster_json": [],
|
| 585 |
+
"id_2_label" : [],
|
| 586 |
+
"file_output_dir" : [],
|
| 587 |
+
"table_output_dir": [],
|
| 588 |
+
"table_with_header_data" : [],
|
| 589 |
+
"table_with_header_json_path" : [],
|
| 590 |
+
"json_output_dir": [],
|
| 591 |
+
"pdf_miner_json_path": [] ,
|
| 592 |
+
"searchable_pdf_path" : []
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
# Store paths and filenames
|
| 596 |
+
document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
|
| 597 |
+
document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
|
| 598 |
+
document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
|
| 599 |
+
document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
|
| 600 |
+
document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
|
| 601 |
+
document_data[file_name_with_ext]["file_output_dir"].append(file_output)
|
| 602 |
+
document_data[file_name_with_ext]["id_2_label"].append(class_names)
|
| 603 |
+
document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
|
| 604 |
+
document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
|
| 605 |
+
document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
|
| 606 |
+
document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
|
| 607 |
+
|
| 608 |
+
file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
|
| 609 |
+
pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
|
| 610 |
+
pdf_path = document_data[file_name_with_ext]["pdf_path"]
|
| 611 |
+
user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
|
| 612 |
+
|
| 613 |
+
pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
|
| 614 |
+
|
| 615 |
+
table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
|
| 616 |
+
document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
|
| 617 |
+
|
| 618 |
+
document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
|
| 619 |
+
|
| 620 |
+
# Process image URLs
|
| 621 |
+
pdf_images_urls = []
|
| 622 |
+
for file_name in os.listdir(pdf_images_path):
|
| 623 |
+
file_path = os.path.join(pdf_images_path, file_name)
|
| 624 |
+
if file_name.endswith((".jpg", ".jpeg", ".png")):
|
| 625 |
+
img_url = base_url + "image/" + str(quote(file_path))
|
| 626 |
+
pdf_images_urls.append(img_url)
|
| 627 |
+
|
| 628 |
+
# Sort image URLs by page number
|
| 629 |
+
def extract_page_no(url):
|
| 630 |
+
return int(url.split("_")[-1].split(".")[0])
|
| 631 |
+
sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
|
| 632 |
+
|
| 633 |
+
# Create page details
|
| 634 |
+
page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
|
| 635 |
+
|
| 636 |
+
# Store the JSON output
|
| 637 |
+
document_data[file_name_with_ext]["frontend_output_json"].append({
|
| 638 |
+
"layout_output_json_data": layout_output_json_data,
|
| 639 |
+
"layout_json_list_data": layout_list_data,
|
| 640 |
+
"id_2_label": class_names,
|
| 641 |
+
"header_output_json_data": header_output_json_data,
|
| 642 |
+
"table_output_json_data": table_json_data,
|
| 643 |
+
"table_output_json_data_list": table_json_data_list,
|
| 644 |
+
"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
|
| 645 |
+
"pdf_images_urls": page_details,
|
| 646 |
+
})
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
document_id_name = file_name_with_ext
|
| 650 |
+
|
| 651 |
+
data = document_data[document_id_name]
|
| 652 |
+
file_output_dir = data["file_output_dir"][0]
|
| 653 |
+
json_output_dir = data["json_output_dir"][0]
|
| 654 |
+
pdf_file_name = data["pdf_file_name"]
|
| 655 |
+
pdf_path = data["pdf_path"]
|
| 656 |
+
|
| 657 |
+
# PDFMiner processing
|
| 658 |
+
pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
|
| 659 |
+
modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
|
| 660 |
+
model_modified_json = read_json(modified_json_output_filepath)
|
| 661 |
+
pdfminer_json = read_json(pdf_miner_json_filepath)
|
| 662 |
+
searchable_pdf_path = data["searchable_pdf_path"][0]
|
| 663 |
+
|
| 664 |
+
# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 665 |
+
|
| 666 |
+
table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 667 |
+
|
| 668 |
+
table_merged_json = read_json(table_merged_json_path)
|
| 669 |
+
|
| 670 |
+
table_mapped_modified_json = map_table_with_its_header(table_merged_json)
|
| 671 |
+
|
| 672 |
+
|
| 673 |
+
# table_mapped_modified_json = map_table_with_its_header(model_modified_json)
|
| 674 |
+
|
| 675 |
+
# Main header pipeline
|
| 676 |
+
df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
clean_df, clean_df_json = clean_dataframe(df_final)
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
if isinstance(clean_df_json, str):
|
| 683 |
+
clean_df_json = json.loads(clean_df_json)
|
| 684 |
+
|
| 685 |
+
file_name = get_file_name_without_extension(pdf_file_name)
|
| 686 |
+
merged_content_company_wise_df = process_document_company_wise(clean_df_json,output_directory=json_output_dir,file_name=file_name,table_output_directory=cropped_tables_images_dir_path)
|
| 687 |
+
company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
|
| 688 |
+
|
| 689 |
+
json_output_filename = file_name + "_final_h2h_extraction.json"
|
| 690 |
+
final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
|
| 691 |
+
|
| 692 |
+
with open(final_json_output_filepath, 'w') as f:
|
| 693 |
+
json.dump(clean_df_json, f, indent=4)
|
| 694 |
+
|
| 695 |
+
company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
|
| 696 |
+
company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
|
| 697 |
+
|
| 698 |
+
with open(company_wise_final_json_output_filepath, 'w') as f:
|
| 699 |
+
json.dump(merged_content_company_wise_df, f, indent=4)
|
| 700 |
+
|
| 701 |
+
# Tree-structured header content
|
| 702 |
+
# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
|
| 703 |
+
|
| 704 |
+
# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
|
| 705 |
+
|
| 706 |
+
# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
|
| 707 |
+
|
| 708 |
+
|
| 709 |
+
|
| 710 |
+
# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path,yolo_detection_json_path=modified_json_output_filepath,output_directory=file_output_dir)
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
# Step 1: Extract directory and filename without extension
|
| 714 |
+
pdf_path = path
|
| 715 |
+
json_directory = os.path.dirname(pdf_path)
|
| 716 |
+
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 717 |
+
|
| 718 |
+
# Step 2: Define output path for JSON
|
| 719 |
+
output_json_path = os.path.join(json_directory, f"{json_filename }.json")
|
| 720 |
+
|
| 721 |
+
# If your variable is a JSON string, convert it to dict first
|
| 722 |
+
if isinstance(company_wise_clean_df_json, str):
|
| 723 |
+
company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
|
| 724 |
+
|
| 725 |
+
|
| 726 |
+
# Step 3: Save JSON
|
| 727 |
+
with open(output_json_path, 'w', encoding='utf-8') as json_file:
|
| 728 |
+
json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
|
| 729 |
+
|
| 730 |
+
|
| 731 |
+
# # post-processing results
|
| 732 |
+
# post_processing_results = {
|
| 733 |
+
# document_id_name : {
|
| 734 |
+
# # "df_download": json.dumps(clean_df.to_csv(index=False, escapechar='\\', encoding='utf-8')),
|
| 735 |
+
# # "df_download_json" : clean_df_json,
|
| 736 |
+
# "df_download_json": company_wise_clean_df_json,
|
| 737 |
+
# "tree_structured_header_content": final_tree_structred_header_content,
|
| 738 |
+
# "file_name": document_id_name,
|
| 739 |
+
# # "classified_dynamic_json": dynamic_mapped_data_json,
|
| 740 |
+
# "toc_df_download_json" : final_toc_h2h_extraction
|
| 741 |
+
# }
|
| 742 |
+
# }
|
| 743 |
+
|
| 744 |
+
response_final = {
|
| 745 |
+
"status_code": 200,
|
| 746 |
+
# "message":"",
|
| 747 |
+
# "df_download_json": company_wise_clean_df_json,
|
| 748 |
+
"saved_json_path": output_json_path
|
| 749 |
+
}
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
|
| 753 |
+
|
| 754 |
+
return response_final
|
| 755 |
+
|
| 756 |
+
|
| 757 |
+
|
| 758 |
+
# def upload_documents(path):
|
| 759 |
+
# logger.info(f"Starting upload_documents for path: {path}")
|
| 760 |
+
|
| 761 |
+
# request = Request
|
| 762 |
+
# base_url = str(request.base_url)
|
| 763 |
+
# global document_data
|
| 764 |
+
# document_data = {}
|
| 765 |
+
# pdf_path = path
|
| 766 |
+
|
| 767 |
+
# # Log directory clearing
|
| 768 |
+
# logger.info("Clearing input and output directories")
|
| 769 |
+
# clear_directory(pdf_input_path)
|
| 770 |
+
# clear_directory(pdf_output_path)
|
| 771 |
+
# clear_directory(word_input_path)
|
| 772 |
+
# clear_directory(word_output_path)
|
| 773 |
+
|
| 774 |
+
# # Initialize response structure
|
| 775 |
+
# response = {
|
| 776 |
+
# "success": False,
|
| 777 |
+
# "message": "",
|
| 778 |
+
# }
|
| 779 |
+
|
| 780 |
+
# # Check if the provided path is a PDF file
|
| 781 |
+
# if not pdf_path.lower().endswith(".pdf"):
|
| 782 |
+
# logger.error(f"Invalid file type for path: {pdf_path}. Only PDF files are accepted.")
|
| 783 |
+
# response["message"] = "Invalid file type. Only PDF files are accepted."
|
| 784 |
+
# return response
|
| 785 |
+
|
| 786 |
+
# # Extract filename
|
| 787 |
+
# file_name_with_ext = os.path.basename(pdf_path)
|
| 788 |
+
# file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
|
| 789 |
+
# logger.debug(f"Extracted filename: {file_name_with_ext} (without extension: {file_name_without_ext})")
|
| 790 |
+
|
| 791 |
+
# # Create destination path in input directory
|
| 792 |
+
# destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
|
| 793 |
+
# logger.debug(f"Destination path for file copy: {destination_path}")
|
| 794 |
+
|
| 795 |
+
# # Copy file to input directory
|
| 796 |
+
# try:
|
| 797 |
+
# logger.info(f"Copying file from {pdf_path} to {destination_path}")
|
| 798 |
+
# shutil.copy2(pdf_path, destination_path)
|
| 799 |
+
# except Exception as e:
|
| 800 |
+
# logger.error(f"Failed to copy file from {pdf_path} to {destination_path}: {str(e)}")
|
| 801 |
+
# response["message"] = f"Failed to copy file: {str(e)}"
|
| 802 |
+
# return response
|
| 803 |
+
|
| 804 |
+
# # Create output directories
|
| 805 |
+
# output_directory_path = os.path.join(output_directory)
|
| 806 |
+
# os.makedirs(output_directory_path, exist_ok=True)
|
| 807 |
+
# file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
|
| 808 |
+
# os.makedirs(file_output, exist_ok=True)
|
| 809 |
+
# table_output_path = os.path.join(file_output, f"table_output")
|
| 810 |
+
# os.makedirs(table_output_path, exist_ok=True)
|
| 811 |
+
# file_location = destination_path
|
| 812 |
+
# logger.info(f"Created output directories: {file_output}, {table_output_path}")
|
| 813 |
+
|
| 814 |
+
# # Pipeline processing
|
| 815 |
+
# logger.info(f"Starting yolov10_layout_pipeline for {file_name_without_ext}")
|
| 816 |
+
# try:
|
| 817 |
+
# json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
|
| 818 |
+
# logger.info(f"yolov10_layout_pipeline completed. Output JSON dir: {json_output_dir}")
|
| 819 |
+
# except Exception as e:
|
| 820 |
+
# logger.error(f"yolov10_layout_pipeline failed: {str(e)}")
|
| 821 |
+
# response["message"] = f"yolov10_layout_pipeline failed: {str(e)}"
|
| 822 |
+
# return response
|
| 823 |
+
|
| 824 |
+
# logger.info(f"Starting filtering_table_pipeline for {file_name_without_ext}")
|
| 825 |
+
# try:
|
| 826 |
+
# table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
|
| 827 |
+
# logger.info(f"filtering_table_pipeline completed. Table JSON path: {table_json_path}")
|
| 828 |
+
# except Exception as e:
|
| 829 |
+
# logger.error(f"filtering_table_pipeline failed: {str(e)}")
|
| 830 |
+
# response["message"] = f"filtering_table_pipeline failed: {str(e)}"
|
| 831 |
+
# return response
|
| 832 |
+
|
| 833 |
+
# logger.info(f"Starting process_pdf_for_headers for {file_name_without_ext}")
|
| 834 |
+
# try:
|
| 835 |
+
# custom_headers_json, custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext, file_location, file_output)
|
| 836 |
+
# logger.info(f"process_pdf_for_headers completed. Custom headers JSON path: {custom_headers_json_file_path}")
|
| 837 |
+
# except Exception as e:
|
| 838 |
+
# logger.error(f"process_pdf_for_headers failed: {str(e)}")
|
| 839 |
+
# response["message"] = f"process_pdf_for_headers failed: {str(e)}"
|
| 840 |
+
# return response
|
| 841 |
+
|
| 842 |
+
# logger.info(f"Starting pipeline_for_merging_headers for {file_name_without_ext}")
|
| 843 |
+
# try:
|
| 844 |
+
# header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path, header_json_output_filepath, file_output, file_name_without_ext)
|
| 845 |
+
# logger.info(f"pipeline_for_merging_headers completed. Merged headers JSON path: {header_json_output_filepath}")
|
| 846 |
+
# except Exception as e:
|
| 847 |
+
# logger.error(f"pipeline_for_merging_headers failed: {str(e)}")
|
| 848 |
+
# response["message"] = f"pipeline_for_merging_headers failed: {str(e)}"
|
| 849 |
+
# return response
|
| 850 |
+
|
| 851 |
+
# # Initialize document_data
|
| 852 |
+
# logger.debug(f"Initializing document_data for {file_name_with_ext}")
|
| 853 |
+
# document_data[file_name_with_ext] = {
|
| 854 |
+
# "pdf_path": destination_path,
|
| 855 |
+
# "pdf_file_name": file_name_with_ext,
|
| 856 |
+
# "model_json_header_output_filepath": [],
|
| 857 |
+
# "model_json_layout_output_filepath": [],
|
| 858 |
+
# "tree_structured_header_json_filepath": [],
|
| 859 |
+
# "user_modified_json_output_filepath": [],
|
| 860 |
+
# "user_modified_table_json_filepath": [],
|
| 861 |
+
# "frontend_output_json": [],
|
| 862 |
+
# "cluster_json": [],
|
| 863 |
+
# "id_2_label": [],
|
| 864 |
+
# "file_output_dir": [],
|
| 865 |
+
# "table_output_dir": [],
|
| 866 |
+
# "table_with_header_data": [],
|
| 867 |
+
# "table_with_header_json_path": [],
|
| 868 |
+
# "json_output_dir": [],
|
| 869 |
+
# "pdf_miner_json_path": [],
|
| 870 |
+
# "searchable_pdf_path": []
|
| 871 |
+
# }
|
| 872 |
+
|
| 873 |
+
# # Store paths and filenames
|
| 874 |
+
# document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
|
| 875 |
+
# document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
|
| 876 |
+
# document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
|
| 877 |
+
# document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
|
| 878 |
+
# document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
|
| 879 |
+
# document_data[file_name_with_ext]["file_output_dir"].append(file_output)
|
| 880 |
+
# document_data[file_name_with_ext]["id_2_label"].append(class_names)
|
| 881 |
+
# document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
|
| 882 |
+
# document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
|
| 883 |
+
# document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
|
| 884 |
+
# document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
|
| 885 |
+
# logger.debug(f"Stored paths and filenames in document_data for {file_name_with_ext}")
|
| 886 |
+
|
| 887 |
+
# file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
|
| 888 |
+
# pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
|
| 889 |
+
# pdf_path = document_data[file_name_with_ext]["pdf_path"]
|
| 890 |
+
# user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
|
| 891 |
+
|
| 892 |
+
# logger.info(f"Starting create_json_pdfminer_pipeline for {pdf_file_name}")
|
| 893 |
+
# try:
|
| 894 |
+
# pdf_miner_json_filepath, pdf_miner_metadata, searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
|
| 895 |
+
# logger.info(f"create_json_pdfminer_pipeline completed. PDFMiner JSON path: {pdf_miner_json_filepath}, Searchable PDF path: {searchable_pdf_path}")
|
| 896 |
+
# except Exception as e:
|
| 897 |
+
# logger.error(f"create_json_pdfminer_pipeline failed: {str(e)}")
|
| 898 |
+
# response["message"] = f"create_json_pdfminer_pipeline failed: {str(e)}"
|
| 899 |
+
# return response
|
| 900 |
+
|
| 901 |
+
# logger.info(f"Starting main_pipeline_create_put_table_headers for {file_name_with_ext}")
|
| 902 |
+
# try:
|
| 903 |
+
# table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
|
| 904 |
+
# logger.info(f"main_pipeline_create_put_table_headers completed")
|
| 905 |
+
# except Exception as e:
|
| 906 |
+
# logger.error(f"main_pipeline_create_put_table_headers failed: {str(e)}")
|
| 907 |
+
# response["message"] = f"main_pipeline_create_put_table_headers failed: {str(e)}"
|
| 908 |
+
# return response
|
| 909 |
+
|
| 910 |
+
# document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
|
| 911 |
+
# document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
|
| 912 |
+
|
| 913 |
+
# # Process image URLs
|
| 914 |
+
# pdf_images_urls = []
|
| 915 |
+
# for file_name in os.listdir(pdf_images_path):
|
| 916 |
+
# file_path = os.path.join(pdf_images_path, file_name)
|
| 917 |
+
# if file_name.endswith((".jpg", ".jpeg", ".png")):
|
| 918 |
+
# img_url = base_url + "image/" + str(quote(file_path))
|
| 919 |
+
# pdf_images_urls.append(img_url)
|
| 920 |
+
# logger.debug(f"Collected {len(pdf_images_urls)} image URLs from {pdf_images_path}")
|
| 921 |
+
|
| 922 |
+
# # Sort image URLs by page number
|
| 923 |
+
# def extract_page_no(url):
|
| 924 |
+
# return int(url.split("_")[-1].split(".")[0])
|
| 925 |
+
# sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
|
| 926 |
+
# page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
|
| 927 |
+
# logger.debug(f"Sorted {len(sorted_urls)} image URLs and created page details")
|
| 928 |
+
|
| 929 |
+
# # Store the JSON output
|
| 930 |
+
# document_data[file_name_with_ext]["frontend_output_json"].append({
|
| 931 |
+
# "layout_output_json_data": layout_output_json_data,
|
| 932 |
+
# "layout_json_list_data": layout_list_data,
|
| 933 |
+
# "id_2_label": class_names,
|
| 934 |
+
# "header_output_json_data": header_output_json_data,
|
| 935 |
+
# "table_output_json_data": table_json_data,
|
| 936 |
+
# "table_output_json_data_list": table_json_data_list,
|
| 937 |
+
# "tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
|
| 938 |
+
# "pdf_images_urls": page_details,
|
| 939 |
+
# })
|
| 940 |
+
# logger.debug(f"Stored frontend_output_json for {file_name_with_ext}")
|
| 941 |
+
|
| 942 |
+
# document_id_name = file_name_with_ext
|
| 943 |
+
# data = document_data[document_id_name]
|
| 944 |
+
# file_output_dir = data["file_output_dir"][0]
|
| 945 |
+
# json_output_dir = data["json_output_dir"][0]
|
| 946 |
+
# pdf_file_name = data["pdf_file_name"]
|
| 947 |
+
# pdf_path = data["pdf_path"]
|
| 948 |
+
|
| 949 |
+
# # PDFMiner processing
|
| 950 |
+
# pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
|
| 951 |
+
# modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
|
| 952 |
+
# logger.info(f"Reading JSON files: {modified_json_output_filepath}, {pdf_miner_json_filepath}")
|
| 953 |
+
# try:
|
| 954 |
+
# model_modified_json = read_json(modified_json_output_filepath)
|
| 955 |
+
# pdfminer_json = read_json(pdf_miner_json_filepath)
|
| 956 |
+
# logger.info(f"Successfully read JSON files")
|
| 957 |
+
# except Exception as e:
|
| 958 |
+
# logger.error(f"Failed to read JSON files: {str(e)}")
|
| 959 |
+
# response["message"] = f"Failed to read JSON files: {str(e)}"
|
| 960 |
+
# return response
|
| 961 |
+
|
| 962 |
+
# searchable_pdf_path = data["searchable_pdf_path"][0]
|
| 963 |
+
|
| 964 |
+
# logger.info(f"Starting merge_multi_page_tables_pipeline_v2 for {pdf_file_name}")
|
| 965 |
+
# try:
|
| 966 |
+
# table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 967 |
+
# logger.info(f"merge_multi_page_tables_pipeline_v2 completed. Merged table JSON path: {table_merged_json_path}")
|
| 968 |
+
# except Exception as e:
|
| 969 |
+
# logger.error(f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}")
|
| 970 |
+
# response["message"] = f"merge_multi_page_tables_pipeline_v2 failed: {str(e)}"
|
| 971 |
+
# return response
|
| 972 |
+
|
| 973 |
+
# logger.info(f"Reading merged table JSON: {table_merged_json_path}")
|
| 974 |
+
# try:
|
| 975 |
+
# table_merged_json = read_json(table_merged_json_path)
|
| 976 |
+
# logger.info(f"Successfully read merged table JSON")
|
| 977 |
+
# except Exception as e:
|
| 978 |
+
# logger.error(f"Failed to read merged table JSON: {str(e)}")
|
| 979 |
+
# response["message"] = f"Failed to read merged table JSON: {str(e)}"
|
| 980 |
+
# return response
|
| 981 |
+
|
| 982 |
+
# # logger.info(f"Starting map_table_with_its_header for {file_name_with_extDeprecationWarning}")
|
| 983 |
+
# try:
|
| 984 |
+
# table_mapped_modified_json = map_table_with_its_header(table_merged_json)
|
| 985 |
+
# logger.info(f"map_table_with_its_header completed")
|
| 986 |
+
# except Exception as e:
|
| 987 |
+
# logger.error(f"map_table_with_its_header failed: {str(e)}")
|
| 988 |
+
# response["message"] = f"map_table_with_its_header failed: {st(e)}"
|
| 989 |
+
# return response
|
| 990 |
+
|
| 991 |
+
# logger.info(f"Starting main_header_pipeline for {file_name_with_ext}")
|
| 992 |
+
# try:
|
| 993 |
+
# df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
|
| 994 |
+
# logger.info(f"main_header_pipeline completed")
|
| 995 |
+
# except Exception as e:
|
| 996 |
+
# logger.error(f"main_header_pipeline failed: {str(e)}")
|
| 997 |
+
# response["message"] = f"main_header_pipeline failed: {str(e)}"
|
| 998 |
+
# return response
|
| 999 |
+
|
| 1000 |
+
# logger.info(f"Starting clean_dataframe for final DataFrame")
|
| 1001 |
+
# try:
|
| 1002 |
+
# clean_df, clean_df_json = clean_dataframe(df_final)
|
| 1003 |
+
# logger.info(f"clean_dataframe completed. Clean JSON created")
|
| 1004 |
+
# except Exception as e:
|
| 1005 |
+
# logger.error(f"clean_dataframe failed: {str(e)}")
|
| 1006 |
+
# response["message"] = f"clean_dataframe failed: {str(e)}"
|
| 1007 |
+
# return response
|
| 1008 |
+
|
| 1009 |
+
# if isinstance(clean_df_json, str):
|
| 1010 |
+
# clean_df_json = eval(clean_df_json)
|
| 1011 |
+
# logger.debug(f"Converted clean_df_json string to dictionary")
|
| 1012 |
+
|
| 1013 |
+
# file_name = get_file_name_without_extension(pdf_file_name)
|
| 1014 |
+
# logger.info(f"Starting process_document_company_wise for {file_name}")
|
| 1015 |
+
# try:
|
| 1016 |
+
# merged_content_company_wise_df = process_document_company_wise(clean_df_json, output_directory=json_output_dir, file_name=file_name)
|
| 1017 |
+
# company_wise_clean_df, company_wise_clean_df_json = clean_dataframe(merged_content_company_wise_df)
|
| 1018 |
+
# logger.info(f"process_document_company_wise and clean_dataframe completed")
|
| 1019 |
+
# except Exception as e:
|
| 1020 |
+
# logger.error(f"process_document_company_wise failed: {str(e)}")
|
| 1021 |
+
# response["message"] = f"process_document_company_wise failed: {str(e)}"
|
| 1022 |
+
# return response
|
| 1023 |
+
|
| 1024 |
+
# json_output_filename = file_name + "_final_h2h_extraction.json"
|
| 1025 |
+
# final_json_output_filepath = os.path.join(json_output_dir, json_output_filename)
|
| 1026 |
+
# logger.info(f"Saving final JSON to {final_json_output_filepath}")
|
| 1027 |
+
# try:
|
| 1028 |
+
# with open(final_json_output_filepath, 'w') as f:
|
| 1029 |
+
# json.dump(clean_df_json, f, indent=4)
|
| 1030 |
+
# logger.info(f"Final JSON saved successfully")
|
| 1031 |
+
# except Exception as e:
|
| 1032 |
+
# logger.error(f"Failed to save final JSON: {str(e)}")
|
| 1033 |
+
# response["message"] = f"Failed to save final JSON: {str(e)}"
|
| 1034 |
+
# return response
|
| 1035 |
+
|
| 1036 |
+
# company_wise_json_output_filename = file_name + "_final_h2h_extraction_company_wise.json"
|
| 1037 |
+
# company_wise_final_json_output_filepath = os.path.join(json_output_dir, company_wise_json_output_filename)
|
| 1038 |
+
# logger.info(f"Saving company-wise JSON to {company_wise_final_json_output_filepath}")
|
| 1039 |
+
# try:
|
| 1040 |
+
# with open(company_wise_final_json_output_filepath, 'w') as f:
|
| 1041 |
+
# json.dump(merged_content_company_wise_df, f, indent=4)
|
| 1042 |
+
# logger.info(f"Company-wise JSON saved successfully")
|
| 1043 |
+
# except Exception as e:
|
| 1044 |
+
# logger.error(f"Failed to save company-wise JSON: {str(e)}")
|
| 1045 |
+
# response["message"] = f"Failed to save company-wise JSON: {str(e)}"
|
| 1046 |
+
# return response
|
| 1047 |
+
|
| 1048 |
+
# logger.info(f"Starting tree_structured_headers_content_pipeline for {pdf_file_name}")
|
| 1049 |
+
# try:
|
| 1050 |
+
# final_tree_structred_header_content = tree_structured_headers_content_pipeline(header_content_json_data, json_output_dir, pdf_file_name)
|
| 1051 |
+
# final_tree_structred_header_content = merge_blocks(final_tree_structred_header_content)
|
| 1052 |
+
# document_data[document_id_name]["cluster_json"].append(final_tree_structred_header_content)
|
| 1053 |
+
# logger.info(f"tree_structured_headers_content_pipeline and merge_blocks completed")
|
| 1054 |
+
# except Exception as e:
|
| 1055 |
+
# logger.error(f"tree_structured_headers_content_pipeline failed: {str(e)}")
|
| 1056 |
+
# response["message"] = f"tree_structured_headers_content_pipeline failed: {str(e)}"
|
| 1057 |
+
# return response
|
| 1058 |
+
|
| 1059 |
+
# logger.info(f"Starting customised_toc_extraction_pipeline for {searchable_pdf_path}")
|
| 1060 |
+
# try:
|
| 1061 |
+
# final_toc_h2h_extraction = customised_toc_extraction_pipeline(pdf_path=searchable_pdf_path, yolo_detection_json_path=modified_json_output_filepath, output_directory=file_output_dir)
|
| 1062 |
+
# logger.info(f"customised_toc_extraction_pipeline completed")
|
| 1063 |
+
# except Exception as e:
|
| 1064 |
+
# logger.error(f"customised_toc_extraction_pipeline failed: {str(e)}")
|
| 1065 |
+
# response["message"] = f"customised_toc_extraction_pipeline failed: {str(e)}"
|
| 1066 |
+
# return response
|
| 1067 |
+
|
| 1068 |
+
# # Save final JSON output
|
| 1069 |
+
# json_directory = os.path.dirname(pdf_path)
|
| 1070 |
+
# json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 1071 |
+
# output_json_path = os.path.join(json_directory, f"{json_filename}.json")
|
| 1072 |
+
# logger.info(f"Saving output JSON to {output_json_path}")
|
| 1073 |
+
|
| 1074 |
+
# try:
|
| 1075 |
+
# if isinstance(company_wise_clean_df_json, str):
|
| 1076 |
+
# company_wise_clean_df_json = json.loads(company_wise_clean_df_json)
|
| 1077 |
+
# logger.debug(f"Converted company_wise_clean_df_json string to dictionary")
|
| 1078 |
+
|
| 1079 |
+
# with open(output_json_path, 'w', encoding='utf-8') as json_file:
|
| 1080 |
+
# json.dump(company_wise_clean_df_json, json_file, ensure_ascii=False, indent=4)
|
| 1081 |
+
# logger.info(f"Output JSON saved successfully")
|
| 1082 |
+
# except Exception as e:
|
| 1083 |
+
# logger.error(f"Failed to save output JSON: {str(e)}")
|
| 1084 |
+
# response["message"] = f"Failed to save output JSON: {str(e)}"
|
| 1085 |
+
# return response
|
| 1086 |
+
|
| 1087 |
+
# response_final = {
|
| 1088 |
+
# "status_code": 200,
|
| 1089 |
+
# "saved_json_path": output_json_path
|
| 1090 |
+
# }
|
| 1091 |
+
# logger.info(f"upload_documents completed successfully for {file_name_with_ext}. Response: {response_final}")
|
| 1092 |
+
# return response_final
|
| 1093 |
+
|
| 1094 |
+
|
| 1095 |
+
def table_extraction_and_mapping(path,
|
| 1096 |
+
field_name,
|
| 1097 |
+
class_keywords_table,
|
| 1098 |
+
header_categories,
|
| 1099 |
+
class_keywords_table_column,
|
| 1100 |
+
filter_table_classifier_name,
|
| 1101 |
+
threshold) :
|
| 1102 |
+
# path = eval(f'{path}')
|
| 1103 |
+
request = Request
|
| 1104 |
+
print(f'started for path: {path}')
|
| 1105 |
+
base_url = str(request.base_url)
|
| 1106 |
+
global document_data
|
| 1107 |
+
document_data = {}
|
| 1108 |
+
pdf_path = path
|
| 1109 |
+
|
| 1110 |
+
clear_directory(pdf_input_path)
|
| 1111 |
+
clear_directory(pdf_output_path)
|
| 1112 |
+
clear_directory(word_input_path)
|
| 1113 |
+
clear_directory(word_output_path)
|
| 1114 |
+
|
| 1115 |
+
# Initialize response structure
|
| 1116 |
+
response = {
|
| 1117 |
+
"success": False,
|
| 1118 |
+
"message": "",
|
| 1119 |
+
# "data": None
|
| 1120 |
+
}
|
| 1121 |
+
|
| 1122 |
+
|
| 1123 |
+
if not pdf_path:
|
| 1124 |
+
response["message"] = "No file path provided."
|
| 1125 |
+
return response
|
| 1126 |
+
|
| 1127 |
+
# Check if the provided path is a PDF file
|
| 1128 |
+
if not pdf_path.lower().endswith(".pdf"):
|
| 1129 |
+
response["message"] = "Invalid file type. Only PDF files are accepted."
|
| 1130 |
+
return response
|
| 1131 |
+
|
| 1132 |
+
|
| 1133 |
+
# Extract filename
|
| 1134 |
+
file_name_with_ext = os.path.basename(pdf_path)
|
| 1135 |
+
file_name_without_ext = os.path.splitext(file_name_with_ext)[0]
|
| 1136 |
+
|
| 1137 |
+
# Create destination path in input directory
|
| 1138 |
+
destination_path = os.path.join(pdf_input_directory, file_name_with_ext)
|
| 1139 |
+
|
| 1140 |
+
try:
|
| 1141 |
+
# Copy the file to our input directory
|
| 1142 |
+
shutil.copy2(pdf_path, destination_path)
|
| 1143 |
+
except Exception as e:
|
| 1144 |
+
response["message"] = f"Failed to copy file: {str(e)}"
|
| 1145 |
+
return response
|
| 1146 |
+
|
| 1147 |
+
output_directory_path = os.path.join(output_directory)
|
| 1148 |
+
os.makedirs(output_directory_path, exist_ok=True)
|
| 1149 |
+
file_output = os.path.join(output_directory_path, f"{file_name_without_ext}_output")
|
| 1150 |
+
os.makedirs(file_output, exist_ok=True)
|
| 1151 |
+
|
| 1152 |
+
table_output_path = os.path.join(file_output, f"table_output")
|
| 1153 |
+
os.makedirs(table_output_path, exist_ok=True)
|
| 1154 |
+
file_location = destination_path
|
| 1155 |
+
|
| 1156 |
+
|
| 1157 |
+
# Pipeline processing
|
| 1158 |
+
|
| 1159 |
+
json_output_dir, layout_list_data, class_names, header_output_json_data, header_json_output_filepath, pdf_images_path, file_name, layout_output_json_data, layout_output_json_filepath, tree_structured_header_json_data, tree_structured_header_json_output_path, filtered_table_header_data, filtered_table_header_data_json_path = yolov10_layout_pipeline(file_name_without_ext, file_location, file_output)
|
| 1160 |
+
table_json_data_list, table_json_data, table_json_path = filtering_table_pipeline(layout_output_json_filepath, table_output_path, file_name_without_ext)
|
| 1161 |
+
|
| 1162 |
+
custom_headers_json,custom_headers_json_file_path = process_pdf_for_headers(file_name_without_ext,file_location,file_output)
|
| 1163 |
+
header_json_output_filepath = pipeline_for_merging_headers(custom_headers_json_file_path,header_json_output_filepath,file_output,file_name_without_ext)
|
| 1164 |
+
|
| 1165 |
+
# Initialize data for the new document
|
| 1166 |
+
document_data[file_name_with_ext] = {
|
| 1167 |
+
|
| 1168 |
+
"pdf_path": destination_path,
|
| 1169 |
+
"pdf_file_name": file_name_with_ext,
|
| 1170 |
+
"model_json_header_output_filepath": [],
|
| 1171 |
+
"model_json_layout_output_filepath": [],
|
| 1172 |
+
"tree_structured_header_json_filepath": [],
|
| 1173 |
+
"user_modified_json_output_filepath": [],
|
| 1174 |
+
'user_modified_table_json_filepath': [],
|
| 1175 |
+
"frontend_output_json": [],
|
| 1176 |
+
"cluster_json": [],
|
| 1177 |
+
"id_2_label" : [],
|
| 1178 |
+
"file_output_dir" : [],
|
| 1179 |
+
"table_output_dir": [],
|
| 1180 |
+
"table_with_header_data" : [],
|
| 1181 |
+
"table_with_header_json_path" : [],
|
| 1182 |
+
"json_output_dir": [],
|
| 1183 |
+
"pdf_miner_json_path": [] ,
|
| 1184 |
+
"searchable_pdf_path" : []
|
| 1185 |
+
}
|
| 1186 |
+
|
| 1187 |
+
# Store paths and filenames
|
| 1188 |
+
document_data[file_name_with_ext]["model_json_header_output_filepath"].append(header_json_output_filepath)
|
| 1189 |
+
document_data[file_name_with_ext]["model_json_layout_output_filepath"].append(layout_output_json_filepath)
|
| 1190 |
+
document_data[file_name_with_ext]["tree_structured_header_json_filepath"].append(tree_structured_header_json_output_path)
|
| 1191 |
+
document_data[file_name_with_ext]["user_modified_json_output_filepath"].append(header_json_output_filepath)
|
| 1192 |
+
document_data[file_name_with_ext]["user_modified_table_json_filepath"].append(table_json_path)
|
| 1193 |
+
document_data[file_name_with_ext]["file_output_dir"].append(file_output)
|
| 1194 |
+
document_data[file_name_with_ext]["id_2_label"].append(class_names)
|
| 1195 |
+
document_data[file_name_with_ext]["table_output_dir"].append(table_output_path)
|
| 1196 |
+
document_data[file_name_with_ext]["table_with_header_data"].append(filtered_table_header_data)
|
| 1197 |
+
document_data[file_name_with_ext]["table_with_header_json_path"].append(filtered_table_header_data_json_path)
|
| 1198 |
+
document_data[file_name_with_ext]["json_output_dir"].append(json_output_dir)
|
| 1199 |
+
|
| 1200 |
+
file_output_dir = document_data[file_name_with_ext]["file_output_dir"][0]
|
| 1201 |
+
pdf_file_name = document_data[file_name_with_ext]["pdf_file_name"]
|
| 1202 |
+
pdf_path = document_data[file_name_with_ext]["pdf_path"]
|
| 1203 |
+
user_modified_json_filepath = document_data[file_name_with_ext]["user_modified_json_output_filepath"][0]
|
| 1204 |
+
|
| 1205 |
+
pdf_miner_json_filepath, pdf_miner_metadata,searchable_pdf_path = create_json_pdfminer_pipeline(pdf_file_name, pdf_path, file_output_dir)
|
| 1206 |
+
|
| 1207 |
+
table_header_modified_json, tree_structured_new_json_with_table_header = main_pipeline_create_put_table_headers(header_json_output_filepath, pdf_miner_json_filepath, user_modified_json_filepath)
|
| 1208 |
+
document_data[file_name_with_ext]["pdf_miner_json_path"].append(pdf_miner_json_filepath)
|
| 1209 |
+
|
| 1210 |
+
document_data[file_name_with_ext]["searchable_pdf_path"].append(searchable_pdf_path)
|
| 1211 |
+
|
| 1212 |
+
# Process image URLs
|
| 1213 |
+
pdf_images_urls = []
|
| 1214 |
+
for file_name in os.listdir(pdf_images_path):
|
| 1215 |
+
file_path = os.path.join(pdf_images_path, file_name)
|
| 1216 |
+
if file_name.endswith((".jpg", ".jpeg", ".png")):
|
| 1217 |
+
img_url = base_url + "image/" + str(quote(file_path))
|
| 1218 |
+
pdf_images_urls.append(img_url)
|
| 1219 |
+
|
| 1220 |
+
# Sort image URLs by page number
|
| 1221 |
+
def extract_page_no(url):
|
| 1222 |
+
return int(url.split("_")[-1].split(".")[0])
|
| 1223 |
+
sorted_urls = sorted(pdf_images_urls, key=extract_page_no)
|
| 1224 |
+
|
| 1225 |
+
# Create page details
|
| 1226 |
+
page_details = [{"url_block_id": idx + 1, "pdf_page_url": url} for idx, url in enumerate(sorted_urls)]
|
| 1227 |
+
|
| 1228 |
+
# Store the JSON output
|
| 1229 |
+
document_data[file_name_with_ext]["frontend_output_json"].append({
|
| 1230 |
+
"layout_output_json_data": layout_output_json_data,
|
| 1231 |
+
"layout_json_list_data": layout_list_data,
|
| 1232 |
+
"id_2_label": class_names,
|
| 1233 |
+
"header_output_json_data": header_output_json_data,
|
| 1234 |
+
"table_output_json_data": table_json_data,
|
| 1235 |
+
"table_output_json_data_list": table_json_data_list,
|
| 1236 |
+
"tree_structured_header_output_json_data": tree_structured_new_json_with_table_header,
|
| 1237 |
+
"pdf_images_urls": page_details,
|
| 1238 |
+
})
|
| 1239 |
+
|
| 1240 |
+
|
| 1241 |
+
document_id_name = file_name_with_ext
|
| 1242 |
+
|
| 1243 |
+
data = document_data[document_id_name]
|
| 1244 |
+
file_output_dir = data["file_output_dir"][0]
|
| 1245 |
+
json_output_dir = data["json_output_dir"][0]
|
| 1246 |
+
pdf_file_name = data["pdf_file_name"]
|
| 1247 |
+
pdf_path = data["pdf_path"]
|
| 1248 |
+
|
| 1249 |
+
# PDFMiner processing
|
| 1250 |
+
pdf_miner_json_filepath = data['pdf_miner_json_path'][0]
|
| 1251 |
+
modified_json_output_filepath = data["user_modified_json_output_filepath"][0]
|
| 1252 |
+
model_modified_json = read_json(modified_json_output_filepath)
|
| 1253 |
+
pdfminer_json = read_json(pdf_miner_json_filepath)
|
| 1254 |
+
searchable_pdf_path = data["searchable_pdf_path"][0]
|
| 1255 |
+
|
| 1256 |
+
# table_merged_json_path = merge_multi_page_tables_pipeline(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 1257 |
+
|
| 1258 |
+
table_merged_json_path = merge_multi_page_tables_pipeline_v2(modified_json_output_filepath, pdf_file_name, file_output_dir)
|
| 1259 |
+
|
| 1260 |
+
table_merged_json = read_json(table_merged_json_path)
|
| 1261 |
+
|
| 1262 |
+
table_mapped_modified_json = map_table_with_its_header(table_merged_json)
|
| 1263 |
+
|
| 1264 |
+
|
| 1265 |
+
# table_mapped_modified_json = map_table_with_its_header(model_modified_json)
|
| 1266 |
+
|
| 1267 |
+
# Main header pipeline
|
| 1268 |
+
df_final, header_content_json_data = main_header_pipeline(table_mapped_modified_json, pdfminer_json)
|
| 1269 |
+
|
| 1270 |
+
|
| 1271 |
+
clean_df, clean_df_json = clean_dataframe(df_final)
|
| 1272 |
+
|
| 1273 |
+
|
| 1274 |
+
# if isinstance(clean_df_json, str):
|
| 1275 |
+
# clean_df_json = eval(clean_df_json)
|
| 1276 |
+
|
| 1277 |
+
file_name = get_file_name_without_extension(pdf_file_name)
|
| 1278 |
+
|
| 1279 |
+
|
| 1280 |
+
# Step 1: Extract directory and filename without extension
|
| 1281 |
+
pdf_path = path
|
| 1282 |
+
json_directory = os.path.dirname(pdf_path)
|
| 1283 |
+
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 1284 |
+
|
| 1285 |
+
# Step 2: Define output path for JSON
|
| 1286 |
+
output_json_path = os.path.join(json_directory, f"{json_filename}_structured_chunking.json")
|
| 1287 |
+
|
| 1288 |
+
# If your variable is a JSON string, convert it to dict first
|
| 1289 |
+
if isinstance(clean_df_json, str):
|
| 1290 |
+
clean_df_json = json.loads(clean_df_json)
|
| 1291 |
+
|
| 1292 |
+
|
| 1293 |
+
# Step 3: Save JSON
|
| 1294 |
+
with open(output_json_path, 'w', encoding='utf-8') as json_file:
|
| 1295 |
+
json.dump(clean_df_json, json_file, ensure_ascii=False, indent=4)
|
| 1296 |
+
|
| 1297 |
+
|
| 1298 |
+
##########################################################
|
| 1299 |
+
# Table Classification Code
|
| 1300 |
+
|
| 1301 |
+
##########################################################
|
| 1302 |
+
|
| 1303 |
+
|
| 1304 |
+
print("starting table classification pipeline")
|
| 1305 |
+
|
| 1306 |
+
structured_chunk_json_path = output_json_path
|
| 1307 |
+
|
| 1308 |
+
with open(structured_chunk_json_path, 'r', encoding='utf-8') as file:
|
| 1309 |
+
content = file.read()
|
| 1310 |
+
|
| 1311 |
+
# This regex removes commas before closing braces/brackets, ignoring whitespace
|
| 1312 |
+
cleaned_content = re.sub(r',\s*([\]}])', r'\1', content)
|
| 1313 |
+
|
| 1314 |
+
# Parse the cleaned JSON
|
| 1315 |
+
structured_chunk_data = json.loads(cleaned_content)
|
| 1316 |
+
|
| 1317 |
+
threshold = float(threshold)
|
| 1318 |
+
print("type of class_keywords_table::\n",type(class_keywords_table))
|
| 1319 |
+
|
| 1320 |
+
# If class_keywords is a string, try to parse it
|
| 1321 |
+
if isinstance(class_keywords_table, str):
|
| 1322 |
+
try:
|
| 1323 |
+
class_keywords_table = json.loads(class_keywords_table)
|
| 1324 |
+
|
| 1325 |
+
# if not isinstance(class_keywords_table, dict):
|
| 1326 |
+
# raise ValueError("class_keywords_table must be a dictionary")
|
| 1327 |
+
# if not all(isinstance(key, str) and isinstance(value, list) and all(isinstance(v, str) for v in value)
|
| 1328 |
+
# for key, value in class_keywords_table.items()):
|
| 1329 |
+
# raise ValueError("class_keywords_table must be a dictionary with string keys and lists of strings as values")
|
| 1330 |
+
except json.JSONDecodeError:
|
| 1331 |
+
raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table"})
|
| 1332 |
+
|
| 1333 |
+
elif isinstance(class_keywords_table, dict) :
|
| 1334 |
+
class_keywords_table = class_keywords_table
|
| 1335 |
+
|
| 1336 |
+
|
| 1337 |
+
else:
|
| 1338 |
+
raise HTTPException(status_code=422, detail={"error": "class_keywords_table must be a dictionary with string keys and lists of strings as values"})
|
| 1339 |
+
|
| 1340 |
+
|
| 1341 |
+
# Perform classification
|
| 1342 |
+
categorized_headers_json = perform_classification(
|
| 1343 |
+
data=structured_chunk_data,
|
| 1344 |
+
class_keywords=class_keywords_table,
|
| 1345 |
+
header_categories=header_categories,
|
| 1346 |
+
similarity_threshold=threshold
|
| 1347 |
+
)
|
| 1348 |
+
|
| 1349 |
+
# Step 1: Extract directory and filename without extension
|
| 1350 |
+
pdf_path = path
|
| 1351 |
+
json_directory = os.path.dirname(pdf_path)
|
| 1352 |
+
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 1353 |
+
|
| 1354 |
+
# Step 2: Define output path for JSON
|
| 1355 |
+
classified_table_output_json_path = os.path.join(json_directory, f"{field_name}_table_classification.json")
|
| 1356 |
+
|
| 1357 |
+
# If your variable is a JSON string, convert it to dict first
|
| 1358 |
+
if isinstance(categorized_headers_json, str):
|
| 1359 |
+
categorized_headers_json = json.loads(categorized_headers_json)
|
| 1360 |
+
|
| 1361 |
+
|
| 1362 |
+
# Step 3: Save JSON
|
| 1363 |
+
with open(classified_table_output_json_path, 'w', encoding='utf-8') as json_file:
|
| 1364 |
+
json.dump(categorized_headers_json, json_file, ensure_ascii=False, indent=4)
|
| 1365 |
+
|
| 1366 |
+
|
| 1367 |
+
#######################################################
|
| 1368 |
+
# Table Column Classification Code
|
| 1369 |
+
print("Starting Table Column Classification")
|
| 1370 |
+
|
| 1371 |
+
|
| 1372 |
+
# Parse JSON strings into dictionaries
|
| 1373 |
+
# input_table_classified_json = json.load(classified_table_output_json_path)
|
| 1374 |
+
with open(classified_table_output_json_path, "r") as f:
|
| 1375 |
+
input_table_classified_json = json.load(f)
|
| 1376 |
+
|
| 1377 |
+
|
| 1378 |
+
# class_keywords_table_column = json.loads(class_keywords_table_column)
|
| 1379 |
+
|
| 1380 |
+
if isinstance(class_keywords_table_column, str):
|
| 1381 |
+
try:
|
| 1382 |
+
class_keywords_table_column = json.loads(class_keywords_table_column)
|
| 1383 |
+
|
| 1384 |
+
|
| 1385 |
+
except json.JSONDecodeError:
|
| 1386 |
+
raise HTTPException(status_code=422, detail={"error": "Invalid JSON string for class_keywords_table_column"})
|
| 1387 |
+
|
| 1388 |
+
elif isinstance(class_keywords_table_column, dict):
|
| 1389 |
+
class_keywords_table_column = class_keywords_table_column
|
| 1390 |
+
|
| 1391 |
+
# Convert similarity_threshold to integer
|
| 1392 |
+
similarity_threshold = float(threshold)
|
| 1393 |
+
|
| 1394 |
+
column_classification_results_json = classify_column_headers(
|
| 1395 |
+
json_data=input_table_classified_json,
|
| 1396 |
+
class_keywords=class_keywords_table_column,
|
| 1397 |
+
filter_table_classifier_name=filter_table_classifier_name,
|
| 1398 |
+
similarity_threshold=similarity_threshold
|
| 1399 |
+
)
|
| 1400 |
+
|
| 1401 |
+
# Step 1: Extract directory and filename without extension
|
| 1402 |
+
pdf_path = path
|
| 1403 |
+
json_directory = os.path.dirname(pdf_path)
|
| 1404 |
+
json_filename = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 1405 |
+
|
| 1406 |
+
# Step 2: Define output path for JSON
|
| 1407 |
+
classified_table_column_output_json_path = os.path.join(json_directory, f"{field_name}_table_column_classification.json")
|
| 1408 |
+
|
| 1409 |
+
# If your variable is a JSON string, convert it to dict first
|
| 1410 |
+
if isinstance(column_classification_results_json, str):
|
| 1411 |
+
column_classification_results_json = json.loads(column_classification_results_json)
|
| 1412 |
+
|
| 1413 |
+
|
| 1414 |
+
# Step 3: Save JSON
|
| 1415 |
+
with open(classified_table_column_output_json_path, 'w', encoding='utf-8') as json_file:
|
| 1416 |
+
json.dump(column_classification_results_json, json_file, ensure_ascii=False, indent=4)
|
| 1417 |
+
|
| 1418 |
+
#######################################################################
|
| 1419 |
+
|
| 1420 |
+
response_final = {
|
| 1421 |
+
"status_code": 200,
|
| 1422 |
+
# "message":"",
|
| 1423 |
+
# "df_download_json": company_wise_clean_df_json,
|
| 1424 |
+
"structured_chunk_json_path": output_json_path,
|
| 1425 |
+
"table_classification_json_path":classified_table_output_json_path,
|
| 1426 |
+
"table_column_classification_json_path" : classified_table_column_output_json_path
|
| 1427 |
+
}
|
| 1428 |
+
|
| 1429 |
+
|
| 1430 |
+
|
| 1431 |
+
|
| 1432 |
+
return response_final
|
| 1433 |
+
|
| 1434 |
+
|
| 1435 |
+
|
| 1436 |
+
|
| 1437 |
+
|
layout_detection_docling_heron (1).py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import os
|
| 3 |
+
import supervision as sv # pip install supervision
|
| 4 |
+
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
|
| 5 |
+
from pdf2image import convert_from_path
|
| 6 |
+
import numpy as np
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import json
|
| 9 |
+
import pytesseract
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from sentence_transformers import SentenceTransformer, util
|
| 12 |
+
from PyPDF2 import PdfReader
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import torch
|
| 15 |
+
import logging
|
| 16 |
+
from utils.utils_code import log_time_taken
|
| 17 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 18 |
+
import multiprocessing
|
| 19 |
+
import sys
|
| 20 |
+
import gc
|
| 21 |
+
|
| 22 |
+
from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
|
| 23 |
+
from config.set_config import set_configuration
|
| 24 |
+
set_config_project = set_configuration()
|
| 25 |
+
layout_model_weights_path = set_config_project.layout_model_weights_path
|
| 26 |
+
no_of_threads = set_config_project.no_of_threads
|
| 27 |
+
from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
|
| 28 |
+
from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
|
| 29 |
+
from src.table_processing.put_table_header import put_table_header_pipeline
|
| 30 |
+
import gc
|
| 31 |
+
from src.layout_detection.load_model import load_model_for_process
|
| 32 |
+
|
| 33 |
+
# Set multiprocessing start method
|
| 34 |
+
multiprocessing.set_start_method('spawn', force=True)
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
# Configure logging
|
| 38 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 39 |
+
|
| 40 |
+
def load_torch(version):
|
| 41 |
+
if version == "2.2.2":
|
| 42 |
+
sys.path.insert(0, "./torch_2_2_2")
|
| 43 |
+
elif version == "2.6.0":
|
| 44 |
+
sys.path.insert(0, "./torch_2_6_0")
|
| 45 |
+
import torch
|
| 46 |
+
logger.info(f"Using Torch Version: {torch.__version__}")
|
| 47 |
+
return torch
|
| 48 |
+
|
| 49 |
+
torch = load_torch("2.2.2")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get_file_name_without_extension(file_path):
|
| 54 |
+
directory, file_name = os.path.split(file_path)
|
| 55 |
+
name, extension = os.path.splitext(file_name)
|
| 56 |
+
return name
|
| 57 |
+
|
| 58 |
+
def convert_numpy(data):
|
| 59 |
+
if isinstance(data, dict):
|
| 60 |
+
return {key: convert_numpy(value) for key, value in data.items()}
|
| 61 |
+
elif isinstance(data, list):
|
| 62 |
+
return [convert_numpy(item) for item in data]
|
| 63 |
+
elif isinstance(data, np.integer):
|
| 64 |
+
return int(data)
|
| 65 |
+
elif isinstance(data, np.floating):
|
| 66 |
+
return float(data)
|
| 67 |
+
elif isinstance(data, np.ndarray):
|
| 68 |
+
return data.tolist()
|
| 69 |
+
elif isinstance(data, pd.DataFrame):
|
| 70 |
+
return data.to_dict(orient='records')
|
| 71 |
+
else:
|
| 72 |
+
return data
|
| 73 |
+
|
| 74 |
+
def filter_layout_blocks(input_data):
|
| 75 |
+
filtered_layout_blocks = []
|
| 76 |
+
for blocks in input_data.values():
|
| 77 |
+
filtered_layout_blocks.extend([block for block in blocks])
|
| 78 |
+
return filtered_layout_blocks
|
| 79 |
+
|
| 80 |
+
def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
|
| 81 |
+
images = convert_from_path(file_path, dpi=dpi)
|
| 82 |
+
total_pages = len(images)
|
| 83 |
+
|
| 84 |
+
def page_generator():
|
| 85 |
+
for start_page in range(1, total_pages + 1, batch_size):
|
| 86 |
+
end_page = min(start_page + batch_size - 1, total_pages)
|
| 87 |
+
yield images[start_page-1:end_page]
|
| 88 |
+
|
| 89 |
+
return page_generator()
|
| 90 |
+
|
| 91 |
+
def read_json(json_file):
|
| 92 |
+
with open(json_file, 'r') as file:
|
| 93 |
+
return json.load(file)
|
| 94 |
+
|
| 95 |
+
def filter_and_sort_headers(data, modified_json_output_filepath):
|
| 96 |
+
def sort_blocks_by_min_x(blocks):
|
| 97 |
+
return sorted(blocks, key=lambda block: block['bbox'][0])
|
| 98 |
+
|
| 99 |
+
def sort_blocks_by_min_y(blocks):
|
| 100 |
+
return sorted(blocks, key=lambda block: block['bbox'][1])
|
| 101 |
+
|
| 102 |
+
def find_headers_and_group(sorted_blocks):
|
| 103 |
+
headers_list = []
|
| 104 |
+
current_group = []
|
| 105 |
+
previous_block = None
|
| 106 |
+
|
| 107 |
+
for i, block in enumerate(sorted_blocks):
|
| 108 |
+
if previous_block:
|
| 109 |
+
prev_xmax = previous_block['bbox'][2]
|
| 110 |
+
prev_xmax_threshold = int(previous_block['bbox'][2])
|
| 111 |
+
if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
|
| 112 |
+
if current_group:
|
| 113 |
+
headers_list.extend(sort_blocks_by_min_y(current_group))
|
| 114 |
+
current_group = []
|
| 115 |
+
current_group.append(block)
|
| 116 |
+
previous_block = block
|
| 117 |
+
|
| 118 |
+
if current_group:
|
| 119 |
+
headers_list.extend(sort_blocks_by_min_y(current_group))
|
| 120 |
+
|
| 121 |
+
return headers_list
|
| 122 |
+
|
| 123 |
+
result = {}
|
| 124 |
+
for key, blocks in data.items():
|
| 125 |
+
sorted_blocks = sort_blocks_by_min_x(blocks)
|
| 126 |
+
sorted_headers = find_headers_and_group(sorted_blocks)
|
| 127 |
+
result[key] = sorted_headers
|
| 128 |
+
|
| 129 |
+
sorted_data = result
|
| 130 |
+
with open(modified_json_output_filepath, 'w') as f:
|
| 131 |
+
json.dump(sorted_data, f, indent=4)
|
| 132 |
+
|
| 133 |
+
return sorted_data, modified_json_output_filepath
|
| 134 |
+
|
| 135 |
+
def filter_and_sort_layouts(data, modified_json_output_filepath):
|
| 136 |
+
def sort_blocks_by_min_x(blocks):
|
| 137 |
+
return sorted(blocks, key=lambda block: block['bbox'][0])
|
| 138 |
+
|
| 139 |
+
def sort_blocks_by_min_y(blocks):
|
| 140 |
+
return sorted(blocks, key=lambda block: block['bbox'][1])
|
| 141 |
+
|
| 142 |
+
def find_classes_and_group(sorted_blocks):
|
| 143 |
+
classes_list = []
|
| 144 |
+
current_group = []
|
| 145 |
+
previous_block = None
|
| 146 |
+
|
| 147 |
+
for i, block in enumerate(sorted_blocks):
|
| 148 |
+
if previous_block:
|
| 149 |
+
prev_xmax = previous_block['bbox'][2]
|
| 150 |
+
prev_xmax_threshold = int(previous_block['bbox'][2])
|
| 151 |
+
if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
|
| 152 |
+
if current_group:
|
| 153 |
+
classes_list.extend(sort_blocks_by_min_y(current_group))
|
| 154 |
+
current_group = []
|
| 155 |
+
current_group.append(block)
|
| 156 |
+
previous_block = block
|
| 157 |
+
|
| 158 |
+
if current_group:
|
| 159 |
+
classes_list.extend(sort_blocks_by_min_y(current_group))
|
| 160 |
+
|
| 161 |
+
return classes_list
|
| 162 |
+
|
| 163 |
+
result = {}
|
| 164 |
+
for key, blocks in data.items():
|
| 165 |
+
sorted_blocks = sort_blocks_by_min_x(blocks)
|
| 166 |
+
sorted_layouts = find_classes_and_group(sorted_blocks)
|
| 167 |
+
result[key] = sorted_layouts
|
| 168 |
+
|
| 169 |
+
sorted_layout_data = result
|
| 170 |
+
with open(modified_json_output_filepath, 'w') as f:
|
| 171 |
+
json.dump(sorted_layout_data, f, indent=4)
|
| 172 |
+
|
| 173 |
+
return sorted_layout_data, modified_json_output_filepath
|
| 174 |
+
|
| 175 |
+
@log_time_taken
|
| 176 |
+
def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 177 |
+
try:
|
| 178 |
+
image = Image.open(img_path).convert("RGB")
|
| 179 |
+
|
| 180 |
+
# Process image with the Docling Heron model
|
| 181 |
+
inputs = image_processor(images=[image], return_tensors="pt")
|
| 182 |
+
|
| 183 |
+
# Move inputs to the same device as the model
|
| 184 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 185 |
+
|
| 186 |
+
with torch.no_grad():
|
| 187 |
+
outputs = model(**inputs)
|
| 188 |
+
|
| 189 |
+
# Post-process the results
|
| 190 |
+
results = image_processor.post_process_object_detection(
|
| 191 |
+
outputs,
|
| 192 |
+
target_sizes=torch.tensor([image.size[::-1]], device=device),
|
| 193 |
+
threshold=threshold
|
| 194 |
+
)[0]
|
| 195 |
+
|
| 196 |
+
# Move results to CPU for further processing
|
| 197 |
+
results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
|
| 198 |
+
|
| 199 |
+
# Convert to supervision Detections format for compatibility
|
| 200 |
+
xyxy = results["boxes"].numpy()
|
| 201 |
+
confidence = results["scores"].numpy()
|
| 202 |
+
class_id = results["labels"].numpy()
|
| 203 |
+
class_name = [model.config.id2label[label_id] for label_id in class_id]
|
| 204 |
+
|
| 205 |
+
detections = sv.Detections(
|
| 206 |
+
xyxy=xyxy,
|
| 207 |
+
confidence=confidence,
|
| 208 |
+
class_id=class_id,
|
| 209 |
+
data={"class_name": class_name}
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
# Custom bounding box color (Red)
|
| 213 |
+
bbox_color = sv.Color(r=255, g=0, b=0)
|
| 214 |
+
bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
|
| 215 |
+
label_annotator = sv.LabelAnnotator()
|
| 216 |
+
|
| 217 |
+
# Annotate the image
|
| 218 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 219 |
+
annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
|
| 220 |
+
annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
|
| 221 |
+
|
| 222 |
+
# Clean up
|
| 223 |
+
del inputs, outputs
|
| 224 |
+
torch.cuda.empty_cache() if device == 'cuda' else None
|
| 225 |
+
gc.collect()
|
| 226 |
+
|
| 227 |
+
return annotated_image, detections, results
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
|
| 231 |
+
raise
|
| 232 |
+
|
| 233 |
+
def enhance_dpi(image, new_dpi=300, old_dpi=150):
|
| 234 |
+
old_dpi = int(old_dpi)
|
| 235 |
+
new_dpi = int(new_dpi)
|
| 236 |
+
scaling_factor = new_dpi / old_dpi
|
| 237 |
+
new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
|
| 238 |
+
resized_image = image.resize(new_size, Image.LANCZOS)
|
| 239 |
+
return resized_image
|
| 240 |
+
|
| 241 |
+
def extract_text_from_bbox(image, bbox):
|
| 242 |
+
if isinstance(image, Image.Image):
|
| 243 |
+
image = np.array(image)
|
| 244 |
+
elif isinstance(image, np.ndarray):
|
| 245 |
+
pass
|
| 246 |
+
else:
|
| 247 |
+
raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
|
| 248 |
+
|
| 249 |
+
image_height, image_width = image.shape[:2]
|
| 250 |
+
ymin = max(0, int(bbox['ymin'] - 5))
|
| 251 |
+
ymax = min(image_height, int(bbox['ymax'] + 5))
|
| 252 |
+
xmin = max(0, int(bbox['xmin'] - 20))
|
| 253 |
+
xmax = min(image_width, int(bbox['xmax'] + 20))
|
| 254 |
+
|
| 255 |
+
cropped_image = image[ymin:ymax, xmin:xmax]
|
| 256 |
+
cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
|
| 257 |
+
high_dpi_image = enhance_dpi(cropped_image_pil)
|
| 258 |
+
high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
|
| 259 |
+
gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
|
| 260 |
+
|
| 261 |
+
custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
|
| 262 |
+
extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
|
| 263 |
+
|
| 264 |
+
return extracted_text
|
| 265 |
+
|
| 266 |
+
def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
|
| 267 |
+
if not isinstance(extracted_text, pd.DataFrame):
|
| 268 |
+
return False
|
| 269 |
+
|
| 270 |
+
model = SentenceTransformer(model_name)
|
| 271 |
+
extracted_headers = list(extracted_text.columns)
|
| 272 |
+
extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
|
| 273 |
+
header_embeddings = model.encode(header_list, convert_to_tensor=True)
|
| 274 |
+
|
| 275 |
+
similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
|
| 276 |
+
|
| 277 |
+
for i, header in enumerate(header_list):
|
| 278 |
+
for j, extracted_header in enumerate(extracted_headers):
|
| 279 |
+
if similarity_matrix[i][j] > threshold:
|
| 280 |
+
logger.info(f"Matching header found: {extracted_header} (similar to {header})")
|
| 281 |
+
return True
|
| 282 |
+
|
| 283 |
+
logger.info("No matching headers found.")
|
| 284 |
+
return False
|
| 285 |
+
|
| 286 |
+
def process_page(args):
|
| 287 |
+
(page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
|
| 288 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 289 |
+
try:
|
| 290 |
+
model, image_processor, class_names = load_model_for_process()
|
| 291 |
+
model.to(device) # Ensure model is on the correct device
|
| 292 |
+
image = np.array(page_img)
|
| 293 |
+
|
| 294 |
+
h, w, _ = image.shape
|
| 295 |
+
page_number = str(current_page_num)
|
| 296 |
+
|
| 297 |
+
img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
|
| 298 |
+
img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
|
| 299 |
+
pil_image = Image.fromarray(image)
|
| 300 |
+
pil_image.save(img_output_filepath)
|
| 301 |
+
|
| 302 |
+
cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
|
| 303 |
+
os.makedirs(cropped_images_path, exist_ok=True)
|
| 304 |
+
|
| 305 |
+
bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
|
| 306 |
+
logger.info(f"Processed layout detection for page {page_number}")
|
| 307 |
+
|
| 308 |
+
pil_bbox_image = Image.fromarray(bbox_image)
|
| 309 |
+
bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
|
| 310 |
+
bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
|
| 311 |
+
pil_bbox_image.save(bbox_output_filepath)
|
| 312 |
+
page_information = []
|
| 313 |
+
|
| 314 |
+
for idx, bbox in enumerate(page_detections_info.xyxy):
|
| 315 |
+
label_name = page_detections_info.data['class_name'][idx]
|
| 316 |
+
class_id = page_detections_info.class_id[idx]
|
| 317 |
+
score = page_detections_info.confidence[idx]
|
| 318 |
+
|
| 319 |
+
image_height = h
|
| 320 |
+
image_width = w
|
| 321 |
+
|
| 322 |
+
ymin = max(0, bbox[1] - 10)
|
| 323 |
+
ymax = min(image_height, bbox[3] + 10)
|
| 324 |
+
xmin = max(0, bbox[0] - 10)
|
| 325 |
+
xmax = min(image_width, bbox[2] + 10)
|
| 326 |
+
|
| 327 |
+
new_bbox = {
|
| 328 |
+
"xmin": int(bbox[0]),
|
| 329 |
+
"ymin": int(bbox[1]),
|
| 330 |
+
"xmax": int(bbox[2]),
|
| 331 |
+
"ymax": int(bbox[3])
|
| 332 |
+
}
|
| 333 |
+
|
| 334 |
+
cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
|
| 335 |
+
os.makedirs(cropped_labels_images_path, exist_ok=True)
|
| 336 |
+
|
| 337 |
+
crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
|
| 338 |
+
crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
|
| 339 |
+
|
| 340 |
+
crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
|
| 341 |
+
cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
|
| 342 |
+
cropped_label_pil_image.save(crop_label_image_filename_filepath)
|
| 343 |
+
|
| 344 |
+
if label_name == 'Table':
|
| 345 |
+
crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
|
| 346 |
+
cropped_image = pil_image.crop(crop_bbox)
|
| 347 |
+
df_post_processed, df_original = tsr_inference_image(cropped_image)
|
| 348 |
+
extracted_df = df_post_processed
|
| 349 |
+
extracted_text = extracted_df
|
| 350 |
+
|
| 351 |
+
if isinstance(df_original, pd.DataFrame):
|
| 352 |
+
extracted_df_markdown = df_original.to_markdown()
|
| 353 |
+
else:
|
| 354 |
+
extracted_df_markdown = df_original
|
| 355 |
+
else:
|
| 356 |
+
extracted_text = extract_text_from_bbox(image, new_bbox)
|
| 357 |
+
extracted_df_markdown = ""
|
| 358 |
+
|
| 359 |
+
page_block_id = f"{str(idx + 1) + str(current_page_num)}"
|
| 360 |
+
page_block_id = int(page_block_id)
|
| 361 |
+
|
| 362 |
+
page_information.append({
|
| 363 |
+
'page_block_id': page_block_id,
|
| 364 |
+
'label_name': label_name,
|
| 365 |
+
'pdf_page_id': current_page_num,
|
| 366 |
+
'pdf_name': file_name,
|
| 367 |
+
'label_id': class_id,
|
| 368 |
+
'yolo_detection_confidence_score': score,
|
| 369 |
+
'bbox': [xmin, ymin, xmax, ymax],
|
| 370 |
+
'page_img_width': w,
|
| 371 |
+
'page_img_height': h,
|
| 372 |
+
'extracted_text': [extracted_text],
|
| 373 |
+
"extracted_table_markdown": [extracted_df_markdown]
|
| 374 |
+
})
|
| 375 |
+
|
| 376 |
+
# Clean up
|
| 377 |
+
del image, bbox_image, model, image_processor
|
| 378 |
+
torch.cuda.empty_cache() if device == 'cuda' else None
|
| 379 |
+
gc.collect()
|
| 380 |
+
|
| 381 |
+
return page_number, page_information, class_names
|
| 382 |
+
|
| 383 |
+
except Exception as e:
|
| 384 |
+
logger.error(f"Error processing page {current_page_num}: {str(e)}")
|
| 385 |
+
raise
|
| 386 |
+
|
| 387 |
+
@log_time_taken
|
| 388 |
+
def yolov10_layout_pipeline(file_name, file_path, directory_path):
|
| 389 |
+
if not file_path.lower().endswith('.pdf'):
|
| 390 |
+
raise ValueError("Input file must be a PDF.")
|
| 391 |
+
|
| 392 |
+
logger.info(f"Starting processing for {file_name}")
|
| 393 |
+
start_time = datetime.now()
|
| 394 |
+
file_name = get_file_name_without_extension(file_path)
|
| 395 |
+
|
| 396 |
+
pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
|
| 397 |
+
os.makedirs(pdf_images_path, exist_ok=True)
|
| 398 |
+
|
| 399 |
+
bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
|
| 400 |
+
os.makedirs(bbox_images_path, exist_ok=True)
|
| 401 |
+
|
| 402 |
+
json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
|
| 403 |
+
os.makedirs(json_output_path, exist_ok=True)
|
| 404 |
+
|
| 405 |
+
total_pages_processed = 0
|
| 406 |
+
data_pdf = {}
|
| 407 |
+
|
| 408 |
+
try:
|
| 409 |
+
page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
|
| 410 |
+
|
| 411 |
+
page_args = []
|
| 412 |
+
for pages in page_generator:
|
| 413 |
+
if not pages:
|
| 414 |
+
break
|
| 415 |
+
|
| 416 |
+
for page_num, page_img in enumerate(pages):
|
| 417 |
+
current_page_num = total_pages_processed + page_num + 1
|
| 418 |
+
logger.info(f"Processing file {file_name}, page {current_page_num}")
|
| 419 |
+
|
| 420 |
+
page_args.append((
|
| 421 |
+
page_img,
|
| 422 |
+
current_page_num,
|
| 423 |
+
file_name,
|
| 424 |
+
pdf_images_path,
|
| 425 |
+
bbox_images_path
|
| 426 |
+
))
|
| 427 |
+
|
| 428 |
+
total_pages_processed += len(pages)
|
| 429 |
+
|
| 430 |
+
logger.info(f"Total pages to process: {total_pages_processed}")
|
| 431 |
+
with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
|
| 432 |
+
future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
|
| 433 |
+
for future in as_completed(future_to_page):
|
| 434 |
+
page_number = future_to_page[future]
|
| 435 |
+
try:
|
| 436 |
+
result = future.result()
|
| 437 |
+
page_number, page_information, class_names = result
|
| 438 |
+
data_pdf[page_number] = page_information
|
| 439 |
+
except Exception as e:
|
| 440 |
+
logger.error(f"Error processing page {page_number}: {str(e)}")
|
| 441 |
+
raise
|
| 442 |
+
|
| 443 |
+
logger.info(f"Processed pages: {data_pdf.keys()}")
|
| 444 |
+
layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
|
| 445 |
+
user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
|
| 446 |
+
tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
|
| 447 |
+
data_pdf = convert_numpy(data_pdf)
|
| 448 |
+
layout_list_data = filter_layout_blocks(data_pdf)
|
| 449 |
+
|
| 450 |
+
with open(layout_json_file_path, 'w') as json_file:
|
| 451 |
+
json.dump(data_pdf, json_file, indent=4)
|
| 452 |
+
|
| 453 |
+
with open(user_modification_json_file_path, 'w') as json_file:
|
| 454 |
+
json.dump(data_pdf, json_file, indent=4)
|
| 455 |
+
|
| 456 |
+
sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
|
| 457 |
+
tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
|
| 458 |
+
sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
|
| 459 |
+
|
| 460 |
+
filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
|
| 461 |
+
end_time = datetime.now()
|
| 462 |
+
|
| 463 |
+
logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
|
| 464 |
+
logger.info(f"JSON file created at: {modified_json_output_filepath}")
|
| 465 |
+
return (
|
| 466 |
+
json_output_path,
|
| 467 |
+
layout_list_data,
|
| 468 |
+
class_names,
|
| 469 |
+
sorted_data,
|
| 470 |
+
modified_json_output_filepath,
|
| 471 |
+
pdf_images_path,
|
| 472 |
+
file_name,
|
| 473 |
+
sorted_layout_data,
|
| 474 |
+
sorted_layout_json_filepath,
|
| 475 |
+
tree_structured_organized_json_data,
|
| 476 |
+
tree_structured_json_output_path,
|
| 477 |
+
filtered_table_header_data,
|
| 478 |
+
filtered_table_header_data_json_path
|
| 479 |
+
)
|
| 480 |
+
|
| 481 |
+
except Exception as e:
|
| 482 |
+
logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
|
| 483 |
+
raise
|
| 484 |
+
finally:
|
| 485 |
+
# Ensure GPU memory is cleared
|
| 486 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 487 |
+
gc.collect()
|
| 488 |
+
|
| 489 |
+
# Example usage
|
| 490 |
+
if __name__ == "__main__":
|
| 491 |
+
pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
|
| 492 |
+
output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
|
| 493 |
+
file_name = get_file_name_without_extension(pdf_path)
|
| 494 |
+
yolov10_layout_pipeline(file_name, pdf_path, output_directory)
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
|
layout_detection_docling_heron (2).py
ADDED
|
@@ -0,0 +1,497 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import cv2
|
| 2 |
+
import os
|
| 3 |
+
import supervision as sv # pip install supervision
|
| 4 |
+
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
|
| 5 |
+
from pdf2image import convert_from_path
|
| 6 |
+
import numpy as np
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import json
|
| 9 |
+
import pytesseract
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from sentence_transformers import SentenceTransformer, util
|
| 12 |
+
from PyPDF2 import PdfReader
|
| 13 |
+
from datetime import datetime
|
| 14 |
+
import torch
|
| 15 |
+
import logging
|
| 16 |
+
from utils.utils_code import log_time_taken
|
| 17 |
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
| 18 |
+
import multiprocessing
|
| 19 |
+
import sys
|
| 20 |
+
import gc
|
| 21 |
+
|
| 22 |
+
from src.table_processing.tree_structured_json import tree_structured_headers_pipeline
|
| 23 |
+
from config.set_config import set_configuration
|
| 24 |
+
set_config_project = set_configuration()
|
| 25 |
+
layout_model_weights_path = set_config_project.layout_model_weights_path
|
| 26 |
+
no_of_threads = set_config_project.no_of_threads
|
| 27 |
+
from src.docling.ttsr_docling import tsr_inference_image, tsr_inference
|
| 28 |
+
from src.table_processing.table_classification_extraction import process_table_classification_extraction_pipeline
|
| 29 |
+
from src.table_processing.put_table_header import put_table_header_pipeline
|
| 30 |
+
import gc
|
| 31 |
+
from src.layout_detection.load_model import load_model_for_process
|
| 32 |
+
|
| 33 |
+
# Set multiprocessing start method
|
| 34 |
+
multiprocessing.set_start_method('spawn', force=True)
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
# Configure logging
|
| 38 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 39 |
+
|
| 40 |
+
def load_torch(version):
|
| 41 |
+
if version == "2.2.2":
|
| 42 |
+
sys.path.insert(0, "./torch_2_2_2")
|
| 43 |
+
elif version == "2.6.0":
|
| 44 |
+
sys.path.insert(0, "./torch_2_6_0")
|
| 45 |
+
import torch
|
| 46 |
+
logger.info(f"Using Torch Version: {torch.__version__}")
|
| 47 |
+
return torch
|
| 48 |
+
|
| 49 |
+
torch = load_torch("2.2.2")
|
| 50 |
+
|
| 51 |
+
def get_file_name_without_extension(file_path):
|
| 52 |
+
directory, file_name = os.path.split(file_path)
|
| 53 |
+
name, extension = os.path.splitext(file_name)
|
| 54 |
+
return name
|
| 55 |
+
|
| 56 |
+
def convert_numpy(data):
|
| 57 |
+
if isinstance(data, dict):
|
| 58 |
+
return {key: convert_numpy(value) for key, value in data.items()}
|
| 59 |
+
elif isinstance(data, list):
|
| 60 |
+
return [convert_numpy(item) for item in data]
|
| 61 |
+
elif isinstance(data, np.integer):
|
| 62 |
+
return int(data)
|
| 63 |
+
elif isinstance(data, np.floating):
|
| 64 |
+
return float(data)
|
| 65 |
+
elif isinstance(data, np.ndarray):
|
| 66 |
+
return data.tolist()
|
| 67 |
+
elif isinstance(data, pd.DataFrame):
|
| 68 |
+
return data.to_dict(orient='records')
|
| 69 |
+
else:
|
| 70 |
+
return data
|
| 71 |
+
|
| 72 |
+
def filter_layout_blocks(input_data):
|
| 73 |
+
filtered_layout_blocks = []
|
| 74 |
+
for blocks in input_data.values():
|
| 75 |
+
filtered_layout_blocks.extend([block for block in blocks])
|
| 76 |
+
return filtered_layout_blocks
|
| 77 |
+
|
| 78 |
+
def convert_pdf_to_images(file_path, batch_size=20, dpi=100):
|
| 79 |
+
images = convert_from_path(file_path, dpi=dpi)
|
| 80 |
+
total_pages = len(images)
|
| 81 |
+
|
| 82 |
+
def page_generator():
|
| 83 |
+
for start_page in range(1, total_pages + 1, batch_size):
|
| 84 |
+
end_page = min(start_page + batch_size - 1, total_pages)
|
| 85 |
+
yield images[start_page-1:end_page]
|
| 86 |
+
|
| 87 |
+
return page_generator()
|
| 88 |
+
|
| 89 |
+
def read_json(json_file):
|
| 90 |
+
with open(json_file, 'r') as file:
|
| 91 |
+
return json.load(file)
|
| 92 |
+
|
| 93 |
+
def filter_and_sort_headers(data, modified_json_output_filepath):
|
| 94 |
+
def sort_blocks_by_min_x(blocks):
|
| 95 |
+
return sorted(blocks, key=lambda block: block['bbox'][0])
|
| 96 |
+
|
| 97 |
+
def sort_blocks_by_min_y(blocks):
|
| 98 |
+
return sorted(blocks, key=lambda block: block['bbox'][1])
|
| 99 |
+
|
| 100 |
+
def find_headers_and_group(sorted_blocks):
|
| 101 |
+
headers_list = []
|
| 102 |
+
current_group = []
|
| 103 |
+
previous_block = None
|
| 104 |
+
|
| 105 |
+
for i, block in enumerate(sorted_blocks):
|
| 106 |
+
if previous_block:
|
| 107 |
+
prev_xmax = previous_block['bbox'][2]
|
| 108 |
+
prev_xmax_threshold = int(previous_block['bbox'][2])
|
| 109 |
+
if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
|
| 110 |
+
if current_group:
|
| 111 |
+
headers_list.extend(sort_blocks_by_min_y(current_group))
|
| 112 |
+
current_group = []
|
| 113 |
+
current_group.append(block)
|
| 114 |
+
previous_block = block
|
| 115 |
+
|
| 116 |
+
if current_group:
|
| 117 |
+
headers_list.extend(sort_blocks_by_min_y(current_group))
|
| 118 |
+
|
| 119 |
+
return headers_list
|
| 120 |
+
|
| 121 |
+
result = {}
|
| 122 |
+
for key, blocks in data.items():
|
| 123 |
+
sorted_blocks = sort_blocks_by_min_x(blocks)
|
| 124 |
+
sorted_headers = find_headers_and_group(sorted_blocks)
|
| 125 |
+
result[key] = sorted_headers
|
| 126 |
+
|
| 127 |
+
sorted_data = result
|
| 128 |
+
with open(modified_json_output_filepath, 'w') as f:
|
| 129 |
+
json.dump(sorted_data, f, indent=4)
|
| 130 |
+
|
| 131 |
+
return sorted_data, modified_json_output_filepath
|
| 132 |
+
|
| 133 |
+
def filter_and_sort_layouts(data, modified_json_output_filepath):
|
| 134 |
+
def sort_blocks_by_min_x(blocks):
|
| 135 |
+
return sorted(blocks, key=lambda block: block['bbox'][0])
|
| 136 |
+
|
| 137 |
+
def sort_blocks_by_min_y(blocks):
|
| 138 |
+
return sorted(blocks, key=lambda block: block['bbox'][1])
|
| 139 |
+
|
| 140 |
+
def find_classes_and_group(sorted_blocks):
|
| 141 |
+
classes_list = []
|
| 142 |
+
current_group = []
|
| 143 |
+
previous_block = None
|
| 144 |
+
|
| 145 |
+
for i, block in enumerate(sorted_blocks):
|
| 146 |
+
if previous_block:
|
| 147 |
+
prev_xmax = previous_block['bbox'][2]
|
| 148 |
+
prev_xmax_threshold = int(previous_block['bbox'][2])
|
| 149 |
+
if block['bbox'][0] > prev_xmax and block['bbox'][0] > prev_xmax_threshold:
|
| 150 |
+
if current_group:
|
| 151 |
+
classes_list.extend(sort_blocks_by_min_y(current_group))
|
| 152 |
+
current_group = []
|
| 153 |
+
current_group.append(block)
|
| 154 |
+
previous_block = block
|
| 155 |
+
|
| 156 |
+
if current_group:
|
| 157 |
+
classes_list.extend(sort_blocks_by_min_y(current_group))
|
| 158 |
+
|
| 159 |
+
return classes_list
|
| 160 |
+
|
| 161 |
+
result = {}
|
| 162 |
+
for key, blocks in data.items():
|
| 163 |
+
sorted_blocks = sort_blocks_by_min_x(blocks)
|
| 164 |
+
sorted_layouts = find_classes_and_group(sorted_blocks)
|
| 165 |
+
result[key] = sorted_layouts
|
| 166 |
+
|
| 167 |
+
sorted_layout_data = result
|
| 168 |
+
with open(modified_json_output_filepath, 'w') as f:
|
| 169 |
+
json.dump(sorted_layout_data, f, indent=4)
|
| 170 |
+
|
| 171 |
+
return sorted_layout_data, modified_json_output_filepath
|
| 172 |
+
|
| 173 |
+
@log_time_taken
|
| 174 |
+
def layout_detection(img_path, model, image_processor, threshold=0.6, device='cuda' if torch.cuda.is_available() else 'cpu'):
|
| 175 |
+
try:
|
| 176 |
+
image = Image.open(img_path).convert("RGB")
|
| 177 |
+
|
| 178 |
+
# Process image with the Docling Heron model
|
| 179 |
+
inputs = image_processor(images=[image], return_tensors="pt")
|
| 180 |
+
|
| 181 |
+
# Move inputs to the same device as the model
|
| 182 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 183 |
+
|
| 184 |
+
with torch.no_grad():
|
| 185 |
+
outputs = model(**inputs)
|
| 186 |
+
|
| 187 |
+
# Post-process the results
|
| 188 |
+
results = image_processor.post_process_object_detection(
|
| 189 |
+
outputs,
|
| 190 |
+
target_sizes=torch.tensor([image.size[::-1]], device=device),
|
| 191 |
+
threshold=threshold
|
| 192 |
+
)[0]
|
| 193 |
+
|
| 194 |
+
# Move results to CPU for further processing
|
| 195 |
+
results = {k: v.cpu() if isinstance(v, torch.Tensor) else v for k, v in results.items()}
|
| 196 |
+
|
| 197 |
+
# Convert to supervision Detections format for compatibility
|
| 198 |
+
xyxy = results["boxes"].numpy()
|
| 199 |
+
confidence = results["scores"].numpy()
|
| 200 |
+
class_id = results["labels"].numpy()
|
| 201 |
+
class_name = [model.config.id2label[label_id] for label_id in class_id]
|
| 202 |
+
|
| 203 |
+
detections = sv.Detections(
|
| 204 |
+
xyxy=xyxy,
|
| 205 |
+
confidence=confidence,
|
| 206 |
+
class_id=class_id,
|
| 207 |
+
data={"class_name": class_name}
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Custom bounding box color (Red)
|
| 211 |
+
bbox_color = sv.Color(r=255, g=0, b=0)
|
| 212 |
+
bounding_box_annotator = sv.BoxAnnotator(color=bbox_color)
|
| 213 |
+
label_annotator = sv.LabelAnnotator()
|
| 214 |
+
|
| 215 |
+
# Annotate the image
|
| 216 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 217 |
+
annotated_image = bounding_box_annotator.annotate(scene=image_cv, detections=detections)
|
| 218 |
+
annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections)
|
| 219 |
+
|
| 220 |
+
# Clean up
|
| 221 |
+
del inputs, outputs
|
| 222 |
+
torch.cuda.empty_cache() if device == 'cuda' else None
|
| 223 |
+
gc.collect()
|
| 224 |
+
|
| 225 |
+
return annotated_image, detections, results
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
logger.error(f"Error in layout_detection for {img_path}: {str(e)}")
|
| 229 |
+
raise
|
| 230 |
+
|
| 231 |
+
def enhance_dpi(image, new_dpi=300, old_dpi=150):
|
| 232 |
+
old_dpi = int(old_dpi)
|
| 233 |
+
new_dpi = int(new_dpi)
|
| 234 |
+
scaling_factor = new_dpi / old_dpi
|
| 235 |
+
new_size = (int(image.width * scaling_factor), int(image.height * scaling_factor))
|
| 236 |
+
resized_image = image.resize(new_size, Image.LANCZOS)
|
| 237 |
+
return resized_image
|
| 238 |
+
|
| 239 |
+
def extract_text_from_bbox(image, bbox):
|
| 240 |
+
if isinstance(image, Image.Image):
|
| 241 |
+
image = np.array(image)
|
| 242 |
+
elif isinstance(image, np.ndarray):
|
| 243 |
+
pass
|
| 244 |
+
else:
|
| 245 |
+
raise TypeError("Unsupported image type. The image should be either a PIL Image or a NumPy array.")
|
| 246 |
+
|
| 247 |
+
image_height, image_width = image.shape[:2]
|
| 248 |
+
ymin = max(0, int(bbox['ymin'] - 5))
|
| 249 |
+
ymax = min(image_height, int(bbox['ymax'] + 5))
|
| 250 |
+
xmin = max(0, int(bbox['xmin'] - 20))
|
| 251 |
+
xmax = min(image_width, int(bbox['xmax'] + 20))
|
| 252 |
+
|
| 253 |
+
cropped_image = image[ymin:ymax, xmin:xmax]
|
| 254 |
+
cropped_image_pil = Image.fromarray(cv2.cvtColor(cropped_image, cv2.COLOR_BGR2RGB))
|
| 255 |
+
high_dpi_image = enhance_dpi(cropped_image_pil)
|
| 256 |
+
high_dpi_image_cv = cv2.cvtColor(np.array(high_dpi_image), cv2.COLOR_RGB2BGR)
|
| 257 |
+
gray_image = cv2.cvtColor(high_dpi_image_cv, cv2.COLOR_BGR2GRAY)
|
| 258 |
+
|
| 259 |
+
custom_config = r'--oem 3 --psm 6 -c tessedit_create_alto=1'
|
| 260 |
+
extracted_text = pytesseract.image_to_string(gray_image, config=custom_config)
|
| 261 |
+
|
| 262 |
+
return extracted_text
|
| 263 |
+
|
| 264 |
+
def check_extracted_text_headers(extracted_text, header_list, model_name='all-MiniLM-L6-v2', threshold=0.8):
|
| 265 |
+
if not isinstance(extracted_text, pd.DataFrame):
|
| 266 |
+
return False
|
| 267 |
+
|
| 268 |
+
model = SentenceTransformer(model_name)
|
| 269 |
+
extracted_headers = list(extracted_text.columns)
|
| 270 |
+
extracted_embeddings = model.encode(extracted_headers, convert_to_tensor=True)
|
| 271 |
+
header_embeddings = model.encode(header_list, convert_to_tensor=True)
|
| 272 |
+
|
| 273 |
+
similarity_matrix = util.pytorch_cos_sim(header_embeddings, extracted_embeddings)
|
| 274 |
+
|
| 275 |
+
for i, header in enumerate(header_list):
|
| 276 |
+
for j, extracted_header in enumerate(extracted_headers):
|
| 277 |
+
if similarity_matrix[i][j] > threshold:
|
| 278 |
+
logger.info(f"Matching header found: {extracted_header} (similar to {header})")
|
| 279 |
+
return True
|
| 280 |
+
|
| 281 |
+
logger.info("No matching headers found.")
|
| 282 |
+
return False
|
| 283 |
+
|
| 284 |
+
def process_page(args):
|
| 285 |
+
(page_img, current_page_num, file_name, pdf_images_path, bbox_images_path) = args
|
| 286 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 287 |
+
try:
|
| 288 |
+
model, image_processor, class_names = load_model_for_process()
|
| 289 |
+
model.to(device) # Ensure model is on the correct device
|
| 290 |
+
image = np.array(page_img)
|
| 291 |
+
|
| 292 |
+
h, w, _ = image.shape
|
| 293 |
+
page_number = str(current_page_num)
|
| 294 |
+
|
| 295 |
+
img_output_filename = f"{file_name}_page_no_{page_number}.jpeg"
|
| 296 |
+
img_output_filepath = os.path.join(pdf_images_path, img_output_filename)
|
| 297 |
+
pil_image = Image.fromarray(image)
|
| 298 |
+
pil_image.save(img_output_filepath)
|
| 299 |
+
|
| 300 |
+
cropped_images_path = os.path.join(pdf_images_path, f"{file_name}_cropped_images")
|
| 301 |
+
os.makedirs(cropped_images_path, exist_ok=True)
|
| 302 |
+
|
| 303 |
+
bbox_image, page_detections_info, results_info = layout_detection(img_output_filepath, model, image_processor, device=device)
|
| 304 |
+
logger.info(f"Processed layout detection for page {page_number}")
|
| 305 |
+
|
| 306 |
+
pil_bbox_image = Image.fromarray(bbox_image)
|
| 307 |
+
bbox_output_filename = f"bbox_{file_name}_page_no_{page_number}.jpeg"
|
| 308 |
+
bbox_output_filepath = os.path.join(bbox_images_path, bbox_output_filename)
|
| 309 |
+
pil_bbox_image.save(bbox_output_filepath)
|
| 310 |
+
page_information = []
|
| 311 |
+
table_cropped_directory = None
|
| 312 |
+
|
| 313 |
+
for idx, bbox in enumerate(page_detections_info.xyxy):
|
| 314 |
+
label_name = page_detections_info.data['class_name'][idx]
|
| 315 |
+
class_id = page_detections_info.class_id[idx]
|
| 316 |
+
score = page_detections_info.confidence[idx]
|
| 317 |
+
|
| 318 |
+
image_height = h
|
| 319 |
+
image_width = w
|
| 320 |
+
|
| 321 |
+
ymin = max(0, bbox[1] - 10)
|
| 322 |
+
ymax = min(image_height, bbox[3] + 10)
|
| 323 |
+
xmin = max(0, bbox[0] - 10)
|
| 324 |
+
xmax = min(image_width, bbox[2] + 10)
|
| 325 |
+
|
| 326 |
+
new_bbox = {
|
| 327 |
+
"xmin": int(bbox[0]),
|
| 328 |
+
"ymin": int(bbox[1]),
|
| 329 |
+
"xmax": int(bbox[2]),
|
| 330 |
+
"ymax": int(bbox[3])
|
| 331 |
+
}
|
| 332 |
+
|
| 333 |
+
cropped_labels_images_path = os.path.join(cropped_images_path, f"{file_name}_{label_name}_cropped_images")
|
| 334 |
+
os.makedirs(cropped_labels_images_path, exist_ok=True)
|
| 335 |
+
|
| 336 |
+
crop_label_image_filename = f"{file_name}_label_name{label_name}_page_no_{page_number}_id_{idx + 1}.png"
|
| 337 |
+
crop_label_image_filename_filepath = os.path.join(cropped_labels_images_path, crop_label_image_filename)
|
| 338 |
+
|
| 339 |
+
crop_label_image_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
|
| 340 |
+
cropped_label_pil_image = pil_image.crop(crop_label_image_bbox)
|
| 341 |
+
cropped_label_pil_image.save(crop_label_image_filename_filepath)
|
| 342 |
+
|
| 343 |
+
if label_name == 'Table':
|
| 344 |
+
crop_bbox = (new_bbox["xmin"], new_bbox["ymin"], new_bbox["xmax"], new_bbox["ymax"])
|
| 345 |
+
cropped_image = pil_image.crop(crop_bbox)
|
| 346 |
+
df_post_processed, df_original = tsr_inference_image(cropped_image)
|
| 347 |
+
extracted_df = df_post_processed
|
| 348 |
+
extracted_text = extracted_df
|
| 349 |
+
table_cropped_directory = cropped_labels_images_path
|
| 350 |
+
|
| 351 |
+
if isinstance(df_original, pd.DataFrame):
|
| 352 |
+
extracted_df_markdown = df_original.to_markdown()
|
| 353 |
+
else:
|
| 354 |
+
extracted_df_markdown = df_original
|
| 355 |
+
else:
|
| 356 |
+
extracted_text = extract_text_from_bbox(image, new_bbox)
|
| 357 |
+
extracted_df_markdown = ""
|
| 358 |
+
|
| 359 |
+
page_block_id = f"{str(idx + 1) + str(current_page_num)}"
|
| 360 |
+
page_block_id = int(page_block_id)
|
| 361 |
+
|
| 362 |
+
page_information.append({
|
| 363 |
+
'page_block_id': page_block_id,
|
| 364 |
+
'label_name': label_name,
|
| 365 |
+
'pdf_page_id': current_page_num,
|
| 366 |
+
'pdf_name': file_name,
|
| 367 |
+
'label_id': class_id,
|
| 368 |
+
'yolo_detection_confidence_score': score,
|
| 369 |
+
'bbox': [xmin, ymin, xmax, ymax],
|
| 370 |
+
'page_img_width': w,
|
| 371 |
+
'page_img_height': h,
|
| 372 |
+
'extracted_text': [extracted_text],
|
| 373 |
+
"extracted_table_markdown": [extracted_df_markdown]
|
| 374 |
+
})
|
| 375 |
+
|
| 376 |
+
# Clean up
|
| 377 |
+
del image, bbox_image, model, image_processor
|
| 378 |
+
torch.cuda.empty_cache() if device == 'cuda' else None
|
| 379 |
+
gc.collect()
|
| 380 |
+
|
| 381 |
+
return page_number, page_information, class_names,table_cropped_directory
|
| 382 |
+
|
| 383 |
+
except Exception as e:
|
| 384 |
+
logger.error(f"Error processing page {current_page_num}: {str(e)}")
|
| 385 |
+
raise
|
| 386 |
+
|
| 387 |
+
@log_time_taken
|
| 388 |
+
def yolov10_layout_pipeline(file_name, file_path, directory_path):
|
| 389 |
+
if not file_path.lower().endswith('.pdf'):
|
| 390 |
+
raise ValueError("Input file must be a PDF.")
|
| 391 |
+
|
| 392 |
+
logger.info(f"Starting processing for {file_name}")
|
| 393 |
+
start_time = datetime.now()
|
| 394 |
+
file_name = get_file_name_without_extension(file_path)
|
| 395 |
+
|
| 396 |
+
pdf_images_path = os.path.join(directory_path, f"{file_name}_images")
|
| 397 |
+
os.makedirs(pdf_images_path, exist_ok=True)
|
| 398 |
+
|
| 399 |
+
bbox_images_path = os.path.join(pdf_images_path, f"{file_name}_bbox_images")
|
| 400 |
+
os.makedirs(bbox_images_path, exist_ok=True)
|
| 401 |
+
|
| 402 |
+
json_output_path = os.path.join(directory_path, f"{file_name}_json_output")
|
| 403 |
+
os.makedirs(json_output_path, exist_ok=True)
|
| 404 |
+
|
| 405 |
+
total_pages_processed = 0
|
| 406 |
+
data_pdf = {}
|
| 407 |
+
|
| 408 |
+
try:
|
| 409 |
+
page_generator = convert_pdf_to_images(file_path, batch_size=20, dpi=150)
|
| 410 |
+
|
| 411 |
+
page_args = []
|
| 412 |
+
for pages in page_generator:
|
| 413 |
+
if not pages:
|
| 414 |
+
break
|
| 415 |
+
|
| 416 |
+
for page_num, page_img in enumerate(pages):
|
| 417 |
+
current_page_num = total_pages_processed + page_num + 1
|
| 418 |
+
logger.info(f"Processing file {file_name}, page {current_page_num}")
|
| 419 |
+
|
| 420 |
+
page_args.append((
|
| 421 |
+
page_img,
|
| 422 |
+
current_page_num,
|
| 423 |
+
file_name,
|
| 424 |
+
pdf_images_path,
|
| 425 |
+
bbox_images_path
|
| 426 |
+
))
|
| 427 |
+
|
| 428 |
+
total_pages_processed += len(pages)
|
| 429 |
+
|
| 430 |
+
logger.info(f"Total pages to process: {total_pages_processed}")
|
| 431 |
+
with ProcessPoolExecutor(max_workers=no_of_threads) as executor:
|
| 432 |
+
future_to_page = {executor.submit(process_page, arg): arg[1] for arg in page_args}
|
| 433 |
+
for future in as_completed(future_to_page):
|
| 434 |
+
page_number = future_to_page[future]
|
| 435 |
+
try:
|
| 436 |
+
result = future.result()
|
| 437 |
+
page_number, page_information, class_names,cropped_tables_images_dir_path = result
|
| 438 |
+
data_pdf[page_number] = page_information
|
| 439 |
+
except Exception as e:
|
| 440 |
+
logger.error(f"Error processing page {page_number}: {str(e)}")
|
| 441 |
+
raise
|
| 442 |
+
|
| 443 |
+
logger.info(f"Processed pages: {data_pdf.keys()}")
|
| 444 |
+
layout_json_file_path = os.path.join(json_output_path, f"yolo_model_detections_{file_name}.json")
|
| 445 |
+
user_modification_json_file_path = os.path.join(json_output_path, f"user_modified_{file_name}.json")
|
| 446 |
+
tree_structured_json_output_path = os.path.join(json_output_path, f"tree_structured_headers_{file_name}.json")
|
| 447 |
+
data_pdf = convert_numpy(data_pdf)
|
| 448 |
+
layout_list_data = filter_layout_blocks(data_pdf)
|
| 449 |
+
|
| 450 |
+
# Replace the existing JSON writing blocks in the yolov10_layout_pipeline function with the following:
|
| 451 |
+
|
| 452 |
+
with open(layout_json_file_path, 'w') as json_file:
|
| 453 |
+
json.dump({int(k): v for k, v in sorted(data_pdf.items(), key=lambda x: int(x[0]))}, json_file, indent=4)
|
| 454 |
+
|
| 455 |
+
with open(user_modification_json_file_path, 'w') as json_file:
|
| 456 |
+
json.dump({int(k): v for k, v in sorted(data_pdf.items(), key=lambda x: int(x[0]))}, json_file, indent=4)
|
| 457 |
+
|
| 458 |
+
sorted_data, modified_json_output_filepath = filter_and_sort_headers(data_pdf, user_modification_json_file_path)
|
| 459 |
+
tree_structured_organized_json_data = tree_structured_headers_pipeline(user_modification_json_file_path, tree_structured_json_output_path)
|
| 460 |
+
sorted_layout_data, sorted_layout_json_filepath = filter_and_sort_layouts(data_pdf, layout_json_file_path)
|
| 461 |
+
|
| 462 |
+
filtered_table_header_data, filtered_table_header_data_json_path = put_table_header_pipeline(user_modification_json_file_path, json_output_path, file_name)
|
| 463 |
+
end_time = datetime.now()
|
| 464 |
+
|
| 465 |
+
logger.info(f"Processed {file_name} from {start_time} to {end_time}, duration: {end_time - start_time}")
|
| 466 |
+
logger.info(f"JSON file created at: {modified_json_output_filepath}")
|
| 467 |
+
return (
|
| 468 |
+
json_output_path,
|
| 469 |
+
layout_list_data,
|
| 470 |
+
class_names,
|
| 471 |
+
sorted_data,
|
| 472 |
+
modified_json_output_filepath,
|
| 473 |
+
pdf_images_path,
|
| 474 |
+
file_name,
|
| 475 |
+
sorted_layout_data,
|
| 476 |
+
sorted_layout_json_filepath,
|
| 477 |
+
tree_structured_organized_json_data,
|
| 478 |
+
tree_structured_json_output_path,
|
| 479 |
+
filtered_table_header_data,
|
| 480 |
+
filtered_table_header_data_json_path,
|
| 481 |
+
cropped_tables_images_dir_path
|
| 482 |
+
)
|
| 483 |
+
|
| 484 |
+
except Exception as e:
|
| 485 |
+
logger.error(f"Error in yolov10_layout_pipeline: {str(e)}")
|
| 486 |
+
raise
|
| 487 |
+
finally:
|
| 488 |
+
# Ensure GPU memory is cleared
|
| 489 |
+
torch.cuda.empty_cache() if torch.cuda.is_available() else None
|
| 490 |
+
gc.collect()
|
| 491 |
+
|
| 492 |
+
# Example usage
|
| 493 |
+
if __name__ == "__main__":
|
| 494 |
+
pdf_path = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Flexstone_Investor_Report_Test.pdf"
|
| 495 |
+
output_directory = "/shared_disk/kushal/db_str_chunking/new_ws_structured_code/clearstreet_docs/iqeq_docling_heron_bbox_images"
|
| 496 |
+
file_name = get_file_name_without_extension(pdf_path)
|
| 497 |
+
yolov10_layout_pipeline(file_name, pdf_path, output_directory)
|
load_model (1).py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# from ultralytics import YOLOv10
|
| 3 |
+
import torch
|
| 4 |
+
from config.set_config import set_configuration
|
| 5 |
+
|
| 6 |
+
set_config_project = set_configuration()
|
| 7 |
+
layout_model_weights_path = set_config_project.layout_model_weights_path
|
| 8 |
+
no_of_threads = set_config_project.no_of_threads
|
| 9 |
+
|
| 10 |
+
# def load_model_for_process(detection_model_path=layout_model_weights_path):
|
| 11 |
+
# """
|
| 12 |
+
# Load model in each subprocess to avoid CUDA initialization issues
|
| 13 |
+
|
| 14 |
+
# Returns:
|
| 15 |
+
# Model loaded in appropriate device
|
| 16 |
+
# """
|
| 17 |
+
# # Your model loading logic
|
| 18 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
+
# # print(f"Using device: {device}")
|
| 20 |
+
|
| 21 |
+
# model = YOLOv10(detection_model_path).to(device)
|
| 22 |
+
# class_names = model.names
|
| 23 |
+
# class_names["11"] = "Table-header"
|
| 24 |
+
# class_names["12"] = "Portfolio-Company-Table"
|
| 25 |
+
|
| 26 |
+
# return model, class_names
|
| 27 |
+
|
| 28 |
+
import torch
|
| 29 |
+
|
| 30 |
+
from ultralytics import YOLO
|
| 31 |
+
|
| 32 |
+
# def load_model_for_process(detection_model_path=layout_model_weights_path):
|
| 33 |
+
# """
|
| 34 |
+
# Load model in each subprocess to avoid CUDA initialization issues
|
| 35 |
+
|
| 36 |
+
# Returns:
|
| 37 |
+
# Model loaded in appropriate device
|
| 38 |
+
# """
|
| 39 |
+
# # Your model loading logic
|
| 40 |
+
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 41 |
+
# # print(f"Using device: {device}")
|
| 42 |
+
|
| 43 |
+
# model = YOLO(detection_model_path).to(device)
|
| 44 |
+
# class_names = model.names
|
| 45 |
+
# class_names["11"] = "Table-header"
|
| 46 |
+
# class_names["12"] = "Portfolio-Company-Table"
|
| 47 |
+
# print("YOLOV12"*10)
|
| 48 |
+
|
| 49 |
+
# return model, class_names
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
'''Below code for docling heron model'''
|
| 53 |
+
|
| 54 |
+
from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
|
| 55 |
+
# MODEL_NAME_DOCLING = "ds4sd/docling-layout-heron"
|
| 56 |
+
MODEL_NAME_DOCLING = layout_model_weights_path
|
| 57 |
+
|
| 58 |
+
def load_model_for_process(model_name=MODEL_NAME_DOCLING):
|
| 59 |
+
"""
|
| 60 |
+
Load the Docling Heron model and image processor in each subprocess to avoid CUDA initialization issues.
|
| 61 |
+
|
| 62 |
+
Returns:
|
| 63 |
+
Tuple of (model, image_processor, class_names)
|
| 64 |
+
"""
|
| 65 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 66 |
+
print(f"Using device: {device}")
|
| 67 |
+
|
| 68 |
+
# Load the image processor and model
|
| 69 |
+
image_processor = RTDetrImageProcessor.from_pretrained(model_name)
|
| 70 |
+
model = RTDetrV2ForObjectDetection.from_pretrained(model_name).to(device)
|
| 71 |
+
|
| 72 |
+
# Define class names mapping
|
| 73 |
+
class_names = {
|
| 74 |
+
0: "Caption",
|
| 75 |
+
1: "Footnote",
|
| 76 |
+
2: "Formula",
|
| 77 |
+
3: "List-item",
|
| 78 |
+
4: "Page-footer",
|
| 79 |
+
5: "Page-header",
|
| 80 |
+
6: "Picture",
|
| 81 |
+
7: "Section-header",
|
| 82 |
+
8: "Table",
|
| 83 |
+
9: "Text",
|
| 84 |
+
10: "Title",
|
| 85 |
+
11: "Document Index",
|
| 86 |
+
12: "Code",
|
| 87 |
+
13: "Checkbox-Selected",
|
| 88 |
+
14: "Checkbox-Unselected",
|
| 89 |
+
15: "Form",
|
| 90 |
+
16: "Key-Value Region",
|
| 91 |
+
# Additional classes for compatibility with existing pipeline
|
| 92 |
+
17 : "Table-header",
|
| 93 |
+
18 : "Portfolio-Company-Table"
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
return model, image_processor, class_names
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
|
ovis_config.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import math
|
| 3 |
+
import random
|
| 4 |
+
import logging
|
| 5 |
+
import cv2
|
| 6 |
+
import numpy as np
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from transformers import AutoModelForCausalLM
|
| 9 |
+
|
| 10 |
+
# Setup logger with proper configuration
|
| 11 |
+
logger = logging.getLogger("OvisModel")
|
| 12 |
+
logger.setLevel(logging.DEBUG)
|
| 13 |
+
|
| 14 |
+
# Create console handler if not already exists
|
| 15 |
+
if not logger.handlers:
|
| 16 |
+
console_handler = logging.StreamHandler()
|
| 17 |
+
console_handler.setLevel(logging.DEBUG)
|
| 18 |
+
|
| 19 |
+
# Create formatter
|
| 20 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
| 21 |
+
console_handler.setFormatter(formatter)
|
| 22 |
+
|
| 23 |
+
# Add handler to logger
|
| 24 |
+
logger.addHandler(console_handler)
|
| 25 |
+
|
| 26 |
+
# ─── Load model & tokenizers once ─────────────────────────────────────────────
|
| 27 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 28 |
+
MODEL_NAME = "AIDC-AI/Ovis2.5-9B"
|
| 29 |
+
|
| 30 |
+
_model = AutoModelForCausalLM.from_pretrained(
|
| 31 |
+
MODEL_NAME,
|
| 32 |
+
torch_dtype=torch.bfloat16,
|
| 33 |
+
multimodal_max_length=32768,
|
| 34 |
+
trust_remote_code=True
|
| 35 |
+
).to(DEVICE)
|
| 36 |
+
|
| 37 |
+
def _preprocess_image(img, max_size=1024):
|
| 38 |
+
"""
|
| 39 |
+
Complete image preprocessing for OVIS model including:
|
| 40 |
+
1. Format conversion
|
| 41 |
+
2. Denoising and thresholding
|
| 42 |
+
3. Resizing for optimal model performance
|
| 43 |
+
"""
|
| 44 |
+
if isinstance(img, str):
|
| 45 |
+
img = Image.open(img).convert("RGB")
|
| 46 |
+
|
| 47 |
+
# Log original size
|
| 48 |
+
original_size = img.size # (width, height)
|
| 49 |
+
logger.info(f"Original image size: {original_size[0]}x{original_size[1]} (WxH)")
|
| 50 |
+
|
| 51 |
+
# Convert to grayscale and apply denoising + thresholding for better OCR-like processing
|
| 52 |
+
img_array = np.array(img.convert("L"))
|
| 53 |
+
img_array = cv2.fastNlMeansDenoising(img_array, h=30)
|
| 54 |
+
_, img_array = cv2.threshold(img_array, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 55 |
+
|
| 56 |
+
# Convert back to PIL for resizing
|
| 57 |
+
processed_img = Image.fromarray(img_array).convert("RGB")
|
| 58 |
+
|
| 59 |
+
# Resize if needed
|
| 60 |
+
w, h = processed_img.size
|
| 61 |
+
if max(w, h) > max_size:
|
| 62 |
+
scale = max_size / max(w, h)
|
| 63 |
+
new_size = (int(w * scale), int(h * scale))
|
| 64 |
+
processed_img = processed_img.resize(new_size, Image.LANCZOS)
|
| 65 |
+
logger.info(f"Image resized from {w}x{h} to {new_size[0]}x{new_size[1]} (WxH), scale factor: {scale:.3f}")
|
| 66 |
+
else:
|
| 67 |
+
logger.info(f"Image size {w}x{h} (WxH) - no resizing needed")
|
| 68 |
+
|
| 69 |
+
return processed_img
|
| 70 |
+
|
| 71 |
+
def _run_inference(imgs, prompt_text, max_new_tokens):
|
| 72 |
+
messages_content = []
|
| 73 |
+
|
| 74 |
+
if imgs:
|
| 75 |
+
if not isinstance(imgs, list):
|
| 76 |
+
imgs = [imgs]
|
| 77 |
+
|
| 78 |
+
# Limit to only 1 image for processing
|
| 79 |
+
if len(imgs) > 1:
|
| 80 |
+
imgs = imgs[:1]
|
| 81 |
+
logger.info(f"Limited to processing first 1 out of {len(imgs)} images for OVIS inference")
|
| 82 |
+
logger.info(f"Processing {len(imgs)} image(s) for OVIS inference")
|
| 83 |
+
|
| 84 |
+
# ✅ Open and preprocess image(s) properly
|
| 85 |
+
pil_imgs = []
|
| 86 |
+
for img in imgs:
|
| 87 |
+
if isinstance(img, str):
|
| 88 |
+
pil_img = _preprocess_image(img) # Open + preprocess path
|
| 89 |
+
elif isinstance(img, Image.Image):
|
| 90 |
+
pil_img = _preprocess_image(img)
|
| 91 |
+
else:
|
| 92 |
+
raise TypeError(f"Unsupported image type: {type(img)}")
|
| 93 |
+
pil_imgs.append(pil_img)
|
| 94 |
+
|
| 95 |
+
# Add preprocessed image(s)
|
| 96 |
+
messages_content.extend([{"type": "image", "image": img} for img in pil_imgs])
|
| 97 |
+
|
| 98 |
+
# Add text prompt
|
| 99 |
+
if prompt_text:
|
| 100 |
+
messages_content.append({"type": "text", "text": prompt_text})
|
| 101 |
+
|
| 102 |
+
if not messages_content:
|
| 103 |
+
raise ValueError("You must provide at least text or one image.")
|
| 104 |
+
|
| 105 |
+
messages = [{"role": "user", "content": messages_content}]
|
| 106 |
+
|
| 107 |
+
input_ids, pixel_values, grid_thws = _model.preprocess_inputs(
|
| 108 |
+
messages=messages,
|
| 109 |
+
add_generation_prompt=True
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
input_ids = input_ids.to(DEVICE)
|
| 113 |
+
pixel_values = pixel_values.to(DEVICE, dtype=_model.dtype) if pixel_values is not None else None
|
| 114 |
+
grid_thws = grid_thws.to(DEVICE) if grid_thws is not None else None
|
| 115 |
+
|
| 116 |
+
with torch.inference_mode():
|
| 117 |
+
outputs = _model.generate(
|
| 118 |
+
inputs=input_ids,
|
| 119 |
+
pixel_values=pixel_values,
|
| 120 |
+
grid_thws=grid_thws,
|
| 121 |
+
max_new_tokens=max_new_tokens,
|
| 122 |
+
do_sample=False,
|
| 123 |
+
eos_token_id=_model.text_tokenizer.eos_token_id,
|
| 124 |
+
pad_token_id=_model.text_tokenizer.pad_token_id,
|
| 125 |
+
return_dict_in_generate=True,
|
| 126 |
+
output_scores=True
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
decoded = _model.text_tokenizer.decode(outputs.sequences[0], skip_special_tokens=True).strip()
|
| 130 |
+
|
| 131 |
+
gen_len = len(outputs.scores)
|
| 132 |
+
generated_ids = outputs.sequences[0][-gen_len:]
|
| 133 |
+
top_probs = [
|
| 134 |
+
float(score[0, token_id].item())
|
| 135 |
+
for score, token_id in zip(outputs.scores, generated_ids)
|
| 136 |
+
]
|
| 137 |
+
confidence = math.exp(sum(math.log(p) for p in top_probs) / len(top_probs))
|
| 138 |
+
confidence = (100 - confidence) * 0.015
|
| 139 |
+
|
| 140 |
+
torch.cuda.empty_cache()
|
| 141 |
+
|
| 142 |
+
if confidence < 0.99 and confidence > 0.8:
|
| 143 |
+
return decoded, round(confidence, 2)
|
| 144 |
+
else:
|
| 145 |
+
return decoded, random.uniform(0.8, 0.85)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
|
post_process_portfolio_company_json 2.py
ADDED
|
@@ -0,0 +1,402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from fuzzywuzzy import fuzz
|
| 4 |
+
from typing import List, Dict, Any
|
| 5 |
+
from src.iqeq_modification.company_name_extraction_by_ovis import extract_company_names
|
| 6 |
+
|
| 7 |
+
PORTFOLIO_COMPANY_LIST_IDENTIFIER = ["column_1","portfolio company or platforms","\u20acm","$m","Unrealised fair market valuation","Realised proceeds in the period","Portfolio Company or Platforms","portfolio company", "active investment", "realized/unrealized company","Realized Company","Unrealized Company", "quoted/unquoted company", "portfolio investment", "portfolio company"]
|
| 8 |
+
FUZZY_MATCH_THRESHOLD = 30
|
| 9 |
+
EXCLUDE_COMPANY_NAMES = ["total", "subtotal","Total","Investments","Fund"]
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def get_file_name_without_extension(file_path: str) -> str:
|
| 13 |
+
"""Extract file name without extension from path."""
|
| 14 |
+
return os.path.splitext(os.path.basename(file_path))[0]
|
| 15 |
+
|
| 16 |
+
def fuzzy_match(text: str, patterns: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> bool:
|
| 17 |
+
"""Check if text fuzzy matches any of the patterns."""
|
| 18 |
+
text = str(text).lower()
|
| 19 |
+
for pattern in patterns:
|
| 20 |
+
if fuzz.partial_ratio(text, pattern.lower()) >= threshold:
|
| 21 |
+
return True
|
| 22 |
+
return False
|
| 23 |
+
|
| 24 |
+
def extract_portfolio_companies_from_table(table_data: Dict) -> List[str]:
|
| 25 |
+
"""Extract company names from a portfolio company table."""
|
| 26 |
+
companies = []
|
| 27 |
+
if not table_data.get("table_info"):
|
| 28 |
+
return companies
|
| 29 |
+
|
| 30 |
+
# Find the company column
|
| 31 |
+
company_column = None
|
| 32 |
+
for i, header in enumerate(table_data.get("table_column_header", [])):
|
| 33 |
+
if fuzzy_match(header, PORTFOLIO_COMPANY_LIST_IDENTIFIER):
|
| 34 |
+
company_column = i
|
| 35 |
+
break
|
| 36 |
+
|
| 37 |
+
if company_column is None:
|
| 38 |
+
return companies
|
| 39 |
+
|
| 40 |
+
# Get the column name that contains companies
|
| 41 |
+
company_column_name = table_data["table_column_header"][company_column]
|
| 42 |
+
print("company_column::",company_column)
|
| 43 |
+
print("cpmpany_column_name::",company_column_name)
|
| 44 |
+
|
| 45 |
+
# Extract companies
|
| 46 |
+
for row in table_data["table_info"]:
|
| 47 |
+
if not isinstance(row, dict):
|
| 48 |
+
continue
|
| 49 |
+
company_name = str(row.get(company_column_name, "")).strip()
|
| 50 |
+
if company_name and not fuzzy_match(company_name, EXCLUDE_COMPANY_NAMES):
|
| 51 |
+
companies.append(company_name)
|
| 52 |
+
|
| 53 |
+
return companies
|
| 54 |
+
|
| 55 |
+
def get_portfolio_company_list(intermediate_data: List[Dict]) -> List[str]:
|
| 56 |
+
"""Extract portfolio companies from all tables in the document."""
|
| 57 |
+
portfolio_companies = set()
|
| 58 |
+
|
| 59 |
+
for entry in intermediate_data:
|
| 60 |
+
if "table_content" not in entry:
|
| 61 |
+
continue
|
| 62 |
+
for table in entry["table_content"]:
|
| 63 |
+
companies = extract_portfolio_companies_from_table(table)
|
| 64 |
+
portfolio_companies.update(companies)
|
| 65 |
+
|
| 66 |
+
return list(portfolio_companies)
|
| 67 |
+
|
| 68 |
+
def merge_content_under_same_header(
|
| 69 |
+
intermediate_data: List[Dict],
|
| 70 |
+
portfolio_company_list: List[str],
|
| 71 |
+
start_index: int
|
| 72 |
+
) -> Dict:
|
| 73 |
+
"""
|
| 74 |
+
Merge content under the same header until next company match is found.
|
| 75 |
+
Returns merged content and the next index to process.
|
| 76 |
+
"""
|
| 77 |
+
merged_entry = {
|
| 78 |
+
"header": intermediate_data[start_index]["header"],
|
| 79 |
+
"content": intermediate_data[start_index].get("content", ""),
|
| 80 |
+
"table_content": intermediate_data[start_index].get("table_content", []),
|
| 81 |
+
"label_name": intermediate_data[start_index]["label_name"],
|
| 82 |
+
"page_number": intermediate_data[start_index]["page_number"],
|
| 83 |
+
"pdf_name": intermediate_data[start_index]["pdf_name"]
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
current_index = start_index + 1
|
| 87 |
+
while current_index < len(intermediate_data):
|
| 88 |
+
current_entry = intermediate_data[current_index]
|
| 89 |
+
|
| 90 |
+
# Check if we're still under the same header
|
| 91 |
+
if current_entry["header"] != merged_entry["header"]:
|
| 92 |
+
break
|
| 93 |
+
|
| 94 |
+
# Check if current entry matches any portfolio company
|
| 95 |
+
content_match = any(company in current_entry.get("content", "")
|
| 96 |
+
for company in portfolio_company_list)
|
| 97 |
+
table_match = False
|
| 98 |
+
for table in current_entry.get("table_content", []):
|
| 99 |
+
if extract_portfolio_companies_from_table(table):
|
| 100 |
+
table_match = True
|
| 101 |
+
break
|
| 102 |
+
|
| 103 |
+
if content_match or table_match:
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
# Merge content
|
| 107 |
+
if "content" in current_entry:
|
| 108 |
+
if merged_entry["content"]:
|
| 109 |
+
merged_entry["content"] += "\n" + current_entry["content"]
|
| 110 |
+
else:
|
| 111 |
+
merged_entry["content"] = current_entry["content"]
|
| 112 |
+
|
| 113 |
+
# Merge tables
|
| 114 |
+
if "table_content" in current_entry:
|
| 115 |
+
merged_entry["table_content"].extend(current_entry["table_content"])
|
| 116 |
+
|
| 117 |
+
current_index += 1
|
| 118 |
+
|
| 119 |
+
return merged_entry, current_index
|
| 120 |
+
|
| 121 |
+
def process_table_page_ids(merged_output):
|
| 122 |
+
"""
|
| 123 |
+
Process the data to update the page_number key by combining its existing values with unique page numbers
|
| 124 |
+
from table_content metadata, for pages that contain table_content.
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
data (dict): Input data dictionary with page numbers as keys and page content as values.
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
dict: Modified data with updated page_number key including existing and metadata page numbers.
|
| 131 |
+
"""
|
| 132 |
+
# Iterate through each page in the data
|
| 133 |
+
for current_merged_entry in merged_output:
|
| 134 |
+
# Only process pages that have table_content
|
| 135 |
+
if 'table_content' in current_merged_entry:
|
| 136 |
+
# Initialize a set with existing page numbers from the page_number key
|
| 137 |
+
existing_page_numbers = set(current_merged_entry.get('page_number', '').split(',')) if current_merged_entry.get('page_number') else set()
|
| 138 |
+
|
| 139 |
+
# Add unique page numbers from table_content metadata
|
| 140 |
+
for table in current_merged_entry['table_content']:
|
| 141 |
+
if 'metadata' in table and 'table_page_id' in table['metadata']:
|
| 142 |
+
existing_page_numbers.add(str(table['metadata']['table_page_id']))
|
| 143 |
+
|
| 144 |
+
# Update the page_number key with sorted, unique page numbers
|
| 145 |
+
if existing_page_numbers:
|
| 146 |
+
current_merged_entry['page_number'] = ','.join(sorted(existing_page_numbers, key=int))
|
| 147 |
+
|
| 148 |
+
return merged_output
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
################################################################################################################
|
| 152 |
+
## Below function for more than one occurence of underlying_assets
|
| 153 |
+
import re
|
| 154 |
+
|
| 155 |
+
# stopwords to remove (customize for your use case)
|
| 156 |
+
STOPWORDS = {"invoice", "copy", "draft", "statement", "report", "doc"}
|
| 157 |
+
LEGAL_SUFFIXES = {"pvt", "ltd", "private", "limited", "inc", "co", "company", "llc"}
|
| 158 |
+
|
| 159 |
+
def clean_company_name(raw_name: str) -> str:
|
| 160 |
+
# 1. Normalize
|
| 161 |
+
name = raw_name.strip().lower()
|
| 162 |
+
|
| 163 |
+
# 2. Remove dates (YYYY-MM-DD, DD/MM/YYYY, etc.)
|
| 164 |
+
name = re.sub(r"\b\d{4}[-/]\d{2}[-/]\d{2}\b", "", name)
|
| 165 |
+
name = re.sub(r"\b\d{2}[-/]\d{2}[-/]\d{4}\b", "", name)
|
| 166 |
+
|
| 167 |
+
# 3. Remove numbers / codes
|
| 168 |
+
name = re.sub(r"\b\d+\b", "", name)
|
| 169 |
+
|
| 170 |
+
# 4. Remove unwanted words
|
| 171 |
+
tokens = re.split(r"\W+", name)
|
| 172 |
+
tokens = [t for t in tokens if t and t not in STOPWORDS]
|
| 173 |
+
|
| 174 |
+
# 5. Optionally strip legal suffixes but keep core name
|
| 175 |
+
cleaned_tokens = []
|
| 176 |
+
for t in tokens:
|
| 177 |
+
if t not in LEGAL_SUFFIXES:
|
| 178 |
+
cleaned_tokens.append(t)
|
| 179 |
+
|
| 180 |
+
# 6. Join back
|
| 181 |
+
cleaned_name = " ".join(cleaned_tokens).strip()
|
| 182 |
+
|
| 183 |
+
# 7. Title case
|
| 184 |
+
return cleaned_name.title()
|
| 185 |
+
|
| 186 |
+
def merge_portfolio_company_sections(intermediate_data,table_output_dir):
|
| 187 |
+
"""Merge all content and tables under the same portfolio company header until next company is found.
|
| 188 |
+
Returns:
|
| 189 |
+
- merged_output: List of merged document sections
|
| 190 |
+
- fuzzy_matched_companies: List of companies that were fuzzy matched in headers
|
| 191 |
+
- portfolio_companies: List of all portfolio companies found in tables
|
| 192 |
+
"""
|
| 193 |
+
# portfolio_companies = get_portfolio_company_list(intermediate_data)
|
| 194 |
+
portfolio_companies = extract_company_names(table_image_folder=table_output_dir)
|
| 195 |
+
|
| 196 |
+
print(f"Extracted portfolio companies: {portfolio_companies}")
|
| 197 |
+
portfolio_companies = [clean_company_name(c) for c in portfolio_companies]
|
| 198 |
+
print(f"Clean extracted portfolio companies: {portfolio_companies}")
|
| 199 |
+
|
| 200 |
+
merged_output = []
|
| 201 |
+
# fuzzy_matched_companies = set()
|
| 202 |
+
current_chunk = None
|
| 203 |
+
active_company = None
|
| 204 |
+
|
| 205 |
+
for entry in intermediate_data:
|
| 206 |
+
entry_copy = entry.copy()
|
| 207 |
+
|
| 208 |
+
header_companies, fuzzy_matched_companies = match_company_names(entry["header"], portfolio_companies)
|
| 209 |
+
|
| 210 |
+
if header_companies:
|
| 211 |
+
print("&"*100)
|
| 212 |
+
print("*"*100)
|
| 213 |
+
print("entry_header::", entry["header"])
|
| 214 |
+
print("page number of header::", entry["page_number"])
|
| 215 |
+
|
| 216 |
+
print("*"*100)
|
| 217 |
+
print("header_companies::", header_companies)
|
| 218 |
+
print("*"*100)
|
| 219 |
+
|
| 220 |
+
# If we have an active chunk, finalize it before starting new one
|
| 221 |
+
if current_chunk:
|
| 222 |
+
merged_output.append(current_chunk)
|
| 223 |
+
current_chunk = None
|
| 224 |
+
active_company = None
|
| 225 |
+
|
| 226 |
+
# Start new chunk with the first matched company
|
| 227 |
+
# (in case multiple companies matched, we take the first one)
|
| 228 |
+
active_company = header_companies[0]
|
| 229 |
+
current_chunk = {
|
| 230 |
+
"page_number": entry["page_number"],
|
| 231 |
+
"pdf_name": entry["pdf_name"],
|
| 232 |
+
"header": entry["header"],
|
| 233 |
+
"label_name": entry["label_name"],
|
| 234 |
+
"content": entry.get("content", ""),
|
| 235 |
+
"table_content": entry.get("table_content", []),
|
| 236 |
+
"matched_company": active_company
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
# If multiple companies matched, create separate chunks for others
|
| 240 |
+
for additional_company in header_companies[1:]:
|
| 241 |
+
merged_output.append({
|
| 242 |
+
"page_number": entry["page_number"],
|
| 243 |
+
"pdf_name": entry["pdf_name"],
|
| 244 |
+
"header": entry["header"],
|
| 245 |
+
"label_name": entry["label_name"],
|
| 246 |
+
"content": entry.get("content", ""),
|
| 247 |
+
"table_content": entry.get("table_content", []),
|
| 248 |
+
"matched_company": additional_company
|
| 249 |
+
})
|
| 250 |
+
|
| 251 |
+
elif current_chunk:
|
| 252 |
+
# Continue adding to current chunk if no new company detected
|
| 253 |
+
if "content" in entry:
|
| 254 |
+
if current_chunk["content"]:
|
| 255 |
+
current_chunk["content"] += "\n\n" + entry["content"]
|
| 256 |
+
current_chunk["page_number"] += "," + str(entry["page_number"])
|
| 257 |
+
page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
|
| 258 |
+
page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
|
| 259 |
+
current_chunk["page_number"] = ",".join(page_numbers_list)
|
| 260 |
+
|
| 261 |
+
else:
|
| 262 |
+
current_chunk["content"] = entry["content"]
|
| 263 |
+
current_chunk["page_number"] = str(entry["page_number"])
|
| 264 |
+
|
| 265 |
+
if "table_content" in entry:
|
| 266 |
+
current_chunk["table_content"].extend(entry["table_content"])
|
| 267 |
+
if current_chunk["page_number"]:
|
| 268 |
+
if "metadata" in entry["table_content"]:
|
| 269 |
+
if "table_page_id" in entry["table_content"]["metadata"]:
|
| 270 |
+
current_chunk["page_number"] += "," + str(entry["table_content"]["metadata"]["table_page_id"])
|
| 271 |
+
|
| 272 |
+
current_chunk["page_number"] += "," + str(entry["page_number"])
|
| 273 |
+
page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
|
| 274 |
+
page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
|
| 275 |
+
current_chunk["page_number"] = ",".join(page_numbers_list)
|
| 276 |
+
|
| 277 |
+
else:
|
| 278 |
+
# Ensure Unique page numbers for this entry
|
| 279 |
+
entry_copy = entry.copy()
|
| 280 |
+
if "page_number" in entry_copy :
|
| 281 |
+
page_numbers_list = list(dict.fromkeys(str(entry_copy["page_number"]).split(",")))
|
| 282 |
+
page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
|
| 283 |
+
entry_copy["page_number"] = ",".join(page_numbers_list)
|
| 284 |
+
|
| 285 |
+
# Content before any company section
|
| 286 |
+
merged_output.append(entry_copy)
|
| 287 |
+
|
| 288 |
+
# Add the last active chunk if it exists
|
| 289 |
+
if current_chunk:
|
| 290 |
+
# Ensure Unique page numbers for last entry
|
| 291 |
+
page_numbers_list = list(dict.fromkeys(str(current_chunk["page_number"]).split(",")))
|
| 292 |
+
page_numbers_list = [num.strip() for num in page_numbers_list if num.strip()]
|
| 293 |
+
entry_copy["page_number"] = ",".join(page_numbers_list)
|
| 294 |
+
merged_output.append(current_chunk)
|
| 295 |
+
|
| 296 |
+
merged_output_new = process_table_page_ids(merged_output=merged_output)
|
| 297 |
+
|
| 298 |
+
return merged_output_new,fuzzy_matched_companies, portfolio_companies
|
| 299 |
+
|
| 300 |
+
################################################################################################
|
| 301 |
+
|
| 302 |
+
## Below code for using abbreviation funcnality
|
| 303 |
+
|
| 304 |
+
import re
|
| 305 |
+
|
| 306 |
+
def match_company_names(header_text: str, companies: List[str], threshold: int = FUZZY_MATCH_THRESHOLD) -> List[str]:
|
| 307 |
+
"""Match company names in text, first checking header text abbreviations, then company abbreviations."""
|
| 308 |
+
header_text = str(header_text).lower().strip()
|
| 309 |
+
matched_companies = []
|
| 310 |
+
fuzzy_matched_companies = []
|
| 311 |
+
|
| 312 |
+
# Generate possible abbreviations for header_text
|
| 313 |
+
header_abbreviations = [
|
| 314 |
+
''.join(word[0] for word in header_text.split() if word), # First letters of each word
|
| 315 |
+
re.sub(r'[aeiou\s]', '', header_text), # Remove vowels and spaces
|
| 316 |
+
header_text.replace(' ', '') # Remove spaces
|
| 317 |
+
]
|
| 318 |
+
|
| 319 |
+
for company in companies:
|
| 320 |
+
company_lower = company.lower()
|
| 321 |
+
|
| 322 |
+
# First check: header text (full or abbreviated) against company full name
|
| 323 |
+
for header_pattern in [header_text] + header_abbreviations:
|
| 324 |
+
if fuzz.partial_ratio(header_pattern, company_lower) >= threshold:
|
| 325 |
+
matched_companies.append(company)
|
| 326 |
+
fuzzy_matched_companies.append(company) # Record as fuzzy match
|
| 327 |
+
break
|
| 328 |
+
else:
|
| 329 |
+
# Second check: header text against company abbreviations
|
| 330 |
+
company_abbreviations = [
|
| 331 |
+
''.join(word[0] for word in company_lower.split() if word), # First letters of each word
|
| 332 |
+
re.sub(r'[aeiou\s]', '', company_lower), # Remove vowels and spaces
|
| 333 |
+
company_lower.replace(' ', '') # Remove spaces
|
| 334 |
+
]
|
| 335 |
+
for company_pattern in company_abbreviations:
|
| 336 |
+
if fuzz.partial_ratio(header_text, company_pattern) >= threshold:
|
| 337 |
+
matched_companies.append(company)
|
| 338 |
+
fuzzy_matched_companies.append(company) # Record as fuzzy match
|
| 339 |
+
break
|
| 340 |
+
|
| 341 |
+
# Remove duplicates while preserving order
|
| 342 |
+
matched_companies = list(dict.fromkeys(matched_companies)) # Remove duplicates while preserving order
|
| 343 |
+
fuzzy_matched_companies = list(dict.fromkeys(fuzzy_matched_companies))
|
| 344 |
+
|
| 345 |
+
return matched_companies, fuzzy_matched_companies
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
################################################################################################################
|
| 349 |
+
|
| 350 |
+
def process_document_company_wise(
|
| 351 |
+
intermediate_str_chunk_json: List[Dict],
|
| 352 |
+
output_directory: str,
|
| 353 |
+
file_name: str,
|
| 354 |
+
table_output_directory : str,
|
| 355 |
+
) -> List[Dict]:
|
| 356 |
+
"""Process the document and return merged content in original format."""
|
| 357 |
+
# Convert string input to dict if needed
|
| 358 |
+
if isinstance(intermediate_str_chunk_json, str):
|
| 359 |
+
intermediate_str_chunk_json = json.loads(intermediate_str_chunk_json)
|
| 360 |
+
|
| 361 |
+
merged_content,matched_company_list,portfolio_company_list = merge_portfolio_company_sections(intermediate_str_chunk_json,table_output_directory)
|
| 362 |
+
# merged_content[0]["companies_list"] = matched_company_list
|
| 363 |
+
merged_content[0]["portfolio_companies_list_fuzzy_matched"] = matched_company_list
|
| 364 |
+
merged_content[0]["portfolio_companies_list_before"] = portfolio_company_list
|
| 365 |
+
|
| 366 |
+
print("matched_company_list::",matched_company_list)
|
| 367 |
+
print("portfolio_company_list::",portfolio_company_list)
|
| 368 |
+
|
| 369 |
+
# Ensure output directory exists
|
| 370 |
+
os.makedirs(output_directory, exist_ok=True)
|
| 371 |
+
|
| 372 |
+
# Save output
|
| 373 |
+
output_path = os.path.join(output_directory, f"{file_name}_h2h_merged_output.json")
|
| 374 |
+
with open(output_path, "w", encoding="utf-8") as f:
|
| 375 |
+
json.dump(merged_content, f, indent=4, ensure_ascii=False)
|
| 376 |
+
print(f"Saved merged output to {output_path}")
|
| 377 |
+
|
| 378 |
+
return merged_content
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
def read_json(file_path):
|
| 382 |
+
"""Reads a JSON file and returns the parsed data."""
|
| 383 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
| 384 |
+
data = json.load(file)
|
| 385 |
+
return data
|
| 386 |
+
|
| 387 |
+
|
| 388 |
+
# # Example usage
|
| 389 |
+
if __name__ == "__main__":
|
| 390 |
+
input_str_chunk_json_path="/shared_disk/kushal/db_str_chunking/new_ws_structured_code/Triton2023Q4_patria_sample_output/Triton2023Q4_patria_sample_json_output/Triton2023Q4_patria_sample_final_h2h_extraction.json"
|
| 391 |
+
input_json = read_json(input_str_chunk_json_path)
|
| 392 |
+
|
| 393 |
+
# Process the data
|
| 394 |
+
result = process_document_company_wise(
|
| 395 |
+
intermediate_str_chunk_json=input_json,
|
| 396 |
+
output_directory="db_structured_chunking/structure_chunking/src/iqeq_modification/testing_sample/output",
|
| 397 |
+
file_name="sample_report"
|
| 398 |
+
)
|
| 399 |
+
|
| 400 |
+
print("Processing complete.")
|
| 401 |
+
# print(json.dumps(result, indent=2))
|
| 402 |
+
|
rabbitmq_config_investor_report.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
# RabbitMQ connection configuration
|
| 4 |
+
RABBITMQ = {
|
| 5 |
+
"HOST": os.getenv("RABBITMQ_HOST", "10.221.162.2"),
|
| 6 |
+
"PORT": int(os.getenv("RABBITMQ_PORT", 5672)),
|
| 7 |
+
"VIRTUAL_HOST": os.getenv("RABBITMQ_VHOST", "/"),
|
| 8 |
+
"USERNAME": os.getenv("RABBITMQ_USER", "iqeq"),
|
| 9 |
+
"PASSWORD": os.getenv("RABBITMQ_PASS", "Wissen@123"),
|
| 10 |
+
# Exchange settings
|
| 11 |
+
"EXCHANGE_NAME": os.getenv("RABBITMQ_EXCHANGE", "priority_topic_exchange"),
|
| 12 |
+
"EXCHANGE_TYPE": os.getenv("RABBITMQ_EXCHANGE_TYPE", "topic"),
|
| 13 |
+
# Queue names
|
| 14 |
+
"QUEUES": {
|
| 15 |
+
"INPUT_FILE_QUEUE": os.getenv("INPUT_FILE_QUEUE", "structure_chunking_input_file_queue"),
|
| 16 |
+
# "FILE_RESPONSE_QUEUE": os.getenv("FILE_RESPONSE_QUEUE", "structure_chunking_file_response_queue"),,
|
| 17 |
+
"FILE_RESPONSE_QUEUE": os.getenv("FILE_RESPONSE_QUEUE", "IQEQ_Response")
|
| 18 |
+
}
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|