Spaces:
Build error
Build error
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +285 -259
working_yolo_pipeline.py
CHANGED
|
@@ -1202,14 +1202,6 @@
|
|
| 1202 |
|
| 1203 |
|
| 1204 |
|
| 1205 |
-
|
| 1206 |
-
|
| 1207 |
-
|
| 1208 |
-
|
| 1209 |
-
|
| 1210 |
-
|
| 1211 |
-
|
| 1212 |
-
|
| 1213 |
|
| 1214 |
|
| 1215 |
|
|
@@ -1598,49 +1590,6 @@ def get_word_data_for_detection(page: fitz.Page, top_margin_percent=0.10, bottom
|
|
| 1598 |
return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
|
| 1599 |
|
| 1600 |
|
| 1601 |
-
# def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
|
| 1602 |
-
# if not word_data: return []
|
| 1603 |
-
# x_points = []
|
| 1604 |
-
# for _, x1, _, x2, _ in word_data: x_points.extend([x1, x2])
|
| 1605 |
-
# max_x = max(x_points)
|
| 1606 |
-
# bin_size = params['cluster_bin_size']
|
| 1607 |
-
# num_bins = int(np.ceil(max_x / bin_size))
|
| 1608 |
-
# hist, bin_edges = np.histogram(x_points, bins=num_bins, range=(0, max_x))
|
| 1609 |
-
# smoothed_hist = gaussian_filter1d(hist.astype(float), sigma=params['cluster_smoothing'])
|
| 1610 |
-
# inverted_signal = np.max(smoothed_hist) - smoothed_hist
|
| 1611 |
-
#
|
| 1612 |
-
# peaks, properties = find_peaks(
|
| 1613 |
-
# inverted_signal, height=0, distance=params['cluster_min_width'] / bin_size
|
| 1614 |
-
# )
|
| 1615 |
-
#
|
| 1616 |
-
# if not peaks.size: return []
|
| 1617 |
-
#
|
| 1618 |
-
# threshold_value = np.percentile(smoothed_hist, params['cluster_threshold_percentile'])
|
| 1619 |
-
# inverted_threshold = np.max(smoothed_hist) - threshold_value
|
| 1620 |
-
# significant_peaks = peaks[properties['peak_heights'] >= inverted_threshold]
|
| 1621 |
-
# separator_x_coords = [int(bin_edges[p]) for p in significant_peaks]
|
| 1622 |
-
#
|
| 1623 |
-
# final_separators = []
|
| 1624 |
-
# prominence_threshold = params['cluster_prominence'] * np.max(smoothed_hist)
|
| 1625 |
-
#
|
| 1626 |
-
# for x_coord in separator_x_coords:
|
| 1627 |
-
# bin_idx = np.searchsorted(bin_edges, x_coord) - 1
|
| 1628 |
-
# window_size = int(params['cluster_min_width'] / bin_size)
|
| 1629 |
-
#
|
| 1630 |
-
# left_start, left_end = max(0, bin_idx - window_size), bin_idx
|
| 1631 |
-
# right_start, right_end = bin_idx + 1, min(len(smoothed_hist), bin_idx + 1 + window_size)
|
| 1632 |
-
#
|
| 1633 |
-
# if left_end <= left_start or right_end <= right_start: continue
|
| 1634 |
-
#
|
| 1635 |
-
# avg_left_density = np.mean(smoothed_hist[left_start:left_end])
|
| 1636 |
-
# avg_right_density = np.mean(smoothed_hist[right_start:right_end])
|
| 1637 |
-
#
|
| 1638 |
-
# if avg_left_density >= prominence_threshold and avg_right_density >= prominence_threshold:
|
| 1639 |
-
# final_separators.append(x_coord)
|
| 1640 |
-
#
|
| 1641 |
-
# return sorted(final_separators)
|
| 1642 |
-
|
| 1643 |
-
|
| 1644 |
def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
|
| 1645 |
"""Calculates the X-axis histogram and detects significant gutters."""
|
| 1646 |
if not word_data: return []
|
|
@@ -1953,6 +1902,9 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
|
|
| 1953 |
final_page_predictions = []
|
| 1954 |
CHUNK_SIZE = 500
|
| 1955 |
|
|
|
|
|
|
|
|
|
|
| 1956 |
for page_data in preprocessed_data:
|
| 1957 |
page_num_1_based = page_data['page_number']
|
| 1958 |
page_num_0_based = page_num_1_based - 1
|
|
@@ -2092,189 +2044,131 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
|
|
| 2092 |
|
| 2093 |
|
| 2094 |
|
| 2095 |
-
|
| 2096 |
-
|
| 2097 |
-
#
|
| 2098 |
-
|
| 2099 |
-
|
| 2100 |
-
|
| 2101 |
-
#
|
| 2102 |
-
|
| 2103 |
-
|
| 2104 |
-
|
| 2105 |
-
|
| 2106 |
-
|
| 2107 |
-
#
|
| 2108 |
-
|
| 2109 |
-
|
| 2110 |
-
|
| 2111 |
-
#
|
| 2112 |
-
|
| 2113 |
-
|
| 2114 |
-
|
| 2115 |
-
|
| 2116 |
-
|
| 2117 |
-
|
| 2118 |
-
|
| 2119 |
-
|
| 2120 |
-
|
| 2121 |
-
|
| 2122 |
-
|
| 2123 |
-
|
| 2124 |
-
|
| 2125 |
-
|
| 2126 |
-
|
| 2127 |
-
|
| 2128 |
-
|
| 2129 |
-
|
| 2130 |
-
|
| 2131 |
-
|
| 2132 |
-
|
| 2133 |
-
|
| 2134 |
-
|
| 2135 |
-
|
| 2136 |
-
|
| 2137 |
-
|
| 2138 |
-
|
| 2139 |
-
|
| 2140 |
-
|
| 2141 |
-
|
| 2142 |
-
|
| 2143 |
-
|
| 2144 |
-
|
| 2145 |
-
|
| 2146 |
-
|
| 2147 |
-
|
| 2148 |
-
|
| 2149 |
-
|
| 2150 |
-
|
| 2151 |
-
|
| 2152 |
-
|
| 2153 |
-
|
| 2154 |
-
|
| 2155 |
-
|
| 2156 |
-
#
|
| 2157 |
-
|
| 2158 |
-
|
| 2159 |
-
|
| 2160 |
-
|
| 2161 |
-
|
| 2162 |
-
|
| 2163 |
-
|
| 2164 |
-
|
| 2165 |
-
|
| 2166 |
-
|
| 2167 |
-
|
| 2168 |
-
|
| 2169 |
-
|
| 2170 |
-
|
| 2171 |
-
|
| 2172 |
-
|
| 2173 |
-
|
| 2174 |
-
|
| 2175 |
-
|
| 2176 |
-
|
| 2177 |
-
|
| 2178 |
-
|
| 2179 |
-
|
| 2180 |
-
|
| 2181 |
-
|
| 2182 |
-
|
| 2183 |
-
|
| 2184 |
-
|
| 2185 |
-
|
| 2186 |
-
|
| 2187 |
-
|
| 2188 |
-
|
| 2189 |
-
|
| 2190 |
-
|
| 2191 |
-
|
| 2192 |
-
|
| 2193 |
-
|
| 2194 |
-
|
| 2195 |
-
|
| 2196 |
-
|
| 2197 |
-
|
| 2198 |
-
|
| 2199 |
-
|
| 2200 |
-
|
| 2201 |
-
|
| 2202 |
-
|
| 2203 |
-
|
| 2204 |
-
|
| 2205 |
-
|
| 2206 |
-
|
| 2207 |
-
|
| 2208 |
-
|
| 2209 |
-
|
| 2210 |
-
|
| 2211 |
-
|
| 2212 |
-
# sub_bboxes_pdf = chunk_bboxes_pdf[i:i + CHUNK_SIZE]
|
| 2213 |
-
#
|
| 2214 |
-
# if not sub_words: continue
|
| 2215 |
-
#
|
| 2216 |
-
# # Tokenization and Model Call (Inference)
|
| 2217 |
-
# encoded_input = tokenizer(
|
| 2218 |
-
# sub_words, boxes=sub_bboxes, truncation=True, padding="max_length",
|
| 2219 |
-
# max_length=512, return_tensors="pt"
|
| 2220 |
-
# )
|
| 2221 |
-
#
|
| 2222 |
-
# input_ids = encoded_input['input_ids'].to(device)
|
| 2223 |
-
# bbox = encoded_input['bbox'].to(device)
|
| 2224 |
-
# attention_mask = encoded_input['attention_mask'].to(device)
|
| 2225 |
-
#
|
| 2226 |
-
# with torch.no_grad():
|
| 2227 |
-
# predictions_int_list = model(input_ids, bbox, attention_mask)
|
| 2228 |
-
#
|
| 2229 |
-
# if not predictions_int_list: continue
|
| 2230 |
-
#
|
| 2231 |
-
# # Post-processing and prediction mapping (unchanged)
|
| 2232 |
-
# predictions_int = predictions_int_list[0]
|
| 2233 |
-
# word_ids = encoded_input.word_ids()
|
| 2234 |
-
# word_idx_to_pred_id = {}
|
| 2235 |
-
#
|
| 2236 |
-
# for token_idx, word_idx in enumerate(word_ids):
|
| 2237 |
-
# if word_idx is not None and word_idx < len(sub_words):
|
| 2238 |
-
# if word_idx not in word_idx_to_pred_id:
|
| 2239 |
-
# word_idx_to_pred_id[word_idx] = predictions_int[token_idx]
|
| 2240 |
-
#
|
| 2241 |
-
# for current_word_idx in range(len(sub_words)):
|
| 2242 |
-
# pred_id_or_tensor = word_idx_to_pred_id.get(current_word_idx, 0)
|
| 2243 |
-
# pred_id = pred_id_or_tensor.item() if torch.is_tensor(pred_id_or_tensor) else pred_id_or_tensor
|
| 2244 |
-
# predicted_label = ID_TO_LABEL[pred_id]
|
| 2245 |
-
#
|
| 2246 |
-
# page_raw_predictions.append({
|
| 2247 |
-
# "word": sub_words[current_word_idx],
|
| 2248 |
-
# "bbox": sub_bboxes_pdf[current_word_idx],
|
| 2249 |
-
# "predicted_label": predicted_label,
|
| 2250 |
-
# "page_number": page_num_1_based
|
| 2251 |
-
# })
|
| 2252 |
-
#
|
| 2253 |
-
# batch_inference_time = time.time() - start_time_inference_batch
|
| 2254 |
-
# total_inference_time += batch_inference_time
|
| 2255 |
-
# # Optional: Log per-batch inference time if needed for deep debugging
|
| 2256 |
-
# # print(f" [LOG] Batch inference ({len(sub_words)} words) took {batch_inference_time:.3f}s.")
|
| 2257 |
-
#
|
| 2258 |
-
# if page_raw_predictions:
|
| 2259 |
-
# final_page_predictions.append({
|
| 2260 |
-
# "page_number": page_num_1_based,
|
| 2261 |
-
# "data": page_raw_predictions
|
| 2262 |
-
# })
|
| 2263 |
-
#
|
| 2264 |
-
# print(f" [LOG] Total inference time for Page {page_num_1_based} (all batches): {total_inference_time:.2f}s")
|
| 2265 |
-
# print(f" [LOG] Total processing time for Page {page_num_1_based}: {time.time() - start_time_page:.2f}s")
|
| 2266 |
-
#
|
| 2267 |
-
# doc.close()
|
| 2268 |
-
#
|
| 2269 |
-
# total_elapsed_time = time.time() - start_time_overall
|
| 2270 |
-
# print(f"✅ LayoutLMv3 inference complete. Predicted tags for {len(final_page_predictions)} pages.")
|
| 2271 |
-
# print(f" [LOG] Overall LayoutLMv3 Inference Pipeline Duration: {total_elapsed_time:.2f}s.")
|
| 2272 |
-
# return final_page_predictions
|
| 2273 |
|
| 2274 |
|
| 2275 |
# ============================================================================
|
| 2276 |
# --- PHASE 3: BIO TO STRUCTURED JSON DECODER (Modified for In-Memory Return) ---
|
| 2277 |
# ============================================================================
|
|
|
|
|
|
|
|
|
|
| 2278 |
|
| 2279 |
def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 2280 |
"""
|
|
@@ -2333,31 +2227,41 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 2333 |
entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
|
| 2334 |
current_text_buffer.append(word)
|
| 2335 |
previous_entity_type = last_entity_type
|
| 2336 |
-
is_passage_label = (
|
| 2337 |
|
| 2338 |
-
|
| 2339 |
-
|
| 2340 |
-
|
| 2341 |
-
continue
|
| 2342 |
|
| 2343 |
-
|
| 2344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2345 |
current_passage_buffer.append(word)
|
| 2346 |
last_entity_type = 'PASSAGE'
|
| 2347 |
-
|
| 2348 |
-
|
| 2349 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2350 |
|
| 2351 |
if label == 'B-QUESTION':
|
| 2352 |
if not first_question_started:
|
|
|
|
| 2353 |
header_text = ' '.join(current_text_buffer[:-1]).strip()
|
| 2354 |
if header_text or current_passage_buffer:
|
| 2355 |
metadata_item = {'type': 'METADATA', 'passage': ''}
|
| 2356 |
-
|
| 2357 |
-
|
| 2358 |
-
if header_text:
|
| 2359 |
-
metadata_item['text'] = header_text
|
| 2360 |
-
elif header_text:
|
| 2361 |
metadata_item['text'] = header_text
|
| 2362 |
structured_data.append(metadata_item)
|
| 2363 |
first_question_started = True
|
|
@@ -2382,8 +2286,13 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 2382 |
is_in_new_passage = False
|
| 2383 |
continue
|
| 2384 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2385 |
if current_item is not None:
|
| 2386 |
if is_in_new_passage:
|
|
|
|
| 2387 |
current_item['new_passage'] += f' {word}'
|
| 2388 |
if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
|
| 2389 |
is_in_new_passage = False
|
|
@@ -2392,18 +2301,25 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 2392 |
continue
|
| 2393 |
|
| 2394 |
is_in_new_passage = False
|
|
|
|
|
|
|
| 2395 |
if label.startswith('B-'):
|
| 2396 |
-
|
|
|
|
|
|
|
| 2397 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 2398 |
current_passage_buffer = []
|
|
|
|
| 2399 |
last_entity_type = entity_type
|
| 2400 |
|
| 2401 |
if entity_type == 'PASSAGE':
|
|
|
|
|
|
|
| 2402 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 2403 |
current_item['new_passage'] = word
|
| 2404 |
is_in_new_passage = True
|
| 2405 |
else:
|
| 2406 |
-
current_passage_buffer.append(word)
|
| 2407 |
elif entity_type == 'OPTION':
|
| 2408 |
current_option_key = word
|
| 2409 |
current_item['options'][current_option_key] = word
|
|
@@ -2416,20 +2332,22 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 2416 |
current_item['question'] += f' {word}'
|
| 2417 |
just_finished_i_option = False
|
| 2418 |
|
|
|
|
| 2419 |
elif label.startswith('I-'):
|
| 2420 |
if entity_type == 'QUESTION' and current_item.get('question'):
|
| 2421 |
current_item['question'] += f' {word}'
|
| 2422 |
last_entity_type = 'QUESTION'
|
| 2423 |
just_finished_i_option = False
|
| 2424 |
elif entity_type == 'PASSAGE':
|
|
|
|
| 2425 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 2426 |
current_item['new_passage'] = word
|
| 2427 |
is_in_new_passage = True
|
| 2428 |
else:
|
| 2429 |
-
|
| 2430 |
-
|
| 2431 |
if last_entity_type == 'PASSAGE' or not current_passage_buffer:
|
| 2432 |
-
current_passage_buffer.append(word)
|
| 2433 |
last_entity_type = 'PASSAGE'
|
| 2434 |
just_finished_i_option = False
|
| 2435 |
elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
|
|
@@ -2441,11 +2359,21 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 2441 |
else:
|
| 2442 |
just_finished_i_option = False
|
| 2443 |
|
|
|
|
| 2444 |
elif label == 'O':
|
| 2445 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2446 |
current_item['question'] += f' {word}'
|
| 2447 |
just_finished_i_option = False
|
| 2448 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2449 |
# --- Finalize last item ---
|
| 2450 |
if current_item is not None:
|
| 2451 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
|
@@ -2475,6 +2403,74 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 2475 |
return structured_data
|
| 2476 |
|
| 2477 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2478 |
# ============================================================================
|
| 2479 |
# --- PHASE 4: IMAGE EMBEDDING (Modified for In-Memory Return) ---
|
| 2480 |
# ============================================================================
|
|
@@ -2554,7 +2550,7 @@ def embed_images_as_base64_in_memory(structured_data: List[Dict[str, Any]], figu
|
|
| 2554 |
# --- MAIN FUNCTION (The Callable Interface) ---
|
| 2555 |
# ============================================================================
|
| 2556 |
|
| 2557 |
-
def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 2558 |
"""
|
| 2559 |
Executes the full document analysis pipeline: YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed.
|
| 2560 |
|
|
@@ -2638,6 +2634,16 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
|
|
| 2638 |
print("Pipeline aborted: Failed to convert BIO tags to structured data in Phase 3.")
|
| 2639 |
return None
|
| 2640 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2641 |
# --- D. PHASE 4: IMAGE EMBEDDING (Base64) ---
|
| 2642 |
final_result = embed_images_as_base64_in_memory(
|
| 2643 |
structured_data_list,
|
|
@@ -2675,6 +2681,7 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str) -> Op
|
|
| 2675 |
return final_result
|
| 2676 |
|
| 2677 |
|
|
|
|
| 2678 |
if __name__ == "__main__":
|
| 2679 |
parser = argparse.ArgumentParser(
|
| 2680 |
description="Complete Document Analysis Pipeline (YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed).")
|
|
@@ -2684,18 +2691,38 @@ if __name__ == "__main__":
|
|
| 2684 |
default=DEFAULT_LAYOUTLMV3_MODEL_PATH,
|
| 2685 |
help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
|
| 2686 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2687 |
args = parser.parse_args()
|
| 2688 |
|
| 2689 |
-
|
| 2690 |
-
final_json_data = run_document_pipeline(args.input_pdf, args.layoutlmv3_model_path)
|
| 2691 |
|
| 2692 |
-
|
| 2693 |
-
|
| 2694 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2695 |
|
| 2696 |
-
|
| 2697 |
-
|
| 2698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2699 |
with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2700 |
json.dump(final_json_data, f, indent=2, ensure_ascii=False)
|
| 2701 |
|
|
@@ -2703,4 +2730,3 @@ if __name__ == "__main__":
|
|
| 2703 |
|
| 2704 |
|
| 2705 |
|
| 2706 |
-
|
|
|
|
| 1202 |
|
| 1203 |
|
| 1204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1205 |
|
| 1206 |
|
| 1207 |
|
|
|
|
| 1590 |
return [d for d in word_data if d[2] >= y_min and d[4] <= y_max]
|
| 1591 |
|
| 1592 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1593 |
def calculate_x_gutters(word_data: list, params: Dict) -> List[int]:
|
| 1594 |
"""Calculates the X-axis histogram and detects significant gutters."""
|
| 1595 |
if not word_data: return []
|
|
|
|
| 1902 |
final_page_predictions = []
|
| 1903 |
CHUNK_SIZE = 500
|
| 1904 |
|
| 1905 |
+
|
| 1906 |
+
all_pages_word_level_results = []
|
| 1907 |
+
|
| 1908 |
for page_data in preprocessed_data:
|
| 1909 |
page_num_1_based = page_data['page_number']
|
| 1910 |
page_num_0_based = page_num_1_based - 1
|
|
|
|
| 2044 |
|
| 2045 |
|
| 2046 |
|
| 2047 |
+
def create_label_studio_span(page_results, start_idx, end_idx, label):
|
| 2048 |
+
"""Create a Label Studio span with character-level offsets and bbox."""
|
| 2049 |
+
# Get the words and bboxes for the specific entity span
|
| 2050 |
+
entity_words = [page_results[i]['word'] for i in range(start_idx, end_idx + 1)]
|
| 2051 |
+
entity_bboxes = [page_results[i]['bbox'] for i in range(start_idx, end_idx + 1)]
|
| 2052 |
+
|
| 2053 |
+
# Calculate encompassing BBOX
|
| 2054 |
+
x0 = min(bbox[0] for bbox in entity_bboxes)
|
| 2055 |
+
y0 = min(bbox[1] for bbox in entity_bboxes)
|
| 2056 |
+
x1 = max(bbox[2] for bbox in entity_bboxes)
|
| 2057 |
+
y1 = max(bbox[3] for bbox in entity_bboxes)
|
| 2058 |
+
|
| 2059 |
+
# Calculate character offsets based on the full page text string
|
| 2060 |
+
all_words_on_page = [r['word'] for r in page_results]
|
| 2061 |
+
text_string = " ".join(all_words_on_page)
|
| 2062 |
+
|
| 2063 |
+
# Compute start and end character offsets
|
| 2064 |
+
start_char = len(" ".join(all_words_on_page[:start_idx]))
|
| 2065 |
+
if start_idx != 0:
|
| 2066 |
+
start_char += 1
|
| 2067 |
+
end_char = start_char + len(" ".join(entity_words))
|
| 2068 |
+
|
| 2069 |
+
span_text = " ".join(entity_words)
|
| 2070 |
+
|
| 2071 |
+
return {
|
| 2072 |
+
"from_name": "label",
|
| 2073 |
+
"to_name": "text",
|
| 2074 |
+
"type": "labels",
|
| 2075 |
+
"value": {
|
| 2076 |
+
"start": start_char,
|
| 2077 |
+
"end": end_char,
|
| 2078 |
+
"text": span_text,
|
| 2079 |
+
"labels": [label],
|
| 2080 |
+
"bbox": {"x": x0, "y": y0, "width": x1 - x0, "height": y1 - y0}
|
| 2081 |
+
},
|
| 2082 |
+
"score": 0.99
|
| 2083 |
+
}
|
| 2084 |
+
|
| 2085 |
+
|
| 2086 |
+
|
| 2087 |
+
|
| 2088 |
+
def convert_raw_predictions_to_label_studio(page_data_list, output_path: str):
|
| 2089 |
+
"""Convert raw word-level predictions (grouped by page) to Label Studio format."""
|
| 2090 |
+
final_tasks = []
|
| 2091 |
+
total_spans = 0
|
| 2092 |
+
|
| 2093 |
+
print("\n[PHASE: LABEL STUDIO CONVERSION]")
|
| 2094 |
+
|
| 2095 |
+
for page_data in page_data_list:
|
| 2096 |
+
page_num = page_data['page_number']
|
| 2097 |
+
page_results = page_data['data']
|
| 2098 |
+
if not page_results:
|
| 2099 |
+
continue
|
| 2100 |
+
|
| 2101 |
+
original_words = [r['word'] for r in page_results]
|
| 2102 |
+
text_string = " ".join(original_words)
|
| 2103 |
+
|
| 2104 |
+
results = []
|
| 2105 |
+
current_entity_label = None
|
| 2106 |
+
current_entity_start_word_index = None
|
| 2107 |
+
|
| 2108 |
+
# BIO stitching
|
| 2109 |
+
for i, pred_item in enumerate(page_results):
|
| 2110 |
+
label = pred_item['predicted_label']
|
| 2111 |
+
tag_only = label.split('-', 1)[-1] if '-' in label else label
|
| 2112 |
+
|
| 2113 |
+
if label.startswith('B-'):
|
| 2114 |
+
if current_entity_label:
|
| 2115 |
+
results.append(
|
| 2116 |
+
create_label_studio_span(page_results,
|
| 2117 |
+
current_entity_start_word_index,
|
| 2118 |
+
i - 1,
|
| 2119 |
+
current_entity_label)
|
| 2120 |
+
)
|
| 2121 |
+
current_entity_label = tag_only
|
| 2122 |
+
current_entity_start_word_index = i
|
| 2123 |
+
|
| 2124 |
+
elif label.startswith('I-') and current_entity_label == tag_only:
|
| 2125 |
+
continue
|
| 2126 |
+
|
| 2127 |
+
else:
|
| 2128 |
+
if current_entity_label:
|
| 2129 |
+
results.append(
|
| 2130 |
+
create_label_studio_span(page_results,
|
| 2131 |
+
current_entity_start_word_index,
|
| 2132 |
+
i - 1,
|
| 2133 |
+
current_entity_label)
|
| 2134 |
+
)
|
| 2135 |
+
current_entity_label = None
|
| 2136 |
+
current_entity_start_word_index = None
|
| 2137 |
+
|
| 2138 |
+
if current_entity_label:
|
| 2139 |
+
results.append(
|
| 2140 |
+
create_label_studio_span(page_results,
|
| 2141 |
+
current_entity_start_word_index,
|
| 2142 |
+
len(page_results) - 1,
|
| 2143 |
+
current_entity_label)
|
| 2144 |
+
)
|
| 2145 |
+
|
| 2146 |
+
total_spans += len(results)
|
| 2147 |
+
print(f" -> Page {page_num}: Generated {len(results)} labeled spans.")
|
| 2148 |
+
|
| 2149 |
+
final_tasks.append({
|
| 2150 |
+
"data": {
|
| 2151 |
+
"text": text_string,
|
| 2152 |
+
"original_words": original_words,
|
| 2153 |
+
"original_bboxes": [r['bbox'] for r in page_results]
|
| 2154 |
+
},
|
| 2155 |
+
"annotations": [{"result": results}],
|
| 2156 |
+
"meta": {"page_number": page_num}
|
| 2157 |
+
})
|
| 2158 |
+
|
| 2159 |
+
with open(output_path, "w", encoding='utf-8') as f:
|
| 2160 |
+
json.dump(final_tasks, f, indent=2, ensure_ascii=False)
|
| 2161 |
+
|
| 2162 |
+
print(f"\n✅ Label Studio tasks created and saved to {output_path}. Total {total_spans} spans.")
|
| 2163 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2164 |
|
| 2165 |
|
| 2166 |
# ============================================================================
|
| 2167 |
# --- PHASE 3: BIO TO STRUCTURED JSON DECODER (Modified for In-Memory Return) ---
|
| 2168 |
# ============================================================================
|
| 2169 |
+
#
|
| 2170 |
+
|
| 2171 |
+
|
| 2172 |
|
| 2173 |
def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 2174 |
"""
|
|
|
|
| 2227 |
entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
|
| 2228 |
current_text_buffer.append(word)
|
| 2229 |
previous_entity_type = last_entity_type
|
| 2230 |
+
is_passage_label = (entity_type == 'PASSAGE')
|
| 2231 |
|
| 2232 |
+
# ----------------------------------------------------------------------
|
| 2233 |
+
# --- MODIFICATION AREA 1: Pre-Question Content (Metadata/Passage) ---
|
| 2234 |
+
# ----------------------------------------------------------------------
|
|
|
|
| 2235 |
|
| 2236 |
+
# If we haven't started the first question yet
|
| 2237 |
+
if not first_question_started:
|
| 2238 |
+
# Skip 'O' and non-B/I-PASSAGE tags before the first B-QUESTION
|
| 2239 |
+
if label != 'B-QUESTION' and not is_passage_label:
|
| 2240 |
+
just_finished_i_option = False
|
| 2241 |
+
is_in_new_passage = False
|
| 2242 |
+
continue
|
| 2243 |
+
|
| 2244 |
+
# Handle PASSAGE tokens before the first B-QUESTION
|
| 2245 |
+
if is_passage_label:
|
| 2246 |
+
# B-PASSAGE or I-PASSAGE always appends to the buffer here
|
| 2247 |
current_passage_buffer.append(word)
|
| 2248 |
last_entity_type = 'PASSAGE'
|
| 2249 |
+
just_finished_i_option = False
|
| 2250 |
+
is_in_new_passage = False
|
| 2251 |
+
continue
|
| 2252 |
+
|
| 2253 |
+
# ----------------------------------------------------------------------
|
| 2254 |
+
# --- Standard B-QUESTION Start (Split/Finalize previous item) ---
|
| 2255 |
+
# ----------------------------------------------------------------------
|
| 2256 |
|
| 2257 |
if label == 'B-QUESTION':
|
| 2258 |
if not first_question_started:
|
| 2259 |
+
# Handle initial header/metadata
|
| 2260 |
header_text = ' '.join(current_text_buffer[:-1]).strip()
|
| 2261 |
if header_text or current_passage_buffer:
|
| 2262 |
metadata_item = {'type': 'METADATA', 'passage': ''}
|
| 2263 |
+
finalize_passage_to_item(metadata_item, current_passage_buffer)
|
| 2264 |
+
if header_text:
|
|
|
|
|
|
|
|
|
|
| 2265 |
metadata_item['text'] = header_text
|
| 2266 |
structured_data.append(metadata_item)
|
| 2267 |
first_question_started = True
|
|
|
|
| 2286 |
is_in_new_passage = False
|
| 2287 |
continue
|
| 2288 |
|
| 2289 |
+
# ----------------------------------------------------------------------
|
| 2290 |
+
# --- Processing tokens within an active Question (current_item) ---
|
| 2291 |
+
# ----------------------------------------------------------------------
|
| 2292 |
+
|
| 2293 |
if current_item is not None:
|
| 2294 |
if is_in_new_passage:
|
| 2295 |
+
# Handle passage continuation started after an option
|
| 2296 |
current_item['new_passage'] += f' {word}'
|
| 2297 |
if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
|
| 2298 |
is_in_new_passage = False
|
|
|
|
| 2301 |
continue
|
| 2302 |
|
| 2303 |
is_in_new_passage = False
|
| 2304 |
+
|
| 2305 |
+
# Case 1: Beginning of a new entity (B- tag)
|
| 2306 |
if label.startswith('B-'):
|
| 2307 |
+
|
| 2308 |
+
# Check for termination entities
|
| 2309 |
+
if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
|
| 2310 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 2311 |
current_passage_buffer = []
|
| 2312 |
+
|
| 2313 |
last_entity_type = entity_type
|
| 2314 |
|
| 2315 |
if entity_type == 'PASSAGE':
|
| 2316 |
+
# MODIFICATION 2: B-PASSAGE always continues the current passage buffer
|
| 2317 |
+
# unless immediately following an I-OPTION (which starts 'new_passage')
|
| 2318 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 2319 |
current_item['new_passage'] = word
|
| 2320 |
is_in_new_passage = True
|
| 2321 |
else:
|
| 2322 |
+
current_passage_buffer.append(word) # Append B-PASSAGE word
|
| 2323 |
elif entity_type == 'OPTION':
|
| 2324 |
current_option_key = word
|
| 2325 |
current_item['options'][current_option_key] = word
|
|
|
|
| 2332 |
current_item['question'] += f' {word}'
|
| 2333 |
just_finished_i_option = False
|
| 2334 |
|
| 2335 |
+
# Case 2: Inside an existing entity (I- tag)
|
| 2336 |
elif label.startswith('I-'):
|
| 2337 |
if entity_type == 'QUESTION' and current_item.get('question'):
|
| 2338 |
current_item['question'] += f' {word}'
|
| 2339 |
last_entity_type = 'QUESTION'
|
| 2340 |
just_finished_i_option = False
|
| 2341 |
elif entity_type == 'PASSAGE':
|
| 2342 |
+
# MODIFICATION 3: I-PASSAGE always continues the current passage buffer
|
| 2343 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 2344 |
current_item['new_passage'] = word
|
| 2345 |
is_in_new_passage = True
|
| 2346 |
else:
|
| 2347 |
+
# Ensure last entity was PASSAGE or QUESTION/initial state to append
|
| 2348 |
+
if last_entity_type == 'QUESTION': last_entity_type = 'PASSAGE'
|
| 2349 |
if last_entity_type == 'PASSAGE' or not current_passage_buffer:
|
| 2350 |
+
current_passage_buffer.append(word) # Append I-PASSAGE word
|
| 2351 |
last_entity_type = 'PASSAGE'
|
| 2352 |
just_finished_i_option = False
|
| 2353 |
elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
|
|
|
|
| 2359 |
else:
|
| 2360 |
just_finished_i_option = False
|
| 2361 |
|
| 2362 |
+
# Case 3: Outside any entity (O tag)
|
| 2363 |
elif label == 'O':
|
| 2364 |
+
# MODIFICATION 4: Skip 'O' tokens ONLY if the last active entity was PASSAGE.
|
| 2365 |
+
# Otherwise, default O tokens append to QUESTION (original logic preserved)
|
| 2366 |
+
if last_entity_type == 'PASSAGE':
|
| 2367 |
+
# Do nothing to the passage buffer and do not change last_entity_type
|
| 2368 |
+
pass
|
| 2369 |
+
elif last_entity_type == 'QUESTION' and current_item and 'question' in current_item:
|
| 2370 |
current_item['question'] += f' {word}'
|
| 2371 |
just_finished_i_option = False
|
| 2372 |
|
| 2373 |
+
# ----------------------------------------------------------------------
|
| 2374 |
+
# --- Finalization (Unchanged) ---
|
| 2375 |
+
# ----------------------------------------------------------------------
|
| 2376 |
+
|
| 2377 |
# --- Finalize last item ---
|
| 2378 |
if current_item is not None:
|
| 2379 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
|
|
|
| 2403 |
return structured_data
|
| 2404 |
|
| 2405 |
|
| 2406 |
+
def correct_misaligned_options(structured_data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 2407 |
+
"""
|
| 2408 |
+
Corrects common OCR/tagging misalignment in options:
|
| 2409 |
+
If option N is empty (only contains its identifier, e.g., '(A)')
|
| 2410 |
+
AND option N+1 contains its own identifier followed by TWO EQUATION/FIGURE tags,
|
| 2411 |
+
then the first tag of option N+1 is moved to option N.
|
| 2412 |
+
"""
|
| 2413 |
+
print("\n" + "=" * 80)
|
| 2414 |
+
print("--- 5. STARTING POST-PROCESSING: OPTION ALIGNMENT CORRECTION ---")
|
| 2415 |
+
print("=" * 80)
|
| 2416 |
+
|
| 2417 |
+
# Regex to find all EQUATION/FIGURE tags
|
| 2418 |
+
tag_pattern = re.compile(r'(EQUATION\d+|FIGURE\d+)')
|
| 2419 |
+
|
| 2420 |
+
corrected_count = 0
|
| 2421 |
+
|
| 2422 |
+
for item in structured_data:
|
| 2423 |
+
if item.get('type') in ['METADATA']:
|
| 2424 |
+
continue
|
| 2425 |
+
|
| 2426 |
+
options = item.get('options')
|
| 2427 |
+
if not options or len(options) < 2:
|
| 2428 |
+
continue
|
| 2429 |
+
|
| 2430 |
+
# Get option keys in their correct order
|
| 2431 |
+
option_keys = list(options.keys())
|
| 2432 |
+
|
| 2433 |
+
for i in range(len(option_keys) - 1):
|
| 2434 |
+
current_key = option_keys[i]
|
| 2435 |
+
next_key = option_keys[i + 1]
|
| 2436 |
+
|
| 2437 |
+
current_value = options[current_key].strip()
|
| 2438 |
+
next_value = options[next_key].strip()
|
| 2439 |
+
|
| 2440 |
+
# --- Condition 1: Check if the current option is "empty" ---
|
| 2441 |
+
# An "empty" option only contains its key/identifier (e.g., "(A)")
|
| 2442 |
+
is_current_empty = current_value == current_key
|
| 2443 |
+
|
| 2444 |
+
# --- Condition 2: Check if the next option has two content tags ---
|
| 2445 |
+
# Remove the option key from the next value to check content
|
| 2446 |
+
content_in_next = next_value.replace(next_key, '', 1).strip()
|
| 2447 |
+
tags_in_next = tag_pattern.findall(content_in_next)
|
| 2448 |
+
|
| 2449 |
+
has_two_tags = len(tags_in_next) == 2
|
| 2450 |
+
|
| 2451 |
+
if is_current_empty and has_two_tags:
|
| 2452 |
+
print(
|
| 2453 |
+
f" -> Correction applied in Item {item.get('question', '...')}: Moving '{tags_in_next[0]}' from {next_key} to {current_key}.")
|
| 2454 |
+
|
| 2455 |
+
# 1. Get the first tag (the one that belongs to the current option)
|
| 2456 |
+
tag_to_move = tags_in_next[0]
|
| 2457 |
+
|
| 2458 |
+
# 2. Update current option: Append the tag to the empty identifier
|
| 2459 |
+
options[current_key] = f"{current_key} {tag_to_move}".strip()
|
| 2460 |
+
|
| 2461 |
+
# 3. Update next option: Remove the first tag from its content
|
| 2462 |
+
remaining_tags_content = tag_pattern.sub('', next_value.replace(next_key, '', 1), 1).strip()
|
| 2463 |
+
remaining_tags_content += f" {tags_in_next[1]}"
|
| 2464 |
+
|
| 2465 |
+
# Reconstruct the next option value with only its identifier and the single remaining tag
|
| 2466 |
+
options[next_key] = f"{next_key} {tags_in_next[1]}".strip()
|
| 2467 |
+
|
| 2468 |
+
corrected_count += 1
|
| 2469 |
+
|
| 2470 |
+
print(f"✅ Option alignment correction finished. Total corrections: {corrected_count}.")
|
| 2471 |
+
return structured_data
|
| 2472 |
+
|
| 2473 |
+
|
| 2474 |
# ============================================================================
|
| 2475 |
# --- PHASE 4: IMAGE EMBEDDING (Modified for In-Memory Return) ---
|
| 2476 |
# ============================================================================
|
|
|
|
| 2550 |
# --- MAIN FUNCTION (The Callable Interface) ---
|
| 2551 |
# ============================================================================
|
| 2552 |
|
| 2553 |
+
def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, label_studio_output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 2554 |
"""
|
| 2555 |
Executes the full document analysis pipeline: YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed.
|
| 2556 |
|
|
|
|
| 2634 |
print("Pipeline aborted: Failed to convert BIO tags to structured data in Phase 3.")
|
| 2635 |
return None
|
| 2636 |
|
| 2637 |
+
structured_data_list = correct_misaligned_options(structured_data_list)
|
| 2638 |
+
|
| 2639 |
+
try:
|
| 2640 |
+
# CHANGE: Use the provided, persistent output path
|
| 2641 |
+
convert_raw_predictions_to_label_studio(page_raw_predictions_list, label_studio_output_path)
|
| 2642 |
+
print(f"✅ Label Studio output saved to: {label_studio_output_path}")
|
| 2643 |
+
except Exception as e:
|
| 2644 |
+
print(f"❌ Error during Label Studio conversion: {e}")
|
| 2645 |
+
|
| 2646 |
+
|
| 2647 |
# --- D. PHASE 4: IMAGE EMBEDDING (Base64) ---
|
| 2648 |
final_result = embed_images_as_base64_in_memory(
|
| 2649 |
structured_data_list,
|
|
|
|
| 2681 |
return final_result
|
| 2682 |
|
| 2683 |
|
| 2684 |
+
|
| 2685 |
if __name__ == "__main__":
|
| 2686 |
parser = argparse.ArgumentParser(
|
| 2687 |
description="Complete Document Analysis Pipeline (YOLO/OCR -> LayoutLMv3 -> Structured JSON -> Base64 Image Embed).")
|
|
|
|
| 2691 |
default=DEFAULT_LAYOUTLMV3_MODEL_PATH,
|
| 2692 |
help="Path to the saved LayoutLMv3-CRF PyTorch model checkpoint.")
|
| 2693 |
|
| 2694 |
+
# NEW ARGUMENT: Optional path for the Label Studio output
|
| 2695 |
+
parser.add_argument("--ls_output_path", type=str, default=None,
|
| 2696 |
+
help="Optional path to save the Label Studio JSON task file.")
|
| 2697 |
+
|
| 2698 |
args = parser.parse_args()
|
| 2699 |
|
| 2700 |
+
pdf_name = os.path.splitext(os.path.basename(args.input_pdf))[0]
|
|
|
|
| 2701 |
|
| 2702 |
+
# 1. Define Persistent Output Paths
|
| 2703 |
+
final_output_file_name = f"{pdf_name}_final_output_embedded.json"
|
| 2704 |
+
final_output_path = os.path.abspath(final_output_file_name)
|
| 2705 |
+
|
| 2706 |
+
# 2. Determine Label Studio Output Path
|
| 2707 |
+
# If not provided, create a default path next to the script
|
| 2708 |
+
if args.ls_output_path:
|
| 2709 |
+
ls_output_path = os.path.abspath(args.ls_output_path)
|
| 2710 |
+
else:
|
| 2711 |
+
ls_output_file_name = f"{pdf_name}_label_studio_tasks.json"
|
| 2712 |
+
ls_output_path = os.path.abspath(ls_output_file_name)
|
| 2713 |
|
| 2714 |
+
# --- Call the main function (Updated to include ls_output_path) ---
|
| 2715 |
+
print(f"\n[SETUP] Label Studio Output will be saved to: {ls_output_path}")
|
| 2716 |
|
| 2717 |
+
# NOTE: You must update the signature of run_document_pipeline to accept 3 arguments
|
| 2718 |
+
final_json_data = run_document_pipeline(
|
| 2719 |
+
args.input_pdf,
|
| 2720 |
+
args.layoutlmv3_model_path,
|
| 2721 |
+
ls_output_path # <--- Passing the persistent path
|
| 2722 |
+
)
|
| 2723 |
+
|
| 2724 |
+
if final_json_data:
|
| 2725 |
+
# Save the final structured output
|
| 2726 |
with open(final_output_path, 'w', encoding='utf-8') as f:
|
| 2727 |
json.dump(final_json_data, f, indent=2, ensure_ascii=False)
|
| 2728 |
|
|
|
|
| 2730 |
|
| 2731 |
|
| 2732 |
|
|
|