Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on 20 days ago

Commit

ee597c6

verified ·

1 Parent(s): f54d98f

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +204 -9

working_yolo_pipeline.py CHANGED Viewed

@@ -1564,13 +1564,181 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
 # ============================================================================
 def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
     print("\n" + "=" * 80)
     print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
     print("=" * 80)
     try:
         with open(input_path, 'r', encoding='utf-8') as f:
             predictions_by_page = json.load(f)
     except Exception as e:
         print(f"❌ Error loading raw prediction file: {e}")
         return None
@@ -1579,6 +1747,9 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
     for page_item in predictions_by_page:
         if isinstance(page_item, dict) and 'data' in page_item:
             predictions.extend(page_item['data'])
     structured_data = []
     current_item = None
@@ -1593,20 +1764,27 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
     def finalize_passage_to_item(item, passage_buffer):
         if passage_buffer:
             passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
             if item.get('passage'):
                 item['passage'] += ' ' + passage_text
             else:
                 item['passage'] = passage_text
         passage_buffer.clear()
-    for item in predictions:
         word = item['word']
         label = item['predicted_label']
         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
         current_text_buffer.append(word)
         previous_entity_type = last_entity_type
         is_passage_label = (entity_type == 'PASSAGE')
         if not first_question_started:
             if label != 'B-QUESTION' and not is_passage_label:
                 just_finished_i_option = False
@@ -1620,9 +1798,11 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                 continue
         if label == 'B-QUESTION':
             if not first_question_started:
                 header_text = ' '.join(current_text_buffer[:-1]).strip()
                 if header_text or current_passage_buffer:
                     metadata_item = {'type': 'METADATA', 'passage': ''}
                     finalize_passage_to_item(metadata_item, current_passage_buffer)
                     if header_text: metadata_item['text'] = header_text
@@ -1634,6 +1814,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                 finalize_passage_to_item(current_item, current_passage_buffer)
                 current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
                 structured_data.append(current_item)
                 current_text_buffer = [word]
             current_item = {
@@ -1647,37 +1828,46 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
         if current_item is not None:
             if is_in_new_passage:
-                # 🔑 Robust Initialization and Appending for 'new_passage'
                 if 'new_passage' not in current_item:
                     current_item['new_passage'] = word
                 else:
                     current_item['new_passage'] += f' {word}'
                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
                     is_in_new_passage = False
-                if label.startswith(('B-', 'I-')): last_entity_type = entity_type
                 continue
             is_in_new_passage = False
             if label.startswith('B-'):
                 if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
                     finalize_passage_to_item(current_item, current_passage_buffer)
                     current_passage_buffer = []
                 last_entity_type = entity_type
                 if entity_type == 'PASSAGE':
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
-                        current_item['new_passage'] = word  # Initialize the new passage start
                         is_in_new_passage = True
                     else:
                         current_passage_buffer.append(word)
                 elif entity_type == 'OPTION':
                     current_option_key = word
                     current_item['options'][current_option_key] = word
                     just_finished_i_option = False
                 elif entity_type == 'ANSWER':
                     current_item['answer'] = word
                     current_option_key = None
                     just_finished_i_option = False
                 elif entity_type == 'QUESTION':
                     current_item['question'] += f' {word}'
                     just_finished_i_option = False
@@ -1687,7 +1877,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                     current_item['question'] += f' {word}'
                 elif entity_type == 'PASSAGE':
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
-                        current_item['new_passage'] = word  # Initialize the new passage start
                         is_in_new_passage = True
                     else:
                         if not current_passage_buffer: last_entity_type = 'PASSAGE'
@@ -1697,6 +1887,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                     just_finished_i_option = True
                 elif entity_type == 'ANSWER':
                     current_item['answer'] += f' {word}'
                 just_finished_i_option = (entity_type == 'OPTION')
             elif label == 'O':
@@ -1704,25 +1895,29 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
                     current_item['question'] += f' {word}'
                 just_finished_i_option = False
     if current_item is not None:
         finalize_passage_to_item(current_item, current_passage_buffer)
         current_item['text'] = ' '.join(current_text_buffer).strip()
         structured_data.append(current_item)
     for item in structured_data:
         item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
         if 'new_passage' in item:
             item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
     try:
         with open(output_path, 'w', encoding='utf-8') as f:
             json.dump(structured_data, f, indent=2, ensure_ascii=False)
-    except Exception:
-        pass
     return structured_data
 def create_query_text(entry: Dict[str, Any]) -> str:
     """Combines question and options into a single string for similarity matching."""
     query_parts = []

 # ============================================================================
+# def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
+#     print("\n" + "=" * 80)
+#     print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
+#     print("=" * 80)
+#     try:
+#         with open(input_path, 'r', encoding='utf-8') as f:
+#             predictions_by_page = json.load(f)
+#     except Exception as e:
+#         print(f"❌ Error loading raw prediction file: {e}")
+#         return None
+#     predictions = []
+#     for page_item in predictions_by_page:
+#         if isinstance(page_item, dict) and 'data' in page_item:
+#             predictions.extend(page_item['data'])
+#     structured_data = []
+#     current_item = None
+#     current_option_key = None
+#     current_passage_buffer = []
+#     current_text_buffer = []
+#     first_question_started = False
+#     last_entity_type = None
+#     just_finished_i_option = False
+#     is_in_new_passage = False
+#     def finalize_passage_to_item(item, passage_buffer):
+#         if passage_buffer:
+#             passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
+#             if item.get('passage'):
+#                 item['passage'] += ' ' + passage_text
+#             else:
+#                 item['passage'] = passage_text
+#         passage_buffer.clear()
+#     for item in predictions:
+#         word = item['word']
+#         label = item['predicted_label']
+#         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
+#         current_text_buffer.append(word)
+#         previous_entity_type = last_entity_type
+#         is_passage_label = (entity_type == 'PASSAGE')
+#         if not first_question_started:
+#             if label != 'B-QUESTION' and not is_passage_label:
+#                 just_finished_i_option = False
+#                 is_in_new_passage = False
+#                 continue
+#             if is_passage_label:
+#                 current_passage_buffer.append(word)
+#                 last_entity_type = 'PASSAGE'
+#                 just_finished_i_option = False
+#                 is_in_new_passage = False
+#                 continue
+#         if label == 'B-QUESTION':
+#             if not first_question_started:
+#                 header_text = ' '.join(current_text_buffer[:-1]).strip()
+#                 if header_text or current_passage_buffer:
+#                     metadata_item = {'type': 'METADATA', 'passage': ''}
+#                     finalize_passage_to_item(metadata_item, current_passage_buffer)
+#                     if header_text: metadata_item['text'] = header_text
+#                     structured_data.append(metadata_item)
+#                 first_question_started = True
+#                 current_text_buffer = [word]
+#             if current_item is not None:
+#                 finalize_passage_to_item(current_item, current_passage_buffer)
+#                 current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
+#                 structured_data.append(current_item)
+#                 current_text_buffer = [word]
+#             current_item = {
+#                 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
+#             }
+#             current_option_key = None
+#             last_entity_type = 'QUESTION'
+#             just_finished_i_option = False
+#             is_in_new_passage = False
+#             continue
+#         if current_item is not None:
+#             if is_in_new_passage:
+#                 # 🔑 Robust Initialization and Appending for 'new_passage'
+#                 if 'new_passage' not in current_item:
+#                     current_item['new_passage'] = word
+#                 else:
+#                     current_item['new_passage'] += f' {word}'
+#                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
+#                     is_in_new_passage = False
+#                 if label.startswith(('B-', 'I-')): last_entity_type = entity_type
+#                 continue
+#             is_in_new_passage = False
+#             if label.startswith('B-'):
+#                 if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
+#                     finalize_passage_to_item(current_item, current_passage_buffer)
+#                     current_passage_buffer = []
+#                 last_entity_type = entity_type
+#                 if entity_type == 'PASSAGE':
+#                     if previous_entity_type == 'OPTION' and just_finished_i_option:
+#                         current_item['new_passage'] = word  # Initialize the new passage start
+#                         is_in_new_passage = True
+#                     else:
+#                         current_passage_buffer.append(word)
+#                 elif entity_type == 'OPTION':
+#                     current_option_key = word
+#                     current_item['options'][current_option_key] = word
+#                     just_finished_i_option = False
+#                 elif entity_type == 'ANSWER':
+#                     current_item['answer'] = word
+#                     current_option_key = None
+#                     just_finished_i_option = False
+#                 elif entity_type == 'QUESTION':
+#                     current_item['question'] += f' {word}'
+#                     just_finished_i_option = False
+#             elif label.startswith('I-'):
+#                 if entity_type == 'QUESTION':
+#                     current_item['question'] += f' {word}'
+#                 elif entity_type == 'PASSAGE':
+#                     if previous_entity_type == 'OPTION' and just_finished_i_option:
+#                         current_item['new_passage'] = word  # Initialize the new passage start
+#                         is_in_new_passage = True
+#                     else:
+#                         if not current_passage_buffer: last_entity_type = 'PASSAGE'
+#                         current_passage_buffer.append(word)
+#                 elif entity_type == 'OPTION' and current_option_key is not None:
+#                     current_item['options'][current_option_key] += f' {word}'
+#                     just_finished_i_option = True
+#                 elif entity_type == 'ANSWER':
+#                     current_item['answer'] += f' {word}'
+#                 just_finished_i_option = (entity_type == 'OPTION')
+#             elif label == 'O':
+#                 if last_entity_type == 'QUESTION':
+#                     current_item['question'] += f' {word}'
+#                 just_finished_i_option = False
+#     if current_item is not None:
+#         finalize_passage_to_item(current_item, current_passage_buffer)
+#         current_item['text'] = ' '.join(current_text_buffer).strip()
+#         structured_data.append(current_item)
+#     for item in structured_data:
+#         item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
+#         if 'new_passage' in item:
+#             item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
+#     try:
+#         with open(output_path, 'w', encoding='utf-8') as f:
+#             json.dump(structured_data, f, indent=2, ensure_ascii=False)
+#     except Exception:
+#         pass
+#     return structured_data
 def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
     print("\n" + "=" * 80)
     print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
+    print(f"Source: {input_path}")
     print("=" * 80)
+    start_time = time.time()
     try:
         with open(input_path, 'r', encoding='utf-8') as f:
             predictions_by_page = json.load(f)
+        print(f"✅ Successfully loaded raw predictions ({len(predictions_by_page)} pages found)")
     except Exception as e:
         print(f"❌ Error loading raw prediction file: {e}")
         return None
     for page_item in predictions_by_page:
         if isinstance(page_item, dict) and 'data' in page_item:
             predictions.extend(page_item['data'])
+    total_words = len(predictions)
+    print(f"📋 Total words to process: {total_words}")
     structured_data = []
     current_item = None
     def finalize_passage_to_item(item, passage_buffer):
         if passage_buffer:
             passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
+            print(f"   ↳ [Buffer] Finalizing passage ({len(passage_buffer)} words) into current item")
             if item.get('passage'):
                 item['passage'] += ' ' + passage_text
             else:
                 item['passage'] = passage_text
         passage_buffer.clear()
+    # Iterate through every predicted word
+    for idx, item in enumerate(predictions):
         word = item['word']
         label = item['predicted_label']
         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
         current_text_buffer.append(word)
         previous_entity_type = last_entity_type
         is_passage_label = (entity_type == 'PASSAGE')
+        # --- LOGGING: Track progress every 500 words or on B- labels ---
+        if label.startswith('B-'):
+             print(f"[Word {idx}/{total_words}] Found Label: {label} | Word: '{word}'")
         if not first_question_started:
             if label != 'B-QUESTION' and not is_passage_label:
                 just_finished_i_option = False
                 continue
         if label == 'B-QUESTION':
+            print(f"🔍 Detection: New Question Started at word {idx}")
             if not first_question_started:
                 header_text = ' '.join(current_text_buffer[:-1]).strip()
                 if header_text or current_passage_buffer:
+                    print(f"   -> Creating METADATA item for text found before first question")
                     metadata_item = {'type': 'METADATA', 'passage': ''}
                     finalize_passage_to_item(metadata_item, current_passage_buffer)
                     if header_text: metadata_item['text'] = header_text
                 finalize_passage_to_item(current_item, current_passage_buffer)
                 current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
                 structured_data.append(current_item)
+                print(f"   -> Saved Question. Total structured items so far: {len(structured_data)}")
                 current_text_buffer = [word]
             current_item = {
         if current_item is not None:
             if is_in_new_passage:
                 if 'new_passage' not in current_item:
                     current_item['new_passage'] = word
                 else:
                     current_item['new_passage'] += f' {word}'
                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
+                    print(f"   ↳ [State] Exiting new_passage mode at label {label}")
                     is_in_new_passage = False
+                if label.startswith(('B-', 'I-')):
+                    last_entity_type = entity_type
                 continue
             is_in_new_passage = False
             if label.startswith('B-'):
                 if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
                     finalize_passage_to_item(current_item, current_passage_buffer)
                     current_passage_buffer = []
                 last_entity_type = entity_type
                 if entity_type == 'PASSAGE':
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
+                        print(f"   ↳ [State] Transitioning to new_passage (Option -> Passage boundary)")
+                        current_item['new_passage'] = word
                         is_in_new_passage = True
                     else:
                         current_passage_buffer.append(word)
                 elif entity_type == 'OPTION':
                     current_option_key = word
                     current_item['options'][current_option_key] = word
                     just_finished_i_option = False
                 elif entity_type == 'ANSWER':
                     current_item['answer'] = word
                     current_option_key = None
                     just_finished_i_option = False
                 elif entity_type == 'QUESTION':
                     current_item['question'] += f' {word}'
                     just_finished_i_option = False
                     current_item['question'] += f' {word}'
                 elif entity_type == 'PASSAGE':
                     if previous_entity_type == 'OPTION' and just_finished_i_option:
+                        current_item['new_passage'] = word
                         is_in_new_passage = True
                     else:
                         if not current_passage_buffer: last_entity_type = 'PASSAGE'
                     just_finished_i_option = True
                 elif entity_type == 'ANSWER':
                     current_item['answer'] += f' {word}'
                 just_finished_i_option = (entity_type == 'OPTION')
             elif label == 'O':
                     current_item['question'] += f' {word}'
                 just_finished_i_option = False
+    # Final wrap up
     if current_item is not None:
+        print(f"🏁 Finalizing the very last item...")
         finalize_passage_to_item(current_item, current_passage_buffer)
         current_item['text'] = ' '.join(current_text_buffer).strip()
         structured_data.append(current_item)
+    # Clean up and regex replacement
     for item in structured_data:
         item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
         if 'new_passage' in item:
             item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
+    print(f"💾 Saving {len(structured_data)} items to {output_path}")
     try:
         with open(output_path, 'w', encoding='utf-8') as f:
             json.dump(structured_data, f, indent=2, ensure_ascii=False)
+        print(f"✅ Decoding Complete. Total time: {time.time() - start_time:.2f}s")
+    except Exception as e:
+        print(f"⚠️ Error saving final JSON: {e}")
     return structured_data
 def create_query_text(entry: Dict[str, Any]) -> str:
     """Combines question and options into a single string for similarity matching."""
     query_parts = []