heerjtdev commited on
Commit
ee597c6
Β·
verified Β·
1 Parent(s): f54d98f

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +204 -9
working_yolo_pipeline.py CHANGED
@@ -1564,13 +1564,181 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
1564
  # ============================================================================
1565
 
1566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1567
  def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
1568
  print("\n" + "=" * 80)
1569
  print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
 
1570
  print("=" * 80)
 
 
 
1571
  try:
1572
  with open(input_path, 'r', encoding='utf-8') as f:
1573
  predictions_by_page = json.load(f)
 
1574
  except Exception as e:
1575
  print(f"❌ Error loading raw prediction file: {e}")
1576
  return None
@@ -1579,6 +1747,9 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1579
  for page_item in predictions_by_page:
1580
  if isinstance(page_item, dict) and 'data' in page_item:
1581
  predictions.extend(page_item['data'])
 
 
 
1582
 
1583
  structured_data = []
1584
  current_item = None
@@ -1593,20 +1764,27 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1593
  def finalize_passage_to_item(item, passage_buffer):
1594
  if passage_buffer:
1595
  passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
 
1596
  if item.get('passage'):
1597
  item['passage'] += ' ' + passage_text
1598
  else:
1599
  item['passage'] = passage_text
1600
  passage_buffer.clear()
1601
 
1602
- for item in predictions:
 
1603
  word = item['word']
1604
  label = item['predicted_label']
1605
  entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
1606
  current_text_buffer.append(word)
 
1607
  previous_entity_type = last_entity_type
1608
  is_passage_label = (entity_type == 'PASSAGE')
1609
 
 
 
 
 
1610
  if not first_question_started:
1611
  if label != 'B-QUESTION' and not is_passage_label:
1612
  just_finished_i_option = False
@@ -1620,9 +1798,11 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1620
  continue
1621
 
1622
  if label == 'B-QUESTION':
 
1623
  if not first_question_started:
1624
  header_text = ' '.join(current_text_buffer[:-1]).strip()
1625
  if header_text or current_passage_buffer:
 
1626
  metadata_item = {'type': 'METADATA', 'passage': ''}
1627
  finalize_passage_to_item(metadata_item, current_passage_buffer)
1628
  if header_text: metadata_item['text'] = header_text
@@ -1634,6 +1814,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1634
  finalize_passage_to_item(current_item, current_passage_buffer)
1635
  current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
1636
  structured_data.append(current_item)
 
1637
  current_text_buffer = [word]
1638
 
1639
  current_item = {
@@ -1647,37 +1828,46 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1647
 
1648
  if current_item is not None:
1649
  if is_in_new_passage:
1650
- # πŸ”‘ Robust Initialization and Appending for 'new_passage'
1651
  if 'new_passage' not in current_item:
1652
  current_item['new_passage'] = word
1653
  else:
1654
  current_item['new_passage'] += f' {word}'
1655
-
1656
  if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
 
1657
  is_in_new_passage = False
1658
- if label.startswith(('B-', 'I-')): last_entity_type = entity_type
 
 
1659
  continue
 
1660
  is_in_new_passage = False
1661
 
1662
  if label.startswith('B-'):
1663
  if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
1664
  finalize_passage_to_item(current_item, current_passage_buffer)
1665
  current_passage_buffer = []
 
1666
  last_entity_type = entity_type
 
1667
  if entity_type == 'PASSAGE':
1668
  if previous_entity_type == 'OPTION' and just_finished_i_option:
1669
- current_item['new_passage'] = word # Initialize the new passage start
 
1670
  is_in_new_passage = True
1671
  else:
1672
  current_passage_buffer.append(word)
 
1673
  elif entity_type == 'OPTION':
1674
  current_option_key = word
1675
  current_item['options'][current_option_key] = word
1676
  just_finished_i_option = False
 
1677
  elif entity_type == 'ANSWER':
1678
  current_item['answer'] = word
1679
  current_option_key = None
1680
  just_finished_i_option = False
 
1681
  elif entity_type == 'QUESTION':
1682
  current_item['question'] += f' {word}'
1683
  just_finished_i_option = False
@@ -1687,7 +1877,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1687
  current_item['question'] += f' {word}'
1688
  elif entity_type == 'PASSAGE':
1689
  if previous_entity_type == 'OPTION' and just_finished_i_option:
1690
- current_item['new_passage'] = word # Initialize the new passage start
1691
  is_in_new_passage = True
1692
  else:
1693
  if not current_passage_buffer: last_entity_type = 'PASSAGE'
@@ -1697,6 +1887,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1697
  just_finished_i_option = True
1698
  elif entity_type == 'ANSWER':
1699
  current_item['answer'] += f' {word}'
 
1700
  just_finished_i_option = (entity_type == 'OPTION')
1701
 
1702
  elif label == 'O':
@@ -1704,25 +1895,29 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
1704
  current_item['question'] += f' {word}'
1705
  just_finished_i_option = False
1706
 
 
1707
  if current_item is not None:
 
1708
  finalize_passage_to_item(current_item, current_passage_buffer)
1709
  current_item['text'] = ' '.join(current_text_buffer).strip()
1710
  structured_data.append(current_item)
1711
 
 
1712
  for item in structured_data:
1713
  item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
1714
  if 'new_passage' in item:
1715
  item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
1716
 
 
1717
  try:
1718
  with open(output_path, 'w', encoding='utf-8') as f:
1719
  json.dump(structured_data, f, indent=2, ensure_ascii=False)
1720
- except Exception:
1721
- pass
 
1722
 
1723
  return structured_data
1724
 
1725
-
1726
  def create_query_text(entry: Dict[str, Any]) -> str:
1727
  """Combines question and options into a single string for similarity matching."""
1728
  query_parts = []
 
1564
  # ============================================================================
1565
 
1566
 
1567
+ # def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
1568
+ # print("\n" + "=" * 80)
1569
+ # print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
1570
+ # print("=" * 80)
1571
+ # try:
1572
+ # with open(input_path, 'r', encoding='utf-8') as f:
1573
+ # predictions_by_page = json.load(f)
1574
+ # except Exception as e:
1575
+ # print(f"❌ Error loading raw prediction file: {e}")
1576
+ # return None
1577
+
1578
+ # predictions = []
1579
+ # for page_item in predictions_by_page:
1580
+ # if isinstance(page_item, dict) and 'data' in page_item:
1581
+ # predictions.extend(page_item['data'])
1582
+
1583
+ # structured_data = []
1584
+ # current_item = None
1585
+ # current_option_key = None
1586
+ # current_passage_buffer = []
1587
+ # current_text_buffer = []
1588
+ # first_question_started = False
1589
+ # last_entity_type = None
1590
+ # just_finished_i_option = False
1591
+ # is_in_new_passage = False
1592
+
1593
+ # def finalize_passage_to_item(item, passage_buffer):
1594
+ # if passage_buffer:
1595
+ # passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
1596
+ # if item.get('passage'):
1597
+ # item['passage'] += ' ' + passage_text
1598
+ # else:
1599
+ # item['passage'] = passage_text
1600
+ # passage_buffer.clear()
1601
+
1602
+ # for item in predictions:
1603
+ # word = item['word']
1604
+ # label = item['predicted_label']
1605
+ # entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
1606
+ # current_text_buffer.append(word)
1607
+ # previous_entity_type = last_entity_type
1608
+ # is_passage_label = (entity_type == 'PASSAGE')
1609
+
1610
+ # if not first_question_started:
1611
+ # if label != 'B-QUESTION' and not is_passage_label:
1612
+ # just_finished_i_option = False
1613
+ # is_in_new_passage = False
1614
+ # continue
1615
+ # if is_passage_label:
1616
+ # current_passage_buffer.append(word)
1617
+ # last_entity_type = 'PASSAGE'
1618
+ # just_finished_i_option = False
1619
+ # is_in_new_passage = False
1620
+ # continue
1621
+
1622
+ # if label == 'B-QUESTION':
1623
+ # if not first_question_started:
1624
+ # header_text = ' '.join(current_text_buffer[:-1]).strip()
1625
+ # if header_text or current_passage_buffer:
1626
+ # metadata_item = {'type': 'METADATA', 'passage': ''}
1627
+ # finalize_passage_to_item(metadata_item, current_passage_buffer)
1628
+ # if header_text: metadata_item['text'] = header_text
1629
+ # structured_data.append(metadata_item)
1630
+ # first_question_started = True
1631
+ # current_text_buffer = [word]
1632
+
1633
+ # if current_item is not None:
1634
+ # finalize_passage_to_item(current_item, current_passage_buffer)
1635
+ # current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
1636
+ # structured_data.append(current_item)
1637
+ # current_text_buffer = [word]
1638
+
1639
+ # current_item = {
1640
+ # 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
1641
+ # }
1642
+ # current_option_key = None
1643
+ # last_entity_type = 'QUESTION'
1644
+ # just_finished_i_option = False
1645
+ # is_in_new_passage = False
1646
+ # continue
1647
+
1648
+ # if current_item is not None:
1649
+ # if is_in_new_passage:
1650
+ # # πŸ”‘ Robust Initialization and Appending for 'new_passage'
1651
+ # if 'new_passage' not in current_item:
1652
+ # current_item['new_passage'] = word
1653
+ # else:
1654
+ # current_item['new_passage'] += f' {word}'
1655
+
1656
+ # if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
1657
+ # is_in_new_passage = False
1658
+ # if label.startswith(('B-', 'I-')): last_entity_type = entity_type
1659
+ # continue
1660
+ # is_in_new_passage = False
1661
+
1662
+ # if label.startswith('B-'):
1663
+ # if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
1664
+ # finalize_passage_to_item(current_item, current_passage_buffer)
1665
+ # current_passage_buffer = []
1666
+ # last_entity_type = entity_type
1667
+ # if entity_type == 'PASSAGE':
1668
+ # if previous_entity_type == 'OPTION' and just_finished_i_option:
1669
+ # current_item['new_passage'] = word # Initialize the new passage start
1670
+ # is_in_new_passage = True
1671
+ # else:
1672
+ # current_passage_buffer.append(word)
1673
+ # elif entity_type == 'OPTION':
1674
+ # current_option_key = word
1675
+ # current_item['options'][current_option_key] = word
1676
+ # just_finished_i_option = False
1677
+ # elif entity_type == 'ANSWER':
1678
+ # current_item['answer'] = word
1679
+ # current_option_key = None
1680
+ # just_finished_i_option = False
1681
+ # elif entity_type == 'QUESTION':
1682
+ # current_item['question'] += f' {word}'
1683
+ # just_finished_i_option = False
1684
+
1685
+ # elif label.startswith('I-'):
1686
+ # if entity_type == 'QUESTION':
1687
+ # current_item['question'] += f' {word}'
1688
+ # elif entity_type == 'PASSAGE':
1689
+ # if previous_entity_type == 'OPTION' and just_finished_i_option:
1690
+ # current_item['new_passage'] = word # Initialize the new passage start
1691
+ # is_in_new_passage = True
1692
+ # else:
1693
+ # if not current_passage_buffer: last_entity_type = 'PASSAGE'
1694
+ # current_passage_buffer.append(word)
1695
+ # elif entity_type == 'OPTION' and current_option_key is not None:
1696
+ # current_item['options'][current_option_key] += f' {word}'
1697
+ # just_finished_i_option = True
1698
+ # elif entity_type == 'ANSWER':
1699
+ # current_item['answer'] += f' {word}'
1700
+ # just_finished_i_option = (entity_type == 'OPTION')
1701
+
1702
+ # elif label == 'O':
1703
+ # if last_entity_type == 'QUESTION':
1704
+ # current_item['question'] += f' {word}'
1705
+ # just_finished_i_option = False
1706
+
1707
+ # if current_item is not None:
1708
+ # finalize_passage_to_item(current_item, current_passage_buffer)
1709
+ # current_item['text'] = ' '.join(current_text_buffer).strip()
1710
+ # structured_data.append(current_item)
1711
+
1712
+ # for item in structured_data:
1713
+ # item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
1714
+ # if 'new_passage' in item:
1715
+ # item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
1716
+
1717
+ # try:
1718
+ # with open(output_path, 'w', encoding='utf-8') as f:
1719
+ # json.dump(structured_data, f, indent=2, ensure_ascii=False)
1720
+ # except Exception:
1721
+ # pass
1722
+
1723
+ # return structured_data
1724
+
1725
+
1726
+
1727
+
1728
+
1729
+
1730
  def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
1731
  print("\n" + "=" * 80)
1732
  print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
1733
+ print(f"Source: {input_path}")
1734
  print("=" * 80)
1735
+
1736
+ start_time = time.time()
1737
+
1738
  try:
1739
  with open(input_path, 'r', encoding='utf-8') as f:
1740
  predictions_by_page = json.load(f)
1741
+ print(f"βœ… Successfully loaded raw predictions ({len(predictions_by_page)} pages found)")
1742
  except Exception as e:
1743
  print(f"❌ Error loading raw prediction file: {e}")
1744
  return None
 
1747
  for page_item in predictions_by_page:
1748
  if isinstance(page_item, dict) and 'data' in page_item:
1749
  predictions.extend(page_item['data'])
1750
+
1751
+ total_words = len(predictions)
1752
+ print(f"πŸ“‹ Total words to process: {total_words}")
1753
 
1754
  structured_data = []
1755
  current_item = None
 
1764
  def finalize_passage_to_item(item, passage_buffer):
1765
  if passage_buffer:
1766
  passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
1767
+ print(f" ↳ [Buffer] Finalizing passage ({len(passage_buffer)} words) into current item")
1768
  if item.get('passage'):
1769
  item['passage'] += ' ' + passage_text
1770
  else:
1771
  item['passage'] = passage_text
1772
  passage_buffer.clear()
1773
 
1774
+ # Iterate through every predicted word
1775
+ for idx, item in enumerate(predictions):
1776
  word = item['word']
1777
  label = item['predicted_label']
1778
  entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
1779
  current_text_buffer.append(word)
1780
+
1781
  previous_entity_type = last_entity_type
1782
  is_passage_label = (entity_type == 'PASSAGE')
1783
 
1784
+ # --- LOGGING: Track progress every 500 words or on B- labels ---
1785
+ if label.startswith('B-'):
1786
+ print(f"[Word {idx}/{total_words}] Found Label: {label} | Word: '{word}'")
1787
+
1788
  if not first_question_started:
1789
  if label != 'B-QUESTION' and not is_passage_label:
1790
  just_finished_i_option = False
 
1798
  continue
1799
 
1800
  if label == 'B-QUESTION':
1801
+ print(f"πŸ” Detection: New Question Started at word {idx}")
1802
  if not first_question_started:
1803
  header_text = ' '.join(current_text_buffer[:-1]).strip()
1804
  if header_text or current_passage_buffer:
1805
+ print(f" -> Creating METADATA item for text found before first question")
1806
  metadata_item = {'type': 'METADATA', 'passage': ''}
1807
  finalize_passage_to_item(metadata_item, current_passage_buffer)
1808
  if header_text: metadata_item['text'] = header_text
 
1814
  finalize_passage_to_item(current_item, current_passage_buffer)
1815
  current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
1816
  structured_data.append(current_item)
1817
+ print(f" -> Saved Question. Total structured items so far: {len(structured_data)}")
1818
  current_text_buffer = [word]
1819
 
1820
  current_item = {
 
1828
 
1829
  if current_item is not None:
1830
  if is_in_new_passage:
 
1831
  if 'new_passage' not in current_item:
1832
  current_item['new_passage'] = word
1833
  else:
1834
  current_item['new_passage'] += f' {word}'
1835
+
1836
  if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
1837
+ print(f" ↳ [State] Exiting new_passage mode at label {label}")
1838
  is_in_new_passage = False
1839
+
1840
+ if label.startswith(('B-', 'I-')):
1841
+ last_entity_type = entity_type
1842
  continue
1843
+
1844
  is_in_new_passage = False
1845
 
1846
  if label.startswith('B-'):
1847
  if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
1848
  finalize_passage_to_item(current_item, current_passage_buffer)
1849
  current_passage_buffer = []
1850
+
1851
  last_entity_type = entity_type
1852
+
1853
  if entity_type == 'PASSAGE':
1854
  if previous_entity_type == 'OPTION' and just_finished_i_option:
1855
+ print(f" ↳ [State] Transitioning to new_passage (Option -> Passage boundary)")
1856
+ current_item['new_passage'] = word
1857
  is_in_new_passage = True
1858
  else:
1859
  current_passage_buffer.append(word)
1860
+
1861
  elif entity_type == 'OPTION':
1862
  current_option_key = word
1863
  current_item['options'][current_option_key] = word
1864
  just_finished_i_option = False
1865
+
1866
  elif entity_type == 'ANSWER':
1867
  current_item['answer'] = word
1868
  current_option_key = None
1869
  just_finished_i_option = False
1870
+
1871
  elif entity_type == 'QUESTION':
1872
  current_item['question'] += f' {word}'
1873
  just_finished_i_option = False
 
1877
  current_item['question'] += f' {word}'
1878
  elif entity_type == 'PASSAGE':
1879
  if previous_entity_type == 'OPTION' and just_finished_i_option:
1880
+ current_item['new_passage'] = word
1881
  is_in_new_passage = True
1882
  else:
1883
  if not current_passage_buffer: last_entity_type = 'PASSAGE'
 
1887
  just_finished_i_option = True
1888
  elif entity_type == 'ANSWER':
1889
  current_item['answer'] += f' {word}'
1890
+
1891
  just_finished_i_option = (entity_type == 'OPTION')
1892
 
1893
  elif label == 'O':
 
1895
  current_item['question'] += f' {word}'
1896
  just_finished_i_option = False
1897
 
1898
+ # Final wrap up
1899
  if current_item is not None:
1900
+ print(f"🏁 Finalizing the very last item...")
1901
  finalize_passage_to_item(current_item, current_passage_buffer)
1902
  current_item['text'] = ' '.join(current_text_buffer).strip()
1903
  structured_data.append(current_item)
1904
 
1905
+ # Clean up and regex replacement
1906
  for item in structured_data:
1907
  item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
1908
  if 'new_passage' in item:
1909
  item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
1910
 
1911
+ print(f"πŸ’Ύ Saving {len(structured_data)} items to {output_path}")
1912
  try:
1913
  with open(output_path, 'w', encoding='utf-8') as f:
1914
  json.dump(structured_data, f, indent=2, ensure_ascii=False)
1915
+ print(f"βœ… Decoding Complete. Total time: {time.time() - start_time:.2f}s")
1916
+ except Exception as e:
1917
+ print(f"⚠️ Error saving final JSON: {e}")
1918
 
1919
  return structured_data
1920
 
 
1921
  def create_query_text(entry: Dict[str, Any]) -> str:
1922
  """Combines question and options into a single string for similarity matching."""
1923
  query_parts = []