Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +204 -9
working_yolo_pipeline.py
CHANGED
|
@@ -1564,13 +1564,181 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
|
|
| 1564 |
# ============================================================================
|
| 1565 |
|
| 1566 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1567 |
def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 1568 |
print("\n" + "=" * 80)
|
| 1569 |
print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
|
|
|
|
| 1570 |
print("=" * 80)
|
|
|
|
|
|
|
|
|
|
| 1571 |
try:
|
| 1572 |
with open(input_path, 'r', encoding='utf-8') as f:
|
| 1573 |
predictions_by_page = json.load(f)
|
|
|
|
| 1574 |
except Exception as e:
|
| 1575 |
print(f"β Error loading raw prediction file: {e}")
|
| 1576 |
return None
|
|
@@ -1579,6 +1747,9 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1579 |
for page_item in predictions_by_page:
|
| 1580 |
if isinstance(page_item, dict) and 'data' in page_item:
|
| 1581 |
predictions.extend(page_item['data'])
|
|
|
|
|
|
|
|
|
|
| 1582 |
|
| 1583 |
structured_data = []
|
| 1584 |
current_item = None
|
|
@@ -1593,20 +1764,27 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1593 |
def finalize_passage_to_item(item, passage_buffer):
|
| 1594 |
if passage_buffer:
|
| 1595 |
passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
|
|
|
|
| 1596 |
if item.get('passage'):
|
| 1597 |
item['passage'] += ' ' + passage_text
|
| 1598 |
else:
|
| 1599 |
item['passage'] = passage_text
|
| 1600 |
passage_buffer.clear()
|
| 1601 |
|
| 1602 |
-
|
|
|
|
| 1603 |
word = item['word']
|
| 1604 |
label = item['predicted_label']
|
| 1605 |
entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
|
| 1606 |
current_text_buffer.append(word)
|
|
|
|
| 1607 |
previous_entity_type = last_entity_type
|
| 1608 |
is_passage_label = (entity_type == 'PASSAGE')
|
| 1609 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1610 |
if not first_question_started:
|
| 1611 |
if label != 'B-QUESTION' and not is_passage_label:
|
| 1612 |
just_finished_i_option = False
|
|
@@ -1620,9 +1798,11 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1620 |
continue
|
| 1621 |
|
| 1622 |
if label == 'B-QUESTION':
|
|
|
|
| 1623 |
if not first_question_started:
|
| 1624 |
header_text = ' '.join(current_text_buffer[:-1]).strip()
|
| 1625 |
if header_text or current_passage_buffer:
|
|
|
|
| 1626 |
metadata_item = {'type': 'METADATA', 'passage': ''}
|
| 1627 |
finalize_passage_to_item(metadata_item, current_passage_buffer)
|
| 1628 |
if header_text: metadata_item['text'] = header_text
|
|
@@ -1634,6 +1814,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1634 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1635 |
current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
|
| 1636 |
structured_data.append(current_item)
|
|
|
|
| 1637 |
current_text_buffer = [word]
|
| 1638 |
|
| 1639 |
current_item = {
|
|
@@ -1647,37 +1828,46 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1647 |
|
| 1648 |
if current_item is not None:
|
| 1649 |
if is_in_new_passage:
|
| 1650 |
-
# π Robust Initialization and Appending for 'new_passage'
|
| 1651 |
if 'new_passage' not in current_item:
|
| 1652 |
current_item['new_passage'] = word
|
| 1653 |
else:
|
| 1654 |
current_item['new_passage'] += f' {word}'
|
| 1655 |
-
|
| 1656 |
if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
|
|
|
|
| 1657 |
is_in_new_passage = False
|
| 1658 |
-
|
|
|
|
|
|
|
| 1659 |
continue
|
|
|
|
| 1660 |
is_in_new_passage = False
|
| 1661 |
|
| 1662 |
if label.startswith('B-'):
|
| 1663 |
if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
|
| 1664 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1665 |
current_passage_buffer = []
|
|
|
|
| 1666 |
last_entity_type = entity_type
|
|
|
|
| 1667 |
if entity_type == 'PASSAGE':
|
| 1668 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 1669 |
-
|
|
|
|
| 1670 |
is_in_new_passage = True
|
| 1671 |
else:
|
| 1672 |
current_passage_buffer.append(word)
|
|
|
|
| 1673 |
elif entity_type == 'OPTION':
|
| 1674 |
current_option_key = word
|
| 1675 |
current_item['options'][current_option_key] = word
|
| 1676 |
just_finished_i_option = False
|
|
|
|
| 1677 |
elif entity_type == 'ANSWER':
|
| 1678 |
current_item['answer'] = word
|
| 1679 |
current_option_key = None
|
| 1680 |
just_finished_i_option = False
|
|
|
|
| 1681 |
elif entity_type == 'QUESTION':
|
| 1682 |
current_item['question'] += f' {word}'
|
| 1683 |
just_finished_i_option = False
|
|
@@ -1687,7 +1877,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1687 |
current_item['question'] += f' {word}'
|
| 1688 |
elif entity_type == 'PASSAGE':
|
| 1689 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 1690 |
-
current_item['new_passage'] = word
|
| 1691 |
is_in_new_passage = True
|
| 1692 |
else:
|
| 1693 |
if not current_passage_buffer: last_entity_type = 'PASSAGE'
|
|
@@ -1697,6 +1887,7 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1697 |
just_finished_i_option = True
|
| 1698 |
elif entity_type == 'ANSWER':
|
| 1699 |
current_item['answer'] += f' {word}'
|
|
|
|
| 1700 |
just_finished_i_option = (entity_type == 'OPTION')
|
| 1701 |
|
| 1702 |
elif label == 'O':
|
|
@@ -1704,25 +1895,29 @@ def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) ->
|
|
| 1704 |
current_item['question'] += f' {word}'
|
| 1705 |
just_finished_i_option = False
|
| 1706 |
|
|
|
|
| 1707 |
if current_item is not None:
|
|
|
|
| 1708 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1709 |
current_item['text'] = ' '.join(current_text_buffer).strip()
|
| 1710 |
structured_data.append(current_item)
|
| 1711 |
|
|
|
|
| 1712 |
for item in structured_data:
|
| 1713 |
item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
|
| 1714 |
if 'new_passage' in item:
|
| 1715 |
item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
|
| 1716 |
|
|
|
|
| 1717 |
try:
|
| 1718 |
with open(output_path, 'w', encoding='utf-8') as f:
|
| 1719 |
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 1720 |
-
|
| 1721 |
-
|
|
|
|
| 1722 |
|
| 1723 |
return structured_data
|
| 1724 |
|
| 1725 |
-
|
| 1726 |
def create_query_text(entry: Dict[str, Any]) -> str:
|
| 1727 |
"""Combines question and options into a single string for similarity matching."""
|
| 1728 |
query_parts = []
|
|
|
|
| 1564 |
# ============================================================================
|
| 1565 |
|
| 1566 |
|
| 1567 |
+
# def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 1568 |
+
# print("\n" + "=" * 80)
|
| 1569 |
+
# print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
|
| 1570 |
+
# print("=" * 80)
|
| 1571 |
+
# try:
|
| 1572 |
+
# with open(input_path, 'r', encoding='utf-8') as f:
|
| 1573 |
+
# predictions_by_page = json.load(f)
|
| 1574 |
+
# except Exception as e:
|
| 1575 |
+
# print(f"β Error loading raw prediction file: {e}")
|
| 1576 |
+
# return None
|
| 1577 |
+
|
| 1578 |
+
# predictions = []
|
| 1579 |
+
# for page_item in predictions_by_page:
|
| 1580 |
+
# if isinstance(page_item, dict) and 'data' in page_item:
|
| 1581 |
+
# predictions.extend(page_item['data'])
|
| 1582 |
+
|
| 1583 |
+
# structured_data = []
|
| 1584 |
+
# current_item = None
|
| 1585 |
+
# current_option_key = None
|
| 1586 |
+
# current_passage_buffer = []
|
| 1587 |
+
# current_text_buffer = []
|
| 1588 |
+
# first_question_started = False
|
| 1589 |
+
# last_entity_type = None
|
| 1590 |
+
# just_finished_i_option = False
|
| 1591 |
+
# is_in_new_passage = False
|
| 1592 |
+
|
| 1593 |
+
# def finalize_passage_to_item(item, passage_buffer):
|
| 1594 |
+
# if passage_buffer:
|
| 1595 |
+
# passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
|
| 1596 |
+
# if item.get('passage'):
|
| 1597 |
+
# item['passage'] += ' ' + passage_text
|
| 1598 |
+
# else:
|
| 1599 |
+
# item['passage'] = passage_text
|
| 1600 |
+
# passage_buffer.clear()
|
| 1601 |
+
|
| 1602 |
+
# for item in predictions:
|
| 1603 |
+
# word = item['word']
|
| 1604 |
+
# label = item['predicted_label']
|
| 1605 |
+
# entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
|
| 1606 |
+
# current_text_buffer.append(word)
|
| 1607 |
+
# previous_entity_type = last_entity_type
|
| 1608 |
+
# is_passage_label = (entity_type == 'PASSAGE')
|
| 1609 |
+
|
| 1610 |
+
# if not first_question_started:
|
| 1611 |
+
# if label != 'B-QUESTION' and not is_passage_label:
|
| 1612 |
+
# just_finished_i_option = False
|
| 1613 |
+
# is_in_new_passage = False
|
| 1614 |
+
# continue
|
| 1615 |
+
# if is_passage_label:
|
| 1616 |
+
# current_passage_buffer.append(word)
|
| 1617 |
+
# last_entity_type = 'PASSAGE'
|
| 1618 |
+
# just_finished_i_option = False
|
| 1619 |
+
# is_in_new_passage = False
|
| 1620 |
+
# continue
|
| 1621 |
+
|
| 1622 |
+
# if label == 'B-QUESTION':
|
| 1623 |
+
# if not first_question_started:
|
| 1624 |
+
# header_text = ' '.join(current_text_buffer[:-1]).strip()
|
| 1625 |
+
# if header_text or current_passage_buffer:
|
| 1626 |
+
# metadata_item = {'type': 'METADATA', 'passage': ''}
|
| 1627 |
+
# finalize_passage_to_item(metadata_item, current_passage_buffer)
|
| 1628 |
+
# if header_text: metadata_item['text'] = header_text
|
| 1629 |
+
# structured_data.append(metadata_item)
|
| 1630 |
+
# first_question_started = True
|
| 1631 |
+
# current_text_buffer = [word]
|
| 1632 |
+
|
| 1633 |
+
# if current_item is not None:
|
| 1634 |
+
# finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1635 |
+
# current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
|
| 1636 |
+
# structured_data.append(current_item)
|
| 1637 |
+
# current_text_buffer = [word]
|
| 1638 |
+
|
| 1639 |
+
# current_item = {
|
| 1640 |
+
# 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
|
| 1641 |
+
# }
|
| 1642 |
+
# current_option_key = None
|
| 1643 |
+
# last_entity_type = 'QUESTION'
|
| 1644 |
+
# just_finished_i_option = False
|
| 1645 |
+
# is_in_new_passage = False
|
| 1646 |
+
# continue
|
| 1647 |
+
|
| 1648 |
+
# if current_item is not None:
|
| 1649 |
+
# if is_in_new_passage:
|
| 1650 |
+
# # π Robust Initialization and Appending for 'new_passage'
|
| 1651 |
+
# if 'new_passage' not in current_item:
|
| 1652 |
+
# current_item['new_passage'] = word
|
| 1653 |
+
# else:
|
| 1654 |
+
# current_item['new_passage'] += f' {word}'
|
| 1655 |
+
|
| 1656 |
+
# if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
|
| 1657 |
+
# is_in_new_passage = False
|
| 1658 |
+
# if label.startswith(('B-', 'I-')): last_entity_type = entity_type
|
| 1659 |
+
# continue
|
| 1660 |
+
# is_in_new_passage = False
|
| 1661 |
+
|
| 1662 |
+
# if label.startswith('B-'):
|
| 1663 |
+
# if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
|
| 1664 |
+
# finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1665 |
+
# current_passage_buffer = []
|
| 1666 |
+
# last_entity_type = entity_type
|
| 1667 |
+
# if entity_type == 'PASSAGE':
|
| 1668 |
+
# if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 1669 |
+
# current_item['new_passage'] = word # Initialize the new passage start
|
| 1670 |
+
# is_in_new_passage = True
|
| 1671 |
+
# else:
|
| 1672 |
+
# current_passage_buffer.append(word)
|
| 1673 |
+
# elif entity_type == 'OPTION':
|
| 1674 |
+
# current_option_key = word
|
| 1675 |
+
# current_item['options'][current_option_key] = word
|
| 1676 |
+
# just_finished_i_option = False
|
| 1677 |
+
# elif entity_type == 'ANSWER':
|
| 1678 |
+
# current_item['answer'] = word
|
| 1679 |
+
# current_option_key = None
|
| 1680 |
+
# just_finished_i_option = False
|
| 1681 |
+
# elif entity_type == 'QUESTION':
|
| 1682 |
+
# current_item['question'] += f' {word}'
|
| 1683 |
+
# just_finished_i_option = False
|
| 1684 |
+
|
| 1685 |
+
# elif label.startswith('I-'):
|
| 1686 |
+
# if entity_type == 'QUESTION':
|
| 1687 |
+
# current_item['question'] += f' {word}'
|
| 1688 |
+
# elif entity_type == 'PASSAGE':
|
| 1689 |
+
# if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 1690 |
+
# current_item['new_passage'] = word # Initialize the new passage start
|
| 1691 |
+
# is_in_new_passage = True
|
| 1692 |
+
# else:
|
| 1693 |
+
# if not current_passage_buffer: last_entity_type = 'PASSAGE'
|
| 1694 |
+
# current_passage_buffer.append(word)
|
| 1695 |
+
# elif entity_type == 'OPTION' and current_option_key is not None:
|
| 1696 |
+
# current_item['options'][current_option_key] += f' {word}'
|
| 1697 |
+
# just_finished_i_option = True
|
| 1698 |
+
# elif entity_type == 'ANSWER':
|
| 1699 |
+
# current_item['answer'] += f' {word}'
|
| 1700 |
+
# just_finished_i_option = (entity_type == 'OPTION')
|
| 1701 |
+
|
| 1702 |
+
# elif label == 'O':
|
| 1703 |
+
# if last_entity_type == 'QUESTION':
|
| 1704 |
+
# current_item['question'] += f' {word}'
|
| 1705 |
+
# just_finished_i_option = False
|
| 1706 |
+
|
| 1707 |
+
# if current_item is not None:
|
| 1708 |
+
# finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1709 |
+
# current_item['text'] = ' '.join(current_text_buffer).strip()
|
| 1710 |
+
# structured_data.append(current_item)
|
| 1711 |
+
|
| 1712 |
+
# for item in structured_data:
|
| 1713 |
+
# item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
|
| 1714 |
+
# if 'new_passage' in item:
|
| 1715 |
+
# item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
|
| 1716 |
+
|
| 1717 |
+
# try:
|
| 1718 |
+
# with open(output_path, 'w', encoding='utf-8') as f:
|
| 1719 |
+
# json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 1720 |
+
# except Exception:
|
| 1721 |
+
# pass
|
| 1722 |
+
|
| 1723 |
+
# return structured_data
|
| 1724 |
+
|
| 1725 |
+
|
| 1726 |
+
|
| 1727 |
+
|
| 1728 |
+
|
| 1729 |
+
|
| 1730 |
def convert_bio_to_structured_json_relaxed(input_path: str, output_path: str) -> Optional[List[Dict[str, Any]]]:
|
| 1731 |
print("\n" + "=" * 80)
|
| 1732 |
print("--- 3. STARTING BIO TO STRUCTURED JSON DECODING ---")
|
| 1733 |
+
print(f"Source: {input_path}")
|
| 1734 |
print("=" * 80)
|
| 1735 |
+
|
| 1736 |
+
start_time = time.time()
|
| 1737 |
+
|
| 1738 |
try:
|
| 1739 |
with open(input_path, 'r', encoding='utf-8') as f:
|
| 1740 |
predictions_by_page = json.load(f)
|
| 1741 |
+
print(f"β
Successfully loaded raw predictions ({len(predictions_by_page)} pages found)")
|
| 1742 |
except Exception as e:
|
| 1743 |
print(f"β Error loading raw prediction file: {e}")
|
| 1744 |
return None
|
|
|
|
| 1747 |
for page_item in predictions_by_page:
|
| 1748 |
if isinstance(page_item, dict) and 'data' in page_item:
|
| 1749 |
predictions.extend(page_item['data'])
|
| 1750 |
+
|
| 1751 |
+
total_words = len(predictions)
|
| 1752 |
+
print(f"π Total words to process: {total_words}")
|
| 1753 |
|
| 1754 |
structured_data = []
|
| 1755 |
current_item = None
|
|
|
|
| 1764 |
def finalize_passage_to_item(item, passage_buffer):
|
| 1765 |
if passage_buffer:
|
| 1766 |
passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
|
| 1767 |
+
print(f" β³ [Buffer] Finalizing passage ({len(passage_buffer)} words) into current item")
|
| 1768 |
if item.get('passage'):
|
| 1769 |
item['passage'] += ' ' + passage_text
|
| 1770 |
else:
|
| 1771 |
item['passage'] = passage_text
|
| 1772 |
passage_buffer.clear()
|
| 1773 |
|
| 1774 |
+
# Iterate through every predicted word
|
| 1775 |
+
for idx, item in enumerate(predictions):
|
| 1776 |
word = item['word']
|
| 1777 |
label = item['predicted_label']
|
| 1778 |
entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
|
| 1779 |
current_text_buffer.append(word)
|
| 1780 |
+
|
| 1781 |
previous_entity_type = last_entity_type
|
| 1782 |
is_passage_label = (entity_type == 'PASSAGE')
|
| 1783 |
|
| 1784 |
+
# --- LOGGING: Track progress every 500 words or on B- labels ---
|
| 1785 |
+
if label.startswith('B-'):
|
| 1786 |
+
print(f"[Word {idx}/{total_words}] Found Label: {label} | Word: '{word}'")
|
| 1787 |
+
|
| 1788 |
if not first_question_started:
|
| 1789 |
if label != 'B-QUESTION' and not is_passage_label:
|
| 1790 |
just_finished_i_option = False
|
|
|
|
| 1798 |
continue
|
| 1799 |
|
| 1800 |
if label == 'B-QUESTION':
|
| 1801 |
+
print(f"π Detection: New Question Started at word {idx}")
|
| 1802 |
if not first_question_started:
|
| 1803 |
header_text = ' '.join(current_text_buffer[:-1]).strip()
|
| 1804 |
if header_text or current_passage_buffer:
|
| 1805 |
+
print(f" -> Creating METADATA item for text found before first question")
|
| 1806 |
metadata_item = {'type': 'METADATA', 'passage': ''}
|
| 1807 |
finalize_passage_to_item(metadata_item, current_passage_buffer)
|
| 1808 |
if header_text: metadata_item['text'] = header_text
|
|
|
|
| 1814 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1815 |
current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
|
| 1816 |
structured_data.append(current_item)
|
| 1817 |
+
print(f" -> Saved Question. Total structured items so far: {len(structured_data)}")
|
| 1818 |
current_text_buffer = [word]
|
| 1819 |
|
| 1820 |
current_item = {
|
|
|
|
| 1828 |
|
| 1829 |
if current_item is not None:
|
| 1830 |
if is_in_new_passage:
|
|
|
|
| 1831 |
if 'new_passage' not in current_item:
|
| 1832 |
current_item['new_passage'] = word
|
| 1833 |
else:
|
| 1834 |
current_item['new_passage'] += f' {word}'
|
| 1835 |
+
|
| 1836 |
if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
|
| 1837 |
+
print(f" β³ [State] Exiting new_passage mode at label {label}")
|
| 1838 |
is_in_new_passage = False
|
| 1839 |
+
|
| 1840 |
+
if label.startswith(('B-', 'I-')):
|
| 1841 |
+
last_entity_type = entity_type
|
| 1842 |
continue
|
| 1843 |
+
|
| 1844 |
is_in_new_passage = False
|
| 1845 |
|
| 1846 |
if label.startswith('B-'):
|
| 1847 |
if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
|
| 1848 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1849 |
current_passage_buffer = []
|
| 1850 |
+
|
| 1851 |
last_entity_type = entity_type
|
| 1852 |
+
|
| 1853 |
if entity_type == 'PASSAGE':
|
| 1854 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 1855 |
+
print(f" β³ [State] Transitioning to new_passage (Option -> Passage boundary)")
|
| 1856 |
+
current_item['new_passage'] = word
|
| 1857 |
is_in_new_passage = True
|
| 1858 |
else:
|
| 1859 |
current_passage_buffer.append(word)
|
| 1860 |
+
|
| 1861 |
elif entity_type == 'OPTION':
|
| 1862 |
current_option_key = word
|
| 1863 |
current_item['options'][current_option_key] = word
|
| 1864 |
just_finished_i_option = False
|
| 1865 |
+
|
| 1866 |
elif entity_type == 'ANSWER':
|
| 1867 |
current_item['answer'] = word
|
| 1868 |
current_option_key = None
|
| 1869 |
just_finished_i_option = False
|
| 1870 |
+
|
| 1871 |
elif entity_type == 'QUESTION':
|
| 1872 |
current_item['question'] += f' {word}'
|
| 1873 |
just_finished_i_option = False
|
|
|
|
| 1877 |
current_item['question'] += f' {word}'
|
| 1878 |
elif entity_type == 'PASSAGE':
|
| 1879 |
if previous_entity_type == 'OPTION' and just_finished_i_option:
|
| 1880 |
+
current_item['new_passage'] = word
|
| 1881 |
is_in_new_passage = True
|
| 1882 |
else:
|
| 1883 |
if not current_passage_buffer: last_entity_type = 'PASSAGE'
|
|
|
|
| 1887 |
just_finished_i_option = True
|
| 1888 |
elif entity_type == 'ANSWER':
|
| 1889 |
current_item['answer'] += f' {word}'
|
| 1890 |
+
|
| 1891 |
just_finished_i_option = (entity_type == 'OPTION')
|
| 1892 |
|
| 1893 |
elif label == 'O':
|
|
|
|
| 1895 |
current_item['question'] += f' {word}'
|
| 1896 |
just_finished_i_option = False
|
| 1897 |
|
| 1898 |
+
# Final wrap up
|
| 1899 |
if current_item is not None:
|
| 1900 |
+
print(f"π Finalizing the very last item...")
|
| 1901 |
finalize_passage_to_item(current_item, current_passage_buffer)
|
| 1902 |
current_item['text'] = ' '.join(current_text_buffer).strip()
|
| 1903 |
structured_data.append(current_item)
|
| 1904 |
|
| 1905 |
+
# Clean up and regex replacement
|
| 1906 |
for item in structured_data:
|
| 1907 |
item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
|
| 1908 |
if 'new_passage' in item:
|
| 1909 |
item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
|
| 1910 |
|
| 1911 |
+
print(f"πΎ Saving {len(structured_data)} items to {output_path}")
|
| 1912 |
try:
|
| 1913 |
with open(output_path, 'w', encoding='utf-8') as f:
|
| 1914 |
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 1915 |
+
print(f"β
Decoding Complete. Total time: {time.time() - start_time:.2f}s")
|
| 1916 |
+
except Exception as e:
|
| 1917 |
+
print(f"β οΈ Error saving final JSON: {e}")
|
| 1918 |
|
| 1919 |
return structured_data
|
| 1920 |
|
|
|
|
| 1921 |
def create_query_text(entry: Dict[str, Any]) -> str:
|
| 1922 |
"""Combines question and options into a single string for similarity matching."""
|
| 1923 |
query_parts = []
|