Spaces:
Running
Running
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +211 -0
working_yolo_pipeline.py
CHANGED
|
@@ -605,6 +605,191 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
|
|
| 605 |
|
| 606 |
|
| 607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
|
| 609 |
|
| 610 |
|
|
@@ -1863,6 +2048,32 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
|
|
| 1863 |
# Phase 4: Embedding / Equation to LaTeX Conversion
|
| 1864 |
final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 1865 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1866 |
except Exception as e:
|
| 1867 |
print(f"❌ FATAL ERROR: {e}")
|
| 1868 |
import traceback
|
|
|
|
| 605 |
|
| 606 |
|
| 607 |
|
| 608 |
+
|
| 609 |
+
#===================================================================================================
|
| 610 |
+
#===================================================================================================
|
| 611 |
+
#===================================================================================================
|
| 612 |
+
|
| 613 |
+
|
| 614 |
+
|
| 615 |
+
import pandas as pd
|
| 616 |
+
import pickle
|
| 617 |
+
import os
|
| 618 |
+
import time
|
| 619 |
+
import json
|
| 620 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 621 |
+
import numpy as np
|
| 622 |
+
from collections import defaultdict
|
| 623 |
+
|
| 624 |
+
# --- Model File Paths (Required for the Classifier to load) ---
|
| 625 |
+
VECTORIZER_FILE = 'tfidf_vectorizer_conditional.pkl'
|
| 626 |
+
SUBJECT_MODEL_FILE = 'subject_classifier_model_conditional.pkl'
|
| 627 |
+
CONDITIONAL_CONCEPT_MODELS_FILE = 'conditional_concept_models.pkl'
|
| 628 |
+
|
| 629 |
+
|
| 630 |
+
# --- Hierarchical Classifier Class (Dependency for the helper function) ---
|
| 631 |
+
|
| 632 |
+
class HierarchicalClassifier:
|
| 633 |
+
"""
|
| 634 |
+
A two-stage classification system based on conditional training.
|
| 635 |
+
Loads the vectorizer, subject classifier, and conditional concept models.
|
| 636 |
+
"""
|
| 637 |
+
|
| 638 |
+
def __init__(self):
|
| 639 |
+
self.vectorizer = None
|
| 640 |
+
self.subject_model = None
|
| 641 |
+
self.conditional_concept_models = {}
|
| 642 |
+
self.is_ready = False
|
| 643 |
+
|
| 644 |
+
def load_models(self):
|
| 645 |
+
"""Loads the vectorizer, subject model, and conditional concept models."""
|
| 646 |
+
try:
|
| 647 |
+
start_time = time.time()
|
| 648 |
+
# 1. Load the TF-IDF Vectorizer
|
| 649 |
+
with open(VECTORIZER_FILE, 'rb') as f:
|
| 650 |
+
self.vectorizer = pickle.load(f)
|
| 651 |
+
|
| 652 |
+
# 2. Load the Level 1 (Subject) Classifier
|
| 653 |
+
with open(SUBJECT_MODEL_FILE, 'rb') as f:
|
| 654 |
+
self.subject_model = pickle.load(f)
|
| 655 |
+
|
| 656 |
+
# 3. Load the dictionary of conditional Level 2 (Concept) Models
|
| 657 |
+
with open(CONDITIONAL_CONCEPT_MODELS_FILE, 'rb') as f:
|
| 658 |
+
conditional_data = pickle.load(f)
|
| 659 |
+
|
| 660 |
+
# Extract just the models for easy access
|
| 661 |
+
for subject, data in conditional_data.items():
|
| 662 |
+
self.conditional_concept_models[subject] = data['model']
|
| 663 |
+
|
| 664 |
+
print(f"Loaded models successfully in {time.time() - start_time:.2f} seconds.")
|
| 665 |
+
self.is_ready = True
|
| 666 |
+
|
| 667 |
+
except FileNotFoundError as e:
|
| 668 |
+
print(f"Error: Required model file not found: {e.filename}.")
|
| 669 |
+
self.is_ready = False
|
| 670 |
+
except Exception as e:
|
| 671 |
+
print(f"An error occurred while loading models: {e}")
|
| 672 |
+
self.is_ready = False
|
| 673 |
+
|
| 674 |
+
return self.is_ready
|
| 675 |
+
|
| 676 |
+
def predict_subject(self, text_chunk):
|
| 677 |
+
"""Predicts the top Subject (Level 1)."""
|
| 678 |
+
if not self.is_ready:
|
| 679 |
+
return "Unknown", 0.0
|
| 680 |
+
|
| 681 |
+
# Vectorize the input
|
| 682 |
+
text_vector = self.vectorizer.transform([text_chunk]).astype(np.float64)
|
| 683 |
+
|
| 684 |
+
if hasattr(self.subject_model, 'predict_proba'):
|
| 685 |
+
probabilities = self.subject_model.predict_proba(text_vector)[0]
|
| 686 |
+
classes = self.subject_model.classes_
|
| 687 |
+
|
| 688 |
+
top_index = np.argmax(probabilities)
|
| 689 |
+
return classes[top_index], probabilities[top_index]
|
| 690 |
+
else:
|
| 691 |
+
return self.subject_model.predict(text_vector)[0], 1.0
|
| 692 |
+
|
| 693 |
+
def predict_concept_hierarchical(self, text_chunk, predicted_subject):
|
| 694 |
+
"""
|
| 695 |
+
Predicts the top Concept (Level 2) using the specialized conditional model.
|
| 696 |
+
"""
|
| 697 |
+
if not self.is_ready:
|
| 698 |
+
return "Unknown", 0.0
|
| 699 |
+
|
| 700 |
+
concept_model = self.conditional_concept_models.get(predicted_subject)
|
| 701 |
+
|
| 702 |
+
if concept_model is None or len(getattr(concept_model, 'classes_', [])) <= 1:
|
| 703 |
+
return "No_Conditional_Model_Found", 0.0
|
| 704 |
+
|
| 705 |
+
# Vectorize the input
|
| 706 |
+
text_vector = self.vectorizer.transform([text_chunk]).astype(np.float64)
|
| 707 |
+
|
| 708 |
+
if hasattr(concept_model, 'predict_proba'):
|
| 709 |
+
probabilities = concept_model.predict_proba(text_vector)[0]
|
| 710 |
+
classes = concept_model.classes_
|
| 711 |
+
|
| 712 |
+
top_index = np.argmax(probabilities)
|
| 713 |
+
return classes[top_index], probabilities[top_index]
|
| 714 |
+
else:
|
| 715 |
+
return concept_model.predict(text_vector)[0], 1.0
|
| 716 |
+
|
| 717 |
+
|
| 718 |
+
# --------------------------------------------------------------------------------------
|
| 719 |
+
# --- The Requested Helper Function ---
|
| 720 |
+
|
| 721 |
+
def post_process_json_with_inference(json_data, classifier):
|
| 722 |
+
"""
|
| 723 |
+
Takes JSON data, runs hierarchical inference on all question/option text,
|
| 724 |
+
and adds 'predicted_subject' and 'predicted_concept' tags to each entry.
|
| 725 |
+
|
| 726 |
+
Args:
|
| 727 |
+
json_data (list): The list of dictionaries containing question entries.
|
| 728 |
+
classifier (HierarchicalClassifier): An initialized and loaded classifier object.
|
| 729 |
+
|
| 730 |
+
Returns:
|
| 731 |
+
list: The modified list of dictionaries with classification tags added.
|
| 732 |
+
"""
|
| 733 |
+
if not classifier.is_ready:
|
| 734 |
+
print("Classifier not ready. Skipping inference.")
|
| 735 |
+
return json_data
|
| 736 |
+
|
| 737 |
+
# This print statement can be removed for silent pipeline integration
|
| 738 |
+
print("\n--- Starting Subject/Concept Detection ---")
|
| 739 |
+
|
| 740 |
+
for entry in json_data:
|
| 741 |
+
# Only process entries that have a 'question' field
|
| 742 |
+
if 'question' not in entry:
|
| 743 |
+
continue
|
| 744 |
+
|
| 745 |
+
# 1. Combine Question Text and Option Text for robust feature extraction
|
| 746 |
+
full_text = entry.get('question', '')
|
| 747 |
+
|
| 748 |
+
options = entry.get('options', {})
|
| 749 |
+
for option_key, option_value in options.items():
|
| 750 |
+
# Use the text component of the option if available
|
| 751 |
+
option_text = option_value if isinstance(option_value, str) else option_key
|
| 752 |
+
full_text += " " + option_text.replace('\n', ' ')
|
| 753 |
+
|
| 754 |
+
# Clean up text (remove multiple spaces and surrounding whitespace)
|
| 755 |
+
full_text = ' '.join(full_text.split()).strip()
|
| 756 |
+
|
| 757 |
+
# Handle empty text
|
| 758 |
+
if not full_text:
|
| 759 |
+
entry['predicted_subject'] = {'label': 'Empty_Text', 'confidence': 0.0}
|
| 760 |
+
entry['predicted_concept'] = {'label': 'Empty_Text', 'confidence': 0.0}
|
| 761 |
+
continue
|
| 762 |
+
|
| 763 |
+
# 2. STAGE 1: Predict Subject
|
| 764 |
+
subj_label, subj_conf = classifier.predict_subject(full_text)
|
| 765 |
+
|
| 766 |
+
# 3. STAGE 2: Predict Concept (Conditional on predicted subject)
|
| 767 |
+
conc_label, conc_conf = classifier.predict_concept_hierarchical(full_text, subj_label)
|
| 768 |
+
|
| 769 |
+
# 4. Add results to the JSON entry
|
| 770 |
+
entry['predicted_subject'] = {
|
| 771 |
+
'label': subj_label,
|
| 772 |
+
'confidence': round(subj_conf, 4)
|
| 773 |
+
}
|
| 774 |
+
entry['predicted_concept'] = {
|
| 775 |
+
'label': conc_label,
|
| 776 |
+
'confidence': round(conc_conf, 4)
|
| 777 |
+
}
|
| 778 |
+
|
| 779 |
+
# This print statement can be removed for silent pipeline integration
|
| 780 |
+
# print("--- JSON Post-Processing Complete ---")
|
| 781 |
+
|
| 782 |
+
return json_data
|
| 783 |
+
|
| 784 |
+
|
| 785 |
+
|
| 786 |
+
|
| 787 |
+
|
| 788 |
+
#===================================================================================================
|
| 789 |
+
#===================================================================================================
|
| 790 |
+
#===================================================================================================
|
| 791 |
+
|
| 792 |
+
|
| 793 |
|
| 794 |
|
| 795 |
|
|
|
|
| 2048 |
# Phase 4: Embedding / Equation to LaTeX Conversion
|
| 2049 |
final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
|
| 2050 |
|
| 2051 |
+
|
| 2052 |
+
|
| 2053 |
+
|
| 2054 |
+
#================================================================================
|
| 2055 |
+
# --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
|
| 2056 |
+
#================================================================================
|
| 2057 |
+
|
| 2058 |
+
print("\n" + "=" * 80)
|
| 2059 |
+
print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
|
| 2060 |
+
print("=" * 80)
|
| 2061 |
+
|
| 2062 |
+
# 1. Initialize and Load the Classifier
|
| 2063 |
+
classifier = HierarchicalClassifier()
|
| 2064 |
+
if classifier.load_models():
|
| 2065 |
+
# 2. Run Classification on the *Final* Result
|
| 2066 |
+
# The function modifies the list in place and returns it
|
| 2067 |
+
final_result = post_process_json_with_inference(
|
| 2068 |
+
final_result, classifier
|
| 2069 |
+
)
|
| 2070 |
+
print("✅ Classification complete. Tags added to final output.")
|
| 2071 |
+
else:
|
| 2072 |
+
print("❌ Classification model loading failed. Outputting un-tagged data.")
|
| 2073 |
+
|
| 2074 |
+
# ====================================================================
|
| 2075 |
+
|
| 2076 |
+
|
| 2077 |
except Exception as e:
|
| 2078 |
print(f"❌ FATAL ERROR: {e}")
|
| 2079 |
import traceback
|