Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Dec 4, 2025

Commit

c7448f6

verified ·

1 Parent(s): b8ab755

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +211 -0

working_yolo_pipeline.py CHANGED Viewed

@@ -605,6 +605,191 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
@@ -1863,6 +2048,32 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
         # Phase 4: Embedding / Equation to LaTeX Conversion
         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
     except Exception as e:
         print(f"❌ FATAL ERROR: {e}")
         import traceback

+#===================================================================================================
+#===================================================================================================
+#===================================================================================================
+import pandas as pd
+import pickle
+import os
+import time
+import json
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+from collections import defaultdict
+# --- Model File Paths (Required for the Classifier to load) ---
+VECTORIZER_FILE = 'tfidf_vectorizer_conditional.pkl'
+SUBJECT_MODEL_FILE = 'subject_classifier_model_conditional.pkl'
+CONDITIONAL_CONCEPT_MODELS_FILE = 'conditional_concept_models.pkl'
+# --- Hierarchical Classifier Class (Dependency for the helper function) ---
+class HierarchicalClassifier:
+    """
+    A two-stage classification system based on conditional training.
+    Loads the vectorizer, subject classifier, and conditional concept models.
+    """
+    def __init__(self):
+        self.vectorizer = None
+        self.subject_model = None
+        self.conditional_concept_models = {}
+        self.is_ready = False
+    def load_models(self):
+        """Loads the vectorizer, subject model, and conditional concept models."""
+        try:
+            start_time = time.time()
+            # 1. Load the TF-IDF Vectorizer
+            with open(VECTORIZER_FILE, 'rb') as f:
+                self.vectorizer = pickle.load(f)
+            # 2. Load the Level 1 (Subject) Classifier
+            with open(SUBJECT_MODEL_FILE, 'rb') as f:
+                self.subject_model = pickle.load(f)
+            # 3. Load the dictionary of conditional Level 2 (Concept) Models
+            with open(CONDITIONAL_CONCEPT_MODELS_FILE, 'rb') as f:
+                conditional_data = pickle.load(f)
+                # Extract just the models for easy access
+                for subject, data in conditional_data.items():
+                    self.conditional_concept_models[subject] = data['model']
+            print(f"Loaded models successfully in {time.time() - start_time:.2f} seconds.")
+            self.is_ready = True
+        except FileNotFoundError as e:
+            print(f"Error: Required model file not found: {e.filename}.")
+            self.is_ready = False
+        except Exception as e:
+            print(f"An error occurred while loading models: {e}")
+            self.is_ready = False
+        return self.is_ready
+    def predict_subject(self, text_chunk):
+        """Predicts the top Subject (Level 1)."""
+        if not self.is_ready:
+            return "Unknown", 0.0
+        # Vectorize the input
+        text_vector = self.vectorizer.transform([text_chunk]).astype(np.float64)
+        if hasattr(self.subject_model, 'predict_proba'):
+            probabilities = self.subject_model.predict_proba(text_vector)[0]
+            classes = self.subject_model.classes_
+            top_index = np.argmax(probabilities)
+            return classes[top_index], probabilities[top_index]
+        else:
+            return self.subject_model.predict(text_vector)[0], 1.0
+    def predict_concept_hierarchical(self, text_chunk, predicted_subject):
+        """
+        Predicts the top Concept (Level 2) using the specialized conditional model.
+        """
+        if not self.is_ready:
+            return "Unknown", 0.0
+        concept_model = self.conditional_concept_models.get(predicted_subject)
+        if concept_model is None or len(getattr(concept_model, 'classes_', [])) <= 1:
+            return "No_Conditional_Model_Found", 0.0
+        # Vectorize the input
+        text_vector = self.vectorizer.transform([text_chunk]).astype(np.float64)
+        if hasattr(concept_model, 'predict_proba'):
+            probabilities = concept_model.predict_proba(text_vector)[0]
+            classes = concept_model.classes_
+            top_index = np.argmax(probabilities)
+            return classes[top_index], probabilities[top_index]
+        else:
+            return concept_model.predict(text_vector)[0], 1.0
+# --------------------------------------------------------------------------------------
+# --- The Requested Helper Function ---
+def post_process_json_with_inference(json_data, classifier):
+    """
+    Takes JSON data, runs hierarchical inference on all question/option text,
+    and adds 'predicted_subject' and 'predicted_concept' tags to each entry.
+    Args:
+        json_data (list): The list of dictionaries containing question entries.
+        classifier (HierarchicalClassifier): An initialized and loaded classifier object.
+    Returns:
+        list: The modified list of dictionaries with classification tags added.
+    """
+    if not classifier.is_ready:
+        print("Classifier not ready. Skipping inference.")
+        return json_data
+    # This print statement can be removed for silent pipeline integration
+    print("\n--- Starting Subject/Concept Detection ---")
+    for entry in json_data:
+        # Only process entries that have a 'question' field
+        if 'question' not in entry:
+            continue
+        # 1. Combine Question Text and Option Text for robust feature extraction
+        full_text = entry.get('question', '')
+        options = entry.get('options', {})
+        for option_key, option_value in options.items():
+            # Use the text component of the option if available
+            option_text = option_value if isinstance(option_value, str) else option_key
+            full_text += " " + option_text.replace('\n', ' ')
+        # Clean up text (remove multiple spaces and surrounding whitespace)
+        full_text = ' '.join(full_text.split()).strip()
+        # Handle empty text
+        if not full_text:
+            entry['predicted_subject'] = {'label': 'Empty_Text', 'confidence': 0.0}
+            entry['predicted_concept'] = {'label': 'Empty_Text', 'confidence': 0.0}
+            continue
+        # 2. STAGE 1: Predict Subject
+        subj_label, subj_conf = classifier.predict_subject(full_text)
+        # 3. STAGE 2: Predict Concept (Conditional on predicted subject)
+        conc_label, conc_conf = classifier.predict_concept_hierarchical(full_text, subj_label)
+        # 4. Add results to the JSON entry
+        entry['predicted_subject'] = {
+            'label': subj_label,
+            'confidence': round(subj_conf, 4)
+        }
+        entry['predicted_concept'] = {
+            'label': conc_label,
+            'confidence': round(conc_conf, 4)
+        }
+    # This print statement can be removed for silent pipeline integration
+    # print("--- JSON Post-Processing Complete ---")
+    return json_data
+#===================================================================================================
+#===================================================================================================
+#===================================================================================================
         # Phase 4: Embedding / Equation to LaTeX Conversion
         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
+#================================================================================
+        # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
+#================================================================================
+        print("\n" + "=" * 80)
+        print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
+        print("=" * 80)
+        # 1. Initialize and Load the Classifier
+        classifier = HierarchicalClassifier()
+        if classifier.load_models():
+            # 2. Run Classification on the *Final* Result
+            # The function modifies the list in place and returns it
+            final_result = post_process_json_with_inference(
+                final_result, classifier
+            )
+            print("✅ Classification complete. Tags added to final output.")
+        else:
+            print("❌ Classification model loading failed. Outputting un-tagged data.")
+        # ====================================================================
     except Exception as e:
         print(f"❌ FATAL ERROR: {e}")
         import traceback