heerjtdev commited on
Commit
c7448f6
·
verified ·
1 Parent(s): b8ab755

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +211 -0
working_yolo_pipeline.py CHANGED
@@ -605,6 +605,191 @@ def extract_native_words_and_convert(fitz_page, scale_factor: float = 2.0) -> li
605
 
606
 
607
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
608
 
609
 
610
 
@@ -1863,6 +2048,32 @@ def run_document_pipeline( input_pdf_path: str, layoutlmv3_model_path: str, stru
1863
  # Phase 4: Embedding / Equation to LaTeX Conversion
1864
  final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
1865
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1866
  except Exception as e:
1867
  print(f"❌ FATAL ERROR: {e}")
1868
  import traceback
 
605
 
606
 
607
 
608
+
609
+ #===================================================================================================
610
+ #===================================================================================================
611
+ #===================================================================================================
612
+
613
+
614
+
615
+ import pandas as pd
616
+ import pickle
617
+ import os
618
+ import time
619
+ import json
620
+ from sklearn.feature_extraction.text import TfidfVectorizer
621
+ import numpy as np
622
+ from collections import defaultdict
623
+
624
+ # --- Model File Paths (Required for the Classifier to load) ---
625
+ VECTORIZER_FILE = 'tfidf_vectorizer_conditional.pkl'
626
+ SUBJECT_MODEL_FILE = 'subject_classifier_model_conditional.pkl'
627
+ CONDITIONAL_CONCEPT_MODELS_FILE = 'conditional_concept_models.pkl'
628
+
629
+
630
+ # --- Hierarchical Classifier Class (Dependency for the helper function) ---
631
+
632
+ class HierarchicalClassifier:
633
+ """
634
+ A two-stage classification system based on conditional training.
635
+ Loads the vectorizer, subject classifier, and conditional concept models.
636
+ """
637
+
638
+ def __init__(self):
639
+ self.vectorizer = None
640
+ self.subject_model = None
641
+ self.conditional_concept_models = {}
642
+ self.is_ready = False
643
+
644
+ def load_models(self):
645
+ """Loads the vectorizer, subject model, and conditional concept models."""
646
+ try:
647
+ start_time = time.time()
648
+ # 1. Load the TF-IDF Vectorizer
649
+ with open(VECTORIZER_FILE, 'rb') as f:
650
+ self.vectorizer = pickle.load(f)
651
+
652
+ # 2. Load the Level 1 (Subject) Classifier
653
+ with open(SUBJECT_MODEL_FILE, 'rb') as f:
654
+ self.subject_model = pickle.load(f)
655
+
656
+ # 3. Load the dictionary of conditional Level 2 (Concept) Models
657
+ with open(CONDITIONAL_CONCEPT_MODELS_FILE, 'rb') as f:
658
+ conditional_data = pickle.load(f)
659
+
660
+ # Extract just the models for easy access
661
+ for subject, data in conditional_data.items():
662
+ self.conditional_concept_models[subject] = data['model']
663
+
664
+ print(f"Loaded models successfully in {time.time() - start_time:.2f} seconds.")
665
+ self.is_ready = True
666
+
667
+ except FileNotFoundError as e:
668
+ print(f"Error: Required model file not found: {e.filename}.")
669
+ self.is_ready = False
670
+ except Exception as e:
671
+ print(f"An error occurred while loading models: {e}")
672
+ self.is_ready = False
673
+
674
+ return self.is_ready
675
+
676
+ def predict_subject(self, text_chunk):
677
+ """Predicts the top Subject (Level 1)."""
678
+ if not self.is_ready:
679
+ return "Unknown", 0.0
680
+
681
+ # Vectorize the input
682
+ text_vector = self.vectorizer.transform([text_chunk]).astype(np.float64)
683
+
684
+ if hasattr(self.subject_model, 'predict_proba'):
685
+ probabilities = self.subject_model.predict_proba(text_vector)[0]
686
+ classes = self.subject_model.classes_
687
+
688
+ top_index = np.argmax(probabilities)
689
+ return classes[top_index], probabilities[top_index]
690
+ else:
691
+ return self.subject_model.predict(text_vector)[0], 1.0
692
+
693
+ def predict_concept_hierarchical(self, text_chunk, predicted_subject):
694
+ """
695
+ Predicts the top Concept (Level 2) using the specialized conditional model.
696
+ """
697
+ if not self.is_ready:
698
+ return "Unknown", 0.0
699
+
700
+ concept_model = self.conditional_concept_models.get(predicted_subject)
701
+
702
+ if concept_model is None or len(getattr(concept_model, 'classes_', [])) <= 1:
703
+ return "No_Conditional_Model_Found", 0.0
704
+
705
+ # Vectorize the input
706
+ text_vector = self.vectorizer.transform([text_chunk]).astype(np.float64)
707
+
708
+ if hasattr(concept_model, 'predict_proba'):
709
+ probabilities = concept_model.predict_proba(text_vector)[0]
710
+ classes = concept_model.classes_
711
+
712
+ top_index = np.argmax(probabilities)
713
+ return classes[top_index], probabilities[top_index]
714
+ else:
715
+ return concept_model.predict(text_vector)[0], 1.0
716
+
717
+
718
+ # --------------------------------------------------------------------------------------
719
+ # --- The Requested Helper Function ---
720
+
721
+ def post_process_json_with_inference(json_data, classifier):
722
+ """
723
+ Takes JSON data, runs hierarchical inference on all question/option text,
724
+ and adds 'predicted_subject' and 'predicted_concept' tags to each entry.
725
+
726
+ Args:
727
+ json_data (list): The list of dictionaries containing question entries.
728
+ classifier (HierarchicalClassifier): An initialized and loaded classifier object.
729
+
730
+ Returns:
731
+ list: The modified list of dictionaries with classification tags added.
732
+ """
733
+ if not classifier.is_ready:
734
+ print("Classifier not ready. Skipping inference.")
735
+ return json_data
736
+
737
+ # This print statement can be removed for silent pipeline integration
738
+ print("\n--- Starting Subject/Concept Detection ---")
739
+
740
+ for entry in json_data:
741
+ # Only process entries that have a 'question' field
742
+ if 'question' not in entry:
743
+ continue
744
+
745
+ # 1. Combine Question Text and Option Text for robust feature extraction
746
+ full_text = entry.get('question', '')
747
+
748
+ options = entry.get('options', {})
749
+ for option_key, option_value in options.items():
750
+ # Use the text component of the option if available
751
+ option_text = option_value if isinstance(option_value, str) else option_key
752
+ full_text += " " + option_text.replace('\n', ' ')
753
+
754
+ # Clean up text (remove multiple spaces and surrounding whitespace)
755
+ full_text = ' '.join(full_text.split()).strip()
756
+
757
+ # Handle empty text
758
+ if not full_text:
759
+ entry['predicted_subject'] = {'label': 'Empty_Text', 'confidence': 0.0}
760
+ entry['predicted_concept'] = {'label': 'Empty_Text', 'confidence': 0.0}
761
+ continue
762
+
763
+ # 2. STAGE 1: Predict Subject
764
+ subj_label, subj_conf = classifier.predict_subject(full_text)
765
+
766
+ # 3. STAGE 2: Predict Concept (Conditional on predicted subject)
767
+ conc_label, conc_conf = classifier.predict_concept_hierarchical(full_text, subj_label)
768
+
769
+ # 4. Add results to the JSON entry
770
+ entry['predicted_subject'] = {
771
+ 'label': subj_label,
772
+ 'confidence': round(subj_conf, 4)
773
+ }
774
+ entry['predicted_concept'] = {
775
+ 'label': conc_label,
776
+ 'confidence': round(conc_conf, 4)
777
+ }
778
+
779
+ # This print statement can be removed for silent pipeline integration
780
+ # print("--- JSON Post-Processing Complete ---")
781
+
782
+ return json_data
783
+
784
+
785
+
786
+
787
+
788
+ #===================================================================================================
789
+ #===================================================================================================
790
+ #===================================================================================================
791
+
792
+
793
 
794
 
795
 
 
2048
  # Phase 4: Embedding / Equation to LaTeX Conversion
2049
  final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
2050
 
2051
+
2052
+
2053
+
2054
+ #================================================================================
2055
+ # --- NEW FINAL STEP: HIERARCHICAL CLASSIFICATION TAGGING ---
2056
+ #================================================================================
2057
+
2058
+ print("\n" + "=" * 80)
2059
+ print("--- FINAL STEP: HIERARCHICAL SUBJECT/CONCEPT TAGGING ---")
2060
+ print("=" * 80)
2061
+
2062
+ # 1. Initialize and Load the Classifier
2063
+ classifier = HierarchicalClassifier()
2064
+ if classifier.load_models():
2065
+ # 2. Run Classification on the *Final* Result
2066
+ # The function modifies the list in place and returns it
2067
+ final_result = post_process_json_with_inference(
2068
+ final_result, classifier
2069
+ )
2070
+ print("✅ Classification complete. Tags added to final output.")
2071
+ else:
2072
+ print("❌ Classification model loading failed. Outputting un-tagged data.")
2073
+
2074
+ # ====================================================================
2075
+
2076
+
2077
  except Exception as e:
2078
  print(f"❌ FATAL ERROR: {e}")
2079
  import traceback