Spaces:

heerjtdev
/

layout_latex

Running

App Files Files Community

heerjtdev commited on Jan 21

Commit

144aace

verified ·

1 Parent(s): b13058c

Update working_yolo_pipeline.py

Browse files

Files changed (1) hide show

working_yolo_pipeline.py +446 -4

working_yolo_pipeline.py CHANGED Viewed

@@ -96,8 +96,190 @@ except Exception as e:
 from typing import Optional
 def sanitize_text(text: Optional[str]) -> str:
@@ -1634,6 +1816,130 @@ def run_inference_and_get_raw_words(pdf_path: str, model_path: str,
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
 # ============================================================================
@@ -2250,6 +2556,129 @@ import glob
 def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
     if not os.path.exists(input_pdf_path):
         print(f"❌ ERROR: File not found: {input_pdf_path}")
@@ -2284,12 +2713,25 @@ def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, struc
             return None
         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
-        # --- Phase 2: Inference ---
-        print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
         p2_start = time.time()
-        page_raw_predictions_list = run_inference_and_get_raw_words(
-            input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
         )
         if not page_raw_predictions_list:
             print("❌ FAILED at Step 2: Inference returned no data.")
             return None

+#=====================================================================================================================
+#=====================================================================================================================
+# ============================================================================
+# --- CUSTOM MODEL DEFINITIONS (ADD THIS BLOCK) ---
+# ============================================================================
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from collections import Counter
+import pickle
+# --- CONSTANTS FOR CUSTOM MODEL ---
+MODEL_FILE = "model_enhanced.pt"      # Ensure this file is in your directory
+VOCAB_FILE = "vocabs_enhanced.pkl"    # Ensure this file is in your directory
+DEVICE = torch.device("cpu")          # Use "cuda" if available
+MAX_CHAR_LEN = 16
+EMBED_DIM = 128
+CHAR_EMBED_DIM = 50
+CHAR_CNN_OUT = 50
+BBOX_DIM = 128
+HIDDEN_SIZE = 768
+SPATIAL_FEATURE_DIM = 64
+POSITIONAL_DIM = 128
+INFERENCE_CHUNK_SIZE = 450
+LABELS = [
+    "O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION",
+    "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE",
+    "B-SECTION HEADING", "I-SECTION HEADING", "B-PASSAGE", "I-PASSAGE"
+]
+IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
+# --- CRF DEPENDENCY ---
+try:
+    from torch_crf import CRF
+except ImportError:
+    try:
+        from TorchCRF import CRF
+    except ImportError:
+        # Minimal fallback if CRF library is missing (though you should install it)
+        class CRF(nn.Module):
+            def __init__(self, *args, **kwargs): super().__init__()
+# --- MODEL CLASSES ---
+class Vocab:
+    def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
+        self.min_freq = min_freq
+        self.unk_token = unk_token
+        self.pad_token = pad_token
+        self.freq = Counter()
+        self.itos = []
+        self.stoi = {}
+    def __len__(self): return len(self.itos)
+    def __getitem__(self, token): return self.stoi.get(token, self.stoi.get(self.unk_token, 0))
+class CharCNNEncoder(nn.Module):
+    def __init__(self, char_vocab_size, char_emb_dim, out_dim, kernel_sizes=(2, 3, 4, 5)):
+        super().__init__()
+        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
+        self.convs = nn.ModuleList([nn.Conv1d(char_emb_dim, out_dim, kernel_size=k) for k in kernel_sizes])
+        self.out_dim = out_dim * len(kernel_sizes)
+    def forward(self, char_ids):
+        B, L, C = char_ids.size()
+        emb = self.char_emb(char_ids.view(B * L, C)).transpose(1, 2)
+        outs = [torch.max(torch.relu(conv(emb)), dim=2)[0] for conv in self.convs]
+        return torch.cat(outs, dim=1).view(B, L, -1)
+class SpatialAttention(nn.Module):
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.query = nn.Linear(hidden_dim, hidden_dim)
+        self.key = nn.Linear(hidden_dim, hidden_dim)
+        self.value = nn.Linear(hidden_dim, hidden_dim)
+        self.scale = hidden_dim ** 0.5
+    def forward(self, x, mask):
+        Q, K, V = self.query(x), self.key(x), self.value(x)
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        mask_expanded = mask.unsqueeze(1).expand_as(scores)
+        scores = scores.masked_fill(~mask_expanded, float('-inf'))
+        attn_weights = F.softmax(scores, dim=-1).masked_fill(torch.isnan(scores), 0.0)
+        return torch.matmul(attn_weights, V)
+class MCQTagger(nn.Module):
+    def __init__(self, vocab_size, char_vocab_size, n_labels):
+        super().__init__()
+        self.word_emb = nn.Embedding(vocab_size, EMBED_DIM, padding_idx=0)
+        self.char_enc = CharCNNEncoder(char_vocab_size, CHAR_EMBED_DIM, CHAR_CNN_OUT)
+        self.bbox_proj = nn.Sequential(nn.Linear(4, BBOX_DIM), nn.ReLU(), nn.Dropout(0.1), nn.Linear(BBOX_DIM, BBOX_DIM))
+        self.spatial_proj = nn.Sequential(nn.Linear(11, SPATIAL_FEATURE_DIM), nn.ReLU(), nn.Dropout(0.1))
+        self.context_proj = nn.Sequential(nn.Linear(8, 32), nn.ReLU(), nn.Dropout(0.1))
+        self.positional_encoding = nn.Embedding(512, POSITIONAL_DIM)
+        in_dim = (EMBED_DIM + self.char_enc.out_dim + BBOX_DIM + SPATIAL_FEATURE_DIM + 32 + POSITIONAL_DIM)
+        self.bilstm = nn.LSTM(in_dim, HIDDEN_SIZE // 2, num_layers=3, batch_first=True, bidirectional=True, dropout=0.3)
+        self.spatial_attention = SpatialAttention(HIDDEN_SIZE)
+        self.ff = nn.Sequential(nn.Linear(HIDDEN_SIZE * 2, HIDDEN_SIZE), nn.ReLU(), nn.Dropout(0.3), nn.Linear(HIDDEN_SIZE, n_labels))
+        self.crf = CRF(n_labels)
+        self.dropout = nn.Dropout(p=0.5)
+    def forward(self, words, chars, bboxes, spatial_feats, context_feats, mask):
+        B, L = words.size()
+        wemb = self.word_emb(words)
+        cenc = self.char_enc(chars)
+        benc = self.bbox_proj(bboxes)
+        senc = self.spatial_proj(spatial_feats)
+        cxt_enc = self.context_proj(context_feats)
+        pos = torch.arange(L, device=words.device).unsqueeze(0).expand(B, -1)
+        pos_enc = self.positional_encoding(pos.clamp(max=511))
+        enc_in = self.dropout(torch.cat([wemb, cenc, benc, senc, cxt_enc, pos_enc], dim=-1))
+        lengths = mask.sum(dim=1).cpu()
+        packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
+        packed_out, _ = self.bilstm(packed_in)
+        lstm_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
+        attn_out = self.spatial_attention(lstm_out, mask)
+        emissions = self.ff(torch.cat([lstm_out, attn_out], dim=-1))
+        return self.crf.viterbi_decode(emissions, mask=mask)
+# --- INJECT DEPENDENCIES FOR PICKLE LOADING ---
+import sys
+from types import ModuleType
+train_mod = ModuleType("train_model")
+sys.modules["train_model"] = train_mod
+train_mod.Vocab = Vocab
+train_mod.MCQTagger = MCQTagger
+train_mod.CharCNNEncoder = CharCNNEncoder
+train_mod.SpatialAttention = SpatialAttention
+# ============================================================================
+# --- CUSTOM FEATURE EXTRACTORS ---
+# ============================================================================
+def extract_spatial_features(tokens, idx):
+    curr = tokens[idx]
+    f = []
+    # Vertical distance to next
+    if idx < len(tokens)-1: f.append(min((tokens[idx+1]['y0'] - curr['y1'])/100.0, 1.0))
+    else: f.append(0.0)
+    # Vertical distance from prev
+    if idx > 0: f.append(min((curr['y0'] - tokens[idx-1]['y1'])/100.0, 1.0))
+    else: f.append(0.0)
+    # Geometry
+    f.extend([curr['x0']/1000.0, (curr['x1']-curr['x0'])/1000.0, (curr['y1']-curr['y0'])/1000.0])
+    f.extend([(curr['x0']+curr['x1'])/2000.0, (curr['y0']+curr['y1'])/2000.0, curr['x0']/1000.0])
+    # Aspect ratio
+    f.append(min(((curr['x1']-curr['x0'])/max((curr['y1']-curr['y0']),1.0))/10.0, 1.0))
+    # Alignment check
+    if idx > 0: f.append(float(abs(curr['x0'] - tokens[idx-1]['x0']) < 5))
+    else: f.append(0.0)
+    # Area
+    f.append(min(((curr['x1']-curr['x0'])*(curr['y1']-curr['y0']))/(1000.0**2), 1.0))
+    return f
+def extract_context_features(tokens, idx, window=3):
+    f = []
+    def check_p(i):
+        t = str(tokens[i]['word']).lower().strip() # Changed 'text' to 'word' to match pipeline
+        return [float(bool(re.match(r'^q?\.?\d+[.:]', t))), float(bool(re.match(r'^[a-dA-D][.)]', t))), float(t.isupper() and len(t)>2)]
+    prev_res = [0.0, 0.0, 0.0]
+    for i in range(max(0, idx-window), idx):
+        res = check_p(i)
+        prev_res = [max(prev_res[j], res[j]) for j in range(3)]
+    f.extend(prev_res)
+    next_res = [0.0, 0.0, 0.0]
+    for i in range(idx+1, min(len(tokens), idx+window+1)):
+        res = check_p(i)
+        next_res = [max(next_res[j], res[j]) for j in range(3)]
+    f.extend(next_res)
+    dq, dopt = 1.0, 1.0
+    for i in range(idx+1, min(len(tokens), idx+window+1)):
+        t = str(tokens[i]['word']).lower().strip()
+        if re.match(r'^q?\.?\d+[.:]', t): dq = min(dq, (i-idx)/window)
+        if re.match(r'^[a-dA-D][.)]', t): dopt = min(dopt, (i-idx)/window)
+    f.extend([dq, dopt])
+    return f
+#======================================================================================================================================================
+#======================================================================================================================================================
 from typing import Optional
 def sanitize_text(text: Optional[str]) -> str:
+# ============================================================================
+# --- PHASE 2 REPLACEMENT: CUSTOM INFERENCE PIPELINE ---
+# ============================================================================
+def run_custom_inference_and_get_raw_words(preprocessed_json_path: str) -> List[Dict[str, Any]]:
+    print("\n" + "=" * 80)
+    print("--- 2. STARTING CUSTOM MODEL INFERENCE PIPELINE ---")
+    print("=" * 80)
+    # 1. Load Resources
+    if not os.path.exists(MODEL_FILE) or not os.path.exists(VOCAB_FILE):
+        print("❌ Error: Missing custom model or vocab files.")
+        return []
+    try:
+        print("  -> Loading Vocab and Model...")
+        with open(VOCAB_FILE, "rb") as f:
+            word_vocab, char_vocab = pickle.load(f)
+        model = MCQTagger(len(word_vocab), len(char_vocab), len(LABELS)).to(DEVICE)
+        # Load state dict safe
+        state_dict = torch.load(MODEL_FILE, map_location=DEVICE)
+        model.load_state_dict(state_dict if isinstance(state_dict, dict) else state_dict.state_dict())
+        model.eval()
+        print("✅ Custom Model loaded successfully.")
+    except Exception as e:
+        print(f"❌ Error loading custom model: {e}")
+        return []
+    # 2. Load Preprocessed Data
+    try:
+        with open(preprocessed_json_path, 'r', encoding='utf-8') as f:
+            preprocessed_data = json.load(f)
+        print(f"✅ Loaded preprocessed data for {len(preprocessed_data)} pages.")
+    except Exception:
+        print("❌ Error loading preprocessed JSON.")
+        return []
+    final_page_predictions = []
+    scale_factor = 2.0  # The pipeline scales PDF points to 2.0 for YOLO. We need to reverse this.
+    for page_data in preprocessed_data:
+        page_num = page_data['page_number']
+        raw_items = page_data['data']
+        if not raw_items: continue
+        # --- A. ADAPTER: Convert Pipeline Data format to Custom Model format ---
+        # Pipeline Data: {'word': 'Text', 'bbox': [x1, y1, x2, y2]} (scaled by 2.0)
+        # Custom Model Needed: {'word': 'Text', 'x0': x, 'y0': y, 'x1': x, 'y1': y} (PDF points)
+        tokens_for_inference = []
+        for item in raw_items:
+            bbox = item['bbox']
+            # Revert scale to get native PDF coordinates
+            x0 = bbox[0] / scale_factor
+            y0 = bbox[1] / scale_factor
+            x1 = bbox[2] / scale_factor
+            y1 = bbox[3] / scale_factor
+            tokens_for_inference.append({
+                'word': str(item['word']), # Ensure string
+                'x0': x0, 'y0': y0, 'x1': x1, 'y1': y1,
+                'original_bbox': bbox # Keep for output
+            })
+        # --- B. FEATURE EXTRACTION ---
+        for i in range(len(tokens_for_inference)):
+            tokens_for_inference[i]['spatial_features'] = extract_spatial_features(tokens_for_inference, i)
+            tokens_for_inference[i]['context_features'] = extract_context_features(tokens_for_inference, i)
+        # --- C. BATCH INFERENCE ---
+        page_raw_predictions = []
+        # Process in chunks
+        for i in range(0, len(tokens_for_inference), INFERENCE_CHUNK_SIZE):
+            chunk = tokens_for_inference[i : i + INFERENCE_CHUNK_SIZE]
+            # Prepare Tensors
+            w_ids = torch.LongTensor([[word_vocab[t['word']] for t in chunk]]).to(DEVICE)
+            c_ids_list = []
+            for t in chunk:
+                chars = [char_vocab[c] for c in t['word'][:MAX_CHAR_LEN]]
+                chars += [0] * (MAX_CHAR_LEN - len(chars))
+                c_ids_list.append(chars)
+            c_ids = torch.LongTensor([c_ids_list]).to(DEVICE)
+            bboxes = torch.FloatTensor([[[t['x0']/1000.0, t['y0']/1000.0, t['x1']/1000.0, t['y1']/1000.0] for t in chunk]]).to(DEVICE)
+            s_feats = torch.FloatTensor([[t['spatial_features'] for t in chunk]]).to(DEVICE)
+            c_feats = torch.FloatTensor([[t['context_features'] for t in chunk]]).to(DEVICE)
+            mask = torch.ones(w_ids.size(), dtype=torch.bool).to(DEVICE)
+            # Predict
+            with torch.no_grad():
+                preds = model(w_ids, c_ids, bboxes, s_feats, c_feats, mask)[0]
+                # --- D. FORMAT OUTPUT ---
+                for t, p in zip(chunk, preds):
+                    label = IDX2LABEL[p]
+                    # Create the exact dictionary structure expected by the rest of the pipeline
+                    page_raw_predictions.append({
+                        "word": t['word'],
+                        "bbox": t['original_bbox'], # Pass back the scaled bbox the pipeline uses
+                        "predicted_label": label,
+                        "page_number": page_num
+                    })
+        if page_raw_predictions:
+            final_page_predictions.append({
+                "page_number": page_num,
+                "data": page_raw_predictions
+            })
+            print(f"  -> Page {page_num} Inference Complete: {len(page_raw_predictions)} labeled words.")
+    return final_page_predictions
 # ============================================================================
 # --- PHASE 3: BIO TO STRUCTURED JSON DECODER ---
 # ============================================================================
+# def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
+#     if not os.path.exists(input_pdf_path):
+#         print(f"❌ ERROR: File not found: {input_pdf_path}")
+#         return None
+#     print("\n" + "#" * 80)
+#     print("### STARTING OPTIMIZED FULL DOCUMENT ANALYSIS PIPELINE ###")
+#     print(f"Input: {input_pdf_path}")
+#     print("#" * 80)
+#     overall_start = time.time()
+#     pdf_name = os.path.splitext(os.path.basename(input_pdf_path))[0]
+#     temp_pipeline_dir = os.path.join(tempfile.gettempdir(), f"pipeline_run_{pdf_name}_{os.getpid()}")
+#     os.makedirs(temp_pipeline_dir, exist_ok=True)
+#     preprocessed_json_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_preprocessed.json")
+#     raw_output_path = os.path.join(temp_pipeline_dir, f"{pdf_name}_raw_predictions.json")
+#     if structured_intermediate_output_path is None:
+#         structured_intermediate_output_path = os.path.join(
+#             temp_pipeline_dir, f"{pdf_name}_structured_intermediate.json"
+#         )
+#     final_result = None
+#     try:
+#         # --- Phase 1: Preprocessing ---
+#         print(f"\n[Step 1/5] Preprocessing (YOLO + Masking)...")
+#         p1_start = time.time()
+#         preprocessed_json_path_out = run_single_pdf_preprocessing(input_pdf_path, preprocessed_json_path)
+#         if not preprocessed_json_path_out:
+#             print("❌ FAILED at Step 1: Preprocessing returned None.")
+#             return None
+#         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
+#         # --- Phase 2: Inference ---
+#         print(f"\n[Step 2/5] Inference (LayoutLMv3)...")
+#         p2_start = time.time()
+#         page_raw_predictions_list = run_inference_and_get_raw_words(
+#             input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
+#         )
+#         if not page_raw_predictions_list:
+#             print("❌ FAILED at Step 2: Inference returned no data.")
+#             return None
+#         with open(raw_output_path, 'w', encoding='utf-8') as f:
+#             json.dump(page_raw_predictions_list, f, indent=4)
+#         print(f"✅ Step 2 Complete ({time.time() - p2_start:.2f}s)")
+#         # --- Phase 3: Decoding ---
+#         print(f"\n[Step 3/5] Decoding (BIO to Structured JSON)...")
+#         p3_start = time.time()
+#         structured_data_list = convert_bio_to_structured_json_relaxed(
+#             raw_output_path, structured_intermediate_output_path
+#         )
+#         if not structured_data_list:
+#             print("❌ FAILED at Step 3: BIO conversion failed.")
+#             return None
+#         print("... Correcting misalignments and linking context ...")
+#         structured_data_list = correct_misaligned_options(structured_data_list)
+#         structured_data_list = process_context_linking(structured_data_list)
+#         print(f"✅ Step 3 Complete ({time.time() - p3_start:.2f}s)")
+#         # --- Phase 4: Base64 & LaTeX ---
+#         print(f"\n[Step 4/5] Finalizing Layout (Base64 Images & LaTeX)...")
+#         p4_start = time.time()
+#         final_result = embed_images_as_base64_in_memory(structured_data_list, FIGURE_EXTRACTION_DIR)
+#         if not final_result:
+#             print("❌ FAILED at Step 4: Final formatting failed.")
+#             return None
+#         print(f"✅ Step 4 Complete ({time.time() - p4_start:.2f}s)")
+#         # --- Phase 4.5: Question Type Classification ---
+#         print(f"\n[Step 4.5/5] Adding Question Type Classification...")
+#         p4_5_start = time.time()
+#         final_result = add_question_type_validation(final_result)
+#         print(f"✅ Step 4.5 Complete ({time.time() - p4_5_start:.2f}s)")
+#         # --- Phase 5: Hierarchical Tagging ---
+#         print(f"\n[Step 5/5] AI Classification (Subject/Concept Tagging)...")
+#         p5_start = time.time()
+#         classifier = HierarchicalClassifier()
+#         if classifier.load_models():
+#             final_result = post_process_json_with_inference(final_result, classifier)
+#             print(f"✅ Step 5 Complete: Tags added ({time.time() - p5_start:.2f}s)")
+#         else:
+#             print("⚠️ WARNING: Classifier models failed to load. Skipping tags.")
+#         # ============================================================
+#         # 🔧 NEW STEP: FILTER OUT METADATA ENTRIES
+#         # ============================================================
+#         print(f"\n[Post-Processing] Removing METADATA entries...")
+#         initial_count = len(final_result)
+#         final_result = [item for item in final_result if item.get('type') != 'METADATA']
+#         removed_count = initial_count - len(final_result)
+#         print(f"✅ Removed {removed_count} METADATA entries. {len(final_result)} questions remain.")
+#         # ============================================================
+#     except Exception as e:
+#         print(f"\n‼️ FATAL PIPELINE EXCEPTION:")
+#         print(f"Error Message: {str(e)}")
+#         traceback.print_exc()
+#         return None
+#     # finally:
+#     #     print(f"\nCleaning up temporary files in {temp_pipeline_dir}...")
+#     #     try:
+#     #         for f in glob.glob(os.path.join(temp_pipeline_dir, '*')):
+#     #             os.remove(f)
+#     #         os.rmdir(temp_pipeline_dir)
+#     #         print("🧹 Cleanup successful.")
+#     #     except Exception as e:
+#     #         print(f"⚠️ Cleanup failed: {e}")
+#     total_time = time.time() - overall_start
+#     print("\n" + "#" * 80)
+#     print(f"### PIPELINE COMPLETE | Total Time: {total_time:.2f}s ###")
+#     print("#" * 80)
+#     return final_result
 def run_document_pipeline(input_pdf_path: str, layoutlmv3_model_path: str, structured_intermediate_output_path: Optional[str] = None) -> Optional[List[Dict[str, Any]]]:
     if not os.path.exists(input_pdf_path):
         print(f"❌ ERROR: File not found: {input_pdf_path}")
             return None
         print(f"✅ Step 1 Complete ({time.time() - p1_start:.2f}s)")
+        # --- Phase 2: Inference (MODIFIED) ---
+        print(f"\n[Step 2/5] Inference (Custom Model)...")
         p2_start = time.time()
+        # -------------------------------------------------------------------------
+        # --- COMMENTED OUT OLD LAYOUTLMV3 CALL FOR REVERSION ---
+        # page_raw_predictions_list = run_inference_and_get_raw_words(
+        #    input_pdf_path, layoutlmv3_model_path, preprocessed_json_path_out
+        # )
+        # -------------------------------------------------------------------------
+        # --- NEW CUSTOM MODEL CALL ---
+        # Note: We only pass the JSON path because the custom function
+        # doesn't need to re-read the PDF or use the layoutlmv3 model path.
+        page_raw_predictions_list = run_custom_inference_and_get_raw_words(
+            preprocessed_json_path_out
         )
+        # -----------------------------
         if not page_raw_predictions_list:
             print("❌ FAILED at Step 2: Inference returned no data.")
             return None