Spaces:

heerjtdev
/

hybrid_inference

Sleeping

App Files Files Community

heerjtdev commited on Jan 22

Commit

f1c0953

verified ·

1 Parent(s): 02f7b52

Update app.py

Browse files

Files changed (1) hide show

app.py +417 -66

app.py CHANGED Viewed

@@ -1,3 +1,375 @@
 import gradio as gr
 import torch
 import torch.nn as nn
@@ -11,33 +383,28 @@ from TorchCRF import CRF
 # ---------------------------------------------------------
 # 1. CONFIGURATION
 # ---------------------------------------------------------
-# Ensure this filename matches exactly what you uploaded to the Space
 MODEL_FILENAME = "layoutlmv3_bilstm_crf_hybrid.pth"
 BASE_MODEL_ID = "microsoft/layoutlmv3-base"
-# Define your labels exactly as they were during training
 LABELS = [
     "O",
     "B-QUESTION", "I-QUESTION",
     "B-OPTION", "I-OPTION",
     "B-ANSWER", "I-ANSWER",
     "B-SECTION_HEADING", "I-SECTION_HEADING",
-    "B-PASSAGE", "I-PASSAGE"
 ]
 LABEL2ID = {l: i for i, l in enumerate(LABELS)}
 ID2LABEL = {i: l for l, i in LABEL2ID.items()}
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)
-# ---------------------------------------------------------
-# 2. MODEL ARCHITECTURE
-# ---------------------------------------------------------
-# ⚠️ ACTION REQUIRED:
-# Replace this class with the exact class definition of your
-# NEW HYBRID MODEL. The class name and structure must match
-# what was used when you saved 'layoutlmv3_nonlinear_scratch.pth'.
-# ---------------------------------------------------------
 # ---------------------------------------------------------
 # 2. MODEL ARCHITECTURE (LayoutLMv3 + BiLSTM + CRF)
 # ---------------------------------------------------------
@@ -46,52 +413,47 @@ class HybridModel(nn.Module):
         super().__init__()
         self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)
-        # Config for BiLSTM
-        hidden_size = self.layoutlm.config.hidden_size # Usually 768
-        lstm_hidden_size = hidden_size // 2  # 384, so bidirectional output is 768
-        # BiLSTM Layer
-        # input_size=768, hidden=384, bidir=True -> output_dim = 384 * 2 = 768
         self.lstm = nn.LSTM(
-            input_size=hidden_size,
-            hidden_size=lstm_hidden_size,
-            num_layers=1,
             batch_first=True,
             bidirectional=True
         )
-        # Dropout (Optional, check if you used this in training)
         self.dropout = nn.Dropout(0.1)
-        # Classifier: Maps BiLSTM output (768) to Label count
         self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)
-        # CRF Layer
         self.crf = CRF(num_labels)
     def forward(self, input_ids, bbox, attention_mask, labels=None):
-        # 1. LayoutLMv3 Base
         outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
-        sequence_output = outputs.last_hidden_state  # [Batch, Seq, 768]
-        # 2. BiLSTM
-        # LSTM returns (output, (h_n, c_n)). We only need output.
-        lstm_output, _ = self.lstm(sequence_output)  # [Batch, Seq, 768]
-        # 3. Dropout & Classifier
         lstm_output = self.dropout(lstm_output)
-        emissions = self.classifier(lstm_output)     # [Batch, Seq, Num_Labels]
-        # 4. CRF
         if labels is not None:
-            # Training/Eval (Loss)
             log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
             return -log_likelihood.mean()
         else:
-            # Inference (Prediction Tags)
             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
 # ---------------------------------------------------------
-# 3. MODEL LOADING LOGIC
 # ---------------------------------------------------------
 model = None
@@ -100,17 +462,18 @@ def load_model():
     if model is None:
         print(f"🔄 Loading model from {MODEL_FILENAME}...")
         if not os.path.exists(MODEL_FILENAME):
-            raise FileNotFoundError(f"❌ Model file '{MODEL_FILENAME}' not found. Please upload it to the Files tab of your Space.")
-        # Initialize the model structure
         model = HybridModel(num_labels=len(LABELS))
-        # Load weights
         try:
-            state_dict = torch.load(MODEL_FILENAME, map_location=device)
             model.load_state_dict(state_dict)
         except RuntimeError as e:
-            raise RuntimeError(f"❌ State dictionary mismatch. Ensure the 'HybridModel' class structure in app.py matches the model you trained.\nDetails: {e}")
         model.to(device)
         model.eval()
@@ -118,7 +481,7 @@ def load_model():
     return model
 # ---------------------------------------------------------
-# 4. JSON CONVERSION LOGIC (Your Custom Logic)
 # ---------------------------------------------------------
 def convert_bio_to_structured_json(predictions):
     structured_data = []
@@ -138,7 +501,6 @@ def convert_bio_to_structured_json(predictions):
             else: item['passage'] = passage_text
         passage_buffer.clear()
-    # Flatten predictions list if strictly page-separated
     flat_predictions = []
     for page in predictions:
         flat_predictions.extend(page['data'])
@@ -146,9 +508,16 @@ def convert_bio_to_structured_json(predictions):
     for idx, item in enumerate(flat_predictions):
         word = item['word']
         label = item['predicted_label']
         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
-        current_text_buffer.append(word)
         previous_entity_type = last_entity_type
         is_passage_label = (entity_type == 'PASSAGE')
@@ -242,7 +611,6 @@ def convert_bio_to_structured_json(predictions):
         current_item['text'] = ' '.join(current_text_buffer).strip()
         structured_data.append(current_item)
-    # Final Cleanup
     for item in structured_data:
         if 'text' in item: item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
         if 'new_passage' in item: item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
@@ -250,7 +618,7 @@ def convert_bio_to_structured_json(predictions):
     return structured_data
 # ---------------------------------------------------------
-# 5. INFERENCE PIPELINE
 # ---------------------------------------------------------
 def process_pdf(pdf_file):
     if pdf_file is None:
@@ -259,7 +627,6 @@ def process_pdf(pdf_file):
     try:
         active_model = load_model()
-        # A. Extract Text and Boxes
         extracted_pages = []
         with pdfplumber.open(pdf_file.name) as pdf:
             for page_idx, page in enumerate(pdf.pages):
@@ -271,28 +638,22 @@ def process_pdf(pdf_file):
                 for w in words_data:
                     text = w['text']
-                    # Normalize bbox to 0-1000 scale
                     x0 = int((w['x0'] / width) * 1000)
                     top = int((w['top'] / height) * 1000)
                     x1 = int((w['x1'] / width) * 1000)
                     bottom = int((w['bottom'] / height) * 1000)
-                    # Safety clamp
                     box = [max(0, min(x0, 1000)), max(0, min(top, 1000)),
                            max(0, min(x1, 1000)), max(0, min(bottom, 1000))]
                     page_tokens.append(text)
                     page_bboxes.append(box)
                 extracted_pages.append({"page_id": page_idx, "tokens": page_tokens, "bboxes": page_bboxes})
-        # B. Run Inference
         raw_predictions = []
         for page in extracted_pages:
             tokens = page['tokens']
             bboxes = page['bboxes']
             if not tokens: continue
-            # Tokenize
             encoding = tokenizer(
                 tokens,
                 boxes=bboxes,
@@ -307,18 +668,12 @@ def process_pdf(pdf_file):
             bbox = encoding.bbox.to(device)
             attention_mask = encoding.attention_mask.to(device)
-            # Predict
             with torch.no_grad():
-                # NOTE: If your hybrid model requires 'pixel_values',
-                # you will need to add image extraction logic above and pass it here.
-                preds = active_model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
-                # Check if preds returns a tuple (loss, tags) or just tags
-                # The CRF implementation usually returns a list of lists of tags in viterbi_decode
-                pred_tags = preds[0] if isinstance(preds, tuple) else preds[0]
-                # Note: Standard CRF.viterbi_decode returns List[List[int]], so [0] gets the first batch item
-            # Alignment
             word_ids = encoding.word_ids()
             aligned_data = []
             prev_word_idx = None
@@ -326,20 +681,16 @@ def process_pdf(pdf_file):
             for i, word_idx in enumerate(word_ids):
                 if word_idx is None: continue
                 if word_idx != prev_word_idx:
-                    # pred_tags is likely a list of ints.
-                    # If pred_tags[i] fails, your max_length might be cutting off tags,
-                    # or the model output shape differs from the token length.
                     if i < len(pred_tags):
                         label_id = pred_tags[i]
                         label_str = ID2LABEL.get(label_id, "O")
                         aligned_data.append({"word": tokens[word_idx], "predicted_label": label_str})
                 prev_word_idx = word_idx
             raw_predictions.append({"data": aligned_data})
-        # C. Convert to Structured JSON
         final_json = convert_bio_to_structured_json(raw_predictions)
-        # Save output
         output_filename = "structured_output.json"
         with open(output_filename, "w", encoding="utf-8") as f:
             json.dump(final_json, f, indent=2, ensure_ascii=False)
@@ -360,7 +711,7 @@ iface = gr.Interface(
         gr.File(label="Download JSON Output"),
         gr.Textbox(label="Status Log", lines=10)
     ],
-    title="Hybrid Model Inference: PDF to JSON",
     description="Upload a document to extract structured data using the custom Hybrid LayoutLMv3 model.",
     flagging_mode="never"
 )

+# import gradio as gr
+# import torch
+# import torch.nn as nn
+# import pdfplumber
+# import json
+# import os
+# import re
+# from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model
+# from TorchCRF import CRF
+# # ---------------------------------------------------------
+# # 1. CONFIGURATION
+# # ---------------------------------------------------------
+# # Ensure this filename matches exactly what you uploaded to the Space
+# MODEL_FILENAME = "layoutlmv3_bilstm_crf_hybrid.pth"
+# BASE_MODEL_ID = "microsoft/layoutlmv3-base"
+# # Define your labels exactly as they were during training
+# LABELS = [
+#     "O",
+#     "B-QUESTION", "I-QUESTION",
+#     "B-OPTION", "I-OPTION",
+#     "B-ANSWER", "I-ANSWER",
+#     "B-SECTION_HEADING", "I-SECTION_HEADING",
+#     "B-PASSAGE", "I-PASSAGE"
+# ]
+# LABEL2ID = {l: i for i, l in enumerate(LABELS)}
+# ID2LABEL = {i: l for l, i in LABEL2ID.items()}
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)
+# # ---------------------------------------------------------
+# # 2. MODEL ARCHITECTURE
+# # ---------------------------------------------------------
+# # ⚠️ ACTION REQUIRED:
+# # Replace this class with the exact class definition of your
+# # NEW HYBRID MODEL. The class name and structure must match
+# # what was used when you saved 'layoutlmv3_nonlinear_scratch.pth'.
+# # ---------------------------------------------------------
+# # ---------------------------------------------------------
+# # 2. MODEL ARCHITECTURE (LayoutLMv3 + BiLSTM + CRF)
+# # ---------------------------------------------------------
+# class HybridModel(nn.Module):
+#     def __init__(self, num_labels):
+#         super().__init__()
+#         self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)
+#         # Config for BiLSTM
+#         hidden_size = self.layoutlm.config.hidden_size # Usually 768
+#         lstm_hidden_size = hidden_size // 2  # 384, so bidirectional output is 768
+#         # BiLSTM Layer
+#         # input_size=768, hidden=384, bidir=True -> output_dim = 384 * 2 = 768
+#         self.lstm = nn.LSTM(
+#             input_size=hidden_size,
+#             hidden_size=lstm_hidden_size,
+#             num_layers=1,
+#             batch_first=True,
+#             bidirectional=True
+#         )
+#         # Dropout (Optional, check if you used this in training)
+#         self.dropout = nn.Dropout(0.1)
+#         # Classifier: Maps BiLSTM output (768) to Label count
+#         self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)
+#         # CRF Layer
+#         self.crf = CRF(num_labels)
+#     def forward(self, input_ids, bbox, attention_mask, labels=None):
+#         # 1. LayoutLMv3 Base
+#         outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
+#         sequence_output = outputs.last_hidden_state  # [Batch, Seq, 768]
+#         # 2. BiLSTM
+#         # LSTM returns (output, (h_n, c_n)). We only need output.
+#         lstm_output, _ = self.lstm(sequence_output)  # [Batch, Seq, 768]
+#         # 3. Dropout & Classifier
+#         lstm_output = self.dropout(lstm_output)
+#         emissions = self.classifier(lstm_output)     # [Batch, Seq, Num_Labels]
+#         # 4. CRF
+#         if labels is not None:
+#             # Training/Eval (Loss)
+#             log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
+#             return -log_likelihood.mean()
+#         else:
+#             # Inference (Prediction Tags)
+#             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
+# # ---------------------------------------------------------
+# # 3. MODEL LOADING LOGIC
+# # ---------------------------------------------------------
+# model = None
+# def load_model():
+#     global model
+#     if model is None:
+#         print(f"🔄 Loading model from {MODEL_FILENAME}...")
+#         if not os.path.exists(MODEL_FILENAME):
+#             raise FileNotFoundError(f"❌ Model file '{MODEL_FILENAME}' not found. Please upload it to the Files tab of your Space.")
+#         # Initialize the model structure
+#         model = HybridModel(num_labels=len(LABELS))
+#         # Load weights
+#         try:
+#             state_dict = torch.load(MODEL_FILENAME, map_location=device)
+#             model.load_state_dict(state_dict)
+#         except RuntimeError as e:
+#             raise RuntimeError(f"❌ State dictionary mismatch. Ensure the 'HybridModel' class structure in app.py matches the model you trained.\nDetails: {e}")
+#         model.to(device)
+#         model.eval()
+#         print("✅ Model loaded successfully.")
+#     return model
+# # ---------------------------------------------------------
+# # 4. JSON CONVERSION LOGIC (Your Custom Logic)
+# # ---------------------------------------------------------
+# def convert_bio_to_structured_json(predictions):
+#     structured_data = []
+#     current_item = None
+#     current_option_key = None
+#     current_passage_buffer = []
+#     current_text_buffer = []
+#     first_question_started = False
+#     last_entity_type = None
+#     just_finished_i_option = False
+#     is_in_new_passage = False
+#     def finalize_passage_to_item(item, passage_buffer):
+#         if passage_buffer:
+#             passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
+#             if item.get('passage'): item['passage'] += ' ' + passage_text
+#             else: item['passage'] = passage_text
+#         passage_buffer.clear()
+#     # Flatten predictions list if strictly page-separated
+#     flat_predictions = []
+#     for page in predictions:
+#         flat_predictions.extend(page['data'])
+#     for idx, item in enumerate(flat_predictions):
+#         word = item['word']
+#         label = item['predicted_label']
+#         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
+#         current_text_buffer.append(word)
+#         previous_entity_type = last_entity_type
+#         is_passage_label = (entity_type == 'PASSAGE')
+#         if not first_question_started:
+#             if label != 'B-QUESTION' and not is_passage_label:
+#                 just_finished_i_option = False
+#                 is_in_new_passage = False
+#                 continue
+#             if is_passage_label:
+#                 current_passage_buffer.append(word)
+#                 last_entity_type = 'PASSAGE'
+#                 just_finished_i_option = False
+#                 is_in_new_passage = False
+#                 continue
+#         if label == 'B-QUESTION':
+#             if not first_question_started:
+#                 header_text = ' '.join(current_text_buffer[:-1]).strip()
+#                 if header_text or current_passage_buffer:
+#                     metadata_item = {'type': 'METADATA', 'passage': ''}
+#                     finalize_passage_to_item(metadata_item, current_passage_buffer)
+#                     if header_text: metadata_item['text'] = header_text
+#                     structured_data.append(metadata_item)
+#                 first_question_started = True
+#                 current_text_buffer = [word]
+#             if current_item is not None:
+#                 finalize_passage_to_item(current_item, current_passage_buffer)
+#                 current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
+#                 structured_data.append(current_item)
+#                 current_text_buffer = [word]
+#             current_item = {
+#                 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
+#             }
+#             current_option_key = None
+#             last_entity_type = 'QUESTION'
+#             just_finished_i_option = False
+#             is_in_new_passage = False
+#             continue
+#         if current_item is not None:
+#             if is_in_new_passage:
+#                 if 'new_passage' not in current_item: current_item['new_passage'] = word
+#                 else: current_item['new_passage'] += f' {word}'
+#                 if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
+#                     is_in_new_passage = False
+#                 if label.startswith(('B-', 'I-')): last_entity_type = entity_type
+#                 continue
+#             is_in_new_passage = False
+#             if label.startswith('B-'):
+#                 if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
+#                     finalize_passage_to_item(current_item, current_passage_buffer)
+#                     current_passage_buffer = []
+#                 last_entity_type = entity_type
+#                 if entity_type == 'PASSAGE':
+#                     if previous_entity_type == 'OPTION' and just_finished_i_option:
+#                         current_item['new_passage'] = word
+#                         is_in_new_passage = True
+#                     else: current_passage_buffer.append(word)
+#                 elif entity_type == 'OPTION':
+#                     current_option_key = word
+#                     current_item['options'][current_option_key] = word
+#                     just_finished_i_option = False
+#                 elif entity_type == 'ANSWER':
+#                     current_item['answer'] = word
+#                     current_option_key = None
+#                     just_finished_i_option = False
+#                 elif entity_type == 'QUESTION':
+#                     current_item['question'] += f' {word}'
+#                     just_finished_i_option = False
+#             elif label.startswith('I-'):
+#                 if entity_type == 'QUESTION': current_item['question'] += f' {word}'
+#                 elif entity_type == 'PASSAGE':
+#                     if previous_entity_type == 'OPTION' and just_finished_i_option:
+#                         current_item['new_passage'] = word
+#                         is_in_new_passage = True
+#                     else:
+#                         if not current_passage_buffer: last_entity_type = 'PASSAGE'
+#                         current_passage_buffer.append(word)
+#                 elif entity_type == 'OPTION' and current_option_key is not None:
+#                     current_item['options'][current_option_key] += f' {word}'
+#                     just_finished_i_option = True
+#                 elif entity_type == 'ANSWER': current_item['answer'] += f' {word}'
+#                 just_finished_i_option = (entity_type == 'OPTION')
+#     if current_item is not None:
+#         finalize_passage_to_item(current_item, current_passage_buffer)
+#         current_item['text'] = ' '.join(current_text_buffer).strip()
+#         structured_data.append(current_item)
+#     # Final Cleanup
+#     for item in structured_data:
+#         if 'text' in item: item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
+#         if 'new_passage' in item: item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
+#     return structured_data
+# # ---------------------------------------------------------
+# # 5. INFERENCE PIPELINE
+# # ---------------------------------------------------------
+# def process_pdf(pdf_file):
+#     if pdf_file is None:
+#         return None, "⚠️ Please upload a PDF file."
+#     try:
+#         active_model = load_model()
+#         # A. Extract Text and Boxes
+#         extracted_pages = []
+#         with pdfplumber.open(pdf_file.name) as pdf:
+#             for page_idx, page in enumerate(pdf.pages):
+#                 width, height = page.width, page.height
+#                 words_data = page.extract_words()
+#                 page_tokens = []
+#                 page_bboxes = []
+#                 for w in words_data:
+#                     text = w['text']
+#                     # Normalize bbox to 0-1000 scale
+#                     x0 = int((w['x0'] / width) * 1000)
+#                     top = int((w['top'] / height) * 1000)
+#                     x1 = int((w['x1'] / width) * 1000)
+#                     bottom = int((w['bottom'] / height) * 1000)
+#                     # Safety clamp
+#                     box = [max(0, min(x0, 1000)), max(0, min(top, 1000)),
+#                            max(0, min(x1, 1000)), max(0, min(bottom, 1000))]
+#                     page_tokens.append(text)
+#                     page_bboxes.append(box)
+#                 extracted_pages.append({"page_id": page_idx, "tokens": page_tokens, "bboxes": page_bboxes})
+#         # B. Run Inference
+#         raw_predictions = []
+#         for page in extracted_pages:
+#             tokens = page['tokens']
+#             bboxes = page['bboxes']
+#             if not tokens: continue
+#             # Tokenize
+#             encoding = tokenizer(
+#                 tokens,
+#                 boxes=bboxes,
+#                 return_tensors="pt",
+#                 padding="max_length",
+#                 truncation=True,
+#                 max_length=512,
+#                 return_offsets_mapping=True
+#             )
+#             input_ids = encoding.input_ids.to(device)
+#             bbox = encoding.bbox.to(device)
+#             attention_mask = encoding.attention_mask.to(device)
+#             # Predict
+#             with torch.no_grad():
+#                 # NOTE: If your hybrid model requires 'pixel_values',
+#                 # you will need to add image extraction logic above and pass it here.
+#                 preds = active_model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
+#                 # Check if preds returns a tuple (loss, tags) or just tags
+#                 # The CRF implementation usually returns a list of lists of tags in viterbi_decode
+#                 pred_tags = preds[0] if isinstance(preds, tuple) else preds[0]
+#                 # Note: Standard CRF.viterbi_decode returns List[List[int]], so [0] gets the first batch item
+#             # Alignment
+#             word_ids = encoding.word_ids()
+#             aligned_data = []
+#             prev_word_idx = None
+#             for i, word_idx in enumerate(word_ids):
+#                 if word_idx is None: continue
+#                 if word_idx != prev_word_idx:
+#                     # pred_tags is likely a list of ints.
+#                     # If pred_tags[i] fails, your max_length might be cutting off tags,
+#                     # or the model output shape differs from the token length.
+#                     if i < len(pred_tags):
+#                         label_id = pred_tags[i]
+#                         label_str = ID2LABEL.get(label_id, "O")
+#                         aligned_data.append({"word": tokens[word_idx], "predicted_label": label_str})
+#                 prev_word_idx = word_idx
+#             raw_predictions.append({"data": aligned_data})
+#         # C. Convert to Structured JSON
+#         final_json = convert_bio_to_structured_json(raw_predictions)
+#         # Save output
+#         output_filename = "structured_output.json"
+#         with open(output_filename, "w", encoding="utf-8") as f:
+#             json.dump(final_json, f, indent=2, ensure_ascii=False)
+#         return output_filename, f"✅ Success! Processed {len(extracted_pages)} pages. Extracted {len(final_json)} items."
+#     except Exception as e:
+#         import traceback
+#         return None, f"❌ Error:\n{str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+# # ---------------------------------------------------------
+# # 6. GRADIO INTERFACE
+# # ---------------------------------------------------------
+# iface = gr.Interface(
+#     fn=process_pdf,
+#     inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
+#     outputs=[
+#         gr.File(label="Download JSON Output"),
+#         gr.Textbox(label="Status Log", lines=10)
+#     ],
+#     title="Hybrid Model Inference: PDF to JSON",
+#     description="Upload a document to extract structured data using the custom Hybrid LayoutLMv3 model.",
+#     flagging_mode="never"
+# )
+# if __name__ == "__main__":
+#     iface.launch()
 import gradio as gr
 import torch
 import torch.nn as nn
 # ---------------------------------------------------------
 # 1. CONFIGURATION
 # ---------------------------------------------------------
 MODEL_FILENAME = "layoutlmv3_bilstm_crf_hybrid.pth"
 BASE_MODEL_ID = "microsoft/layoutlmv3-base"
+# Labels: 11 Standard BIO tags + 2 Special tokens = 13 Total
+# NOTE: If your output labels look "scrambled" (e.g., Questions detected as Options),
+# try moving "UNK" and "PAD" to the BEGINNING of this list (indices 0 and 1).
 LABELS = [
     "O",
     "B-QUESTION", "I-QUESTION",
     "B-OPTION", "I-OPTION",
     "B-ANSWER", "I-ANSWER",
     "B-SECTION_HEADING", "I-SECTION_HEADING",
+    "B-PASSAGE", "I-PASSAGE",
+    "UNK", "PAD"  # Added to match the 13-label count in your weights
 ]
 LABEL2ID = {l: i for i, l in enumerate(LABELS)}
 ID2LABEL = {i: l for l, i in LABEL2ID.items()}
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)
 # ---------------------------------------------------------
 # 2. MODEL ARCHITECTURE (LayoutLMv3 + BiLSTM + CRF)
 # ---------------------------------------------------------
         super().__init__()
         self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)
+        # Structure derived from your error log:
+        # Weight shape [1024, 768] implies hidden_size = 256 (1024/4)
+        lstm_hidden_size = 256
         self.lstm = nn.LSTM(
+            input_size=768,        # LayoutLMv3 output size
+            hidden_size=lstm_hidden_size,
+            num_layers=2,          # Error log showed 'l1' weights, meaning 2 layers
             batch_first=True,
             bidirectional=True
         )
         self.dropout = nn.Dropout(0.1)
+        # Classifier input = lstm_hidden * 2 (bidirectional) = 256 * 2 = 512
+        # This matches your error log shape [13, 512]
         self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)
         self.crf = CRF(num_labels)
     def forward(self, input_ids, bbox, attention_mask, labels=None):
         outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
+        sequence_output = outputs.last_hidden_state
+        # BiLSTM
+        lstm_output, _ = self.lstm(sequence_output)
+        # Classifier
         lstm_output = self.dropout(lstm_output)
+        emissions = self.classifier(lstm_output)
         if labels is not None:
+            # Training/Eval loss
             log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
             return -log_likelihood.mean()
         else:
+            # Inference prediction
             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
 # ---------------------------------------------------------
+# 3. MODEL LOADING
 # ---------------------------------------------------------
 model = None
     if model is None:
         print(f"🔄 Loading model from {MODEL_FILENAME}...")
         if not os.path.exists(MODEL_FILENAME):
+            raise FileNotFoundError(f"❌ Model file '{MODEL_FILENAME}' not found.")
         model = HybridModel(num_labels=len(LABELS))
+        # Load state dictionary
+        state_dict = torch.load(MODEL_FILENAME, map_location=device)
+        # Try loading. If labels are wrong, this will still throw a shape error.
         try:
             model.load_state_dict(state_dict)
         except RuntimeError as e:
+            raise RuntimeError(f"❌ Weight mismatch! \nYour model has {len(LABELS)} labels defined in script.\nCheck if 'LABELS' list needs reordering or resizing.\nDetailed Error: {e}")
         model.to(device)
         model.eval()
     return model
 # ---------------------------------------------------------
+# 4. JSON CONVERSION LOGIC
 # ---------------------------------------------------------
 def convert_bio_to_structured_json(predictions):
     structured_data = []
             else: item['passage'] = passage_text
         passage_buffer.clear()
     flat_predictions = []
     for page in predictions:
         flat_predictions.extend(page['data'])
     for idx, item in enumerate(flat_predictions):
         word = item['word']
         label = item['predicted_label']
+        # Clean label (remove B- / I-)
         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
+        # Skip special tokens if they appear in prediction
+        if label in ["UNK", "PAD", "O"]:
+            current_text_buffer.append(word)
+            continue
+        current_text_buffer.append(word)
         previous_entity_type = last_entity_type
         is_passage_label = (entity_type == 'PASSAGE')
         current_item['text'] = ' '.join(current_text_buffer).strip()
         structured_data.append(current_item)
     for item in structured_data:
         if 'text' in item: item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
         if 'new_passage' in item: item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
     return structured_data
 # ---------------------------------------------------------
+# 5. PROCESSING PIPELINE
 # ---------------------------------------------------------
 def process_pdf(pdf_file):
     if pdf_file is None:
     try:
         active_model = load_model()
         extracted_pages = []
         with pdfplumber.open(pdf_file.name) as pdf:
             for page_idx, page in enumerate(pdf.pages):
                 for w in words_data:
                     text = w['text']
                     x0 = int((w['x0'] / width) * 1000)
                     top = int((w['top'] / height) * 1000)
                     x1 = int((w['x1'] / width) * 1000)
                     bottom = int((w['bottom'] / height) * 1000)
                     box = [max(0, min(x0, 1000)), max(0, min(top, 1000)),
                            max(0, min(x1, 1000)), max(0, min(bottom, 1000))]
                     page_tokens.append(text)
                     page_bboxes.append(box)
                 extracted_pages.append({"page_id": page_idx, "tokens": page_tokens, "bboxes": page_bboxes})
         raw_predictions = []
         for page in extracted_pages:
             tokens = page['tokens']
             bboxes = page['bboxes']
             if not tokens: continue
             encoding = tokenizer(
                 tokens,
                 boxes=bboxes,
             bbox = encoding.bbox.to(device)
             attention_mask = encoding.attention_mask.to(device)
             with torch.no_grad():
+                # Get the tag indices from the CRF layer
+                pred_tags = active_model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
+                # If batch size is 1, pred_tags is a list of lists: [[tags...]]
+                pred_tags = pred_tags[0]
             word_ids = encoding.word_ids()
             aligned_data = []
             prev_word_idx = None
             for i, word_idx in enumerate(word_ids):
                 if word_idx is None: continue
                 if word_idx != prev_word_idx:
                     if i < len(pred_tags):
                         label_id = pred_tags[i]
+                        # Safe retrieval of label string
                         label_str = ID2LABEL.get(label_id, "O")
                         aligned_data.append({"word": tokens[word_idx], "predicted_label": label_str})
                 prev_word_idx = word_idx
             raw_predictions.append({"data": aligned_data})
         final_json = convert_bio_to_structured_json(raw_predictions)
         output_filename = "structured_output.json"
         with open(output_filename, "w", encoding="utf-8") as f:
             json.dump(final_json, f, indent=2, ensure_ascii=False)
         gr.File(label="Download JSON Output"),
         gr.Textbox(label="Status Log", lines=10)
     ],
+    title="LayoutLMv3 + BiLSTM Hybrid Model Inference",
     description="Upload a document to extract structured data using the custom Hybrid LayoutLMv3 model.",
     flagging_mode="never"
 )