Spaces:

heerjtdev
/

edugenius

Sleeping

App Files Files Community

aagamjtdev commited on Oct 14, 2025

Commit

5b8a1d1

1 Parent(s): 4acd43e

Fix: Full Script, structured data

Browse files

Files changed (1) hide show

app.py +1195 -107

app.py CHANGED Viewed

@@ -1,3 +1,977 @@
 import os
 import json
 import pickle
@@ -6,29 +980,29 @@ from collections import Counter
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from tqdm import tqdm
 # === GRADIO AND DEPENDENCIES ===
 import gradio as gr
 import fitz  # PyMuPDF
-import re
 from PIL import Image, ImageEnhance
 import pytesseract
 try:
     from TorchCRF import CRF
 except ImportError:
-    # This should be handled in requirements.txt for the Space
-    print("CRF module not found. Assuming deployment environment will install it.")
     class CRF:
-        def __init__(self, *args, **kwargs): pass
-        def viterbi_decode(self, emissions, mask): return [list(torch.argmax(emissions[0], dim=-1).cpu().numpy())]
 # ========== CONFIG (Must match Training Script) ==========
-# NOTE: In a Space, we typically don't use DATA_DIR paths if the files are alongside app.py
 MODEL_FILE = "model_CAT.pt"
 VOCAB_FILE = "vocabs_CAT.pkl"
@@ -43,24 +1017,24 @@ BBOX_NORM_CONSTANT = 1000.0
 INFERENCE_CHUNK_SIZE = 256
 # ========== LABELS (Must match Training Script) ==========
-LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE"]
 LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
 IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
 # =========================================================
-# 1. Vocab, CharCNNEncoder, and MCQTagger Classes (Copied from your script)
 # =========================================================
 class Vocab:
-    # ... (Your Vocab class implementation)
     def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
         self.min_freq = min_freq
         self.unk_token = unk_token
         self.pad_token = pad_token
         self.freq = Counter()
-        self.itos = []  # Index to String
-        self.stoi = {}  # String to Index
     def add_sentence(self, toks):
         self.freq.update(toks)
@@ -75,7 +1049,6 @@ class Vocab:
         return len(self.itos)
     def __getitem__(self, token: str) -> int:
-        """Allows lookup using word_vocab[token]. Returns UNK index if token is not found."""
         return self.stoi.get(token, self.stoi[self.unk_token])
     def __getstate__(self):
@@ -97,18 +1070,14 @@ class Vocab:
 def load_vocabs(path: str) -> Tuple[Vocab, Vocab]:
-    """Loads word and character vocabularies from a pickle file and verifies size."""
     try:
         absolute_path = os.path.abspath(path)
-        if not os.path.exists(absolute_path):
-            raise FileNotFoundError(f"Vocab file NOT FOUND at: {absolute_path}")
         with open(absolute_path, "rb") as f:
             word_vocab, char_vocab = pickle.load(f)
         if len(word_vocab) <= 2:
-            raise IndexError("CRITICAL: Word vocabulary size is too small. Vocab file is invalid.")
         return word_vocab, char_vocab
-    except FileNotFoundError:
-        raise FileNotFoundError(f"Vocab file not found at {path}. Please run the training script first.")
     except Exception as e:
         raise RuntimeError(f"Error loading vocabs from {path}: {e}")
@@ -152,6 +1121,7 @@ class MCQTagger(nn.Module):
         if lengths.max().item() == 0:
             B, L = enc_in.size(0), enc_in.size(1)
             return torch.zeros((B, L, len(LABELS)), device=enc_in.device)
         packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
@@ -162,60 +1132,42 @@ class MCQTagger(nn.Module):
     def forward(self, words, chars, bboxes, mask, labels=None, class_weights=None, alpha=0.7):
         emissions = self.forward_emissions(words, chars, bboxes, mask)
-        # We only decode for inference, not calculate loss
         return self.crf.viterbi_decode(emissions, mask=mask)
 # =========================================================
-# 2. PDF Processing Functions (Copied from your script)
 # =========================================================
 def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) -> List[Dict[str, Any]]:
-    # ... (Your ocr_fallback_page implementation)
-    """
-    Renders a PyMuPDF page, runs Tesseract OCR, and tokenizes the result.
-    """
     try:
-        # Render page at high resolution (300 DPI equivalent)
         pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
-        if pix.n - pix.alpha > 3:  # Handle CMYK
             pix = fitz.Pixmap(fitz.csRGB, pix)
         img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        # Preprocessing for Tesseract (as was in the original code)
         img_pil = img_pil.convert('L')
         img_pil = ImageEnhance.Contrast(img_pil).enhance(2.0)
         img_pil = ImageEnhance.Sharpness(img_pil).enhance(2.0)
-        # Run Tesseract
         ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
         ocr_tokens = []
         for i in range(len(ocr_data['text'])):
             word = ocr_data['text'][i]
             conf = ocr_data['conf'][i]
-            conf = ocr_data['conf'][i]
-            # Use only words with reasonable confidence
             if word.strip() and int(conf) > 50:
-                # Get Tesseract's raw pixel bounding box
-                left = ocr_data['left'][i]
-                top = ocr_data['top'][i]
-                width = ocr_data['width'][i]
-                height = ocr_data['height'][i]
-                # Convert pixel bbox back to original PDF coordinate system
                 scale = page_width / pix.width
                 raw_bbox = [
-                    left * scale,
-                    top * scale,
-                    (left + width) * scale,
-                    (top + height) * scale
                 ]
-                # Normalize bbox
                 normalized_bbox = [
                     (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
                     (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
@@ -232,17 +1184,12 @@ def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) ->
         return ocr_tokens
     except Exception as e:
-        # Note: 'page.number' might not be available if not running in a loop context
         print(f"OCR fallback failed: {e}")
         return []
 def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]:
-    # ... (Your extract_tokens_from_pdf_fitz_with_ocr implementation)
-    """
-    Extracts words and their raw bounding boxes using PyMuPDF (fitz) text layer
-    and falls back to OCR if no text is found.
-    """
     all_tokens = []
     try:
         doc = fitz.open(pdf_path)
@@ -251,8 +1198,7 @@ def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]
             page_width, page_height = page.rect.width, page.rect.height
             page_tokens = []
-            # 1. Primary Extraction: Use PyMuPDF's word structure (fitz.Page.get_text("words"))
-            # word_list format: (x0, y0, x1, y1, word, ...)
             word_list = page.get_text("words", sort=True)
             if word_list:
@@ -260,7 +1206,6 @@ def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]
                     word = word_data[4]
                     raw_bbox = word_data[:4]
-                    # Normalize bboxes
                     normalized_bbox = [
                         (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
                         (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
@@ -293,10 +1238,7 @@ extract_tokens_from_pdf = extract_tokens_from_pdf_fitz_with_ocr
 def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab: Vocab, char_vocab: Vocab,
                                   chunk_size: int) -> List[Dict[str, Any]]:
-    # ... (Your preprocess_and_collate_tokens implementation)
-    """
-    Chunks the token list, converts to IDs, and prepares batches for inference. (Unchanged)
-    """
     all_batches = []
     for i in range(0, len(all_tokens), chunk_size):
@@ -330,18 +1272,21 @@ def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab:
             "chars": char_pad,
             "bboxes": bbox_pad,
             "mask": mask,
-            "original_tokens": chunk  # Keep the original data for output formatting
         })
     return all_batches
 # =========================================================
-# 3. Model Loading and Caching (Crucial for Gradio performance)
 # =========================================================
-# Cache the model and vocabs globally so they are loaded only ONCE when the app starts.
-# This avoids reloading the model on every user request, which is vital for speed.
 try:
     WORD_VOCAB, CHAR_VOCAB = load_vocabs(VOCAB_FILE)
     MODEL = MCQTagger(len(WORD_VOCAB), len(CHAR_VOCAB), len(LABELS)).to(DEVICE)
@@ -349,89 +1294,232 @@ try:
     MODEL.eval()
     print("✅ Model and Vocabs loaded successfully (Cached).")
 except Exception as e:
-    MODEL = None
     print(f"❌ Initial Model/Vocab Load Failure: {e}")
-    print("The Gradio demo will not function until model_CAT.pt and vocabs_CAT.pkl are in the root directory.")
 # =========================================================
-# 4. The Gradio Inference Wrapper Function
 # =========================================================
-def gradio_inference_wrapper(pdf_file: str) -> Tuple[str, List[Dict[str, Any]]]:
     """
-    Wraps the entire inference pipeline for the Gradio Interface.
-    Args:
-        pdf_file: The path to the temporary PDF file uploaded by the user (a string).
-    Returns:
-        A tuple of (str, List[Dict[str, Any]]): A status message and the raw predictions.
     """
     if MODEL is None:
         return "❌ ERROR: Model failed to load on startup. Check 'model_CAT.pt' and 'vocabs_CAT.pkl'.", []
     pdf_path = pdf_file
     try:
-        # 1. Extract Tokens
         all_tokens = extract_tokens_from_pdf(pdf_path)
-    except RuntimeError as e:
-        return f"❌ PDF Processing Error: {e}", []
-    if not all_tokens:
-        return "❌ ERROR: No tokens were extracted from the PDF, even after OCR fallback.", []
-    # 2. Preprocess and Batch
-    batches = preprocess_and_collate_tokens(all_tokens, WORD_VOCAB, CHAR_VOCAB, chunk_size=INFERENCE_CHUNK_SIZE)
-    # 3. Run Inference
-    all_predictions = []
-    with torch.no_grad():
-        for batch in batches:
-            words, chars, bboxes, mask = (batch[k] for k in ["words", "chars", "bboxes", "mask"])
-            preds_batch = MODEL(words, chars, bboxes, mask)
-            predictions = preds_batch[0]
-            original_tokens = batch["original_tokens"]
-            for token_data, pred_idx in zip(original_tokens, predictions):
-                all_predictions.append({
-                    "word": token_data["word"],
-                    "bbox": token_data["raw_bbox"],
-                    "predicted_label": IDX2LABEL[pred_idx]
-                })
-    status_message = f"✅ Inference complete. Total tokens predicted: {len(all_predictions)}"
-    # Gradio will display the JSON output prettified
-    return status_message, all_predictions
 # =========================================================
-# 5. Define and Launch the Gradio Interface
 # =========================================================
 if __name__ == "__main__":
-    title = "MCQ Document Structure Tagger (Bi-LSTM-CRF)"
-    description = "Upload a PDF document (e.g., an MCQ paper). The model will tokenize the text, run inference to predict BIO-tags (B-QUESTION, I-OPTION, B-ANSWER, etc.) for each word, and return the raw JSON predictions."
-    # Define the Gradio Interface
     demo = gr.Interface(
         fn=gradio_inference_wrapper,
-        # inputs=gr.File(label="Upload PDF Document", file_types=['.pdf'], type='filepath'),
-        inputs=gr.File(label="Upload PDF Document"),
         outputs=[
             gr.Textbox(label="Status Message", interactive=False),
-            gr.JSON(label="Raw BIO Tagging Predictions (JSON)", show_label=True)
         ],
         title=title,
         description=description,
         allow_flagging="never",
-        # Set a reasonable concurrency limit (number of simultaneous users) for a CPU/small GPU Space
         concurrency_limit=2
     )
-    # Launch the demo (Hugging Face Spaces automatically calls launch() internally)
-    demo.launch()

+# import os
+# import json
+# import pickle
+# from typing import List, Dict, Any, Tuple
+# from collections import Counter
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+# from tqdm import tqdm
+#
+# # === GRADIO AND DEPENDENCIES ===
+# import gradio as gr
+# import fitz  # PyMuPDF
+# import re
+# from PIL import Image, ImageEnhance
+# import pytesseract
+#
+# try:
+#     from TorchCRF import CRF
+# except ImportError:
+#     # This should be handled in requirements.txt for the Space
+#     print("CRF module not found. Assuming deployment environment will install it.")
+#
+#
+#     class CRF:
+#         def __init__(self, *args, **kwargs): pass
+#
+#         def viterbi_decode(self, emissions, mask): return [list(torch.argmax(emissions[0], dim=-1).cpu().numpy())]
+#
+# # ========== CONFIG (Must match Training Script) ==========
+# # NOTE: In a Space, we typically don't use DATA_DIR paths if the files are alongside app.py
+# MODEL_FILE = "model_CAT.pt"
+# VOCAB_FILE = "vocabs_CAT.pkl"
+#
+# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# MAX_CHAR_LEN = 16
+# EMBED_DIM = 100
+# CHAR_EMBED_DIM = 30
+# CHAR_CNN_OUT = 30
+# BBOX_DIM = 100
+# HIDDEN_SIZE = 512
+# BBOX_NORM_CONSTANT = 1000.0
+# INFERENCE_CHUNK_SIZE = 256
+#
+# # ========== LABELS (Must match Training Script) ==========
+# LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE"]
+# LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
+# IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
+#
+#
+# # =========================================================
+# # 1. Vocab, CharCNNEncoder, and MCQTagger Classes (Copied from your script)
+# # =========================================================
+#
+# class Vocab:
+#     # ... (Your Vocab class implementation)
+#     def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
+#         self.min_freq = min_freq
+#         self.unk_token = unk_token
+#         self.pad_token = pad_token
+#         self.freq = Counter()
+#         self.itos = []  # Index to String
+#         self.stoi = {}  # String to Index
+#
+#     def add_sentence(self, toks):
+#         self.freq.update(toks)
+#
+#     def build(self):
+#         items = [tok for tok, c in self.freq.items() if c >= self.min_freq]
+#         items = [self.pad_token, self.unk_token] + sorted(items)
+#         self.itos = items
+#         self.stoi = {s: i for i, s in enumerate(self.itos)}
+#
+#     def __len__(self):
+#         return len(self.itos)
+#
+#     def __getitem__(self, token: str) -> int:
+#         """Allows lookup using word_vocab[token]. Returns UNK index if token is not found."""
+#         return self.stoi.get(token, self.stoi[self.unk_token])
+#
+#     def __getstate__(self):
+#         return {
+#             'min_freq': self.min_freq,
+#             'unk_token': self.unk_token,
+#             'pad_token': self.pad_token,
+#             'itos': self.itos,
+#             'stoi': self.stoi,
+#         }
+#
+#     def __setstate__(self, state):
+#         self.min_freq = state['min_freq']
+#         self.unk_token = state['unk_token']
+#         self.pad_token = state['pad_token']
+#         self.itos = state['itos']
+#         self.stoi = state['stoi']
+#         self.freq = Counter()
+#
+#
+# def load_vocabs(path: str) -> Tuple[Vocab, Vocab]:
+#     """Loads word and character vocabularies from a pickle file and verifies size."""
+#     try:
+#         absolute_path = os.path.abspath(path)
+#         if not os.path.exists(absolute_path):
+#             raise FileNotFoundError(f"Vocab file NOT FOUND at: {absolute_path}")
+#         with open(absolute_path, "rb") as f:
+#             word_vocab, char_vocab = pickle.load(f)
+#         if len(word_vocab) <= 2:
+#             raise IndexError("CRITICAL: Word vocabulary size is too small. Vocab file is invalid.")
+#         return word_vocab, char_vocab
+#     except FileNotFoundError:
+#         raise FileNotFoundError(f"Vocab file not found at {path}. Please run the training script first.")
+#     except Exception as e:
+#         raise RuntimeError(f"Error loading vocabs from {path}: {e}")
+#
+#
+# class CharCNNEncoder(nn.Module):
+#     def __init__(self, char_vocab_size, char_emb_dim, out_dim, kernel_sizes=(3, 4, 5)):
+#         super().__init__()
+#         self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
+#         convs = [nn.Conv1d(char_emb_dim, out_dim, kernel_size=k) for k in kernel_sizes]
+#         self.convs = nn.ModuleList(convs)
+#         self.out_dim = out_dim * len(convs)
+#
+#     def forward(self, char_ids):
+#         B, L, C = char_ids.size()
+#         emb = self.char_emb(char_ids.view(B * L, C)).transpose(1, 2)
+#         outs = [torch.max(torch.relu(conv(emb)), dim=2)[0] for conv in self.convs]
+#         res = torch.cat(outs, dim=1)
+#         return res.view(B, L, -1)
+#
+#
+# class MCQTagger(nn.Module):
+#     def __init__(self, vocab_size, char_vocab_size, n_labels, bbox_dim=BBOX_DIM):
+#         super().__init__()
+#         self.word_emb = nn.Embedding(vocab_size, EMBED_DIM, padding_idx=0)
+#         self.char_enc = CharCNNEncoder(char_vocab_size, CHAR_EMBED_DIM, CHAR_CNN_OUT)
+#         self.bbox_proj = nn.Linear(4, bbox_dim)
+#         in_dim = EMBED_DIM + self.char_enc.out_dim + bbox_dim
+#
+#         self.bilstm = nn.LSTM(in_dim, HIDDEN_SIZE // 2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
+#         self.ff = nn.Linear(HIDDEN_SIZE, n_labels)
+#         self.crf = CRF(n_labels)
+#         self.dropout = nn.Dropout(p=0.5)
+#
+#     def forward_emissions(self, words, chars, bboxes, mask):
+#         wemb = self.word_emb(words)
+#         cenc = self.char_enc(chars)
+#         benc = self.bbox_proj(bboxes)
+#         enc_in = torch.cat([wemb, cenc, benc], dim=-1)
+#         enc_in = self.dropout(enc_in)
+#         lengths = mask.sum(dim=1).cpu()
+#
+#         if lengths.max().item() == 0:
+#             B, L = enc_in.size(0), enc_in.size(1)
+#             return torch.zeros((B, L, len(LABELS)), device=enc_in.device)
+#
+#         packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
+#         packed_out, _ = self.bilstm(packed_in)
+#         padded_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
+#
+#         return self.ff(padded_out)
+#
+#     def forward(self, words, chars, bboxes, mask, labels=None, class_weights=None, alpha=0.7):
+#         emissions = self.forward_emissions(words, chars, bboxes, mask)
+#         # We only decode for inference, not calculate loss
+#         return self.crf.viterbi_decode(emissions, mask=mask)
+#
+#
+# # =========================================================
+# # 2. PDF Processing Functions (Copied from your script)
+# # =========================================================
+#
+# def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) -> List[Dict[str, Any]]:
+#     # ... (Your ocr_fallback_page implementation)
+#     """
+#     Renders a PyMuPDF page, runs Tesseract OCR, and tokenizes the result.
+#     """
+#     try:
+#         # Render page at high resolution (300 DPI equivalent)
+#         pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+#         if pix.n - pix.alpha > 3:  # Handle CMYK
+#             pix = fitz.Pixmap(fitz.csRGB, pix)
+#
+#         img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+#
+#         # Preprocessing for Tesseract (as was in the original code)
+#         img_pil = img_pil.convert('L')
+#         img_pil = ImageEnhance.Contrast(img_pil).enhance(2.0)
+#         img_pil = ImageEnhance.Sharpness(img_pil).enhance(2.0)
+#
+#         # Run Tesseract
+#         ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
+#
+#         ocr_tokens = []
+#         for i in range(len(ocr_data['text'])):
+#             word = ocr_data['text'][i]
+#             conf = ocr_data['conf'][i]
+#             conf = ocr_data['conf'][i]
+#
+#             # Use only words with reasonable confidence
+#             if word.strip() and int(conf) > 50:
+#                 # Get Tesseract's raw pixel bounding box
+#                 left = ocr_data['left'][i]
+#                 top = ocr_data['top'][i]
+#                 width = ocr_data['width'][i]
+#                 height = ocr_data['height'][i]
+#
+#                 # Convert pixel bbox back to original PDF coordinate system
+#                 scale = page_width / pix.width
+#
+#                 raw_bbox = [
+#                     left * scale,
+#                     top * scale,
+#                     (left + width) * scale,
+#                     (top + height) * scale
+#                 ]
+#
+#                 # Normalize bbox
+#                 normalized_bbox = [
+#                     (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
+#                     (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
+#                     (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
+#                     (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
+#                 ]
+#
+#                 ocr_tokens.append({
+#                     "word": word,
+#                     "raw_bbox": [int(b) for b in raw_bbox],
+#                     "normalized_bbox": [int(b) for b in normalized_bbox]
+#                 })
+#
+#         return ocr_tokens
+#
+#     except Exception as e:
+#         # Note: 'page.number' might not be available if not running in a loop context
+#         print(f"OCR fallback failed: {e}")
+#         return []
+#
+#
+# def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]:
+#     # ... (Your extract_tokens_from_pdf_fitz_with_ocr implementation)
+#     """
+#     Extracts words and their raw bounding boxes using PyMuPDF (fitz) text layer
+#     and falls back to OCR if no text is found.
+#     """
+#     all_tokens = []
+#     try:
+#         doc = fitz.open(pdf_path)
+#         for page_num in tqdm(range(len(doc)), desc="PDF Page Processing"):
+#             page = doc.load_page(page_num)
+#             page_width, page_height = page.rect.width, page.rect.height
+#             page_tokens = []
+#
+#             # 1. Primary Extraction: Use PyMuPDF's word structure (fitz.Page.get_text("words"))
+#             # word_list format: (x0, y0, x1, y1, word, ...)
+#             word_list = page.get_text("words", sort=True)
+#
+#             if word_list:
+#                 for word_data in word_list:
+#                     word = word_data[4]
+#                     raw_bbox = word_data[:4]
+#
+#                     # Normalize bboxes
+#                     normalized_bbox = [
+#                         (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
+#                         (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
+#                         (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
+#                         (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
+#                     ]
+#
+#                     page_tokens.append({
+#                         "word": word,
+#                         "raw_bbox": [int(b) for b in raw_bbox],
+#                         "normalized_bbox": [int(b) for b in normalized_bbox]
+#                     })
+#
+#             # 2. OCR Fallback
+#             if not page_tokens:
+#                 print(f" (Page {page_num + 1}) No text layer found. Running OCR...")
+#                 page_tokens = ocr_fallback_page(page, page_width, page_height)
+#
+#             all_tokens.extend(page_tokens)
+#
+#         doc.close()
+#     except Exception as e:
+#         raise RuntimeError(f"Error opening or processing PDF with fitz/OCR: {e}")
+#
+#     return all_tokens
+#
+#
+# extract_tokens_from_pdf = extract_tokens_from_pdf_fitz_with_ocr
+#
+#
+# def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab: Vocab, char_vocab: Vocab,
+#                                   chunk_size: int) -> List[Dict[str, Any]]:
+#     # ... (Your preprocess_and_collate_tokens implementation)
+#     """
+#     Chunks the token list, converts to IDs, and prepares batches for inference. (Unchanged)
+#     """
+#     all_batches = []
+#
+#     for i in range(0, len(all_tokens), chunk_size):
+#         chunk = all_tokens[i:i + chunk_size]
+#         if not chunk: continue
+#
+#         words = [t["word"] for t in chunk]
+#         bboxes_norm = [t["normalized_bbox"] for t in chunk]
+#
+#         # Convert to IDs
+#         word_ids = [word_vocab[w] for w in words]
+#
+#         char_ids = []
+#         for w in words:
+#             chs = [char_vocab[ch] for ch in w[:MAX_CHAR_LEN]]
+#             if len(chs) < MAX_CHAR_LEN:
+#                 pad_index = char_vocab.stoi.get(char_vocab.pad_token, 0)
+#                 chs += [pad_index] * (MAX_CHAR_LEN - len(chs))
+#             char_ids.append(chs)
+#
+#         # Create padded tensors (using single-sample batches)
+#         word_pad = torch.LongTensor([word_ids]).to(DEVICE)
+#         char_pad = torch.LongTensor([char_ids]).to(DEVICE)
+#
+#         # Final normalization to [0, 1] range before feeding to the model
+#         bbox_pad = torch.FloatTensor([bboxes_norm]).to(DEVICE) / BBOX_NORM_CONSTANT
+#         mask = torch.ones(word_pad.size(), dtype=torch.bool).to(DEVICE)
+#
+#         all_batches.append({
+#             "words": word_pad,
+#             "chars": char_pad,
+#             "bboxes": bbox_pad,
+#             "mask": mask,
+#             "original_tokens": chunk  # Keep the original data for output formatting
+#         })
+#
+#     return all_batches
+#
+#
+# # =========================================================
+# # 3. Model Loading and Caching (Crucial for Gradio performance)
+# # =========================================================
+#
+# # Cache the model and vocabs globally so they are loaded only ONCE when the app starts.
+# # This avoids reloading the model on every user request, which is vital for speed.
+# try:
+#     WORD_VOCAB, CHAR_VOCAB = load_vocabs(VOCAB_FILE)
+#     MODEL = MCQTagger(len(WORD_VOCAB), len(CHAR_VOCAB), len(LABELS)).to(DEVICE)
+#     MODEL.load_state_dict(torch.load(MODEL_FILE, map_location=DEVICE))
+#     MODEL.eval()
+#     print("✅ Model and Vocabs loaded successfully (Cached).")
+# except Exception as e:
+#     MODEL = None
+#     print(f"❌ Initial Model/Vocab Load Failure: {e}")
+#     print("The Gradio demo will not function until model_CAT.pt and vocabs_CAT.pkl are in the root directory.")
+#
+#
+# # =========================================================
+# # 4. The Gradio Inference Wrapper Function
+# # =========================================================
+#
+# def gradio_inference_wrapper(pdf_file: str) -> Tuple[str, List[Dict[str, Any]]]:
+#     """
+#     Wraps the entire inference pipeline for the Gradio Interface.
+#
+#     Args:
+#         pdf_file: The path to the temporary PDF file uploaded by the user (a string).
+#
+#     Returns:
+#         A tuple of (str, List[Dict[str, Any]]): A status message and the raw predictions.
+#     """
+#     if MODEL is None:
+#         return "❌ ERROR: Model failed to load on startup. Check 'model_CAT.pt' and 'vocabs_CAT.pkl'.", []
+#
+#     pdf_path = pdf_file
+#
+#     try:
+#         # 1. Extract Tokens
+#         all_tokens = extract_tokens_from_pdf(pdf_path)
+#     except RuntimeError as e:
+#         return f"❌ PDF Processing Error: {e}", []
+#
+#     if not all_tokens:
+#         return "❌ ERROR: No tokens were extracted from the PDF, even after OCR fallback.", []
+#
+#     # 2. Preprocess and Batch
+#     batches = preprocess_and_collate_tokens(all_tokens, WORD_VOCAB, CHAR_VOCAB, chunk_size=INFERENCE_CHUNK_SIZE)
+#
+#     # 3. Run Inference
+#     all_predictions = []
+#     with torch.no_grad():
+#         for batch in batches:
+#             words, chars, bboxes, mask = (batch[k] for k in ["words", "chars", "bboxes", "mask"])
+#
+#             preds_batch = MODEL(words, chars, bboxes, mask)
+#             predictions = preds_batch[0]
+#
+#             original_tokens = batch["original_tokens"]
+#
+#             for token_data, pred_idx in zip(original_tokens, predictions):
+#                 all_predictions.append({
+#                     "word": token_data["word"],
+#                     "bbox": token_data["raw_bbox"],
+#                     "predicted_label": IDX2LABEL[pred_idx]
+#                 })
+#
+#     status_message = f"✅ Inference complete. Total tokens predicted: {len(all_predictions)}"
+#
+#     # Gradio will display the JSON output prettified
+#     return status_message, all_predictions
+#
+#
+# # =========================================================
+# # 5. Define and Launch the Gradio Interface
+# # =========================================================
+#
+# if __name__ == "__main__":
+#     title = "MCQ Document Structure Tagger (Bi-LSTM-CRF)"
+#     description = "Upload a PDF document (e.g., an MCQ paper). The model will tokenize the text, run inference to predict BIO-tags (B-QUESTION, I-OPTION, B-ANSWER, etc.) for each word, and return the raw JSON predictions."
+#
+#     # Define the Gradio Interface
+#     demo = gr.Interface(
+#         fn=gradio_inference_wrapper,
+#         # inputs=gr.File(label="Upload PDF Document", file_types=['.pdf'], type='filepath'),
+#         inputs=gr.File(label="Upload PDF Document"),
+#         outputs=[
+#             gr.Textbox(label="Status Message", interactive=False),
+#             gr.JSON(label="Raw BIO Tagging Predictions (JSON)", show_label=True)
+#         ],
+#         title=title,
+#         description=description,
+#         allow_flagging="never",
+#         # Set a reasonable concurrency limit (number of simultaneous users) for a CPU/small GPU Space
+#         concurrency_limit=2
+#     )
+#
+#     # Launch the demo (Hugging Face Spaces automatically calls launch() internally)
+#     demo.launch()
+#
+# import os
+# import json
+# import pickle
+# from typing import List, Dict, Any, Tuple
+# from collections import Counter
+# import torch
+# import torch.nn as nn
+# import torch.nn.functional as F
+# import re
+# from tqdm import tqdm
+#
+# # === GRADIO AND DEPENDENCIES ===
+# import gradio as gr
+# import fitz  # PyMuPDF
+# from PIL import Image, ImageEnhance
+# import pytesseract
+#
+# try:
+#     from TorchCRF import CRF
+# except ImportError:
+#     # Placeholder for environments where it's not yet installed
+#     class CRF:
+#         def __init__(self, *args, **kwargs): pass
+#
+#         def viterbi_decode(self, emissions, mask): return [list(torch.argmax(emissions[0], dim=-1).cpu().numpy())]
+#
+# # ========== CONFIG (Must match Training Script) ==========
+# MODEL_FILE = "model_CAT.pt"
+# VOCAB_FILE = "vocabs_CAT.pkl"
+#
+# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# MAX_CHAR_LEN = 16
+# EMBED_DIM = 100
+# CHAR_EMBED_DIM = 30
+# CHAR_CNN_OUT = 30
+# BBOX_DIM = 100
+# HIDDEN_SIZE = 512
+# BBOX_NORM_CONSTANT = 1000.0
+# INFERENCE_CHUNK_SIZE = 256
+#
+# # ========== LABELS (Must match Training Script) ==========
+# # NOTE: Added B/I-PASSAGE for the new structuring function
+# LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE",
+#           "B-PASSAGE", "I-PASSAGE"]
+# LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
+# IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
+#
+#
+# # =========================================================
+# # 1. Core Classes (Vocab, CharCNNEncoder, MCQTagger)
+# # (Your classes are retained here)
+# # =========================================================
+#
+# class Vocab:
+#     def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
+#         self.min_freq = min_freq
+#         self.unk_token = unk_token
+#         self.pad_token = pad_token
+#         self.freq = Counter()
+#         self.itos = []
+#         self.stoi = {}
+#
+#     def add_sentence(self, toks):
+#         self.freq.update(toks)
+#
+#     def build(self):
+#         items = [tok for tok, c in self.freq.items() if c >= self.min_freq]
+#         items = [self.pad_token, self.unk_token] + sorted(items)
+#         self.itos = items
+#         self.stoi = {s: i for i, s in enumerate(self.itos)}
+#
+#     def __len__(self):
+#         return len(self.itos)
+#
+#     def __getitem__(self, token: str) -> int:
+#         return self.stoi.get(token, self.stoi[self.unk_token])
+#
+#     def __getstate__(self):
+#         return {
+#             'min_freq': self.min_freq,
+#             'unk_token': self.unk_token,
+#             'pad_token': self.pad_token,
+#             'itos': self.itos,
+#             'stoi': self.stoi,
+#         }
+#
+#     def __setstate__(self, state):
+#         self.min_freq = state['min_freq']
+#         self.unk_token = state['unk_token']
+#         self.pad_token = state['pad_token']
+#         self.itos = state['itos']
+#         self.stoi = state['stoi']
+#         self.freq = Counter()
+#
+#
+# def load_vocabs(path: str) -> Tuple[Vocab, Vocab]:
+#     """Loads word and character vocabularies."""
+#     try:
+#         absolute_path = os.path.abspath(path)
+#         with open(absolute_path, "rb") as f:
+#             word_vocab, char_vocab = pickle.load(f)
+#         if len(word_vocab) <= 2:
+#             raise IndexError("CRITICAL: Word vocabulary size is too small.")
+#         return word_vocab, char_vocab
+#     except Exception as e:
+#         raise RuntimeError(f"Error loading vocabs from {path}: {e}")
+#
+#
+# class CharCNNEncoder(nn.Module):
+#     def __init__(self, char_vocab_size, char_emb_dim, out_dim, kernel_sizes=(3, 4, 5)):
+#         super().__init__()
+#         self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
+#         convs = [nn.Conv1d(char_emb_dim, out_dim, kernel_size=k) for k in kernel_sizes]
+#         self.convs = nn.ModuleList(convs)
+#         self.out_dim = out_dim * len(convs)
+#
+#     def forward(self, char_ids):
+#         B, L, C = char_ids.size()
+#         emb = self.char_emb(char_ids.view(B * L, C)).transpose(1, 2)
+#         outs = [torch.max(torch.relu(conv(emb)), dim=2)[0] for conv in self.convs]
+#         res = torch.cat(outs, dim=1)
+#         return res.view(B, L, -1)
+#
+#
+# class MCQTagger(nn.Module):
+#     def __init__(self, vocab_size, char_vocab_size, n_labels, bbox_dim=BBOX_DIM):
+#         super().__init__()
+#         self.word_emb = nn.Embedding(vocab_size, EMBED_DIM, padding_idx=0)
+#         self.char_enc = CharCNNEncoder(char_vocab_size, CHAR_EMBED_DIM, CHAR_CNN_OUT)
+#         self.bbox_proj = nn.Linear(4, bbox_dim)
+#         in_dim = EMBED_DIM + self.char_enc.out_dim + bbox_dim
+#
+#         self.bilstm = nn.LSTM(in_dim, HIDDEN_SIZE // 2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
+#         self.ff = nn.Linear(HIDDEN_SIZE, n_labels)
+#         self.crf = CRF(n_labels)
+#         self.dropout = nn.Dropout(p=0.5)
+#
+#     def forward_emissions(self, words, chars, bboxes, mask):
+#         wemb = self.word_emb(words)
+#         cenc = self.char_enc(chars)
+#         benc = self.bbox_proj(bboxes)
+#         enc_in = torch.cat([wemb, cenc, benc], dim=-1)
+#         enc_in = self.dropout(enc_in)
+#         lengths = mask.sum(dim=1).cpu()
+#
+#         if lengths.max().item() == 0:
+#             B, L = enc_in.size(0), enc_in.size(1)
+#             return torch.zeros((B, L, len(LABELS)), device=enc_in.device)
+#
+#         packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
+#         packed_out, _ = self.bilstm(packed_in)
+#         padded_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
+#
+#         return self.ff(padded_out)
+#
+#     def forward(self, words, chars, bboxes, mask, labels=None, class_weights=None, alpha=0.7):
+#         emissions = self.forward_emissions(words, chars, bboxes, mask)
+#         return self.crf.viterbi_decode(emissions, mask=mask)
+#
+#
+# # =========================================================
+# # 2. PDF Processing Functions
+# # (Your PDF functions are retained here)
+# # =========================================================
+#
+# def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) -> List[Dict[str, Any]]:
+#     """Renders a PyMuPDF page, runs Tesseract OCR, and tokenizes the result."""
+#     try:
+#         pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+#         if pix.n - pix.alpha > 3:
+#             pix = fitz.Pixmap(fitz.csRGB, pix)
+#
+#         img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+#
+#         # Preprocessing
+#         img_pil = img_pil.convert('L')
+#         img_pil = ImageEnhance.Contrast(img_pil).enhance(2.0)
+#         img_pil = ImageEnhance.Sharpness(img_pil).enhance(2.0)
+#
+#         ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
+#
+#         ocr_tokens = []
+#         for i in range(len(ocr_data['text'])):
+#             word = ocr_data['text'][i]
+#             conf = ocr_data['conf'][i]
+#
+#             if word.strip() and int(conf) > 50:
+#                 left, top, width, height = (ocr_data[k][i] for k in ['left', 'top', 'width', 'height'])
+#                 scale = page_width / pix.width
+#
+#                 raw_bbox = [
+#                     left * scale, top * scale, (left + width) * scale, (top + height) * scale
+#                 ]
+#
+#                 normalized_bbox = [
+#                     (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
+#                     (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
+#                     (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
+#                     (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
+#                 ]
+#
+#                 ocr_tokens.append({
+#                     "word": word,
+#                     "raw_bbox": [int(b) for b in raw_bbox],
+#                     "normalized_bbox": [int(b) for b in normalized_bbox]
+#                 })
+#
+#         return ocr_tokens
+#
+#     except Exception as e:
+#         print(f"OCR fallback failed: {e}")
+#         return []
+#
+#
+# def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]:
+#     """Extracts words and bboxes using PyMuPDF text layer and falls back to OCR."""
+#     all_tokens = []
+#     try:
+#         doc = fitz.open(pdf_path)
+#         for page_num in tqdm(range(len(doc)), desc="PDF Page Processing"):
+#             page = doc.load_page(page_num)
+#             page_width, page_height = page.rect.width, page.rect.height
+#             page_tokens = []
+#
+#             # 1. Primary Extraction: PyMuPDF's word structure
+#             word_list = page.get_text("words", sort=True)
+#
+#             if word_list:
+#                 for word_data in word_list:
+#                     word = word_data[4]
+#                     raw_bbox = word_data[:4]
+#
+#                     normalized_bbox = [
+#                         (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
+#                         (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
+#                         (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
+#                         (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
+#                     ]
+#
+#                     page_tokens.append({
+#                         "word": word,
+#                         "raw_bbox": [int(b) for b in raw_bbox],
+#                         "normalized_bbox": [int(b) for b in normalized_bbox]
+#                     })
+#
+#             # 2. OCR Fallback
+#             if not page_tokens:
+#                 print(f" (Page {page_num + 1}) No text layer found. Running OCR...")
+#                 page_tokens = ocr_fallback_page(page, page_width, page_height)
+#
+#             all_tokens.extend(page_tokens)
+#
+#         doc.close()
+#     except Exception as e:
+#         raise RuntimeError(f"Error opening or processing PDF with fitz/OCR: {e}")
+#
+#     return all_tokens
+#
+#
+# extract_tokens_from_pdf = extract_tokens_from_pdf_fitz_with_ocr
+#
+#
+# def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab: Vocab, char_vocab: Vocab,
+#                                   chunk_size: int) -> List[Dict[str, Any]]:
+#     """Chunks the token list, converts to IDs, and prepares batches for inference."""
+#     all_batches = []
+#
+#     for i in range(0, len(all_tokens), chunk_size):
+#         chunk = all_tokens[i:i + chunk_size]
+#         if not chunk: continue
+#
+#         words = [t["word"] for t in chunk]
+#         bboxes_norm = [t["normalized_bbox"] for t in chunk]
+#
+#         # Convert to IDs
+#         word_ids = [word_vocab[w] for w in words]
+#
+#         char_ids = []
+#         for w in words:
+#             chs = [char_vocab[ch] for ch in w[:MAX_CHAR_LEN]]
+#             if len(chs) < MAX_CHAR_LEN:
+#                 pad_index = char_vocab.stoi.get(char_vocab.pad_token, 0)
+#                 chs += [pad_index] * (MAX_CHAR_LEN - len(chs))
+#             char_ids.append(chs)
+#
+#         # Create padded tensors (using single-sample batches)
+#         word_pad = torch.LongTensor([word_ids]).to(DEVICE)
+#         char_pad = torch.LongTensor([char_ids]).to(DEVICE)
+#
+#         # Final normalization to [0, 1] range before feeding to the model
+#         bbox_pad = torch.FloatTensor([bboxes_norm]).to(DEVICE) / BBOX_NORM_CONSTANT
+#         mask = torch.ones(word_pad.size(), dtype=torch.bool).to(DEVICE)
+#
+#         all_batches.append({
+#             "words": word_pad,
+#             "chars": char_pad,
+#             "bboxes": bbox_pad,
+#             "mask": mask,
+#             "original_tokens": chunk
+#         })
+#
+#     return all_batches
+#
+#
+# # =========================================================
+# # 3. Structuring Logic (Adapted from your second script)
+# # =========================================================
+#
+# def finalize_passage_to_item(item, passage_buffer):
+#     """Adds passage text to the current item and clears the buffer."""
+#     if passage_buffer:
+#         # Use a more careful cleaning, focusing on space reduction
+#         passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
+#         if item.get('passage'):
+#             item['passage'] += ' ' + passage_text
+#         else:
+#             item['passage'] = passage_text
+#     passage_buffer.clear()
+#     return item
+#
+#
+# def convert_bio_to_structured_json_strict(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+#     """
+#     Converts a list of {word, predicted_label} tokens into structured MCQ JSON format.
+#     This function is adapted to work directly with the list of predictions (in-memory).
+#     """
+#     structured_data = []
+#     current_item = None
+#     current_option_key = None
+#     current_passage_buffer = []
+#     current_text_buffer = []
+#
+#     first_question_started = False
+#     last_entity_type = None
+#
+#     for item in predictions:
+#         word = item['word']
+#         label = item['predicted_label']
+#         entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
+#
+#         # Always append word to the total text buffer
+#         current_text_buffer.append(word)
+#
+#         is_passage_label = (label == 'B-PASSAGE' or label == 'I-PASSAGE')
+#
+#         # --- BEFORE FIRST QUESTION/METADATA HANDLING ---
+#         if not first_question_started and label != 'B-QUESTION' and not is_passage_label:
+#             continue
+#
+#         # --- PASSAGE HANDLING (Before question start) ---
+#         if not first_question_started and is_passage_label:
+#             if label == 'B-PASSAGE' or (label == 'I-PASSAGE' and last_entity_type == 'PASSAGE'):
+#                 current_passage_buffer.append(word)
+#                 last_entity_type = 'PASSAGE'
+#             continue
+#
+#         # --- NEW QUESTION START (B-QUESTION) ---
+#         if label == 'B-QUESTION':
+#
+#             # 1. Capture leading text/passage as METADATA (for the very first block)
+#             if not first_question_started:
+#                 header_text = ' '.join(current_text_buffer[:-1]).strip()
+#                 if header_text or current_passage_buffer:
+#                     metadata_item = {'type': 'METADATA'}
+#                     metadata_item = finalize_passage_to_item(metadata_item, current_passage_buffer)
+#                     if header_text:
+#                         metadata_item['text'] = header_text
+#                     structured_data.append(metadata_item)
+#
+#                 first_question_started = True
+#                 current_text_buffer = [word]
+#
+#             # 2. Save previous question block (for subsequent questions)
+#             elif current_item is not None:
+#                 current_item = finalize_passage_to_item(current_item, current_passage_buffer)
+#                 current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
+#                 structured_data.append(current_item)
+#                 current_text_buffer = [word]
+#
+#             # 3. Initialize new question
+#             current_item = {
+#                 'type': 'MCQ',  # Explicitly define the type for the final output
+#                 'question': word,
+#                 'options_text': {},
+#                 'answer': '',
+#                 'text': ''  # The raw text span of the item
+#             }
+#             current_option_key = None
+#             last_entity_type = 'QUESTION'
+#             continue
+#
+#         # --- IF INSIDE A QUESTION BLOCK ---
+#         if current_item is not None:
+#
+#             if label.startswith('B-'):
+#                 last_entity_type = entity_type
+#
+#                 if entity_type == 'PASSAGE':
+#                     finalize_passage_to_item(current_item, current_passage_buffer)
+#                     current_passage_buffer.append(word)
+#                 elif entity_type == 'OPTION':
+#                     current_option_key = word
+#                     current_item['options_text'][current_option_key] = word
+#                     current_passage_buffer = []
+#                 elif entity_type == 'ANSWER':
+#                     current_item['answer'] = word
+#                     current_option_key = None
+#                     current_passage_buffer = []
+#                 elif entity_type == 'QUESTION':
+#                     current_item['question'] += f' {word}'
+#                     current_passage_buffer = []
+#
+#             elif label.startswith('I-'):
+#                 if entity_type == 'QUESTION' and last_entity_type == 'QUESTION':
+#                     current_item['question'] += f' {word}'
+#                 elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
+#                     current_item['options_text'][current_option_key] += f' {word}'
+#                 elif entity_type == 'ANSWER' and last_entity_type == 'ANSWER':
+#                     current_item['answer'] += f' {word}'
+#                 elif entity_type == 'PASSAGE' and last_entity_type == 'PASSAGE':
+#                     current_passage_buffer.append(word)
+#
+#             # O-tokens are ignored for entity building but collected in current_text_buffer.
+#             elif label == 'O':
+#                 pass
+#
+#     # --- Finalize last item ---
+#     if current_item is not None:
+#         current_item = finalize_passage_to_item(current_item, current_passage_buffer)
+#         current_item['text'] = re.sub(r'\s{2,}', ' ', ' '.join(current_text_buffer)).strip()
+#         structured_data.append(current_item)
+#     elif not structured_data and current_passage_buffer:
+#         # Case: Only passage/metadata was present in the whole document
+#         metadata_item = {'type': 'METADATA'}
+#         metadata_item = finalize_passage_to_item(metadata_item, current_passage_buffer)
+#         metadata_item['text'] = re.sub(r'\s{2,}', ' ', ' '.join(current_text_buffer)).strip()
+#         structured_data.append(metadata_item)
+#
+#     # --- FINAL CLEANUP ---
+#     for item in structured_data:
+#         # Final cleanup for all text fields
+#         item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
+#         if 'passage' in item:
+#             item['passage'] = re.sub(r'\s{2,}', ' ', item['passage']).strip()
+#             if not item['passage']:
+#                 del item['passage']
+#         if 'question' in item:
+#             item['question'] = re.sub(r'\s{2,}', ' ', item['question']).strip()
+#         if 'answer' in item:
+#             item['answer'] = re.sub(r'\s{2,}', ' ', item['answer']).strip()
+#         if 'options_text' in item:
+#             for k, v in item['options_text'].items():
+#                 item['options_text'][k] = re.sub(r'\s{2,}', ' ', v).strip()
+#
+#     return structured_data
+#
+#
+# # =========================================================
+# # 4. Updated Gradio Inference Wrapper Function
+# # =========================================================
+#
+# def gradio_inference_wrapper(pdf_file: str) -> Tuple[str, List[Dict[str, Any]]]:
+#     """
+#     Wraps the entire two-stage pipeline: (1) Tagging -> (2) Structuring.
+#     """
+#     if MODEL is None:
+#         return "❌ ERROR: Model failed to load on startup.", []
+#
+#     pdf_path = pdf_file
+#     raw_predictions = []
+#
+#     try:
+#         # 1. Stage 1: PDF Processing and BIO Tagging (Unchanged from before)
+#         all_tokens = extract_tokens_from_pdf(pdf_path)
+#
+#         if not all_tokens:
+#             return "❌ ERROR: No tokens were extracted from the PDF, even after OCR fallback.", []
+#
+#         batches = preprocess_and_collate_tokens(all_tokens, WORD_VOCAB, CHAR_VOCAB, chunk_size=INFERENCE_CHUNK_SIZE)
+#
+#         with torch.no_grad():
+#             for batch in batches:
+#                 words, chars, bboxes, mask = (batch[k] for k in ["words", "chars", "bboxes", "mask"])
+#                 preds_batch = MODEL(words, chars, bboxes, mask)
+#                 predictions = preds_batch[0]
+#                 original_tokens = batch["original_tokens"]
+#
+#                 for token_data, pred_idx in zip(original_tokens, predictions):
+#                     raw_predictions.append({
+#                         "word": token_data["word"],
+#                         "bbox": token_data["raw_bbox"],
+#                         "predicted_label": IDX2LABEL[pred_idx]
+#                     })
+#
+#         # 2. Stage 2: Structured JSON Conversion (The NEW step)
+#         structured_output = convert_bio_to_structured_json_strict(raw_predictions)
+#
+#         status_message = f"✅ Conversion complete. Found {len([i for i in structured_output if i.get('type') == 'MCQ'])} MCQ items."
+#
+#         # Return the final structured output
+#         return status_message, structured_output
+#
+#     except RuntimeError as e:
+#         return f"❌ PDF Processing Error: {e}", []
+#     except Exception as e:
+#         # Catch any unexpected errors during inference or structuring
+#         return f"❌ An unexpected processing error occurred: {e}", []
+#
+#
+# # =========================================================
+# # 5. Define and Launch the Gradio Interface
+# # (Output changed to only show the final structured JSON)
+# # =========================================================
+#
+# if __name__ == "__main__":
+#     title = "MCQ Document Structure Tagger (Bi-LSTM-CRF) - Structured Output"
+#     description = "Upload a PDF document. The system processes it in two stages: 1) BIO-Tagging for structural elements (Question, Option, Answer, Passage) and 2) Converting those tags into a clean, structured JSON list of MCQ items."
+#
+#     demo = gr.Interface(
+#         fn=gradio_inference_wrapper,
+#         inputs=gr.File(label="Upload PDF Document", file_types=['pdf']),
+#         outputs=[
+#             gr.Textbox(label="Status Message", interactive=False),
+#             gr.JSON(label="Structured MCQ JSON Output", show_label=True)
+#         ],
+#         title=title,
+#         description=description,
+#         allow_flagging="never",
+#         concurrency_limit=2
+#     )
+#
+#     demo.launch()
 import os
 import json
 import pickle
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import re
 from tqdm import tqdm
 # === GRADIO AND DEPENDENCIES ===
 import gradio as gr
 import fitz  # PyMuPDF
 from PIL import Image, ImageEnhance
 import pytesseract
 try:
+    # Attempt to import the actual CRF layer for correct Viterbi decoding
     from TorchCRF import CRF
 except ImportError:
+    # Placeholder for environments where it's not yet installed, enabling model definition
     class CRF:
+        def __init__(self, *args, **kwargs):
+            pass
+        # Fallback to simple argmax decoding if the CRF module is missing
+        def viterbi_decode(self, emissions, mask):
+            return [list(torch.argmax(emissions[0], dim=-1).cpu().numpy())]
 # ========== CONFIG (Must match Training Script) ==========
 MODEL_FILE = "model_CAT.pt"
 VOCAB_FILE = "vocabs_CAT.pkl"
 INFERENCE_CHUNK_SIZE = 256
 # ========== LABELS (Must match Training Script) ==========
+# Including PASSAGE for the new structuring logic
+LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE", "B-PASSAGE", "I-PASSAGE"]
 LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
 IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
 # =========================================================
+# 1. Core Classes (Vocab, CharCNNEncoder, MCQTagger)
 # =========================================================
 class Vocab:
     def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
         self.min_freq = min_freq
         self.unk_token = unk_token
         self.pad_token = pad_token
         self.freq = Counter()
+        self.itos = []
+        self.stoi = {}
     def add_sentence(self, toks):
         self.freq.update(toks)
         return len(self.itos)
     def __getitem__(self, token: str) -> int:
         return self.stoi.get(token, self.stoi[self.unk_token])
     def __getstate__(self):
 def load_vocabs(path: str) -> Tuple[Vocab, Vocab]:
+    """Loads word and character vocabularies."""
     try:
         absolute_path = os.path.abspath(path)
         with open(absolute_path, "rb") as f:
             word_vocab, char_vocab = pickle.load(f)
         if len(word_vocab) <= 2:
+            raise IndexError("CRITICAL: Word vocabulary size is too small.")
         return word_vocab, char_vocab
     except Exception as e:
         raise RuntimeError(f"Error loading vocabs from {path}: {e}")
         if lengths.max().item() == 0:
             B, L = enc_in.size(0), enc_in.size(1)
+            # Return zero tensor if batch is empty
             return torch.zeros((B, L, len(LABELS)), device=enc_in.device)
         packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
     def forward(self, words, chars, bboxes, mask, labels=None, class_weights=None, alpha=0.7):
         emissions = self.forward_emissions(words, chars, bboxes, mask)
         return self.crf.viterbi_decode(emissions, mask=mask)
 # =========================================================
+# 2. PDF Processing Functions
 # =========================================================
 def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) -> List[Dict[str, Any]]:
+    """Renders a PyMuPDF page, runs Tesseract OCR, and tokenizes the result."""
     try:
         pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+        if pix.n - pix.alpha > 3:
             pix = fitz.Pixmap(fitz.csRGB, pix)
         img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Preprocessing for Tesseract
         img_pil = img_pil.convert('L')
         img_pil = ImageEnhance.Contrast(img_pil).enhance(2.0)
         img_pil = ImageEnhance.Sharpness(img_pil).enhance(2.0)
         ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
         ocr_tokens = []
         for i in range(len(ocr_data['text'])):
             word = ocr_data['text'][i]
             conf = ocr_data['conf'][i]
             if word.strip() and int(conf) > 50:
+                left, top, width, height = (ocr_data[k][i] for k in ['left', 'top', 'width', 'height'])
                 scale = page_width / pix.width
                 raw_bbox = [
+                    left * scale, top * scale, (left + width) * scale, (top + height) * scale
                 ]
                 normalized_bbox = [
                     (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
                     (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
         return ocr_tokens
     except Exception as e:
         print(f"OCR fallback failed: {e}")
         return []
 def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]:
+    """Extracts words and bboxes using PyMuPDF text layer and falls back to OCR."""
     all_tokens = []
     try:
         doc = fitz.open(pdf_path)
             page_width, page_height = page.rect.width, page.rect.height
             page_tokens = []
+            # 1. Primary Extraction: PyMuPDF's word structure
             word_list = page.get_text("words", sort=True)
             if word_list:
                     word = word_data[4]
                     raw_bbox = word_data[:4]
                     normalized_bbox = [
                         (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
                         (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
 def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab: Vocab, char_vocab: Vocab,
                                   chunk_size: int) -> List[Dict[str, Any]]:
+    """Chunks the token list, converts to IDs, and prepares batches for inference."""
     all_batches = []
     for i in range(0, len(all_tokens), chunk_size):
             "chars": char_pad,
             "bboxes": bbox_pad,
             "mask": mask,
+            "original_tokens": chunk
         })
     return all_batches
 # =========================================================
+# 3. Model Loading and Caching (Global Variables Defined Here!)
 # =========================================================
+# Global variables (MODEL, VOCABS) are defined here for use in the wrapper function
+WORD_VOCAB = None
+CHAR_VOCAB = None
+MODEL = None
 try:
     WORD_VOCAB, CHAR_VOCAB = load_vocabs(VOCAB_FILE)
     MODEL = MCQTagger(len(WORD_VOCAB), len(CHAR_VOCAB), len(LABELS)).to(DEVICE)
     MODEL.eval()
     print("✅ Model and Vocabs loaded successfully (Cached).")
 except Exception as e:
+    # This prevents the app from crashing if the model files are missing on startup
     print(f"❌ Initial Model/Vocab Load Failure: {e}")
+    print("The Gradio demo will not function until model_CAT.pt and vocabs_CAT.pkl are found.")
 # =========================================================
+# 4. Structuring Logic (Converts BIO to clean JSON)
 # =========================================================
+def finalize_passage_to_item(item, passage_buffer):
+    """Adds passage text to the current item and clears the buffer."""
+    if passage_buffer:
+        passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
+        if item.get('passage'):
+            item['passage'] += ' ' + passage_text
+        else:
+            item['passage'] = passage_text
+    passage_buffer.clear()
+    return item
+def convert_bio_to_structured_json_strict(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """
+    Converts a list of {word, predicted_label} tokens into structured MCQ JSON format.
+    """
+    structured_data = []
+    current_item = None
+    current_option_key = None
+    current_passage_buffer = []
+    current_text_buffer = []
+    first_question_started = False
+    last_entity_type = None
+    for item in predictions:
+        word = item['word']
+        label = item['predicted_label']
+        entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
+        current_text_buffer.append(word)
+        is_passage_label = (label == 'B-PASSAGE' or label == 'I-PASSAGE')
+        # --- BEFORE FIRST QUESTION/METADATA HANDLING ---
+        if not first_question_started and label != 'B-QUESTION' and not is_passage_label:
+            continue
+        # --- PASSAGE HANDLING (Before question start) ---
+        if not first_question_started and is_passage_label:
+            if label == 'B-PASSAGE' or (label == 'I-PASSAGE' and last_entity_type == 'PASSAGE'):
+                current_passage_buffer.append(word)
+                last_entity_type = 'PASSAGE'
+            continue
+        # --- NEW QUESTION START (B-QUESTION) ---
+        if label == 'B-QUESTION':
+            # 1. Capture leading text/passage as METADATA
+            if not first_question_started:
+                header_text = ' '.join(current_text_buffer[:-1]).strip()
+                if header_text or current_passage_buffer:
+                    metadata_item = {'type': 'METADATA'}
+                    metadata_item = finalize_passage_to_item(metadata_item, current_passage_buffer)
+                    if header_text:
+                        metadata_item['text'] = header_text
+                    structured_data.append(metadata_item)
+                first_question_started = True
+                current_text_buffer = [word]
+            # 2. Save previous question block
+            elif current_item is not None:
+                current_item = finalize_passage_to_item(current_item, current_passage_buffer)
+                current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
+                structured_data.append(current_item)
+                current_text_buffer = [word]
+            # 3. Initialize new question
+            current_item = {
+                'type': 'MCQ',
+                'question': word,
+                'options_text': {},
+                'answer': '',
+                'text': ''
+            }
+            current_option_key = None
+            last_entity_type = 'QUESTION'
+            continue
+        # --- IF INSIDE A QUESTION BLOCK ---
+        if current_item is not None:
+            if label.startswith('B-'):
+                last_entity_type = entity_type
+                if entity_type == 'PASSAGE':
+                    finalize_passage_to_item(current_item, current_passage_buffer)
+                    current_passage_buffer.append(word)
+                elif entity_type == 'OPTION':
+                    current_option_key = word
+                    current_item['options_text'][current_option_key] = word
+                    current_passage_buffer = []
+                elif entity_type == 'ANSWER':
+                    current_item['answer'] = word
+                    current_option_key = None
+                    current_passage_buffer = []
+                elif entity_type == 'QUESTION':
+                    current_item['question'] += f' {word}'
+                    current_passage_buffer = []
+            elif label.startswith('I-'):
+                if entity_type == 'QUESTION' and last_entity_type == 'QUESTION':
+                    current_item['question'] += f' {word}'
+                elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
+                    current_item['options_text'][current_option_key] += f' {word}'
+                elif entity_type == 'ANSWER' and last_entity_type == 'ANSWER':
+                    current_item['answer'] += f' {word}'
+                elif entity_type == 'PASSAGE' and last_entity_type == 'PASSAGE':
+                    current_passage_buffer.append(word)
+            elif label == 'O':
+                pass
+    # --- Finalize last item ---
+    if current_item is not None:
+        current_item = finalize_passage_to_item(current_item, current_passage_buffer)
+        current_item['text'] = re.sub(r'\s{2,}', ' ', ' '.join(current_text_buffer)).strip()
+        structured_data.append(current_item)
+    elif not structured_data and current_passage_buffer:
+        # Case: Only passage/metadata was present in the whole document
+        metadata_item = {'type': 'METADATA'}
+        metadata_item = finalize_passage_to_item(metadata_item, current_passage_buffer)
+        metadata_item['text'] = re.sub(r'\s{2,}', ' ', ' '.join(current_text_buffer)).strip()
+        structured_data.append(metadata_item)
+    # --- FINAL CLEANUP ---
+    for item in structured_data:
+        # Clean up all text fields for excessive whitespace
+        item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
+        if 'passage' in item:
+            item['passage'] = re.sub(r'\s{2,}', ' ', item['passage']).strip()
+            if not item['passage']:
+                del item['passage']
+        for field in ['question', 'answer']:
+            if field in item:
+                item[field] = re.sub(r'\s{2,}', ' ', item[field]).strip()
+        if 'options_text' in item:
+            for k, v in item['options_text'].items():
+                 item['options_text'][k] = re.sub(r'\s{2,}', ' ', v).strip()
+    return structured_data
+# =========================================================
+# 5. The Gradio Inference Wrapper Function (Main Entry Point)
+# =========================================================
+def gradio_inference_wrapper(pdf_file: str) -> Tuple[str, List[Dict[str, Any]]]:
+    """
+    Wraps the entire two-stage pipeline: (1) Tagging -> (2) Structuring.
     """
+    # Uses global variables defined in Section 3
     if MODEL is None:
         return "❌ ERROR: Model failed to load on startup. Check 'model_CAT.pt' and 'vocabs_CAT.pkl'.", []
     pdf_path = pdf_file
+    raw_predictions = []
     try:
+        # 1. Stage 1: PDF Processing and BIO Tagging
         all_tokens = extract_tokens_from_pdf(pdf_path)
+        if not all_tokens:
+            return "❌ ERROR: No tokens were extracted from the PDF, even after OCR fallback.", []
+        # Uses global variables WORD_VOCAB, CHAR_VOCAB, INFERENCE_CHUNK_SIZE
+        batches = preprocess_and_collate_tokens(all_tokens, WORD_VOCAB, CHAR_VOCAB, chunk_size=INFERENCE_CHUNK_SIZE)
+        with torch.no_grad():
+            for batch in batches:
+                words, chars, bboxes, mask = (batch[k] for k in ["words", "chars", "bboxes", "mask"])
+                preds_batch = MODEL(words, chars, bboxes, mask)
+                predictions = preds_batch[0]
+                original_tokens = batch["original_tokens"]
+                for token_data, pred_idx in zip(original_tokens, predictions):
+                    # Uses global variable IDX2LABEL
+                    raw_predictions.append({
+                        "word": token_data["word"],
+                        "bbox": token_data["raw_bbox"],
+                        "predicted_label": IDX2LABEL[pred_idx]
+                    })
+        # 2. Stage 2: Structured JSON Conversion
+        structured_output = convert_bio_to_structured_json_strict(raw_predictions)
+        mcq_count = len([i for i in structured_output if i.get('type') == 'MCQ'])
+        status_message = f"✅ Conversion complete. Found {mcq_count} MCQ items and {len(structured_output) - mcq_count} Metadata blocks."
+        return status_message, structured_output
+    except RuntimeError as e:
+        return f"❌ PDF Processing Error: {e}", []
+    except Exception as e:
+        return f"❌ An unexpected processing error occurred: {e}", []
 # =========================================================
+# 6. Define and Launch the Gradio Interface
 # =========================================================
 if __name__ == "__main__":
+    title = "MCQ Document Structure Tagger (Bi-LSTM-CRF) - Structured Output"
+    description = "Upload a PDF document. The system processes it in two stages: 1) BIO-Tagging for structural elements (Question, Option, Answer, Passage) and 2) Converting those tags into a clean, structured JSON list of MCQ items."
     demo = gr.Interface(
         fn=gradio_inference_wrapper,
+        # Ensure only PDF files are accepted
+        inputs=gr.File(label="Upload PDF Document", file_types=['pdf']),
         outputs=[
             gr.Textbox(label="Status Message", interactive=False),
+            gr.JSON(label="Structured MCQ JSON Output", show_label=True)
         ],
         title=title,
         description=description,
         allow_flagging="never",
         concurrency_limit=2
     )
+    demo.launch()