Spaces:

heerjtdev
/

edugenius

Sleeping

App Files Files Community

aagamjtdev commited on Oct 14, 2025

Commit

0dc2968

1 Parent(s): 9c882b5

Initial deployment with LFS-tracked model

Browse files

Files changed (4) hide show

app.py +435 -0
model_CAT.pt +3 -0
requirements.txt +6 -0
vocabs_CAT.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,435 @@

+import os
+import json
+import pickle
+from typing import List, Dict, Any, Tuple
+from collections import Counter
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+# === GRADIO AND DEPENDENCIES ===
+import gradio as gr
+import fitz  # PyMuPDF
+import re
+from PIL import Image, ImageEnhance
+import pytesseract
+try:
+    from TorchCRF import CRF
+except ImportError:
+    # This should be handled in requirements.txt for the Space
+    print("CRF module not found. Assuming deployment environment will install it.")
+    class CRF:
+        def __init__(self, *args, **kwargs): pass
+        def viterbi_decode(self, emissions, mask): return [list(torch.argmax(emissions[0], dim=-1).cpu().numpy())]
+# ========== CONFIG (Must match Training Script) ==========
+# NOTE: In a Space, we typically don't use DATA_DIR paths if the files are alongside app.py
+MODEL_FILE = "model_CAT.pt"
+VOCAB_FILE = "vocabs_CAT.pkl"
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MAX_CHAR_LEN = 16
+EMBED_DIM = 100
+CHAR_EMBED_DIM = 30
+CHAR_CNN_OUT = 30
+BBOX_DIM = 100
+HIDDEN_SIZE = 512
+BBOX_NORM_CONSTANT = 1000.0
+INFERENCE_CHUNK_SIZE = 256
+# ========== LABELS (Must match Training Script) ==========
+LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE"]
+LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
+IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
+# =========================================================
+# 1. Vocab, CharCNNEncoder, and MCQTagger Classes (Copied from your script)
+# =========================================================
+class Vocab:
+    # ... (Your Vocab class implementation)
+    def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
+        self.min_freq = min_freq
+        self.unk_token = unk_token
+        self.pad_token = pad_token
+        self.freq = Counter()
+        self.itos = []  # Index to String
+        self.stoi = {}  # String to Index
+    def add_sentence(self, toks):
+        self.freq.update(toks)
+    def build(self):
+        items = [tok for tok, c in self.freq.items() if c >= self.min_freq]
+        items = [self.pad_token, self.unk_token] + sorted(items)
+        self.itos = items
+        self.stoi = {s: i for i, s in enumerate(self.itos)}
+    def __len__(self):
+        return len(self.itos)
+    def __getitem__(self, token: str) -> int:
+        """Allows lookup using word_vocab[token]. Returns UNK index if token is not found."""
+        return self.stoi.get(token, self.stoi[self.unk_token])
+    def __getstate__(self):
+        return {
+            'min_freq': self.min_freq,
+            'unk_token': self.unk_token,
+            'pad_token': self.pad_token,
+            'itos': self.itos,
+            'stoi': self.stoi,
+        }
+    def __setstate__(self, state):
+        self.min_freq = state['min_freq']
+        self.unk_token = state['unk_token']
+        self.pad_token = state['pad_token']
+        self.itos = state['itos']
+        self.stoi = state['stoi']
+        self.freq = Counter()
+def load_vocabs(path: str) -> Tuple[Vocab, Vocab]:
+    """Loads word and character vocabularies from a pickle file and verifies size."""
+    try:
+        absolute_path = os.path.abspath(path)
+        if not os.path.exists(absolute_path):
+            raise FileNotFoundError(f"Vocab file NOT FOUND at: {absolute_path}")
+        with open(absolute_path, "rb") as f:
+            word_vocab, char_vocab = pickle.load(f)
+        if len(word_vocab) <= 2:
+            raise IndexError("CRITICAL: Word vocabulary size is too small. Vocab file is invalid.")
+        return word_vocab, char_vocab
+    except FileNotFoundError:
+        raise FileNotFoundError(f"Vocab file not found at {path}. Please run the training script first.")
+    except Exception as e:
+        raise RuntimeError(f"Error loading vocabs from {path}: {e}")
+class CharCNNEncoder(nn.Module):
+    def __init__(self, char_vocab_size, char_emb_dim, out_dim, kernel_sizes=(3, 4, 5)):
+        super().__init__()
+        self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
+        convs = [nn.Conv1d(char_emb_dim, out_dim, kernel_size=k) for k in kernel_sizes]
+        self.convs = nn.ModuleList(convs)
+        self.out_dim = out_dim * len(convs)
+    def forward(self, char_ids):
+        B, L, C = char_ids.size()
+        emb = self.char_emb(char_ids.view(B * L, C)).transpose(1, 2)
+        outs = [torch.max(torch.relu(conv(emb)), dim=2)[0] for conv in self.convs]
+        res = torch.cat(outs, dim=1)
+        return res.view(B, L, -1)
+class MCQTagger(nn.Module):
+    def __init__(self, vocab_size, char_vocab_size, n_labels, bbox_dim=BBOX_DIM):
+        super().__init__()
+        self.word_emb = nn.Embedding(vocab_size, EMBED_DIM, padding_idx=0)
+        self.char_enc = CharCNNEncoder(char_vocab_size, CHAR_EMBED_DIM, CHAR_CNN_OUT)
+        self.bbox_proj = nn.Linear(4, bbox_dim)
+        in_dim = EMBED_DIM + self.char_enc.out_dim + bbox_dim
+        self.bilstm = nn.LSTM(in_dim, HIDDEN_SIZE // 2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
+        self.ff = nn.Linear(HIDDEN_SIZE, n_labels)
+        self.crf = CRF(n_labels)
+        self.dropout = nn.Dropout(p=0.5)
+    def forward_emissions(self, words, chars, bboxes, mask):
+        wemb = self.word_emb(words)
+        cenc = self.char_enc(chars)
+        benc = self.bbox_proj(bboxes)
+        enc_in = torch.cat([wemb, cenc, benc], dim=-1)
+        enc_in = self.dropout(enc_in)
+        lengths = mask.sum(dim=1).cpu()
+        if lengths.max().item() == 0:
+            B, L = enc_in.size(0), enc_in.size(1)
+            return torch.zeros((B, L, len(LABELS)), device=enc_in.device)
+        packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
+        packed_out, _ = self.bilstm(packed_in)
+        padded_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
+        return self.ff(padded_out)
+    def forward(self, words, chars, bboxes, mask, labels=None, class_weights=None, alpha=0.7):
+        emissions = self.forward_emissions(words, chars, bboxes, mask)
+        # We only decode for inference, not calculate loss
+        return self.crf.viterbi_decode(emissions, mask=mask)
+# =========================================================
+# 2. PDF Processing Functions (Copied from your script)
+# =========================================================
+def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) -> List[Dict[str, Any]]:
+    # ... (Your ocr_fallback_page implementation)
+    """
+    Renders a PyMuPDF page, runs Tesseract OCR, and tokenizes the result.
+    """
+    try:
+        # Render page at high resolution (300 DPI equivalent)
+        pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
+        if pix.n - pix.alpha > 3:  # Handle CMYK
+            pix = fitz.Pixmap(fitz.csRGB, pix)
+        img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+        # Preprocessing for Tesseract (as was in the original code)
+        img_pil = img_pil.convert('L')
+        img_pil = ImageEnhance.Contrast(img_pil).enhance(2.0)
+        img_pil = ImageEnhance.Sharpness(img_pil).enhance(2.0)
+        # Run Tesseract
+        ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
+        ocr_tokens = []
+        for i in range(len(ocr_data['text'])):
+            word = ocr_data['text'][i]
+            conf = ocr_data['conf'][i]
+            # Use only words with reasonable confidence
+            if word.strip() and int(conf) > 50:
+                # Get Tesseract's raw pixel bounding box
+                left = ocr_data['left'][i]
+                top = ocr_data['top'][i]
+                width = ocr_data['width'][i]
+                height = ocr_data['height'][i]
+                # Convert pixel bbox back to original PDF coordinate system
+                scale = page_width / pix.width
+                raw_bbox = [
+                    left * scale,
+                    top * scale,
+                    (left + width) * scale,
+                    (top + height) * scale
+                ]
+                # Normalize bbox
+                normalized_bbox = [
+                    (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
+                    (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
+                    (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
+                    (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
+                ]
+                ocr_tokens.append({
+                    "word": word,
+                    "raw_bbox": [int(b) for b in raw_bbox],
+                    "normalized_bbox": [int(b) for b in normalized_bbox]
+                })
+        return ocr_tokens
+    except Exception as e:
+        # Note: 'page.number' might not be available if not running in a loop context
+        print(f"OCR fallback failed: {e}")
+        return []
+def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]:
+    # ... (Your extract_tokens_from_pdf_fitz_with_ocr implementation)
+    """
+    Extracts words and their raw bounding boxes using PyMuPDF (fitz) text layer
+    and falls back to OCR if no text is found.
+    """
+    all_tokens = []
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in tqdm(range(len(doc)), desc="PDF Page Processing"):
+            page = doc.load_page(page_num)
+            page_width, page_height = page.rect.width, page.rect.height
+            page_tokens = []
+            # 1. Primary Extraction: Use PyMuPDF's word structure (fitz.Page.get_text("words"))
+            # word_list format: (x0, y0, x1, y1, word, ...)
+            word_list = page.get_text("words", sort=True)
+            if word_list:
+                for word_data in word_list:
+                    word = word_data[4]
+                    raw_bbox = word_data[:4]
+                    # Normalize bboxes
+                    normalized_bbox = [
+                        (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
+                        (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
+                        (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
+                        (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
+                    ]
+                    page_tokens.append({
+                        "word": word,
+                        "raw_bbox": [int(b) for b in raw_bbox],
+                        "normalized_bbox": [int(b) for b in normalized_bbox]
+                    })
+            # 2. OCR Fallback
+            if not page_tokens:
+                print(f" (Page {page_num + 1}) No text layer found. Running OCR...")
+                page_tokens = ocr_fallback_page(page, page_width, page_height)
+            all_tokens.extend(page_tokens)
+        doc.close()
+    except Exception as e:
+        raise RuntimeError(f"Error opening or processing PDF with fitz/OCR: {e}")
+    return all_tokens
+extract_tokens_from_pdf = extract_tokens_from_pdf_fitz_with_ocr
+def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab: Vocab, char_vocab: Vocab,
+                                  chunk_size: int) -> List[Dict[str, Any]]:
+    # ... (Your preprocess_and_collate_tokens implementation)
+    """
+    Chunks the token list, converts to IDs, and prepares batches for inference. (Unchanged)
+    """
+    all_batches = []
+    for i in range(0, len(all_tokens), chunk_size):
+        chunk = all_tokens[i:i + chunk_size]
+        if not chunk: continue
+        words = [t["word"] for t in chunk]
+        bboxes_norm = [t["normalized_bbox"] for t in chunk]
+        # Convert to IDs
+        word_ids = [word_vocab[w] for w in words]
+        char_ids = []
+        for w in words:
+            chs = [char_vocab[ch] for ch in w[:MAX_CHAR_LEN]]
+            if len(chs) < MAX_CHAR_LEN:
+                pad_index = char_vocab.stoi.get(char_vocab.pad_token, 0)
+                chs += [pad_index] * (MAX_CHAR_LEN - len(chs))
+            char_ids.append(chs)
+        # Create padded tensors (using single-sample batches)
+        word_pad = torch.LongTensor([word_ids]).to(DEVICE)
+        char_pad = torch.LongTensor([char_ids]).to(DEVICE)
+        # Final normalization to [0, 1] range before feeding to the model
+        bbox_pad = torch.FloatTensor([bboxes_norm]).to(DEVICE) / BBOX_NORM_CONSTANT
+        mask = torch.ones(word_pad.size(), dtype=torch.bool).to(DEVICE)
+        all_batches.append({
+            "words": word_pad,
+            "chars": char_pad,
+            "bboxes": bbox_pad,
+            "mask": mask,
+            "original_tokens": chunk  # Keep the original data for output formatting
+        })
+    return all_batches
+# =========================================================
+# 3. Model Loading and Caching (Crucial for Gradio performance)
+# =========================================================
+# Cache the model and vocabs globally so they are loaded only ONCE when the app starts.
+# This avoids reloading the model on every user request, which is vital for speed.
+try:
+    WORD_VOCAB, CHAR_VOCAB = load_vocabs(VOCAB_FILE)
+    MODEL = MCQTagger(len(WORD_VOCAB), len(CHAR_VOCAB), len(LABELS)).to(DEVICE)
+    MODEL.load_state_dict(torch.load(MODEL_FILE, map_location=DEVICE))
+    MODEL.eval()
+    print("✅ Model and Vocabs loaded successfully (Cached).")
+except Exception as e:
+    MODEL = None
+    print(f"❌ Initial Model/Vocab Load Failure: {e}")
+    print("The Gradio demo will not function until model_CAT.pt and vocabs_CAT.pkl are in the root directory.")
+# =========================================================
+# 4. The Gradio Inference Wrapper Function
+# =========================================================
+def gradio_inference_wrapper(pdf_file: str) -> Tuple[str, List[Dict[str, Any]]]:
+    """
+    Wraps the entire inference pipeline for the Gradio Interface.
+    Args:
+        pdf_file: The path to the temporary PDF file uploaded by the user (a string).
+    Returns:
+        A tuple of (str, List[Dict[str, Any]]): A status message and the raw predictions.
+    """
+    if MODEL is None:
+        return "❌ ERROR: Model failed to load on startup. Check 'model_CAT.pt' and 'vocabs_CAT.pkl'.", []
+    pdf_path = pdf_file
+    try:
+        # 1. Extract Tokens
+        all_tokens = extract_tokens_from_pdf(pdf_path)
+    except RuntimeError as e:
+        return f"❌ PDF Processing Error: {e}", []
+    if not all_tokens:
+        return "❌ ERROR: No tokens were extracted from the PDF, even after OCR fallback.", []
+    # 2. Preprocess and Batch
+    batches = preprocess_and_collate_tokens(all_tokens, WORD_VOCAB, CHAR_VOCAB, chunk_size=INFERENCE_CHUNK_SIZE)
+    # 3. Run Inference
+    all_predictions = []
+    with torch.no_grad():
+        for batch in batches:
+            words, chars, bboxes, mask = (batch[k] for k in ["words", "chars", "bboxes", "mask"])
+            preds_batch = MODEL(words, chars, bboxes, mask)
+            predictions = preds_batch[0]
+            original_tokens = batch["original_tokens"]
+            for token_data, pred_idx in zip(original_tokens, predictions):
+                all_predictions.append({
+                    "word": token_data["word"],
+                    "bbox": token_data["raw_bbox"],
+                    "predicted_label": IDX2LABEL[pred_idx]
+                })
+    status_message = f"✅ Inference complete. Total tokens predicted: {len(all_predictions)}"
+    # Gradio will display the JSON output prettified
+    return status_message, all_predictions
+# =========================================================
+# 5. Define and Launch the Gradio Interface
+# =========================================================
+if __name__ == "__main__":
+    title = "MCQ Document Structure Tagger (Bi-LSTM-CRF)"
+    description = "Upload a PDF document (e.g., an MCQ paper). The model will tokenize the text, run inference to predict BIO-tags (B-QUESTION, I-OPTION, B-ANSWER, etc.) for each word, and return the raw JSON predictions."
+    # Define the Gradio Interface
+    demo = gr.Interface(
+        fn=gradio_inference_wrapper,
+        inputs=gr.File(label="Upload PDF Document", file_types=['pdf']),
+        outputs=[
+            gr.Textbox(label="Status Message", interactive=False),
+            gr.JSON(label="Raw BIO Tagging Predictions (JSON)", show_label=True)
+        ],
+        title=title,
+        description=description,
+        allow_flagging="never",
+        # Set a reasonable concurrency limit (number of simultaneous users) for a CPU/small GPU Space
+        concurrency_limit=2
+    )
+    # Launch the demo (Hugging Face Spaces automatically calls launch() internally)
+    demo.launch()

model_CAT.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7e571ec922de9e9d5095e3a2ef6b670895e1947c5be09db7c1112a49528ceda
+size 15461951

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+gradio
+torch
+PyMuPDF
+pytesseract
+torch-crf
+Pillow

vocabs_CAT.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ace7379c6800c1f13f3859c7181b9be2a0d539debe762cf83739a93c20fb7f70
+size 209360