heerjtdev commited on
Commit
b67b6e8
·
verified ·
1 Parent(s): 0ff9ac0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -553
app.py CHANGED
@@ -1,557 +1,4 @@
1
 
2
- # import os
3
- # import json
4
- # import pickle
5
- # from typing import List, Dict, Any, Tuple
6
- # from collections import Counter
7
- # import torch
8
- # import torch.nn as nn
9
- # import torch.nn.functional as F
10
- # import re
11
- # from tqdm import tqdm
12
-
13
- # # === GRADIO AND DEPENDENCIES ===
14
- # import gradio as gr
15
- # import fitz # PyMuPDF
16
- # from PIL import Image, ImageEnhance
17
- # import pytesseract
18
-
19
- # try:
20
- # # Attempt to import the actual CRF layer for correct Viterbi decoding
21
- # from TorchCRF import CRF
22
- # except ImportError:
23
- # # Placeholder for environments where it's not yet installed, enabling model definition
24
- # class CRF:
25
- # def __init__(self, *args, **kwargs):
26
- # pass
27
- # # Fallback to simple argmax decoding if the CRF module is missing
28
- # def viterbi_decode(self, emissions, mask):
29
- # return [list(torch.argmax(emissions[0], dim=-1).cpu().numpy())]
30
-
31
-
32
- # # ========== CONFIG (Must match Training Script) ==========
33
- # MODEL_FILE = "model_enhanced.pt"
34
- # VOCAB_FILE = "vocabs_enhanced.pkl"
35
-
36
- # DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
- # MAX_CHAR_LEN = 16
38
- # EMBED_DIM = 100
39
- # CHAR_EMBED_DIM = 30
40
- # CHAR_CNN_OUT = 30
41
- # BBOX_DIM = 100
42
- # HIDDEN_SIZE = 512
43
- # BBOX_NORM_CONSTANT = 1000.0
44
- # INFERENCE_CHUNK_SIZE = 256
45
-
46
- # # ========== LABELS (Must match Training Script) ==========
47
- # # Including PASSAGE for the new structuring logic
48
- # # LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-IMAGE", "I-IMAGE", "B-PASSAGE", "I-PASSAGE"]
49
- # # LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
50
- # # IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
51
- # LABELS = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-PASSAGE", "I-PASSAGE", "B-SECTION HEADING", "I-SECTION HEADING"]
52
- # LABEL2IDX = {l: i for i, l in enumerate(LABELS)}
53
- # IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
54
-
55
- # # =========================================================
56
- # # 1. Core Classes (Vocab, CharCNNEncoder, MCQTagger)
57
- # # =========================================================
58
-
59
- # class Vocab:
60
- # def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
61
- # self.min_freq = min_freq
62
- # self.unk_token = unk_token
63
- # self.pad_token = pad_token
64
- # self.freq = Counter()
65
- # self.itos = []
66
- # self.stoi = {}
67
-
68
- # def add_sentence(self, toks):
69
- # self.freq.update(toks)
70
-
71
- # def build(self):
72
- # items = [tok for tok, c in self.freq.items() if c >= self.min_freq]
73
- # items = [self.pad_token, self.unk_token] + sorted(items)
74
- # self.itos = items
75
- # self.stoi = {s: i for i, s in enumerate(self.itos)}
76
-
77
- # def __len__(self):
78
- # return len(self.itos)
79
-
80
- # def __getitem__(self, token: str) -> int:
81
- # return self.stoi.get(token, self.stoi[self.unk_token])
82
-
83
- # def __getstate__(self):
84
- # return {
85
- # 'min_freq': self.min_freq,
86
- # 'unk_token': self.unk_token,
87
- # 'pad_token': self.pad_token,
88
- # 'itos': self.itos,
89
- # 'stoi': self.stoi,
90
- # }
91
-
92
- # def __setstate__(self, state):
93
- # self.min_freq = state['min_freq']
94
- # self.unk_token = state['unk_token']
95
- # self.pad_token = state['pad_token']
96
- # self.itos = state['itos']
97
- # self.stoi = state['stoi']
98
- # self.freq = Counter()
99
-
100
-
101
- # def load_vocabs(path: str) -> Tuple[Vocab, Vocab]:
102
- # """Loads word and character vocabularies."""
103
- # try:
104
- # absolute_path = os.path.abspath(path)
105
- # with open(absolute_path, "rb") as f:
106
- # word_vocab, char_vocab = pickle.load(f)
107
- # if len(word_vocab) <= 2:
108
- # raise IndexError("CRITICAL: Word vocabulary size is too small.")
109
- # return word_vocab, char_vocab
110
- # except Exception as e:
111
- # raise RuntimeError(f"Error loading vocabs from {path}: {e}")
112
-
113
-
114
- # class CharCNNEncoder(nn.Module):
115
- # def __init__(self, char_vocab_size, char_emb_dim, out_dim, kernel_sizes=(3, 4, 5)):
116
- # super().__init__()
117
- # self.char_emb = nn.Embedding(char_vocab_size, char_emb_dim, padding_idx=0)
118
- # convs = [nn.Conv1d(char_emb_dim, out_dim, kernel_size=k) for k in kernel_sizes]
119
- # self.convs = nn.ModuleList(convs)
120
- # self.out_dim = out_dim * len(convs)
121
-
122
- # def forward(self, char_ids):
123
- # B, L, C = char_ids.size()
124
- # emb = self.char_emb(char_ids.view(B * L, C)).transpose(1, 2)
125
- # outs = [torch.max(torch.relu(conv(emb)), dim=2)[0] for conv in self.convs]
126
- # res = torch.cat(outs, dim=1)
127
- # return res.view(B, L, -1)
128
-
129
-
130
- # class MCQTagger(nn.Module):
131
- # def __init__(self, vocab_size, char_vocab_size, n_labels, bbox_dim=BBOX_DIM):
132
- # super().__init__()
133
- # self.word_emb = nn.Embedding(vocab_size, EMBED_DIM, padding_idx=0)
134
- # self.char_enc = CharCNNEncoder(char_vocab_size, CHAR_EMBED_DIM, CHAR_CNN_OUT)
135
- # self.bbox_proj = nn.Linear(4, bbox_dim)
136
- # in_dim = EMBED_DIM + self.char_enc.out_dim + bbox_dim
137
-
138
- # self.bilstm = nn.LSTM(in_dim, HIDDEN_SIZE // 2, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
139
- # self.ff = nn.Linear(HIDDEN_SIZE, n_labels)
140
- # self.crf = CRF(n_labels)
141
- # self.dropout = nn.Dropout(p=0.5)
142
-
143
- # def forward_emissions(self, words, chars, bboxes, mask):
144
- # wemb = self.word_emb(words)
145
- # cenc = self.char_enc(chars)
146
- # benc = self.bbox_proj(bboxes)
147
- # enc_in = torch.cat([wemb, cenc, benc], dim=-1)
148
- # enc_in = self.dropout(enc_in)
149
- # lengths = mask.sum(dim=1).cpu()
150
-
151
- # if lengths.max().item() == 0:
152
- # B, L = enc_in.size(0), enc_in.size(1)
153
- # # Return zero tensor if batch is empty
154
- # return torch.zeros((B, L, len(LABELS)), device=enc_in.device)
155
-
156
- # packed_in = nn.utils.rnn.pack_padded_sequence(enc_in, lengths, batch_first=True, enforce_sorted=False)
157
- # packed_out, _ = self.bilstm(packed_in)
158
- # padded_out, _ = nn.utils.rnn.pad_packed_sequence(packed_out, batch_first=True)
159
-
160
- # return self.ff(padded_out)
161
-
162
- # def forward(self, words, chars, bboxes, mask, labels=None, class_weights=None, alpha=0.7):
163
- # emissions = self.forward_emissions(words, chars, bboxes, mask)
164
- # return self.crf.viterbi_decode(emissions, mask=mask)
165
-
166
-
167
- # # =========================================================
168
- # # 2. PDF Processing Functions
169
- # # =========================================================
170
-
171
- # def ocr_fallback_page(page: fitz.Page, page_width: float, page_height: float) -> List[Dict[str, Any]]:
172
- # """Renders a PyMuPDF page, runs Tesseract OCR, and tokenizes the result."""
173
- # try:
174
- # pix = page.get_pixmap(matrix=fitz.Matrix(3, 3))
175
- # if pix.n - pix.alpha > 3:
176
- # pix = fitz.Pixmap(fitz.csRGB, pix)
177
-
178
- # img_pil = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
179
-
180
- # # Preprocessing for Tesseract
181
- # img_pil = img_pil.convert('L')
182
- # img_pil = ImageEnhance.Contrast(img_pil).enhance(2.0)
183
- # img_pil = ImageEnhance.Sharpness(img_pil).enhance(2.0)
184
-
185
- # ocr_data = pytesseract.image_to_data(img_pil, output_type=pytesseract.Output.DICT)
186
-
187
- # ocr_tokens = []
188
- # for i in range(len(ocr_data['text'])):
189
- # word = ocr_data['text'][i]
190
- # conf = ocr_data['conf'][i]
191
-
192
- # if word.strip() and int(conf) > 50:
193
- # left, top, width, height = (ocr_data[k][i] for k in ['left', 'top', 'width', 'height'])
194
- # scale = page_width / pix.width
195
-
196
- # raw_bbox = [
197
- # left * scale, top * scale, (left + width) * scale, (top + height) * scale
198
- # ]
199
-
200
- # normalized_bbox = [
201
- # (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
202
- # (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
203
- # (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
204
- # (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
205
- # ]
206
-
207
- # ocr_tokens.append({
208
- # "word": word,
209
- # "raw_bbox": [int(b) for b in raw_bbox],
210
- # "normalized_bbox": [int(b) for b in normalized_bbox]
211
- # })
212
-
213
- # return ocr_tokens
214
-
215
- # except Exception as e:
216
- # print(f"OCR fallback failed: {e}")
217
- # return []
218
-
219
-
220
- # def extract_tokens_from_pdf_fitz_with_ocr(pdf_path: str) -> List[Dict[str, Any]]:
221
- # """Extracts words and bboxes using PyMuPDF text layer and falls back to OCR."""
222
- # all_tokens = []
223
- # try:
224
- # doc = fitz.open(pdf_path)
225
- # for page_num in tqdm(range(len(doc)), desc="PDF Page Processing"):
226
- # page = doc.load_page(page_num)
227
- # page_width, page_height = page.rect.width, page.rect.height
228
- # page_tokens = []
229
-
230
- # # 1. Primary Extraction: PyMuPDF's word structure
231
- # word_list = page.get_text("words", sort=True)
232
-
233
- # if word_list:
234
- # for word_data in word_list:
235
- # word = word_data[4]
236
- # raw_bbox = word_data[:4]
237
-
238
- # normalized_bbox = [
239
- # (raw_bbox[0] / page_width) * BBOX_NORM_CONSTANT,
240
- # (raw_bbox[1] / page_height) * BBOX_NORM_CONSTANT,
241
- # (raw_bbox[2] / page_width) * BBOX_NORM_CONSTANT,
242
- # (raw_bbox[3] / page_height) * BBOX_NORM_CONSTANT
243
- # ]
244
-
245
- # page_tokens.append({
246
- # "word": word,
247
- # "raw_bbox": [int(b) for b in raw_bbox],
248
- # "normalized_bbox": [int(b) for b in normalized_bbox]
249
- # })
250
-
251
- # # 2. OCR Fallback
252
- # if not page_tokens:
253
- # print(f" (Page {page_num + 1}) No text layer found. Running OCR...")
254
- # page_tokens = ocr_fallback_page(page, page_width, page_height)
255
-
256
- # all_tokens.extend(page_tokens)
257
-
258
- # doc.close()
259
- # except Exception as e:
260
- # raise RuntimeError(f"Error opening or processing PDF with fitz/OCR: {e}")
261
-
262
- # return all_tokens
263
-
264
-
265
- # extract_tokens_from_pdf = extract_tokens_from_pdf_fitz_with_ocr
266
-
267
-
268
- # def preprocess_and_collate_tokens(all_tokens: List[Dict[str, Any]], word_vocab: Vocab, char_vocab: Vocab,
269
- # chunk_size: int) -> List[Dict[str, Any]]:
270
- # """Chunks the token list, converts to IDs, and prepares batches for inference."""
271
- # all_batches = []
272
-
273
- # for i in range(0, len(all_tokens), chunk_size):
274
- # chunk = all_tokens[i:i + chunk_size]
275
- # if not chunk: continue
276
-
277
- # words = [t["word"] for t in chunk]
278
- # bboxes_norm = [t["normalized_bbox"] for t in chunk]
279
-
280
- # # Convert to IDs
281
- # word_ids = [word_vocab[w] for w in words]
282
-
283
- # char_ids = []
284
- # for w in words:
285
- # chs = [char_vocab[ch] for ch in w[:MAX_CHAR_LEN]]
286
- # if len(chs) < MAX_CHAR_LEN:
287
- # pad_index = char_vocab.stoi.get(char_vocab.pad_token, 0)
288
- # chs += [pad_index] * (MAX_CHAR_LEN - len(chs))
289
- # char_ids.append(chs)
290
-
291
- # # Create padded tensors (using single-sample batches)
292
- # word_pad = torch.LongTensor([word_ids]).to(DEVICE)
293
- # char_pad = torch.LongTensor([char_ids]).to(DEVICE)
294
-
295
- # # Final normalization to [0, 1] range before feeding to the model
296
- # bbox_pad = torch.FloatTensor([bboxes_norm]).to(DEVICE) / BBOX_NORM_CONSTANT
297
- # mask = torch.ones(word_pad.size(), dtype=torch.bool).to(DEVICE)
298
-
299
- # all_batches.append({
300
- # "words": word_pad,
301
- # "chars": char_pad,
302
- # "bboxes": bbox_pad,
303
- # "mask": mask,
304
- # "original_tokens": chunk
305
- # })
306
-
307
- # return all_batches
308
-
309
-
310
- # # =========================================================
311
- # # 3. Model Loading and Caching (Global Variables Defined Here!)
312
- # # =========================================================
313
-
314
- # # Global variables (MODEL, VOCABS) are defined here for use in the wrapper function
315
- # WORD_VOCAB = None
316
- # CHAR_VOCAB = None
317
- # MODEL = None
318
-
319
- # try:
320
- # WORD_VOCAB, CHAR_VOCAB = load_vocabs(VOCAB_FILE)
321
- # MODEL = MCQTagger(len(WORD_VOCAB), len(CHAR_VOCAB), len(LABELS)).to(DEVICE)
322
- # MODEL.load_state_dict(torch.load(MODEL_FILE, map_location=DEVICE))
323
- # MODEL.eval()
324
- # print("✅ Model and Vocabs loaded successfully (Cached).")
325
- # except Exception as e:
326
- # # This prevents the app from crashing if the model files are missing on startup
327
- # print(f"❌ Initial Model/Vocab Load Failure: {e}")
328
- # print("The Gradio demo will not function until model_CAT.pt and vocabs_CAT.pkl are found.")
329
-
330
-
331
- # # =========================================================
332
- # # 4. Structuring Logic (Converts BIO to clean JSON)
333
- # # =========================================================
334
-
335
- # def finalize_passage_to_item(item, passage_buffer):
336
- # """Adds passage text to the current item and clears the buffer."""
337
- # if passage_buffer:
338
- # passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
339
- # if item.get('passage'):
340
- # item['passage'] += ' ' + passage_text
341
- # else:
342
- # item['passage'] = passage_text
343
- # passage_buffer.clear()
344
- # return item
345
-
346
- # def convert_bio_to_structured_json_strict(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
347
- # """
348
- # Converts a list of {word, predicted_label} tokens into structured MCQ JSON format.
349
- # """
350
- # structured_data = []
351
- # current_item = None
352
- # current_option_key = None
353
- # current_passage_buffer = []
354
- # current_text_buffer = []
355
-
356
- # first_question_started = False
357
- # last_entity_type = None
358
-
359
- # for item in predictions:
360
- # word = item['word']
361
- # label = item['predicted_label']
362
- # entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
363
-
364
- # current_text_buffer.append(word)
365
-
366
- # is_passage_label = (label == 'B-PASSAGE' or label == 'I-PASSAGE')
367
-
368
- # # --- BEFORE FIRST QUESTION/METADATA HANDLING ---
369
- # if not first_question_started and label != 'B-QUESTION' and not is_passage_label:
370
- # continue
371
-
372
- # # --- PASSAGE HANDLING (Before question start) ---
373
- # if not first_question_started and is_passage_label:
374
- # if label == 'B-PASSAGE' or (label == 'I-PASSAGE' and last_entity_type == 'PASSAGE'):
375
- # current_passage_buffer.append(word)
376
- # last_entity_type = 'PASSAGE'
377
- # continue
378
-
379
- # # --- NEW QUESTION START (B-QUESTION) ---
380
- # if label == 'B-QUESTION':
381
- # # 1. Capture leading text/passage as METADATA
382
- # if not first_question_started:
383
- # header_text = ' '.join(current_text_buffer[:-1]).strip()
384
- # if header_text or current_passage_buffer:
385
- # metadata_item = {'type': 'METADATA'}
386
- # metadata_item = finalize_passage_to_item(metadata_item, current_passage_buffer)
387
- # if header_text:
388
- # metadata_item['text'] = header_text
389
- # structured_data.append(metadata_item)
390
-
391
- # first_question_started = True
392
- # current_text_buffer = [word]
393
-
394
- # # 2. Save previous question block
395
- # elif current_item is not None:
396
- # current_item = finalize_passage_to_item(current_item, current_passage_buffer)
397
- # current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
398
- # structured_data.append(current_item)
399
- # current_text_buffer = [word]
400
-
401
- # # 3. Initialize new question
402
- # current_item = {
403
- # 'type': 'MCQ',
404
- # 'question': word,
405
- # 'options': {},
406
- # 'answer': '',
407
- # 'text': ''
408
- # }
409
- # current_option_key = None
410
- # last_entity_type = 'QUESTION'
411
- # continue
412
-
413
- # # --- IF INSIDE A QUESTION BLOCK ---
414
- # if current_item is not None:
415
-
416
- # if label.startswith('B-'):
417
- # last_entity_type = entity_type
418
-
419
- # if entity_type == 'PASSAGE':
420
- # finalize_passage_to_item(current_item, current_passage_buffer)
421
- # current_passage_buffer.append(word)
422
- # elif entity_type == 'OPTION':
423
- # current_option_key = word
424
- # current_item['options'][current_option_key] = word
425
- # current_passage_buffer = []
426
- # elif entity_type == 'ANSWER':
427
- # current_item['answer'] = word
428
- # current_option_key = None
429
- # current_passage_buffer = []
430
- # elif entity_type == 'QUESTION':
431
- # current_item['question'] += f' {word}'
432
- # current_passage_buffer = []
433
-
434
- # elif label.startswith('I-'):
435
- # if entity_type == 'QUESTION' and last_entity_type == 'QUESTION':
436
- # current_item['question'] += f' {word}'
437
- # elif entity_type == 'OPTION' and last_entity_type == 'OPTION' and current_option_key is not None:
438
- # current_item['options'][current_option_key] += f' {word}'
439
- # elif entity_type == 'ANSWER' and last_entity_type == 'ANSWER':
440
- # current_item['answer'] += f' {word}'
441
- # elif entity_type == 'PASSAGE' and last_entity_type == 'PASSAGE':
442
- # current_passage_buffer.append(word)
443
-
444
- # elif label == 'O':
445
- # pass
446
-
447
- # # --- Finalize last item ---
448
- # if current_item is not None:
449
- # current_item = finalize_passage_to_item(current_item, current_passage_buffer)
450
- # current_item['text'] = re.sub(r'\s{2,}', ' ', ' '.join(current_text_buffer)).strip()
451
- # structured_data.append(current_item)
452
- # elif not structured_data and current_passage_buffer:
453
- # # Case: Only passage/metadata was present in the whole document
454
- # metadata_item = {'type': 'METADATA'}
455
- # metadata_item = finalize_passage_to_item(metadata_item, current_passage_buffer)
456
- # metadata_item['text'] = re.sub(r'\s{2,}', ' ', ' '.join(current_text_buffer)).strip()
457
- # structured_data.append(metadata_item)
458
-
459
-
460
- # # --- FINAL CLEANUP ---
461
- # for item in structured_data:
462
- # # Clean up all text fields for excessive whitespace
463
- # item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
464
- # if 'passage' in item:
465
- # item['passage'] = re.sub(r'\s{2,}', ' ', item['passage']).strip()
466
- # if not item['passage']:
467
- # del item['passage']
468
- # for field in ['question', 'answer']:
469
- # if field in item:
470
- # item[field] = re.sub(r'\s{2,}', ' ', item[field]).strip()
471
- # if 'options' in item:
472
- # for k, v in item['options'].items():
473
- # item['options'][k] = re.sub(r'\s{2,}', ' ', v).strip()
474
-
475
- # return structured_data
476
-
477
-
478
- # # =========================================================
479
- # # 5. The Gradio Inference Wrapper Function (Main Entry Point)
480
- # # =========================================================
481
-
482
- # def gradio_inference_wrapper(pdf_file: str) -> Tuple[str, List[Dict[str, Any]]]:
483
- # """
484
- # Wraps the entire two-stage pipeline: (1) Tagging -> (2) Structuring.
485
- # """
486
- # # Uses global variables defined in Section 3
487
- # if MODEL is None:
488
- # return "❌ ERROR: Model failed to load on startup. Check 'model_CAT.pt' and 'vocabs_CAT.pkl'.", []
489
-
490
- # pdf_path = pdf_file
491
- # raw_predictions = []
492
-
493
- # try:
494
- # # 1. Stage 1: PDF Processing and BIO Tagging
495
- # all_tokens = extract_tokens_from_pdf(pdf_path)
496
-
497
- # if not all_tokens:
498
- # return "❌ ERROR: No tokens were extracted from the PDF, even after OCR fallback.", []
499
-
500
- # # Uses global variables WORD_VOCAB, CHAR_VOCAB, INFERENCE_CHUNK_SIZE
501
- # batches = preprocess_and_collate_tokens(all_tokens, WORD_VOCAB, CHAR_VOCAB, chunk_size=INFERENCE_CHUNK_SIZE)
502
-
503
- # with torch.no_grad():
504
- # for batch in batches:
505
- # words, chars, bboxes, mask = (batch[k] for k in ["words", "chars", "bboxes", "mask"])
506
- # preds_batch = MODEL(words, chars, bboxes, mask)
507
- # predictions = preds_batch[0]
508
- # original_tokens = batch["original_tokens"]
509
-
510
- # for token_data, pred_idx in zip(original_tokens, predictions):
511
- # # Uses global variable IDX2LABEL
512
- # raw_predictions.append({
513
- # "word": token_data["word"],
514
- # "bbox": token_data["raw_bbox"],
515
- # "predicted_label": IDX2LABEL[pred_idx]
516
- # })
517
-
518
- # # 2. Stage 2: Structured JSON Conversion
519
- # structured_output = convert_bio_to_structured_json_strict(raw_predictions)
520
-
521
- # mcq_count = len([i for i in structured_output if i.get('type') == 'MCQ'])
522
- # status_message = f"✅ Conversion complete. Found {mcq_count} MCQ items and {len(structured_output) - mcq_count} Metadata blocks."
523
-
524
- # return status_message, structured_output
525
-
526
- # except RuntimeError as e:
527
- # return f"❌ PDF Processing Error: {e}", []
528
- # except Exception as e:
529
- # return f"❌ An unexpected processing error occurred: {e}", []
530
-
531
-
532
- # # =========================================================
533
- # # 6. Define and Launch the Gradio Interface
534
- # # =========================================================
535
-
536
- # if __name__ == "__main__":
537
- # title = "MCQ Document Structure Tagger (Bi-LSTM-CRF) - Structured Output"
538
- # description = "Upload a PDF document. The system processes it in two stages: 1) BIO-Tagging for structural elements (Question, Option, Answer, Passage) and 2) Converting those tags into a clean, structured JSON list of MCQ items."
539
-
540
- # demo = gr.Interface(
541
- # fn=gradio_inference_wrapper,
542
- # # Ensure only PDF files are accepted
543
- # inputs=gr.File(label="Upload PDF Document"),
544
- # outputs=[
545
- # gr.Textbox(label="Status Message", interactive=False),
546
- # gr.JSON(label="Structured MCQ JSON Output", show_label=True)
547
- # ],
548
- # title=title,
549
- # description=description,
550
- # allow_flagging="never",
551
- # concurrency_limit=2
552
- # )
553
-
554
- # demo.launch(show_error=True)
555
 
556
  import os
557
  import json
 
1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  import os
4
  import json