Spaces:

heerjtdev
/

edugenius

Sleeping

App Files Files Community

heerjtdev commited on Jan 21

Commit

0ff9ac0

verified ·

1 Parent(s): 0f65208

Update app.py

Browse files

Files changed (1) hide show

app.py +173 -23

app.py CHANGED Viewed

@@ -553,16 +553,16 @@
 #     demo.launch(show_error=True)
 import os
 import json
 import pickle
 from typing import List, Dict, Any, Tuple
 from collections import Counter
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import re
 from tqdm import tqdm
 import gradio as gr
 import fitz  # PyMuPDF
@@ -570,7 +570,6 @@ import sys
 from types import ModuleType
 # --- 1. CRITICAL: MOCK THE TRAINING MODULE ---
-# We create a fake module to satisfy pickle/torch.load
 train_mod = ModuleType("train_model")
 sys.modules["train_model"] = train_mod
@@ -581,7 +580,6 @@ except ImportError:
     try:
         from TorchCRF import CRF
     except ImportError:
-        # Fallback if libraries are missing (prevents crash, but model won't load)
         class CRF(nn.Module):
             def __init__(self, *args, **kwargs): super().__init__()
@@ -608,7 +606,7 @@ LABELS = [
 ]
 IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
-# --- 4. CLASSES (Re-defined to match training) ---
 class Vocab:
     def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
@@ -696,26 +694,23 @@ class MCQTagger(nn.Module):
         emissions = self.ff(torch.cat([lstm_out, attn_out], dim=-1))
         return self.crf.viterbi_decode(emissions, mask=mask)
-# --- 5. CRITICAL FIX: LINK CLASSES TO FAKE MODULE ---
-# This tells pickle: "When you look for 'train_model.Vocab', look here instead."
 train_mod.Vocab = Vocab
 train_mod.MCQTagger = MCQTagger
 train_mod.CharCNNEncoder = CharCNNEncoder
 train_mod.SpatialAttention = SpatialAttention
-# --- 6. FEATURE EXTRACTORS ---
 def extract_spatial_features(tokens, idx):
     curr = tokens[idx]
     f = []
-    # Vertical gaps
     if idx < len(tokens)-1: f.append(min((tokens[idx+1]['y0'] - curr['y1'])/100.0, 1.0))
     else: f.append(0.0)
     if idx > 0: f.append(min((curr['y0'] - tokens[idx-1]['y1'])/100.0, 1.0))
     else: f.append(0.0)
-    # Positioning
     f.extend([curr['x0']/1000.0, (curr['x1']-curr['x0'])/1000.0, (curr['y1']-curr['y0'])/1000.0])
     f.extend([(curr['x0']+curr['x1'])/2000.0, (curr['y0']+curr['y1'])/2000.0, curr['x0']/1000.0])
-    # Ratio & Alignment
     f.append(min(((curr['x1']-curr['x0'])/max((curr['y1']-curr['y0']),1.0))/10.0, 1.0))
     if idx > 0: f.append(float(abs(curr['x0'] - tokens[idx-1]['x0']) < 5))
     else: f.append(0.0)
@@ -733,14 +728,11 @@ def extract_context_features(tokens, idx, window=3):
         res = check_p(i)
         prev_res = [max(prev_res[j], res[j]) for j in range(3)]
     f.extend(prev_res)
     next_res = [0.0, 0.0, 0.0]
     for i in range(idx+1, min(len(tokens), idx+window+1)):
         res = check_p(i)
         next_res = [max(next_res[j], res[j]) for j in range(3)]
     f.extend(next_res)
-    # Distances
     dq, dopt = 1.0, 1.0
     for i in range(idx+1, min(len(tokens), idx+window+1)):
         t = tokens[i]['text'].lower().strip()
@@ -749,17 +741,172 @@ def extract_context_features(tokens, idx, window=3):
     f.extend([dq, dopt])
     return f
 # --- 7. INFERENCE WRAPPER ---
 def gradio_inference(pdf_file):
     if not os.path.exists(MODEL_FILE) or not os.path.exists(VOCAB_FILE):
         return "❌ Missing model/vocab files.", []
     try:
-        # Load Vocab
         with open(VOCAB_FILE, "rb") as f:
             word_vocab, char_vocab = pickle.load(f)
-        # Load Model
         model = MCQTagger(len(word_vocab), len(char_vocab), len(LABELS)).to(DEVICE)
         state_dict = torch.load(MODEL_FILE, map_location=DEVICE)
         model.load_state_dict(state_dict if isinstance(state_dict, dict) else state_dict.state_dict())
@@ -776,19 +923,17 @@ def gradio_inference(pdf_file):
         if not all_tokens: return "❌ No text found.", []
-        # Feature Extraction
         for i in range(len(all_tokens)):
             all_tokens[i]['spatial_features'] = extract_spatial_features(all_tokens, i)
             all_tokens[i]['context_features'] = extract_context_features(all_tokens, i)
         # Predict
-        results = []
         for i in range(0, len(all_tokens), INFERENCE_CHUNK_SIZE):
             chunk = all_tokens[i : i + INFERENCE_CHUNK_SIZE]
-            # Prepare Inputs
             w_ids = torch.LongTensor([[word_vocab[t['text']] for t in chunk]]).to(DEVICE)
             c_ids_list = []
             for t in chunk:
                 chars = [char_vocab[c] for c in t['text'][:MAX_CHAR_LEN]]
@@ -804,19 +949,24 @@ def gradio_inference(pdf_file):
             with torch.no_grad():
                 preds = model(w_ids, c_ids, bboxes, s_feats, c_feats, mask)[0]
                 for t, p in zip(chunk, preds):
-                    results.append({"word": t['text'], "label": IDX2LABEL[p]})
-        return "✅ Success", results
     except Exception as e:
         import traceback
         return f"❌ Error: {str(e)}", []
 # --- 8. UI ---
 demo = gr.Interface(
     fn=gradio_inference,
     inputs=gr.File(label="Upload PDF"),
-    outputs=[gr.Textbox(label="Status"), gr.JSON(label="Predictions")],
     title="MCQ Enhanced Tagger"
 )

 #     demo.launch(show_error=True)
 import os
 import json
 import pickle
+import time
+import re
 from typing import List, Dict, Any, Tuple
 from collections import Counter
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from tqdm import tqdm
 import gradio as gr
 import fitz  # PyMuPDF
 from types import ModuleType
 # --- 1. CRITICAL: MOCK THE TRAINING MODULE ---
 train_mod = ModuleType("train_model")
 sys.modules["train_model"] = train_mod
     try:
         from TorchCRF import CRF
     except ImportError:
         class CRF(nn.Module):
             def __init__(self, *args, **kwargs): super().__init__()
 ]
 IDX2LABEL = {i: l for i, l in enumerate(LABELS)}
+# --- 4. CLASSES ---
 class Vocab:
     def __init__(self, min_freq=1, unk_token="<UNK>", pad_token="<PAD>"):
         emissions = self.ff(torch.cat([lstm_out, attn_out], dim=-1))
         return self.crf.viterbi_decode(emissions, mask=mask)
+# Link classes to the fake module
 train_mod.Vocab = Vocab
 train_mod.MCQTagger = MCQTagger
 train_mod.CharCNNEncoder = CharCNNEncoder
 train_mod.SpatialAttention = SpatialAttention
+# --- 5. FEATURE HELPERS ---
 def extract_spatial_features(tokens, idx):
     curr = tokens[idx]
     f = []
     if idx < len(tokens)-1: f.append(min((tokens[idx+1]['y0'] - curr['y1'])/100.0, 1.0))
     else: f.append(0.0)
     if idx > 0: f.append(min((curr['y0'] - tokens[idx-1]['y1'])/100.0, 1.0))
     else: f.append(0.0)
     f.extend([curr['x0']/1000.0, (curr['x1']-curr['x0'])/1000.0, (curr['y1']-curr['y0'])/1000.0])
     f.extend([(curr['x0']+curr['x1'])/2000.0, (curr['y0']+curr['y1'])/2000.0, curr['x0']/1000.0])
     f.append(min(((curr['x1']-curr['x0'])/max((curr['y1']-curr['y0']),1.0))/10.0, 1.0))
     if idx > 0: f.append(float(abs(curr['x0'] - tokens[idx-1]['x0']) < 5))
     else: f.append(0.0)
         res = check_p(i)
         prev_res = [max(prev_res[j], res[j]) for j in range(3)]
     f.extend(prev_res)
     next_res = [0.0, 0.0, 0.0]
     for i in range(idx+1, min(len(tokens), idx+window+1)):
         res = check_p(i)
         next_res = [max(next_res[j], res[j]) for j in range(3)]
     f.extend(next_res)
     dq, dopt = 1.0, 1.0
     for i in range(idx+1, min(len(tokens), idx+window+1)):
         t = tokens[i]['text'].lower().strip()
     f.extend([dq, dopt])
     return f
+# --- 6. STRUCTURING LOGIC (Injected) ---
+def convert_predictions_to_structured(predictions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """
+    Converts a flat list of predictions [{'word':..., 'predicted_label':...}]
+    into structured JSON, implementing the specific logic provided.
+    """
+    print("--- STARTING BIO TO STRUCTURED JSON DECODING ---")
+    start_time = time.time()
+    total_words = len(predictions)
+    structured_data = []
+    current_item = None
+    current_option_key = None
+    current_passage_buffer = []
+    current_text_buffer = []
+    first_question_started = False
+    last_entity_type = None
+    just_finished_i_option = False
+    is_in_new_passage = False
+    def finalize_passage_to_item(item, passage_buffer):
+        if passage_buffer:
+            passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
+            if item.get('passage'):
+                item['passage'] += ' ' + passage_text
+            else:
+                item['passage'] = passage_text
+        passage_buffer.clear()
+    for idx, item in enumerate(predictions):
+        word = item['word']
+        label = item['predicted_label']
+        entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
+        current_text_buffer.append(word)
+        previous_entity_type = last_entity_type
+        is_passage_label = (entity_type == 'PASSAGE')
+        if not first_question_started:
+            if label != 'B-QUESTION' and not is_passage_label:
+                just_finished_i_option = False
+                is_in_new_passage = False
+                continue
+            if is_passage_label:
+                current_passage_buffer.append(word)
+                last_entity_type = 'PASSAGE'
+                just_finished_i_option = False
+                is_in_new_passage = False
+                continue
+        if label == 'B-QUESTION':
+            if not first_question_started:
+                header_text = ' '.join(current_text_buffer[:-1]).strip()
+                if header_text or current_passage_buffer:
+                    metadata_item = {'type': 'METADATA', 'passage': ''}
+                    finalize_passage_to_item(metadata_item, current_passage_buffer)
+                    if header_text: metadata_item['text'] = header_text
+                    structured_data.append(metadata_item)
+                first_question_started = True
+                current_text_buffer = [word]
+            if current_item is not None:
+                finalize_passage_to_item(current_item, current_passage_buffer)
+                current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
+                structured_data.append(current_item)
+                current_text_buffer = [word]
+            current_item = {
+                'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
+            }
+            current_option_key = None
+            last_entity_type = 'QUESTION'
+            just_finished_i_option = False
+            is_in_new_passage = False
+            continue
+        if current_item is not None:
+            if is_in_new_passage:
+                if 'new_passage' not in current_item:
+                    current_item['new_passage'] = word
+                else:
+                    current_item['new_passage'] += f' {word}'
+                if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
+                    is_in_new_passage = False
+                if label.startswith(('B-', 'I-')):
+                    last_entity_type = entity_type
+                continue
+            is_in_new_passage = False
+            if label.startswith('B-'):
+                if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
+                    finalize_passage_to_item(current_item, current_passage_buffer)
+                    current_passage_buffer = []
+                last_entity_type = entity_type
+                if entity_type == 'PASSAGE':
+                    if previous_entity_type == 'OPTION' and just_finished_i_option:
+                        current_item['new_passage'] = word
+                        is_in_new_passage = True
+                    else:
+                        current_passage_buffer.append(word)
+                elif entity_type == 'OPTION':
+                    current_option_key = word
+                    current_item['options'][current_option_key] = word
+                    just_finished_i_option = False
+                elif entity_type == 'ANSWER':
+                    current_item['answer'] = word
+                    current_option_key = None
+                    just_finished_i_option = False
+                elif entity_type == 'QUESTION':
+                    current_item['question'] += f' {word}'
+                    just_finished_i_option = False
+            elif label.startswith('I-'):
+                if entity_type == 'QUESTION':
+                    current_item['question'] += f' {word}'
+                elif entity_type == 'PASSAGE':
+                    if previous_entity_type == 'OPTION' and just_finished_i_option:
+                        current_item['new_passage'] = word
+                        is_in_new_passage = True
+                    else:
+                        if not current_passage_buffer: last_entity_type = 'PASSAGE'
+                        current_passage_buffer.append(word)
+                elif entity_type == 'OPTION' and current_option_key is not None:
+                    current_item['options'][current_option_key] += f' {word}'
+                    just_finished_i_option = True
+                elif entity_type == 'ANSWER':
+                    current_item['answer'] += f' {word}'
+                just_finished_i_option = (entity_type == 'OPTION')
+            elif label == 'O':
+                pass
+    if current_item is not None:
+        finalize_passage_to_item(current_item, current_passage_buffer)
+        current_item['text'] = ' '.join(current_text_buffer).strip()
+        structured_data.append(current_item)
+    for item in structured_data:
+        item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
+        if 'new_passage' in item:
+            item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()
+    print(f"✅ Decoding Complete. Total time: {time.time() - start_time:.2f}s")
+    return structured_data
 # --- 7. INFERENCE WRAPPER ---
 def gradio_inference(pdf_file):
     if not os.path.exists(MODEL_FILE) or not os.path.exists(VOCAB_FILE):
         return "❌ Missing model/vocab files.", []
     try:
+        # Load Resources
         with open(VOCAB_FILE, "rb") as f:
             word_vocab, char_vocab = pickle.load(f)
         model = MCQTagger(len(word_vocab), len(char_vocab), len(LABELS)).to(DEVICE)
         state_dict = torch.load(MODEL_FILE, map_location=DEVICE)
         model.load_state_dict(state_dict if isinstance(state_dict, dict) else state_dict.state_dict())
         if not all_tokens: return "❌ No text found.", []
+        # Features
         for i in range(len(all_tokens)):
             all_tokens[i]['spatial_features'] = extract_spatial_features(all_tokens, i)
             all_tokens[i]['context_features'] = extract_context_features(all_tokens, i)
         # Predict
+        raw_predictions = []
         for i in range(0, len(all_tokens), INFERENCE_CHUNK_SIZE):
             chunk = all_tokens[i : i + INFERENCE_CHUNK_SIZE]
             w_ids = torch.LongTensor([[word_vocab[t['text']] for t in chunk]]).to(DEVICE)
             c_ids_list = []
             for t in chunk:
                 chars = [char_vocab[c] for c in t['text'][:MAX_CHAR_LEN]]
             with torch.no_grad():
                 preds = model(w_ids, c_ids, bboxes, s_feats, c_feats, mask)[0]
                 for t, p in zip(chunk, preds):
+                    # NOTE: Structuring logic uses 'predicted_label' key
+                    raw_predictions.append({"word": t['text'], "predicted_label": IDX2LABEL[p]})
+        # Structure Output
+        structured_json = convert_predictions_to_structured(raw_predictions)
+        return "✅ Processing Complete", structured_json
     except Exception as e:
         import traceback
+        traceback.print_exc()
         return f"❌ Error: {str(e)}", []
 # --- 8. UI ---
 demo = gr.Interface(
     fn=gradio_inference,
     inputs=gr.File(label="Upload PDF"),
+    outputs=[gr.Textbox(label="Status"), gr.JSON(label="Structured Output")],
     title="MCQ Enhanced Tagger"
 )