Spaces:

heerjtdev
/

hybrid_inference

Sleeping

App Files Files Community

hybrid_inference / app.py

heerjtdev

Update app.py

f1c0953 verified 22 days ago

raw

history blame contribute delete

30.5 kB

	# import gradio as gr
	# import torch
	# import torch.nn as nn
	# import pdfplumber
	# import json
	# import os
	# import re
	# from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model
	# from TorchCRF import CRF

	# # ---------------------------------------------------------
	# # 1. CONFIGURATION
	# # ---------------------------------------------------------
	# # Ensure this filename matches exactly what you uploaded to the Space
	# MODEL_FILENAME = "layoutlmv3_bilstm_crf_hybrid.pth"
	# BASE_MODEL_ID = "microsoft/layoutlmv3-base"

	# # Define your labels exactly as they were during training
	# LABELS = [
	# "O",
	# "B-QUESTION", "I-QUESTION",
	# "B-OPTION", "I-OPTION",
	# "B-ANSWER", "I-ANSWER",
	# "B-SECTION_HEADING", "I-SECTION_HEADING",
	# "B-PASSAGE", "I-PASSAGE"
	# ]
	# LABEL2ID = {l: i for i, l in enumerate(LABELS)}
	# ID2LABEL = {i: l for l, i in LABEL2ID.items()}

	# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)

	# # ---------------------------------------------------------
	# # 2. MODEL ARCHITECTURE
	# # ---------------------------------------------------------
	# # ⚠️ ACTION REQUIRED:
	# # Replace this class with the exact class definition of your
	# # NEW HYBRID MODEL. The class name and structure must match
	# # what was used when you saved 'layoutlmv3_nonlinear_scratch.pth'.
	# # ---------------------------------------------------------
	# # ---------------------------------------------------------
	# # 2. MODEL ARCHITECTURE (LayoutLMv3 + BiLSTM + CRF)
	# # ---------------------------------------------------------
	# class HybridModel(nn.Module):
	# def __init__(self, num_labels):
	# super().__init__()
	# self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)

	# # Config for BiLSTM
	# hidden_size = self.layoutlm.config.hidden_size # Usually 768
	# lstm_hidden_size = hidden_size // 2 # 384, so bidirectional output is 768

	# # BiLSTM Layer
	# # input_size=768, hidden=384, bidir=True -> output_dim = 384 * 2 = 768
	# self.lstm = nn.LSTM(
	# input_size=hidden_size,
	# hidden_size=lstm_hidden_size,
	# num_layers=1,
	# batch_first=True,
	# bidirectional=True
	# )

	# # Dropout (Optional, check if you used this in training)
	# self.dropout = nn.Dropout(0.1)

	# # Classifier: Maps BiLSTM output (768) to Label count
	# self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)

	# # CRF Layer
	# self.crf = CRF(num_labels)

	# def forward(self, input_ids, bbox, attention_mask, labels=None):
	# # 1. LayoutLMv3 Base
	# outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
	# sequence_output = outputs.last_hidden_state # [Batch, Seq, 768]

	# # 2. BiLSTM
	# # LSTM returns (output, (h_n, c_n)). We only need output.
	# lstm_output, _ = self.lstm(sequence_output) # [Batch, Seq, 768]

	# # 3. Dropout & Classifier
	# lstm_output = self.dropout(lstm_output)
	# emissions = self.classifier(lstm_output) # [Batch, Seq, Num_Labels]

	# # 4. CRF
	# if labels is not None:
	# # Training/Eval (Loss)
	# log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
	# return -log_likelihood.mean()
	# else:
	# # Inference (Prediction Tags)
	# return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
	# # ---------------------------------------------------------
	# # 3. MODEL LOADING LOGIC
	# # ---------------------------------------------------------
	# model = None

	# def load_model():
	# global model
	# if model is None:
	# print(f"🔄 Loading model from {MODEL_FILENAME}...")
	# if not os.path.exists(MODEL_FILENAME):
	# raise FileNotFoundError(f"❌ Model file '{MODEL_FILENAME}' not found. Please upload it to the Files tab of your Space.")

	# # Initialize the model structure
	# model = HybridModel(num_labels=len(LABELS))

	# # Load weights
	# try:
	# state_dict = torch.load(MODEL_FILENAME, map_location=device)
	# model.load_state_dict(state_dict)
	# except RuntimeError as e:
	# raise RuntimeError(f"❌ State dictionary mismatch. Ensure the 'HybridModel' class structure in app.py matches the model you trained.\nDetails: {e}")

	# model.to(device)
	# model.eval()
	# print("✅ Model loaded successfully.")
	# return model

	# # ---------------------------------------------------------
	# # 4. JSON CONVERSION LOGIC (Your Custom Logic)
	# # ---------------------------------------------------------
	# def convert_bio_to_structured_json(predictions):
	# structured_data = []
	# current_item = None
	# current_option_key = None
	# current_passage_buffer = []
	# current_text_buffer = []
	# first_question_started = False
	# last_entity_type = None
	# just_finished_i_option = False
	# is_in_new_passage = False

	# def finalize_passage_to_item(item, passage_buffer):
	# if passage_buffer:
	# passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
	# if item.get('passage'): item['passage'] += ' ' + passage_text
	# else: item['passage'] = passage_text
	# passage_buffer.clear()

	# # Flatten predictions list if strictly page-separated
	# flat_predictions = []
	# for page in predictions:
	# flat_predictions.extend(page['data'])

	# for idx, item in enumerate(flat_predictions):
	# word = item['word']
	# label = item['predicted_label']
	# entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None
	# current_text_buffer.append(word)

	# previous_entity_type = last_entity_type
	# is_passage_label = (entity_type == 'PASSAGE')

	# if not first_question_started:
	# if label != 'B-QUESTION' and not is_passage_label:
	# just_finished_i_option = False
	# is_in_new_passage = False
	# continue
	# if is_passage_label:
	# current_passage_buffer.append(word)
	# last_entity_type = 'PASSAGE'
	# just_finished_i_option = False
	# is_in_new_passage = False
	# continue

	# if label == 'B-QUESTION':
	# if not first_question_started:
	# header_text = ' '.join(current_text_buffer[:-1]).strip()
	# if header_text or current_passage_buffer:
	# metadata_item = {'type': 'METADATA', 'passage': ''}
	# finalize_passage_to_item(metadata_item, current_passage_buffer)
	# if header_text: metadata_item['text'] = header_text
	# structured_data.append(metadata_item)
	# first_question_started = True
	# current_text_buffer = [word]

	# if current_item is not None:
	# finalize_passage_to_item(current_item, current_passage_buffer)
	# current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
	# structured_data.append(current_item)
	# current_text_buffer = [word]

	# current_item = {
	# 'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
	# }
	# current_option_key = None
	# last_entity_type = 'QUESTION'
	# just_finished_i_option = False
	# is_in_new_passage = False
	# continue

	# if current_item is not None:
	# if is_in_new_passage:
	# if 'new_passage' not in current_item: current_item['new_passage'] = word
	# else: current_item['new_passage'] += f' {word}'
	# if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
	# is_in_new_passage = False
	# if label.startswith(('B-', 'I-')): last_entity_type = entity_type
	# continue

	# is_in_new_passage = False
	# if label.startswith('B-'):
	# if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
	# finalize_passage_to_item(current_item, current_passage_buffer)
	# current_passage_buffer = []
	# last_entity_type = entity_type
	# if entity_type == 'PASSAGE':
	# if previous_entity_type == 'OPTION' and just_finished_i_option:
	# current_item['new_passage'] = word
	# is_in_new_passage = True
	# else: current_passage_buffer.append(word)
	# elif entity_type == 'OPTION':
	# current_option_key = word
	# current_item['options'][current_option_key] = word
	# just_finished_i_option = False
	# elif entity_type == 'ANSWER':
	# current_item['answer'] = word
	# current_option_key = None
	# just_finished_i_option = False
	# elif entity_type == 'QUESTION':
	# current_item['question'] += f' {word}'
	# just_finished_i_option = False

	# elif label.startswith('I-'):
	# if entity_type == 'QUESTION': current_item['question'] += f' {word}'
	# elif entity_type == 'PASSAGE':
	# if previous_entity_type == 'OPTION' and just_finished_i_option:
	# current_item['new_passage'] = word
	# is_in_new_passage = True
	# else:
	# if not current_passage_buffer: last_entity_type = 'PASSAGE'
	# current_passage_buffer.append(word)
	# elif entity_type == 'OPTION' and current_option_key is not None:
	# current_item['options'][current_option_key] += f' {word}'
	# just_finished_i_option = True
	# elif entity_type == 'ANSWER': current_item['answer'] += f' {word}'
	# just_finished_i_option = (entity_type == 'OPTION')

	# if current_item is not None:
	# finalize_passage_to_item(current_item, current_passage_buffer)
	# current_item['text'] = ' '.join(current_text_buffer).strip()
	# structured_data.append(current_item)

	# # Final Cleanup
	# for item in structured_data:
	# if 'text' in item: item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
	# if 'new_passage' in item: item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()

	# return structured_data

	# # ---------------------------------------------------------
	# # 5. INFERENCE PIPELINE
	# # ---------------------------------------------------------
	# def process_pdf(pdf_file):
	# if pdf_file is None:
	# return None, "⚠️ Please upload a PDF file."

	# try:
	# active_model = load_model()

	# # A. Extract Text and Boxes
	# extracted_pages = []
	# with pdfplumber.open(pdf_file.name) as pdf:
	# for page_idx, page in enumerate(pdf.pages):
	# width, height = page.width, page.height
	# words_data = page.extract_words()

	# page_tokens = []
	# page_bboxes = []

	# for w in words_data:
	# text = w['text']
	# # Normalize bbox to 0-1000 scale
	# x0 = int((w['x0'] / width) * 1000)
	# top = int((w['top'] / height) * 1000)
	# x1 = int((w['x1'] / width) * 1000)
	# bottom = int((w['bottom'] / height) * 1000)

	# # Safety clamp
	# box = [max(0, min(x0, 1000)), max(0, min(top, 1000)),
	# max(0, min(x1, 1000)), max(0, min(bottom, 1000))]

	# page_tokens.append(text)
	# page_bboxes.append(box)
	# extracted_pages.append({"page_id": page_idx, "tokens": page_tokens, "bboxes": page_bboxes})

	# # B. Run Inference
	# raw_predictions = []
	# for page in extracted_pages:
	# tokens = page['tokens']
	# bboxes = page['bboxes']
	# if not tokens: continue

	# # Tokenize
	# encoding = tokenizer(
	# tokens,
	# boxes=bboxes,
	# return_tensors="pt",
	# padding="max_length",
	# truncation=True,
	# max_length=512,
	# return_offsets_mapping=True
	# )

	# input_ids = encoding.input_ids.to(device)
	# bbox = encoding.bbox.to(device)
	# attention_mask = encoding.attention_mask.to(device)

	# # Predict
	# with torch.no_grad():
	# # NOTE: If your hybrid model requires 'pixel_values',
	# # you will need to add image extraction logic above and pass it here.
	# preds = active_model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)

	# # Check if preds returns a tuple (loss, tags) or just tags
	# # The CRF implementation usually returns a list of lists of tags in viterbi_decode
	# pred_tags = preds[0] if isinstance(preds, tuple) else preds[0]
	# # Note: Standard CRF.viterbi_decode returns List[List[int]], so [0] gets the first batch item

	# # Alignment
	# word_ids = encoding.word_ids()
	# aligned_data = []
	# prev_word_idx = None

	# for i, word_idx in enumerate(word_ids):
	# if word_idx is None: continue
	# if word_idx != prev_word_idx:
	# # pred_tags is likely a list of ints.
	# # If pred_tags[i] fails, your max_length might be cutting off tags,
	# # or the model output shape differs from the token length.
	# if i < len(pred_tags):
	# label_id = pred_tags[i]
	# label_str = ID2LABEL.get(label_id, "O")
	# aligned_data.append({"word": tokens[word_idx], "predicted_label": label_str})
	# prev_word_idx = word_idx
	# raw_predictions.append({"data": aligned_data})

	# # C. Convert to Structured JSON
	# final_json = convert_bio_to_structured_json(raw_predictions)

	# # Save output
	# output_filename = "structured_output.json"
	# with open(output_filename, "w", encoding="utf-8") as f:
	# json.dump(final_json, f, indent=2, ensure_ascii=False)

	# return output_filename, f"✅ Success! Processed {len(extracted_pages)} pages. Extracted {len(final_json)} items."

	# except Exception as e:
	# import traceback
	# return None, f"❌ Error:\n{str(e)}\n\nTraceback:\n{traceback.format_exc()}"

	# # ---------------------------------------------------------
	# # 6. GRADIO INTERFACE
	# # ---------------------------------------------------------
	# iface = gr.Interface(
	# fn=process_pdf,
	# inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
	# outputs=[
	# gr.File(label="Download JSON Output"),
	# gr.Textbox(label="Status Log", lines=10)
	# ],
	# title="Hybrid Model Inference: PDF to JSON",
	# description="Upload a document to extract structured data using the custom Hybrid LayoutLMv3 model.",
	# flagging_mode="never"
	# )

	# if __name__ == "__main__":
	# iface.launch()



	import gradio as gr
	import torch
	import torch.nn as nn
	import pdfplumber
	import json
	import os
	import re
	from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model
	from TorchCRF import CRF

	# ---------------------------------------------------------
	# 1. CONFIGURATION
	# ---------------------------------------------------------
	MODEL_FILENAME = "layoutlmv3_bilstm_crf_hybrid.pth"
	BASE_MODEL_ID = "microsoft/layoutlmv3-base"

	# Labels: 11 Standard BIO tags + 2 Special tokens = 13 Total
	# NOTE: If your output labels look "scrambled" (e.g., Questions detected as Options),
	# try moving "UNK" and "PAD" to the BEGINNING of this list (indices 0 and 1).
	LABELS = [
	"O",
	"B-QUESTION", "I-QUESTION",
	"B-OPTION", "I-OPTION",
	"B-ANSWER", "I-ANSWER",
	"B-SECTION_HEADING", "I-SECTION_HEADING",
	"B-PASSAGE", "I-PASSAGE",
	"UNK", "PAD" # Added to match the 13-label count in your weights
	]

	LABEL2ID = {l: i for i, l in enumerate(LABELS)}
	ID2LABEL = {i: l for l, i in LABEL2ID.items()}

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)

	# ---------------------------------------------------------
	# 2. MODEL ARCHITECTURE (LayoutLMv3 + BiLSTM + CRF)
	# ---------------------------------------------------------
	class HybridModel(nn.Module):
	def __init__(self, num_labels):
	super().__init__()
	self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)

	# Structure derived from your error log:
	# Weight shape [1024, 768] implies hidden_size = 256 (1024/4)
	lstm_hidden_size = 256

	self.lstm = nn.LSTM(
	input_size=768, # LayoutLMv3 output size
	hidden_size=lstm_hidden_size,
	num_layers=2, # Error log showed 'l1' weights, meaning 2 layers
	batch_first=True,
	bidirectional=True
	)

	self.dropout = nn.Dropout(0.1)

	# Classifier input = lstm_hidden * 2 (bidirectional) = 256 * 2 = 512
	# This matches your error log shape [13, 512]
	self.classifier = nn.Linear(lstm_hidden_size * 2, num_labels)

	self.crf = CRF(num_labels)

	def forward(self, input_ids, bbox, attention_mask, labels=None):
	outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
	sequence_output = outputs.last_hidden_state

	# BiLSTM
	lstm_output, _ = self.lstm(sequence_output)

	# Classifier
	lstm_output = self.dropout(lstm_output)
	emissions = self.classifier(lstm_output)

	if labels is not None:
	# Training/Eval loss
	log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
	return -log_likelihood.mean()
	else:
	# Inference prediction
	return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())

	# ---------------------------------------------------------
	# 3. MODEL LOADING
	# ---------------------------------------------------------
	model = None

	def load_model():
	global model
	if model is None:
	print(f"🔄 Loading model from {MODEL_FILENAME}...")
	if not os.path.exists(MODEL_FILENAME):
	raise FileNotFoundError(f"❌ Model file '{MODEL_FILENAME}' not found.")

	model = HybridModel(num_labels=len(LABELS))

	# Load state dictionary
	state_dict = torch.load(MODEL_FILENAME, map_location=device)

	# Try loading. If labels are wrong, this will still throw a shape error.
	try:
	model.load_state_dict(state_dict)
	except RuntimeError as e:
	raise RuntimeError(f"❌ Weight mismatch! \nYour model has {len(LABELS)} labels defined in script.\nCheck if 'LABELS' list needs reordering or resizing.\nDetailed Error: {e}")

	model.to(device)
	model.eval()
	print("✅ Model loaded successfully.")
	return model

	# ---------------------------------------------------------
	# 4. JSON CONVERSION LOGIC
	# ---------------------------------------------------------
	def convert_bio_to_structured_json(predictions):
	structured_data = []
	current_item = None
	current_option_key = None
	current_passage_buffer = []
	current_text_buffer = []
	first_question_started = False
	last_entity_type = None
	just_finished_i_option = False
	is_in_new_passage = False

	def finalize_passage_to_item(item, passage_buffer):
	if passage_buffer:
	passage_text = re.sub(r'\s{2,}', ' ', ' '.join(passage_buffer)).strip()
	if item.get('passage'): item['passage'] += ' ' + passage_text
	else: item['passage'] = passage_text
	passage_buffer.clear()

	flat_predictions = []
	for page in predictions:
	flat_predictions.extend(page['data'])

	for idx, item in enumerate(flat_predictions):
	word = item['word']
	label = item['predicted_label']

	# Clean label (remove B- / I-)
	entity_type = label[2:].strip() if label.startswith(('B-', 'I-')) else None

	# Skip special tokens if they appear in prediction
	if label in ["UNK", "PAD", "O"]:
	current_text_buffer.append(word)
	continue

	current_text_buffer.append(word)
	previous_entity_type = last_entity_type
	is_passage_label = (entity_type == 'PASSAGE')

	if not first_question_started:
	if label != 'B-QUESTION' and not is_passage_label:
	just_finished_i_option = False
	is_in_new_passage = False
	continue
	if is_passage_label:
	current_passage_buffer.append(word)
	last_entity_type = 'PASSAGE'
	just_finished_i_option = False
	is_in_new_passage = False
	continue

	if label == 'B-QUESTION':
	if not first_question_started:
	header_text = ' '.join(current_text_buffer[:-1]).strip()
	if header_text or current_passage_buffer:
	metadata_item = {'type': 'METADATA', 'passage': ''}
	finalize_passage_to_item(metadata_item, current_passage_buffer)
	if header_text: metadata_item['text'] = header_text
	structured_data.append(metadata_item)
	first_question_started = True
	current_text_buffer = [word]

	if current_item is not None:
	finalize_passage_to_item(current_item, current_passage_buffer)
	current_item['text'] = ' '.join(current_text_buffer[:-1]).strip()
	structured_data.append(current_item)
	current_text_buffer = [word]

	current_item = {
	'question': word, 'options': {}, 'answer': '', 'passage': '', 'text': ''
	}
	current_option_key = None
	last_entity_type = 'QUESTION'
	just_finished_i_option = False
	is_in_new_passage = False
	continue

	if current_item is not None:
	if is_in_new_passage:
	if 'new_passage' not in current_item: current_item['new_passage'] = word
	else: current_item['new_passage'] += f' {word}'
	if label.startswith('B-') or (label.startswith('I-') and entity_type != 'PASSAGE'):
	is_in_new_passage = False
	if label.startswith(('B-', 'I-')): last_entity_type = entity_type
	continue

	is_in_new_passage = False
	if label.startswith('B-'):
	if entity_type in ['QUESTION', 'OPTION', 'ANSWER', 'SECTION_HEADING']:
	finalize_passage_to_item(current_item, current_passage_buffer)
	current_passage_buffer = []
	last_entity_type = entity_type
	if entity_type == 'PASSAGE':
	if previous_entity_type == 'OPTION' and just_finished_i_option:
	current_item['new_passage'] = word
	is_in_new_passage = True
	else: current_passage_buffer.append(word)
	elif entity_type == 'OPTION':
	current_option_key = word
	current_item['options'][current_option_key] = word
	just_finished_i_option = False
	elif entity_type == 'ANSWER':
	current_item['answer'] = word
	current_option_key = None
	just_finished_i_option = False
	elif entity_type == 'QUESTION':
	current_item['question'] += f' {word}'
	just_finished_i_option = False

	elif label.startswith('I-'):
	if entity_type == 'QUESTION': current_item['question'] += f' {word}'
	elif entity_type == 'PASSAGE':
	if previous_entity_type == 'OPTION' and just_finished_i_option:
	current_item['new_passage'] = word
	is_in_new_passage = True
	else:
	if not current_passage_buffer: last_entity_type = 'PASSAGE'
	current_passage_buffer.append(word)
	elif entity_type == 'OPTION' and current_option_key is not None:
	current_item['options'][current_option_key] += f' {word}'
	just_finished_i_option = True
	elif entity_type == 'ANSWER': current_item['answer'] += f' {word}'
	just_finished_i_option = (entity_type == 'OPTION')

	if current_item is not None:
	finalize_passage_to_item(current_item, current_passage_buffer)
	current_item['text'] = ' '.join(current_text_buffer).strip()
	structured_data.append(current_item)

	for item in structured_data:
	if 'text' in item: item['text'] = re.sub(r'\s{2,}', ' ', item['text']).strip()
	if 'new_passage' in item: item['new_passage'] = re.sub(r'\s{2,}', ' ', item['new_passage']).strip()

	return structured_data

	# ---------------------------------------------------------
	# 5. PROCESSING PIPELINE
	# ---------------------------------------------------------
	def process_pdf(pdf_file):
	if pdf_file is None:
	return None, "⚠️ Please upload a PDF file."

	try:
	active_model = load_model()

	extracted_pages = []
	with pdfplumber.open(pdf_file.name) as pdf:
	for page_idx, page in enumerate(pdf.pages):
	width, height = page.width, page.height
	words_data = page.extract_words()

	page_tokens = []
	page_bboxes = []

	for w in words_data:
	text = w['text']
	x0 = int((w['x0'] / width) * 1000)
	top = int((w['top'] / height) * 1000)
	x1 = int((w['x1'] / width) * 1000)
	bottom = int((w['bottom'] / height) * 1000)
	box = [max(0, min(x0, 1000)), max(0, min(top, 1000)),
	max(0, min(x1, 1000)), max(0, min(bottom, 1000))]
	page_tokens.append(text)
	page_bboxes.append(box)
	extracted_pages.append({"page_id": page_idx, "tokens": page_tokens, "bboxes": page_bboxes})

	raw_predictions = []
	for page in extracted_pages:
	tokens = page['tokens']
	bboxes = page['bboxes']
	if not tokens: continue

	encoding = tokenizer(
	tokens,
	boxes=bboxes,
	return_tensors="pt",
	padding="max_length",
	truncation=True,
	max_length=512,
	return_offsets_mapping=True
	)

	input_ids = encoding.input_ids.to(device)
	bbox = encoding.bbox.to(device)
	attention_mask = encoding.attention_mask.to(device)

	with torch.no_grad():
	# Get the tag indices from the CRF layer
	pred_tags = active_model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
	# If batch size is 1, pred_tags is a list of lists: [[tags...]]
	pred_tags = pred_tags[0]

	word_ids = encoding.word_ids()
	aligned_data = []
	prev_word_idx = None

	for i, word_idx in enumerate(word_ids):
	if word_idx is None: continue
	if word_idx != prev_word_idx:
	if i < len(pred_tags):
	label_id = pred_tags[i]
	# Safe retrieval of label string
	label_str = ID2LABEL.get(label_id, "O")
	aligned_data.append({"word": tokens[word_idx], "predicted_label": label_str})
	prev_word_idx = word_idx
	raw_predictions.append({"data": aligned_data})

	final_json = convert_bio_to_structured_json(raw_predictions)

	output_filename = "structured_output.json"
	with open(output_filename, "w", encoding="utf-8") as f:
	json.dump(final_json, f, indent=2, ensure_ascii=False)

	return output_filename, f"✅ Success! Processed {len(extracted_pages)} pages. Extracted {len(final_json)} items."

	except Exception as e:
	import traceback
	return None, f"❌ Error:\n{str(e)}\n\nTraceback:\n{traceback.format_exc()}"

	# ---------------------------------------------------------
	# 6. GRADIO INTERFACE
	# ---------------------------------------------------------
	iface = gr.Interface(
	fn=process_pdf,
	inputs=gr.File(label="Upload PDF", file_types=[".pdf"]),
	outputs=[
	gr.File(label="Download JSON Output"),
	gr.Textbox(label="Status Log", lines=10)
	],
	title="LayoutLMv3 + BiLSTM Hybrid Model Inference",
	description="Upload a document to extract structured data using the custom Hybrid LayoutLMv3 model.",
	flagging_mode="never"
	)

	if __name__ == "__main__":
	iface.launch()