Spaces:

heerjtdev
/

LayoutLM_train

Running

App Files Files Community

aagamjtdev commited on Oct 27

Commit

6deed2e

1 Parent(s): 44ea3cf

correction

Browse files

Files changed (1) hide show

HF_LayoutLM_with_Passage.py +1 -120

HF_LayoutLM_with_Passage.py CHANGED Viewed

@@ -8,14 +8,10 @@ import torch.nn as nn
 from torch.utils.data import Dataset, DataLoader, random_split
 from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model
 from TorchCRF import CRF
 from torch.optim import AdamW
 from tqdm import tqdm
 from sklearn.metrics import precision_recall_fscore_support
-import fitz  # PyMuPDF
-import pytesseract
-from PIL import Image
-from pdf2image import convert_from_path
 # --- Configuration for Augmentation ---
 MAX_BBOX_DIMENSION = 999
@@ -347,117 +343,6 @@ def main(args):
         print(f"💾 Model saved at {ckpt_path}")
-def run_inference(pdf_path, model_path, output_path):
-    # LABELS UPDATED: Added SECTION_HEADING and PASSAGE (Must match main)
-    labels = [
-        "O",
-        "B-QUESTION", "I-QUESTION",
-        "B-OPTION", "I-OPTION",
-        "B-ANSWER", "I-ANSWER",
-        "B-SECTION_HEADING", "I-SECTION_HEADING",
-        "B-PASSAGE", "I-PASSAGE"
-    ]
-    label2id = {l: i for i, l in enumerate(labels)}
-    id2label = {i: l for l, i in label2id.items()}
-    tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
-    # Load the trained model
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model = LayoutLMv3CRF("microsoft/layoutlmv3-base", num_labels=len(labels)).to(device)
-    try:
-        model.load_state_dict(torch.load(model_path, map_location=device))
-    except Exception as e:
-        print(
-            f"❌ Error loading model state: {e}. Ensure the model at {model_path} has been successfully trained with the new labels.")
-        return
-    model.eval()
-    # Process PDF with OCR
-    try:
-        doc = fitz.open(pdf_path)
-    except Exception as e:
-        print(f"❌ Error opening PDF: {e}")
-        return
-    all_predictions = []
-    tesseract_config = '--psm 6'
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        # Get a high-resolution image of the page for Tesseract
-        pix = page.get_pixmap(dpi=300)
-        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        # Get page dimensions from PyMuPDF
-        page_width, page_height = page.bound().width, page.bound().height
-        # Get OCR data (words and bboxes)
-        ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT, config=tesseract_config)
-        words = [word for word in ocr_data['text'] if word.strip()]
-        # Skip empty pages
-        if not words:
-            continue
-        # Get the scaling factors from the image resolution to the PDF's native resolution
-        x_scale = page_width / pix.width
-        y_scale = page_height / pix.height
-        # Create original pixel bboxes
-        bboxes_raw = [[
-            ocr_data['left'][i],
-            ocr_data['top'][i],
-            ocr_data['left'][i] + ocr_data['width'][i],
-            ocr_data['top'][i] + ocr_data['height'][i]
-        ] for i in range(len(ocr_data['text'])) if ocr_data['text'][i].strip()]
-        # Normalize bboxes to 0-1000 scale using the correct scaling factors
-        normalized_bboxes = [[
-            int(1000 * (b[0] * x_scale) / page_width),
-            int(1000 * (b[1] * y_scale) / page_height),
-            int(1000 * (b[2] * x_scale) / page_width),
-            int(1000 * (b[3] * y_scale) / page_height)
-        ] for b in bboxes_raw]
-        # Tokenize and run inference
-        inputs = tokenizer(words, boxes=normalized_bboxes, return_tensors="pt", truncation=True).to(device)
-        with torch.no_grad():
-            # The model is run on the normalized bboxes
-            preds = model(**inputs)
-        # Align predictions back to words
-        word_ids = inputs.word_ids(batch_index=0)
-        final_preds = []
-        previous_word_idx = None
-        for idx, word_id in enumerate(word_ids):
-            if word_id is not None and word_id != previous_word_idx:
-                # The model returns a list of predicted classes for each token
-                final_preds.append(id2label[preds[0][idx]])
-            previous_word_idx = word_id
-        # Prepare structured output
-        page_results = []
-        # Tesseract returns word list that is shorter than ocr_data if it contains empty strings.
-        # We need to use the cleaned 'words' list and its corresponding filtered bboxes.
-        # Note: We must ensure that the word and bbox lists passed to tokenizer and the filtered
-        # final_preds list are all correctly aligned with the original ocr_data indices.
-        # Since 'words' and 'bboxes_raw' are filtered exactly the same way (by word.strip()),
-        # and 'final_preds' is aligned back to 'words', we can zip them.
-        for word, bbox, label in zip(words, bboxes_raw, final_preds):
-            page_results.append({
-                "word": word,
-                "bbox": bbox,
-                "predicted_label": label
-            })
-        all_predictions.extend(page_results)
-    doc.close()
-    with open(output_path, "w") as f:
-        json.dump(all_predictions, f, indent=2, ensure_ascii=False)
-    print(f"✅ Inference complete. Predictions saved to {output_path}")
 # -------------------------
@@ -478,7 +363,3 @@ if __name__ == "__main__":
         if not args.input:
             parser.error("--input is required for 'train' mode.")
         main(args)
-    elif args.mode == "infer":
-        if not args.input:
-            parser.error("--input is required for 'infer' mode.")
-        run_inference(args.input, "checkpoints/layoutlmv3_crf_new_passage.pth", "inference_predictions.json")

 from torch.utils.data import Dataset, DataLoader, random_split
 from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model
 from TorchCRF import CRF
 from torch.optim import AdamW
 from tqdm import tqdm
 from sklearn.metrics import precision_recall_fscore_support
 # --- Configuration for Augmentation ---
 MAX_BBOX_DIMENSION = 999
         print(f"💾 Model saved at {ckpt_path}")
 # -------------------------
         if not args.input:
             parser.error("--input is required for 'train' mode.")
         main(args)