PranavRatnalikar's picture
Update app.py
7004824 verified
import os
import io
import cv2
import numpy as np
import gradio as gr
import pdf2image
import layoutparser as lp
import pytesseract
import spacy
from docx import Document
from docx.shared import Inches
from transformers import pipeline
from spacy.cli.download import download as spacy_download
os.system('pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5"')
# Download SpaCy Legal NER model if not already installed
MODEL_NAME = "en_legal_ner_trf"
# if not os.path.exists(spacy.util.get_model_path(MODEL_NAME)):
# MODEL_NAME = "en_core_web_sm" # or "en_core_web_trf" for transformers-based model
if not spacy.util.is_package(MODEL_NAME):
spacy_download(MODEL_NAME) # Download model if not installed
os.system(f"python -m spacy download {MODEL_NAME}")
# Load the model
nlp = spacy.load(MODEL_NAME)
# Load SpaCy Legal NER model
# nlp = spacy.load("en_legal_ner_trf")
# Load Legal Text Classification Model
classifier = pipeline("zero-shot-classification", model="nlpaueb/legal-bert-base-uncased")
LEGAL_LABELS = ["Contract Clause", "Statute", "Legal Reference", "Obligation", "Liability"]
def to_image(filename):
doc = pdf2image.convert_from_path(filename, dpi=350, last_page=1)
folder = "doc"
if folder not in os.listdir():
os.makedirs(folder)
image_name = "page_1.jpg"
doc[0].save(os.path.join(folder, image_name), "JPEG")
return doc
def detect(doc):
model = lp.Detectron2LayoutModel("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
label_map={0:"Text", 1:"Title", 2:"List", 3:"Table", 4:"Figure"})
img = np.asarray(doc[0])
detected = model.detect(img)
return img, detected
def highlight_text(img, detected, important_texts):
for block in detected:
segment = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img)
extracted = pytesseract.image_to_string(segment).strip()
if any(term in extracted for term in important_texts):
x1, y1, x2, y2 = map(int, block.coordinates)
cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 3)
return img
def extract_important_texts(text):
doc = nlp(text)
extracted_texts = [ent.text for ent in doc.ents]
important_sections = []
for sentence in text.split(". "):
result = classifier(sentence, LEGAL_LABELS)
if any(score > 0.8 for score in result['scores']):
important_sections.append(sentence)
return list(set(extracted_texts + important_sections))
def predict_elements(img, detected):
model = lp.TesseractAgent(languages='eng')
dic_predicted = {}
full_text = ""
for block in detected:
segmented = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img)
extracted = model.detect(segmented).replace('\n',' ').strip()
full_text += extracted + "\n"
dic_predicted[str(block.id)] = extracted
important_texts = extract_important_texts(full_text)
highlighted_img = highlight_text(img, detected, important_texts)
return dic_predicted, highlighted_img, important_texts
def gen_doc(important_texts):
document = Document()
document.add_heading('Extracted Important Legal Text', level=1)
for text in important_texts:
document.add_paragraph(text)
doc_path = "important_legal_texts.docx"
document.save(doc_path)
return doc_path
def main_convert(filename):
doc = to_image(filename.name)
img, detected = detect(doc)
dic_predicted, highlighted_img, important_texts = predict_elements(img, detected)
doc_path = gen_doc(important_texts)
return doc_path, highlighted_img, important_texts
inputs = [gr.File(type='file', label="Upload Legal PDF")]
outputs = [gr.File(label="Extracted Important Text DOC"), gr.Image(type="numpy", label="Highlighted PDF Image"), gr.JSON(label="Extracted Important Texts")]
title = "Legal Document Parser"
description = "This parser detects important legal text, highlights it in the document, and extracts key legal clauses separately."
io = gr.Interface(fn=main_convert, inputs=inputs, outputs=outputs, title=title, description=description)
io.launch()