Spaces:
Runtime error
Runtime error
| import os | |
| import io | |
| import cv2 | |
| import numpy as np | |
| import gradio as gr | |
| import pdf2image | |
| import layoutparser as lp | |
| import pytesseract | |
| import spacy | |
| from docx import Document | |
| from docx.shared import Inches | |
| from transformers import pipeline | |
| from spacy.cli.download import download as spacy_download | |
| os.system('pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5"') | |
| # Download SpaCy Legal NER model if not already installed | |
| MODEL_NAME = "en_legal_ner_trf" | |
| # if not os.path.exists(spacy.util.get_model_path(MODEL_NAME)): | |
| # MODEL_NAME = "en_core_web_sm" # or "en_core_web_trf" for transformers-based model | |
| if not spacy.util.is_package(MODEL_NAME): | |
| spacy_download(MODEL_NAME) # Download model if not installed | |
| os.system(f"python -m spacy download {MODEL_NAME}") | |
| # Load the model | |
| nlp = spacy.load(MODEL_NAME) | |
| # Load SpaCy Legal NER model | |
| # nlp = spacy.load("en_legal_ner_trf") | |
| # Load Legal Text Classification Model | |
| classifier = pipeline("zero-shot-classification", model="nlpaueb/legal-bert-base-uncased") | |
| LEGAL_LABELS = ["Contract Clause", "Statute", "Legal Reference", "Obligation", "Liability"] | |
| def to_image(filename): | |
| doc = pdf2image.convert_from_path(filename, dpi=350, last_page=1) | |
| folder = "doc" | |
| if folder not in os.listdir(): | |
| os.makedirs(folder) | |
| image_name = "page_1.jpg" | |
| doc[0].save(os.path.join(folder, image_name), "JPEG") | |
| return doc | |
| def detect(doc): | |
| model = lp.Detectron2LayoutModel("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config", | |
| extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8], | |
| label_map={0:"Text", 1:"Title", 2:"List", 3:"Table", 4:"Figure"}) | |
| img = np.asarray(doc[0]) | |
| detected = model.detect(img) | |
| return img, detected | |
| def highlight_text(img, detected, important_texts): | |
| for block in detected: | |
| segment = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img) | |
| extracted = pytesseract.image_to_string(segment).strip() | |
| if any(term in extracted for term in important_texts): | |
| x1, y1, x2, y2 = map(int, block.coordinates) | |
| cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 3) | |
| return img | |
| def extract_important_texts(text): | |
| doc = nlp(text) | |
| extracted_texts = [ent.text for ent in doc.ents] | |
| important_sections = [] | |
| for sentence in text.split(". "): | |
| result = classifier(sentence, LEGAL_LABELS) | |
| if any(score > 0.8 for score in result['scores']): | |
| important_sections.append(sentence) | |
| return list(set(extracted_texts + important_sections)) | |
| def predict_elements(img, detected): | |
| model = lp.TesseractAgent(languages='eng') | |
| dic_predicted = {} | |
| full_text = "" | |
| for block in detected: | |
| segmented = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img) | |
| extracted = model.detect(segmented).replace('\n',' ').strip() | |
| full_text += extracted + "\n" | |
| dic_predicted[str(block.id)] = extracted | |
| important_texts = extract_important_texts(full_text) | |
| highlighted_img = highlight_text(img, detected, important_texts) | |
| return dic_predicted, highlighted_img, important_texts | |
| def gen_doc(important_texts): | |
| document = Document() | |
| document.add_heading('Extracted Important Legal Text', level=1) | |
| for text in important_texts: | |
| document.add_paragraph(text) | |
| doc_path = "important_legal_texts.docx" | |
| document.save(doc_path) | |
| return doc_path | |
| def main_convert(filename): | |
| doc = to_image(filename.name) | |
| img, detected = detect(doc) | |
| dic_predicted, highlighted_img, important_texts = predict_elements(img, detected) | |
| doc_path = gen_doc(important_texts) | |
| return doc_path, highlighted_img, important_texts | |
| inputs = [gr.File(type='file', label="Upload Legal PDF")] | |
| outputs = [gr.File(label="Extracted Important Text DOC"), gr.Image(type="numpy", label="Highlighted PDF Image"), gr.JSON(label="Extracted Important Texts")] | |
| title = "Legal Document Parser" | |
| description = "This parser detects important legal text, highlights it in the document, and extracts key legal clauses separately." | |
| io = gr.Interface(fn=main_convert, inputs=inputs, outputs=outputs, title=title, description=description) | |
| io.launch() | |