jolly_llm_document_parser

Runtime error

App Files Files Community

jolly_llm_document_parser / app.py

PranavRatnalikar

Update app.py

7004824 verified about 1 year ago

raw

history blame contribute delete

4.31 kB

	import os
	import io
	import cv2
	import numpy as np
	import gradio as gr
	import pdf2image
	import layoutparser as lp
	import pytesseract
	import spacy
	from docx import Document
	from docx.shared import Inches
	from transformers import pipeline
	from spacy.cli.download import download as spacy_download

	os.system('pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5"')

	# Download SpaCy Legal NER model if not already installed
	MODEL_NAME = "en_legal_ner_trf"
	# if not os.path.exists(spacy.util.get_model_path(MODEL_NAME)):
	# MODEL_NAME = "en_core_web_sm" # or "en_core_web_trf" for transformers-based model
	if not spacy.util.is_package(MODEL_NAME):
	spacy_download(MODEL_NAME) # Download model if not installed
	os.system(f"python -m spacy download {MODEL_NAME}")

	# Load the model
	nlp = spacy.load(MODEL_NAME)

	# Load SpaCy Legal NER model
	# nlp = spacy.load("en_legal_ner_trf")

	# Load Legal Text Classification Model
	classifier = pipeline("zero-shot-classification", model="nlpaueb/legal-bert-base-uncased")
	LEGAL_LABELS = ["Contract Clause", "Statute", "Legal Reference", "Obligation", "Liability"]

	def to_image(filename):
	doc = pdf2image.convert_from_path(filename, dpi=350, last_page=1)
	folder = "doc"
	if folder not in os.listdir():
	os.makedirs(folder)

	image_name = "page_1.jpg"
	doc[0].save(os.path.join(folder, image_name), "JPEG")
	return doc

	def detect(doc):
	model = lp.Detectron2LayoutModel("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
	label_map={0:"Text", 1:"Title", 2:"List", 3:"Table", 4:"Figure"})
	img = np.asarray(doc[0])
	detected = model.detect(img)
	return img, detected

	def highlight_text(img, detected, important_texts):
	for block in detected:
	segment = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img)
	extracted = pytesseract.image_to_string(segment).strip()

	if any(term in extracted for term in important_texts):
	x1, y1, x2, y2 = map(int, block.coordinates)
	cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 3)

	return img

	def extract_important_texts(text):
	doc = nlp(text)
	extracted_texts = [ent.text for ent in doc.ents]

	important_sections = []
	for sentence in text.split(". "):
	result = classifier(sentence, LEGAL_LABELS)
	if any(score > 0.8 for score in result['scores']):
	important_sections.append(sentence)

	return list(set(extracted_texts + important_sections))

	def predict_elements(img, detected):
	model = lp.TesseractAgent(languages='eng')
	dic_predicted = {}

	full_text = ""
	for block in detected:
	segmented = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img)
	extracted = model.detect(segmented).replace('\n',' ').strip()
	full_text += extracted + "\n"
	dic_predicted[str(block.id)] = extracted

	important_texts = extract_important_texts(full_text)
	highlighted_img = highlight_text(img, detected, important_texts)

	return dic_predicted, highlighted_img, important_texts

	def gen_doc(important_texts):
	document = Document()
	document.add_heading('Extracted Important Legal Text', level=1)
	for text in important_texts:
	document.add_paragraph(text)
	doc_path = "important_legal_texts.docx"
	document.save(doc_path)
	return doc_path

	def main_convert(filename):
	doc = to_image(filename.name)
	img, detected = detect(doc)
	dic_predicted, highlighted_img, important_texts = predict_elements(img, detected)
	doc_path = gen_doc(important_texts)
	return doc_path, highlighted_img, important_texts

	inputs = [gr.File(type='file', label="Upload Legal PDF")]
	outputs = [gr.File(label="Extracted Important Text DOC"), gr.Image(type="numpy", label="Highlighted PDF Image"), gr.JSON(label="Extracted Important Texts")]

	title = "Legal Document Parser"
	description = "This parser detects important legal text, highlights it in the document, and extracts key legal clauses separately."

	io = gr.Interface(fn=main_convert, inputs=inputs, outputs=outputs, title=title, description=description)
	io.launch()