import os
import io
import cv2
import numpy as np
import gradio as gr
import pdf2image
import layoutparser as lp
import pytesseract
import spacy
from docx import Document
from docx.shared import Inches
from transformers import pipeline
from spacy.cli.download import download as spacy_download

os.system('pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.5"')

# Download SpaCy Legal NER model if not already installed
MODEL_NAME = "en_legal_ner_trf"
# if not os.path.exists(spacy.util.get_model_path(MODEL_NAME)):
# MODEL_NAME = "en_core_web_sm"  # or "en_core_web_trf" for transformers-based model
if not spacy.util.is_package(MODEL_NAME):
    spacy_download(MODEL_NAME)  # Download model if not installed
    os.system(f"python -m spacy download {MODEL_NAME}")

# Load the model
nlp = spacy.load(MODEL_NAME)

# Load SpaCy Legal NER model
# nlp = spacy.load("en_legal_ner_trf")

# Load Legal Text Classification Model
classifier = pipeline("zero-shot-classification", model="nlpaueb/legal-bert-base-uncased")
LEGAL_LABELS = ["Contract Clause", "Statute", "Legal Reference", "Obligation", "Liability"]

def to_image(filename):
    doc = pdf2image.convert_from_path(filename, dpi=350, last_page=1)
    folder = "doc"
    if folder not in os.listdir():
        os.makedirs(folder)
    
    image_name = "page_1.jpg"
    doc[0].save(os.path.join(folder, image_name), "JPEG")
    return doc

def detect(doc):
    model = lp.Detectron2LayoutModel("lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
                                     extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
                                     label_map={0:"Text", 1:"Title", 2:"List", 3:"Table", 4:"Figure"})
    img = np.asarray(doc[0])
    detected = model.detect(img)
    return img, detected

def highlight_text(img, detected, important_texts):
    for block in detected:
        segment = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img)
        extracted = pytesseract.image_to_string(segment).strip()
        
        if any(term in extracted for term in important_texts):
            x1, y1, x2, y2 = map(int, block.coordinates)
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 3)
    
    return img

def extract_important_texts(text):
    doc = nlp(text)
    extracted_texts = [ent.text for ent in doc.ents]
    
    important_sections = []
    for sentence in text.split(". "):
        result = classifier(sentence, LEGAL_LABELS)
        if any(score > 0.8 for score in result['scores']):
            important_sections.append(sentence)
    
    return list(set(extracted_texts + important_sections))

def predict_elements(img, detected):
    model = lp.TesseractAgent(languages='eng')
    dic_predicted = {}
    
    full_text = ""
    for block in detected:
        segmented = block.pad(left=5, right=5, top=5, bottom=5).crop_image(img)
        extracted = model.detect(segmented).replace('\n',' ').strip()
        full_text += extracted + "\n"
        dic_predicted[str(block.id)] = extracted
    
    important_texts = extract_important_texts(full_text)
    highlighted_img = highlight_text(img, detected, important_texts)
    
    return dic_predicted, highlighted_img, important_texts

def gen_doc(important_texts):
    document = Document()
    document.add_heading('Extracted Important Legal Text', level=1)
    for text in important_texts:
        document.add_paragraph(text)
    doc_path = "important_legal_texts.docx"
    document.save(doc_path)
    return doc_path

def main_convert(filename):
    doc = to_image(filename.name)
    img, detected = detect(doc)
    dic_predicted, highlighted_img, important_texts = predict_elements(img, detected)
    doc_path = gen_doc(important_texts)
    return doc_path, highlighted_img, important_texts

inputs = [gr.File(type='file', label="Upload Legal PDF")]
outputs = [gr.File(label="Extracted Important Text DOC"), gr.Image(type="numpy", label="Highlighted PDF Image"), gr.JSON(label="Extracted Important Texts")]

title = "Legal Document Parser"
description = "This parser detects important legal text, highlights it in the document, and extracts key legal clauses separately."

io = gr.Interface(fn=main_convert, inputs=inputs, outputs=outputs, title=title, description=description)
io.launch()