| import streamlit as st
|
| import torch
|
| from transformers import AutoModelForQuestionAnswering, AutoTokenizer
|
| import pdfplumber
|
| import json
|
| import difflib
|
| from sklearn.feature_extraction.text import TfidfVectorizer
|
| from sklearn.metrics.pairwise import cosine_similarity
|
| from predict import run_prediction
|
|
|
|
|
| st.set_page_config(layout="wide")
|
|
|
|
|
| MODEL_PATH = "ludigija/contract-roberta"
|
| DATA_PATH = "test.json"
|
|
|
|
|
| @st.cache_resource
|
| def load_model():
|
| model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
|
| return model, tokenizer
|
|
|
|
|
| @st.cache_data
|
| def load_questions():
|
| with open(DATA_PATH) as json_file:
|
| data = json.load(json_file)
|
| return [q["question"] for q in data["data"][0]["paragraphs"][0]["qas"]]
|
|
|
|
|
| def extract_text_from_pdf(uploaded_file):
|
| with pdfplumber.open(uploaded_file) as pdf:
|
| return "\n".join(page.extract_text() or "" for page in pdf.pages)
|
|
|
|
|
| def highlight_differences(text1, text2):
|
| differ = difflib.Differ()
|
| diff = list(differ.compare(text1.split(), text2.split()))
|
|
|
| highlighted_text = ""
|
| for word in diff:
|
| if word.startswith("- "):
|
| highlighted_text += f'<span style="background-color:#ffcccc">{word[2:]}</span> '
|
| elif word.startswith("+ "):
|
| highlighted_text += f'<span style="background-color:#ccffcc">{word[2:]}</span> '
|
| elif word.startswith("? "):
|
| highlighted_text += f'<span style="background-color:#ffff99">{word[2:]}</span> '
|
| else:
|
| highlighted_text += word[2:] + " "
|
| return highlighted_text
|
|
|
|
|
| def calculate_similarity(text1, text2):
|
| vectorizer = TfidfVectorizer()
|
| tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
| similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
|
| return similarity[0][0] * 100
|
|
|
|
|
| page = st.sidebar.radio("Select Task", ("Contract Comparison", "PDF Comparison"))
|
|
|
| if page == "Contract Comparison":
|
|
|
| st.title("CUAD Document Comparison Demo")
|
| st.write("Upload two contracts (original & fake), select a question, and compare answers.")
|
|
|
|
|
| model, tokenizer = load_model()
|
| questions = load_questions()
|
|
|
|
|
| col1, col2 = st.columns(2)
|
| with col1:
|
| original_file = st.file_uploader("Upload Original Contract (PDF or TXT)", type=["pdf", "txt"], key="original")
|
| with col2:
|
| fake_file = st.file_uploader("Upload Fake Contract (PDF or TXT)", type=["pdf", "txt"], key="fake")
|
|
|
| def extract_text(uploaded_file):
|
| if uploaded_file:
|
| if uploaded_file.type == "application/pdf":
|
| with pdfplumber.open(uploaded_file) as pdf:
|
| return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
|
| elif uploaded_file.type == "text/plain":
|
| return uploaded_file.read().decode("utf-8")
|
| return ""
|
|
|
| original_text = extract_text(original_file)
|
| fake_text = extract_text(fake_file)
|
|
|
| col1, col2 = st.columns(2)
|
| with col1:
|
| st.text_area("Original Contract Text", original_text, height=200)
|
| with col2:
|
| st.text_area("Fake Contract Text", fake_text, height=200)
|
|
|
|
|
| question = st.selectbox("Choose a predefined question:", questions)
|
|
|
| if st.button("Compare Answers") and original_text and fake_text:
|
| original_answer = run_prediction(question, original_text, MODEL_PATH)
|
| fake_answer = run_prediction(question, fake_text, MODEL_PATH)
|
|
|
| st.write("### Comparison Results:")
|
| col1, col2 = st.columns(2)
|
| with col1:
|
| st.subheader("Original Document Answer")
|
| st.write(original_answer.strip())
|
| with col2:
|
| st.subheader("Fake Document Answer")
|
| st.write(fake_answer.strip())
|
|
|
| elif page == "PDF Comparison":
|
|
|
| st.title("📄 PDF Comparison Tool")
|
|
|
|
|
| col1, col2 = st.columns(2)
|
| with col1:
|
| original_pdf = st.file_uploader("Upload Original PDF", type="pdf")
|
| with col2:
|
| modified_pdf = st.file_uploader("Upload Modified PDF", type="pdf")
|
|
|
|
|
| if original_pdf and modified_pdf:
|
| text1 = extract_text_from_pdf(original_pdf)
|
| text2 = extract_text_from_pdf(modified_pdf)
|
|
|
|
|
| col1, col2 = st.columns(2)
|
| with col1:
|
| st.subheader("Original PDF")
|
| st.text_area("Original Text", text1, height=300)
|
| with col2:
|
| st.subheader("Modified PDF")
|
| st.text_area("Modified Text", text2, height=300)
|
|
|
|
|
| if st.button("Compare PDFs"):
|
|
|
| similarity_score = calculate_similarity(text1, text2)
|
| st.write(f"**Similarity Score:** {similarity_score:.2f}%")
|
|
|
| if similarity_score == 100:
|
| st.success("The documents are identical. No changes to compare.")
|
| elif similarity_score < 50:
|
| st.warning("The documents are significantly different. No detailed comparison will be performed.")
|
| else:
|
|
|
| highlighted_diff = highlight_differences(text1, text2)
|
| st.markdown("### Differences Highlighted")
|
| st.markdown(f'<div style="border:1px solid #ccc; padding:10px;">{highlighted_diff}</div>', unsafe_allow_html=True)
|
|
|