ludigija's picture
Upload 4 files
028e11d verified
import streamlit as st
import torch
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import pdfplumber
import json
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from predict import run_prediction # Ensure you have this function implemented
# Set Streamlit page config (must be the first command)
st.set_page_config(layout="wide")
# Define model and data paths for contract comparison
MODEL_PATH = "ludigija/contract-roberta"
DATA_PATH = "test.json"
# Cache model loading for contract comparison
@st.cache_resource
def load_model():
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
return model, tokenizer
# Cache questions loading for contract comparison
@st.cache_data
def load_questions():
with open(DATA_PATH) as json_file:
data = json.load(json_file)
return [q["question"] for q in data["data"][0]["paragraphs"][0]["qas"]]
# Extract text from PDF (common for both pages)
def extract_text_from_pdf(uploaded_file):
with pdfplumber.open(uploaded_file) as pdf:
return "\n".join(page.extract_text() or "" for page in pdf.pages)
# Function for text comparison
def highlight_differences(text1, text2):
differ = difflib.Differ()
diff = list(differ.compare(text1.split(), text2.split()))
highlighted_text = ""
for word in diff:
if word.startswith("- "):
highlighted_text += f'<span style="background-color:#ffcccc">{word[2:]}</span> ' # Red for removed
elif word.startswith("+ "):
highlighted_text += f'<span style="background-color:#ccffcc">{word[2:]}</span> ' # Green for added
elif word.startswith("? "):
highlighted_text += f'<span style="background-color:#ffff99">{word[2:]}</span> ' # Yellow for modified
else:
highlighted_text += word[2:] + " "
return highlighted_text
# Function to calculate similarity score
def calculate_similarity(text1, text2):
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])
similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
return similarity[0][0] * 100 # Convert to percentage
# Streamlit UI - Navbar for Navigation
page = st.sidebar.radio("Select Task", ("Contract Comparison", "PDF Comparison"))
if page == "Contract Comparison":
# Contract comparison code
st.title("CUAD Document Comparison Demo")
st.write("Upload two contracts (original & fake), select a question, and compare answers.")
# Load model and questions
model, tokenizer = load_model()
questions = load_questions()
# File uploaders
col1, col2 = st.columns(2)
with col1:
original_file = st.file_uploader("Upload Original Contract (PDF or TXT)", type=["pdf", "txt"], key="original")
with col2:
fake_file = st.file_uploader("Upload Fake Contract (PDF or TXT)", type=["pdf", "txt"], key="fake")
def extract_text(uploaded_file):
if uploaded_file:
if uploaded_file.type == "application/pdf":
with pdfplumber.open(uploaded_file) as pdf:
return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
elif uploaded_file.type == "text/plain":
return uploaded_file.read().decode("utf-8")
return ""
original_text = extract_text(original_file)
fake_text = extract_text(fake_file)
col1, col2 = st.columns(2)
with col1:
st.text_area("Original Contract Text", original_text, height=200)
with col2:
st.text_area("Fake Contract Text", fake_text, height=200)
# Question selection
question = st.selectbox("Choose a predefined question:", questions)
if st.button("Compare Answers") and original_text and fake_text:
original_answer = run_prediction(question, original_text, MODEL_PATH)
fake_answer = run_prediction(question, fake_text, MODEL_PATH)
st.write("### Comparison Results:")
col1, col2 = st.columns(2)
with col1:
st.subheader("Original Document Answer")
st.write(original_answer.strip())
with col2:
st.subheader("Fake Document Answer")
st.write(fake_answer.strip())
elif page == "PDF Comparison":
# PDF comparison code
st.title("📄 PDF Comparison Tool")
# File upload widgets
col1, col2 = st.columns(2) # Create two columns
with col1:
original_pdf = st.file_uploader("Upload Original PDF", type="pdf") # Uploader in the first column
with col2:
modified_pdf = st.file_uploader("Upload Modified PDF", type="pdf") # Uploader in the second column
# Display uploaded PDFs' text
if original_pdf and modified_pdf:
text1 = extract_text_from_pdf(original_pdf)
text2 = extract_text_from_pdf(modified_pdf)
# Display original and modified text in two columns
col1, col2 = st.columns(2) # Create two columns for displaying text
with col1:
st.subheader("Original PDF")
st.text_area("Original Text", text1, height=300) # Display original text in the first column
with col2:
st.subheader("Modified PDF")
st.text_area("Modified Text", text2, height=300) # Display modified text in the second column
# Add a button to trigger comparison
if st.button("Compare PDFs"):
# Calculate similarity score
similarity_score = calculate_similarity(text1, text2)
st.write(f"**Similarity Score:** {similarity_score:.2f}%")
if similarity_score == 100:
st.success("The documents are identical. No changes to compare.")
elif similarity_score < 50:
st.warning("The documents are significantly different. No detailed comparison will be performed.")
else:
# Proceed with detailed comparison using difflib
highlighted_diff = highlight_differences(text1, text2)
st.markdown("### Differences Highlighted")
st.markdown(f'<div style="border:1px solid #ccc; padding:10px;">{highlighted_diff}</div>', unsafe_allow_html=True)