Upload 4 files

028e11d verified about 1 year ago

6.51 kB

	import streamlit as st
	import torch
	from transformers import AutoModelForQuestionAnswering, AutoTokenizer
	import pdfplumber
	import json
	import difflib
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity
	from predict import run_prediction # Ensure you have this function implemented

	# Set Streamlit page config (must be the first command)
	st.set_page_config(layout="wide")

	# Define model and data paths for contract comparison
	MODEL_PATH = "ludigija/contract-roberta"
	DATA_PATH = "test.json"

	# Cache model loading for contract comparison
	@st.cache_resource
	def load_model():
	model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
	tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
	return model, tokenizer

	# Cache questions loading for contract comparison
	@st.cache_data
	def load_questions():
	with open(DATA_PATH) as json_file:
	data = json.load(json_file)
	return [q["question"] for q in data["data"][0]["paragraphs"][0]["qas"]]

	# Extract text from PDF (common for both pages)
	def extract_text_from_pdf(uploaded_file):
	with pdfplumber.open(uploaded_file) as pdf:
	return "\n".join(page.extract_text() or "" for page in pdf.pages)

	# Function for text comparison
	def highlight_differences(text1, text2):
	differ = difflib.Differ()
	diff = list(differ.compare(text1.split(), text2.split()))

	highlighted_text = ""
	for word in diff:
	if word.startswith("- "):
	highlighted_text += f'<span style="background-color:#ffcccc">{word[2:]}</span> ' # Red for removed
	elif word.startswith("+ "):
	highlighted_text += f'<span style="background-color:#ccffcc">{word[2:]}</span> ' # Green for added
	elif word.startswith("? "):
	highlighted_text += f'<span style="background-color:#ffff99">{word[2:]}</span> ' # Yellow for modified
	else:
	highlighted_text += word[2:] + " "
	return highlighted_text

	# Function to calculate similarity score
	def calculate_similarity(text1, text2):
	vectorizer = TfidfVectorizer()
	tfidf_matrix = vectorizer.fit_transform([text1, text2])
	similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
	return similarity[0][0] * 100 # Convert to percentage

	# Streamlit UI - Navbar for Navigation
	page = st.sidebar.radio("Select Task", ("Contract Comparison", "PDF Comparison"))

	if page == "Contract Comparison":
	# Contract comparison code
	st.title("CUAD Document Comparison Demo")
	st.write("Upload two contracts (original & fake), select a question, and compare answers.")

	# Load model and questions
	model, tokenizer = load_model()
	questions = load_questions()

	# File uploaders
	col1, col2 = st.columns(2)
	with col1:
	original_file = st.file_uploader("Upload Original Contract (PDF or TXT)", type=["pdf", "txt"], key="original")
	with col2:
	fake_file = st.file_uploader("Upload Fake Contract (PDF or TXT)", type=["pdf", "txt"], key="fake")

	def extract_text(uploaded_file):
	if uploaded_file:
	if uploaded_file.type == "application/pdf":
	with pdfplumber.open(uploaded_file) as pdf:
	return "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])
	elif uploaded_file.type == "text/plain":
	return uploaded_file.read().decode("utf-8")
	return ""

	original_text = extract_text(original_file)
	fake_text = extract_text(fake_file)

	col1, col2 = st.columns(2)
	with col1:
	st.text_area("Original Contract Text", original_text, height=200)
	with col2:
	st.text_area("Fake Contract Text", fake_text, height=200)

	# Question selection
	question = st.selectbox("Choose a predefined question:", questions)

	if st.button("Compare Answers") and original_text and fake_text:
	original_answer = run_prediction(question, original_text, MODEL_PATH)
	fake_answer = run_prediction(question, fake_text, MODEL_PATH)

	st.write("### Comparison Results:")
	col1, col2 = st.columns(2)
	with col1:
	st.subheader("Original Document Answer")
	st.write(original_answer.strip())
	with col2:
	st.subheader("Fake Document Answer")
	st.write(fake_answer.strip())

	elif page == "PDF Comparison":
	# PDF comparison code
	st.title("📄 PDF Comparison Tool")

	# File upload widgets
	col1, col2 = st.columns(2) # Create two columns
	with col1:
	original_pdf = st.file_uploader("Upload Original PDF", type="pdf") # Uploader in the first column
	with col2:
	modified_pdf = st.file_uploader("Upload Modified PDF", type="pdf") # Uploader in the second column

	# Display uploaded PDFs' text
	if original_pdf and modified_pdf:
	text1 = extract_text_from_pdf(original_pdf)
	text2 = extract_text_from_pdf(modified_pdf)

	# Display original and modified text in two columns
	col1, col2 = st.columns(2) # Create two columns for displaying text
	with col1:
	st.subheader("Original PDF")
	st.text_area("Original Text", text1, height=300) # Display original text in the first column
	with col2:
	st.subheader("Modified PDF")
	st.text_area("Modified Text", text2, height=300) # Display modified text in the second column

	# Add a button to trigger comparison
	if st.button("Compare PDFs"):
	# Calculate similarity score
	similarity_score = calculate_similarity(text1, text2)
	st.write(f"Similarity Score: {similarity_score:.2f}%")

	if similarity_score == 100:
	st.success("The documents are identical. No changes to compare.")
	elif similarity_score < 50:
	st.warning("The documents are significantly different. No detailed comparison will be performed.")
	else:
	# Proceed with detailed comparison using difflib
	highlighted_diff = highlight_differences(text1, text2)
	st.markdown("### Differences Highlighted")
	st.markdown(f'<div style="border:1px solid #ccc; padding:10px;">{highlighted_diff}</div>', unsafe_allow_html=True)