chiichann commited on
Commit
24e672b
Β·
verified Β·
1 Parent(s): bde0716

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +112 -0
  2. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from sentence_transformers import SentenceTransformer
9
+ import string
10
+ import PyPDF2
11
+ import docx
12
+
13
+ # App title with improved styling
14
+ st.set_page_config(page_title="Unsupervised Text Similarity Analysis", layout="wide")
15
+ st.title("πŸ“– Unsupervised Text Similarity Analysis")
16
+ st.markdown("### Compare and analyze text similarity effortlessly! πŸ”")
17
+
18
+ # 🎯 Streamlit Tabs
19
+ tab1, tab2 = st.tabs(["πŸ“– About", "πŸ“Š Similarity Analysis"])
20
+
21
+ # Load Sentence Transformer model
22
+ @st.cache_resource
23
+ def load_model():
24
+ return SentenceTransformer('all-MiniLM-L6-v2')
25
+
26
+ model = load_model()
27
+
28
+ def preprocess_text(text):
29
+ text = text.lower()
30
+ text = text.translate(str.maketrans('', '', string.punctuation))
31
+ return text
32
+
33
+ def extract_text_from_pdf(file):
34
+ reader = PyPDF2.PdfReader(file)
35
+ text = " ".join([page.extract_text() for page in reader.pages if page.extract_text()])
36
+ return text
37
+
38
+ def extract_text_from_docx(file):
39
+ doc = docx.Document(file)
40
+ text = " ".join([para.text for para in doc.paragraphs])
41
+ return text
42
+
43
+ def compute_tfidf_similarity(texts):
44
+ vectorizer = TfidfVectorizer()
45
+ tfidf_matrix = vectorizer.fit_transform(texts)
46
+ return cosine_similarity(tfidf_matrix)
47
+
48
+ def compute_bert_similarity(texts):
49
+ embeddings = model.encode(texts, convert_to_tensor=True)
50
+ return cosine_similarity(embeddings.cpu().numpy())
51
+
52
+ def plot_similarity_matrix(similarity_matrix, labels):
53
+ fig, ax = plt.subplots(figsize=(10, 8))
54
+ sns.heatmap(similarity_matrix, annot=True, fmt='.2f', xticklabels=labels, yticklabels=labels, cmap='coolwarm')
55
+ plt.title("πŸ”— Text Similarity Matrix", fontsize=14)
56
+ st.pyplot(fig)
57
+
58
+ # About Tab
59
+ with tab1:
60
+ st.write("""
61
+ Welcome to the **Unsupervised Text Similarity Analysis** app! πŸš€
62
+ This app allows you to compare the similarity between multiple text documents.
63
+
64
+ ### How It Works:
65
+ 1. **Upload text documents** (TXT, PDF, DOCX).
66
+ 2. **Choose a similarity method** (TF-IDF or BERT Embeddings).
67
+ 3. **Compute similarity** to generate a similarity matrix.
68
+ 4. **Visualize results** with a heatmap and similarity percentages.
69
+
70
+ πŸ“Œ **Use Cases:** Plagiarism detection, document comparison, research analysis, and more!
71
+ """)
72
+
73
+ # Similarity Analysis Tab
74
+ with tab2:
75
+ st.subheader("πŸ“‚ Upload Text Documents")
76
+ uploaded_files = st.file_uploader("Upload text documents", type=["txt", "pdf", "docx"], accept_multiple_files=True)
77
+
78
+ if uploaded_files:
79
+ documents = []
80
+ doc_names = []
81
+
82
+ for file in uploaded_files:
83
+ if file.type == "text/plain":
84
+ text = preprocess_text(file.read().decode("utf-8"))
85
+ elif file.type == "application/pdf":
86
+ text = preprocess_text(extract_text_from_pdf(file))
87
+ elif file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
88
+ text = preprocess_text(extract_text_from_docx(file))
89
+ else:
90
+ continue
91
+
92
+ documents.append(text)
93
+ doc_names.append(file.name)
94
+
95
+ similarity_method = st.selectbox("πŸ”Ž Choose Similarity Method", ["TF-IDF", "BERT Embeddings"], index=0)
96
+
97
+ if st.button("πŸš€ Compute Similarity"):
98
+ if similarity_method == "TF-IDF":
99
+ similarity_matrix = compute_tfidf_similarity(documents)
100
+ else:
101
+ similarity_matrix = compute_bert_similarity(documents)
102
+
103
+ st.subheader("πŸ“Š Similarity Matrix")
104
+ plot_similarity_matrix(similarity_matrix, doc_names)
105
+
106
+ st.subheader("πŸ”— Document Similarity Scores")
107
+ for i in range(len(documents)):
108
+ for j in range(i + 1, len(documents)):
109
+ similarity_percentage = similarity_matrix[i, j] * 100
110
+ st.write(f"βœ… **{doc_names[i]}** and **{doc_names[j]}** have a similarity of **{similarity_percentage:.2f}%**")
111
+ else:
112
+ st.info("πŸ“Œ Please upload at least two text documents to start the analysis.")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ numpy
3
+ pandas
4
+ seaborn
5
+ matplotlib
6
+ scikit-learn
7
+ sentence-transformers