Spaces:
Build error
Build error
| import streamlit as st | |
| from docx import Document | |
| import re | |
| from collections import Counter | |
| from math import sqrt | |
| import nltk | |
| from nltk.corpus import stopwords, wordnet | |
| from nltk.tokenize import word_tokenize, sent_tokenize | |
| import matplotlib.pyplot as plt | |
| import io | |
| import base64 | |
| # Download necessary NLTK data | |
| nltk.download('punkt', quiet=True) | |
| nltk.download('stopwords', quiet=True) | |
| nltk.download('wordnet', quiet=True) | |
| nltk.download('averaged_perceptron_tagger', quiet=True) | |
| def read_file_content(uploaded_file): | |
| if uploaded_file.type == "text/plain": | |
| return uploaded_file.getvalue().decode("utf-8") | |
| elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
| doc = Document(uploaded_file) | |
| return " ".join([paragraph.text for paragraph in doc.paragraphs]) | |
| else: | |
| raise ValueError("Unsupported file type") | |
| def preprocess_text(text): | |
| # Convert to lowercase and remove punctuation | |
| text = re.sub(r'[^\w\s]', '', text.lower()) | |
| # Tokenize and remove stopwords | |
| stop_words = set(stopwords.words('english')) | |
| tokens = word_tokenize(text) | |
| return [word for word in tokens if word not in stop_words] | |
| def cosine_similarity(vec1, vec2): | |
| intersection = set(vec1.keys()) & set(vec2.keys()) | |
| numerator = sum([vec1[x] * vec2[x] for x in intersection]) | |
| sum1 = sum([vec1[x]**2 for x in vec1.keys()]) | |
| sum2 = sum([vec2[x]**2 for x in vec2.keys()]) | |
| denominator = sqrt(sum1) * sqrt(sum2) | |
| if not denominator: | |
| return 0.0 | |
| else: | |
| return float(numerator) / denominator | |
| def calculate_word_similarity(text1, text2): | |
| words1 = preprocess_text(text1) | |
| words2 = preprocess_text(text2) | |
| vec1 = Counter(words1) | |
| vec2 = Counter(words2) | |
| similarity = cosine_similarity(vec1, vec2) | |
| return similarity * 100 | |
| def calculate_sentence_similarity(text1, text2): | |
| sentences1 = sent_tokenize(text1) | |
| sentences2 = sent_tokenize(text2) | |
| similarities = [] | |
| for sent1 in sentences1: | |
| max_similarity = 0 | |
| for sent2 in sentences2: | |
| similarity = calculate_word_similarity(sent1, sent2) | |
| if similarity > max_similarity: | |
| max_similarity = similarity | |
| similarities.append(max_similarity) | |
| average_similarity = sum(similarities) / len(similarities) if similarities else 0.0 | |
| return average_similarity | |
| def longest_common_subsequence(text1, text2): | |
| sentences1 = sent_tokenize(text1) | |
| sentences2 = sent_tokenize(text2) | |
| m, n = len(sentences1), len(sentences2) | |
| L = [[0] * (n + 1) for _ in range(m + 1)] | |
| for i in range(1, m + 1): | |
| for j in range(1, n + 1): | |
| if sentences1[i-1] == sentences2[j-1]: | |
| L[i][j] = L[i-1][j-1] + 1 | |
| else: | |
| L[i][j] = max(L[i-1][j], L[i][j-1]) | |
| # Backtrack to find the LCS | |
| lcs = [] | |
| i, j = m, n | |
| while i > 0 and j > 0: | |
| if sentences1[i-1] == sentences2[j-1]: | |
| lcs.append(sentences1[i-1]) | |
| i -= 1 | |
| j -= 1 | |
| elif L[i-1][j] > L[i][j-1]: | |
| i -= 1 | |
| else: | |
| j -= 1 | |
| return list(reversed(lcs)) | |
| def suggest_rewrites(sentence): | |
| words = word_tokenize(sentence) | |
| tagged_words = nltk.pos_tag(words) | |
| rewrites = [] | |
| for word, tag in tagged_words: | |
| syns = wordnet.synsets(word) | |
| if syns: | |
| if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R'): | |
| synonym = syns[0].lemmas()[0].name() | |
| if synonym != word: | |
| rewrites.append(synonym) | |
| else: | |
| rewrites.append(word) | |
| else: | |
| rewrites.append(word) | |
| else: | |
| rewrites.append(word) | |
| return " ".join(rewrites) | |
| def calculate_plagiarism_percentage(word_similarity, sentence_similarity): | |
| return (word_similarity + sentence_similarity) / 2 | |
| def create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage): | |
| fig, ax = plt.subplots() | |
| ax.bar(["Word-Level Similarity", "Sentence-Level Similarity", "Plagiarism Percentage"], | |
| [word_similarity, sentence_similarity, plagiarism_percentage], | |
| color=["blue", "green", "red"]) | |
| ax.set_ylabel("Percentage") | |
| ax.set_ylim(0, 100) | |
| ax.set_title("Document Similarity and Plagiarism") | |
| st.pyplot(fig) | |
| def download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches): | |
| report = f"Word-Level Similarity: {word_similarity:.2f}%\n" | |
| report += f"Sentence-Level Similarity: {sentence_similarity:.2f}%\n" | |
| report += f"Plagiarism Percentage: {plagiarism_percentage:.2f}%\n\n" | |
| report += "Matched Sequences from the Created Document:\n" | |
| for i, match in enumerate(matched_sequences, 1): | |
| report += f"{i}. {match}\n" | |
| report += "\nRewritten Suggestions to Avoid Plagiarism:\n" | |
| for i, reworded in enumerate(reworded_matches, 1): | |
| report += f"{i}. {reworded}\n" | |
| report_bytes = report.encode("utf-8") | |
| b64 = base64.b64encode(report_bytes).decode() | |
| href = f'<a href="data:text/plain;base64,{b64}" download="plagiarism_report.txt">Download Report</a>' | |
| st.markdown(href, unsafe_allow_html=True) | |
| def main(): | |
| st.title("High-Accuracy Document Plagiarism Checker") | |
| doc1 = st.file_uploader("Upload Original Document", type=["txt", "docx"]) | |
| doc2 = st.file_uploader("Upload Created Document", type=["txt", "docx"]) | |
| if doc1 is not None and doc2 is not None: | |
| try: | |
| text1 = read_file_content(doc1) # Original Document | |
| text2 = read_file_content(doc2) # Created Document | |
| # Calculate word-level cosine similarity | |
| word_similarity = calculate_word_similarity(text1, text2) | |
| # Calculate sentence-level similarity | |
| sentence_similarity = calculate_sentence_similarity(text1, text2) | |
| # Calculate plagiarism percentage | |
| plagiarism_percentage = calculate_plagiarism_percentage(word_similarity, sentence_similarity) | |
| # Find longest common subsequences for sentence matches (from the created document) | |
| matched_sequences = longest_common_subsequence(text1, text2) | |
| st.write(f"Word-Level Cosine Similarity: {word_similarity:.2f}%") | |
| st.write(f"Sentence-Level Similarity: {sentence_similarity:.2f}%") | |
| st.write(f"Plagiarism Percentage: {plagiarism_percentage:.2f}%") | |
| create_bar_chart(word_similarity, sentence_similarity, plagiarism_percentage) | |
| if plagiarism_percentage < 20: | |
| st.write("The created document is mostly original.") | |
| elif plagiarism_percentage < 50: | |
| st.write("There are some similarities between the created and original documents.") | |
| else: | |
| st.write("The created document has significant similarities with the original and may contain plagiarism.") | |
| if matched_sequences: | |
| st.subheader("Matched Content from the Created Document:") | |
| for i, match in enumerate(matched_sequences, 1): | |
| st.write(f"{i}. {match}") | |
| # Rewriting the matched content | |
| reworded_matches = [suggest_rewrites(match) for match in matched_sequences] | |
| st.subheader("Rewritten Suggestions to Avoid Plagiarism:") | |
| for i, reworded in enumerate(reworded_matches, 1): | |
| st.write(f"{i}. {reworded}") | |
| download_report(word_similarity, sentence_similarity, plagiarism_percentage, matched_sequences, reworded_matches) | |
| else: | |
| st.write("No significant matched content found from the created document.") | |
| except ValueError as e: | |
| st.error(f"Error: {str(e)}") | |
| if __name__ == "__main__": | |
| main() |