Spaces:
Runtime error
Runtime error
| import nltk | |
| import numpy as np | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| from nltk.tokenize import word_tokenize | |
| from nltk.corpus import stopwords | |
| # Preprocess text | |
| def preprocess_text(text): | |
| text = text.lower() # Convert to lowercase | |
| words = word_tokenize(text) # Tokenize text | |
| words = [word for word in words if word.isalnum()] # Remove non-alphanumeric characters | |
| words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords | |
| return ' '.join(words) | |
| # Calculate text similarity using TF-IDF and cosine similarity | |
| def calculate_similarity(text1, text2): | |
| preprocessed_text1 = preprocess_text(text1) | |
| preprocessed_text2 = preprocess_text(text2) | |
| tfidf_vectorizer = TfidfVectorizer() | |
| tfidf_matrix = tfidf_vectorizer.fit_transform([preprocessed_text1, preprocessed_text2]) | |
| return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0] | |
| # Replace 'text1' and 'text2' with the text you want to compare | |
| text1 = "This is the original text." | |
| text2 = "π£ Exciting news! π The Falcon 180B has landed, revolutionizing the world of open LLMs. π¦ Want to know how to deploy it on Amazon SageMaker? Check out this informative blog post by Philipp Schmid, Technical Lead at Hugging Face and AWS ML HERO. π€ Get insights on setting up your dev environment, hardware requirements, running inferences, and more. Don't miss out! Read the full article here π Deploy Falcon 180B on Amazon SageMaker. Stay tuned for more Falcon 180B updates! π #AI #MachineLearning #AmazonSageMaker." | |
| # Calculate text similarity | |
| similarity = calculate_similarity(text1, text2) | |
| # Set a threshold for plagiarism detection (adjust as needed) | |
| threshold = 0.8 | |
| # Check if the similarity exceeds the threshold | |
| if similarity >= threshold: | |
| print("Plagiarism detected!") | |
| else: | |
| print("No plagiarism detected.") | |