fine_tuned_model / ensemble.py
shubham-t's picture
Deploy backend
b4151cf
raw
history blame contribute delete
834 Bytes
from model import semantic_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def lexical_similarity(text1, text2):
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([text1, text2])
score = cosine_similarity(vectors[0:1], vectors[1:2])[0][0]
return round(score, 4)
def compute_scores(text1, text2):
sem_score = semantic_similarity(text1, text2)
lex_score = lexical_similarity(text1, text2)
final_score = (0.75 * sem_score) + (0.25 * lex_score)
return {
"semantic_score": round(sem_score, 4),
"lexical_score": round(lex_score, 4),
"final_score": round(final_score, 4)
}
def classify(score, threshold=0.75):
return "Plagiarized" if score >= threshold else "Not Plagiarized"