from flask import Flask, render_template, request import numpy as np import re import contractions import nltk import spacy from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import pandas as pd from keras.models import load_model import pickle import logging from typing import List app = Flask(__name__) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) lstm_model = None word2vec_model = None nlp = None stop_words = None models_loaded = False data = pd.read_csv('cleaned_data.csv') #-> Các câu gốc source_texts = data['source_txt'].tolist() #Tìm câu gốc gần câu đạo văn def find_closest_source(input_text: str) -> str: try: # Chuẩn hóa input input_cleaned = input_text.strip().lower() # Tìm dòng có plagiarism_txt trùng input và label = 1 matched_rows = data[ (data['plagiarism_txt'].str.strip().str.lower() == input_cleaned) & (data['label'] == 1) ] if not matched_rows.empty: return matched_rows.iloc[0]['source_txt'] else: return "No corresponding original sentence found." except Exception as e: logger.error(f"Phát hiện lỗi trong tìm câu gốc: {str(e)}") raise #Load LSTM và Word2Vec def load_models(): global lstm_model, word2vec_model, nlp, stop_words, models_loaded try: # Load LSTM model lstm_model = load_model('best_model.h5') logger.info(f"LSTM input shape: {lstm_model.input_shape}") # Load Word2Vec model with open('word2vec_model.pkl', 'rb') as f: word2vec_model = pickle.load(f) logger.info(f"Word2Vec vector size: {word2vec_model.vector_size}") # NLP tools nltk.download('stopwords', quiet=True) stop_words = set(nltk.corpus.stopwords.words('english')) nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) models_loaded = True logger.info("Tải xong model") except Exception as e: logger.error(f"Phát hiện lỗi: {str(e)}") raise #Load model lần đầu nếu chưa @app.before_request def before_first_request(): global models_loaded if not models_loaded: load_models() @app.route('/') def home(): return render_template('index.html') @app.route('/detect', methods=['POST']) def detect(): if not models_loaded: return render_template('index.html', error="Models are not loaded yet. Please try again.", input_text="") try: input_text = request.form['text'].strip() # tìm câu gốc gần nhất closest_source = find_closest_source(input_text) if closest_source == "No corresponding original sentence found.": return render_template('index.html', result="Not plagiarism", confidence="N/A", input_text=input_text, closest_source=closest_source) # tiền xử lý DL tokens_input = preprocess_text(input_text) tokens_source = preprocess_text(closest_source) # vector embedding vec_source = text_to_sequence(tokens_source) vec_input = text_to_sequence(tokens_input) # Nối 2 vector lại combined_vec = np.concatenate((vec_source, vec_input), axis=2) logger.info(f"Combined vector shape: {combined_vec.shape}") prediction = lstm_model.predict(combined_vec, verbose=0)[0][0] confidence = prediction if prediction > 0.5 else 1 - prediction result = { 'text': "Plagiarism Detected" if prediction > 0.5 else "No Plagiarism Detected", 'confidence': f"{confidence * 100:.1f}%" } return render_template('index.html', result=result['text'], confidence=result['confidence'], input_text=input_text, closest_source=closest_source) except Exception as e: logger.error(f"Detection error: {str(e)}") return render_template('index.html', error="An error occurred during processing.", input_text=request.form.get('text', '')) def preprocess_text(text: str) -> List[str]: try: text = text.lower() text = contractions.fix(text) text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'[^a-zA-Z\s]', '', text) text = re.sub(r'\s+', ' ', text).strip() tokens = [word for word in text.split() if word not in stop_words] doc = nlp(" ".join(tokens)) return [token.lemma_ for token in doc if token.lemma_.strip()] except Exception as e: logger.error(f"Preprocessing error: {str(e)}") raise def text_to_sequence(tokens: List[str]) -> np.ndarray: # Chuyển token thành sequence cho LSTM try: max_timesteps = lstm_model.input_shape[1] lstm_embedding_dim = lstm_model.input_shape[2] // 2 # Vì ta nối 2 vector w2v_embedding_dim = word2vec_model.vector_size word_vectors = [] for word in tokens: if word in word2vec_model.wv: word_vector = word2vec_model.wv[word] # Điều chỉnh vector if w2v_embedding_dim != lstm_embedding_dim: if w2v_embedding_dim > lstm_embedding_dim: word_vector = word_vector[:lstm_embedding_dim] else: padded_vector = np.zeros(lstm_embedding_dim) padded_vector[:w2v_embedding_dim] = word_vector word_vector = padded_vector word_vectors.append(word_vector) if len(word_vectors) >= max_timesteps: break # Padding nếu cần if len(word_vectors) < max_timesteps: padding = [np.zeros(lstm_embedding_dim)] * (max_timesteps - len(word_vectors)) word_vectors.extend(padding) return np.array(word_vectors).reshape(1, max_timesteps, lstm_embedding_dim) except Exception as e: logger.error(f"Sequence conversion error: {str(e)}") raise if __name__ == '__main__': try: load_models() app.run(host='0.0.0.0', port=7860) except Exception as e: logger.critical(f"Application failed to start: {str(e)}")