Spaces:
Build error
Build error
| from flask import Flask, render_template, request | |
| import numpy as np | |
| import re | |
| import contractions | |
| import nltk | |
| import spacy | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import pandas as pd | |
| from keras.models import load_model | |
| import pickle | |
| import logging | |
| from typing import List | |
| app = Flask(__name__) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| lstm_model = None | |
| word2vec_model = None | |
| nlp = None | |
| stop_words = None | |
| models_loaded = False | |
| data = pd.read_csv('cleaned_data.csv') #-> Các câu gốc | |
| source_texts = data['source_txt'].tolist() | |
| #Tìm câu gốc gần câu đạo văn | |
| def find_closest_source(input_text: str) -> str: | |
| try: | |
| # Chuẩn hóa input | |
| input_cleaned = input_text.strip().lower() | |
| # Tìm dòng có plagiarism_txt trùng input và label = 1 | |
| matched_rows = data[ | |
| (data['plagiarism_txt'].str.strip().str.lower() == input_cleaned) & | |
| (data['label'] == 1) | |
| ] | |
| if not matched_rows.empty: | |
| return matched_rows.iloc[0]['source_txt'] | |
| else: | |
| return "No corresponding original sentence found." | |
| except Exception as e: | |
| logger.error(f"Phát hiện lỗi trong tìm câu gốc: {str(e)}") | |
| raise | |
| #Load LSTM và Word2Vec | |
| def load_models(): | |
| global lstm_model, word2vec_model, nlp, stop_words, models_loaded | |
| try: | |
| # Load LSTM model | |
| lstm_model = load_model('best_model.h5') | |
| logger.info(f"LSTM input shape: {lstm_model.input_shape}") | |
| # Load Word2Vec model | |
| with open('word2vec_model.pkl', 'rb') as f: | |
| word2vec_model = pickle.load(f) | |
| logger.info(f"Word2Vec vector size: {word2vec_model.vector_size}") | |
| # NLP tools | |
| nltk.download('stopwords', quiet=True) | |
| stop_words = set(nltk.corpus.stopwords.words('english')) | |
| nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) | |
| models_loaded = True | |
| logger.info("Tải xong model") | |
| except Exception as e: | |
| logger.error(f"Phát hiện lỗi: {str(e)}") | |
| raise | |
| #Load model lần đầu nếu chưa | |
| def before_first_request(): | |
| global models_loaded | |
| if not models_loaded: | |
| load_models() | |
| def home(): | |
| return render_template('index.html') | |
| def detect(): | |
| if not models_loaded: | |
| return render_template('index.html', | |
| error="Models are not loaded yet. Please try again.", | |
| input_text="") | |
| try: | |
| input_text = request.form['text'].strip() | |
| # tìm câu gốc gần nhất | |
| closest_source = find_closest_source(input_text) | |
| if closest_source == "No corresponding original sentence found.": | |
| return render_template('index.html', | |
| result="Not plagiarism", | |
| confidence="N/A", | |
| input_text=input_text, | |
| closest_source=closest_source) | |
| # tiền xử lý DL | |
| tokens_input = preprocess_text(input_text) | |
| tokens_source = preprocess_text(closest_source) | |
| # vector embedding | |
| vec_source = text_to_sequence(tokens_source) | |
| vec_input = text_to_sequence(tokens_input) | |
| # Nối 2 vector lại | |
| combined_vec = np.concatenate((vec_source, vec_input), axis=2) | |
| logger.info(f"Combined vector shape: {combined_vec.shape}") | |
| prediction = lstm_model.predict(combined_vec, verbose=0)[0][0] | |
| confidence = prediction if prediction > 0.5 else 1 - prediction | |
| result = { | |
| 'text': "Plagiarism Detected" if prediction > 0.5 else "No Plagiarism Detected", | |
| 'confidence': f"{confidence * 100:.1f}%" | |
| } | |
| return render_template('index.html', | |
| result=result['text'], | |
| confidence=result['confidence'], | |
| input_text=input_text, | |
| closest_source=closest_source) | |
| except Exception as e: | |
| logger.error(f"Detection error: {str(e)}") | |
| return render_template('index.html', | |
| error="An error occurred during processing.", | |
| input_text=request.form.get('text', '')) | |
| def preprocess_text(text: str) -> List[str]: | |
| try: | |
| text = text.lower() | |
| text = contractions.fix(text) | |
| text = re.sub(r'[^\w\s]', '', text) | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| tokens = [word for word in text.split() if word not in stop_words] | |
| doc = nlp(" ".join(tokens)) | |
| return [token.lemma_ for token in doc if token.lemma_.strip()] | |
| except Exception as e: | |
| logger.error(f"Preprocessing error: {str(e)}") | |
| raise | |
| def text_to_sequence(tokens: List[str]) -> np.ndarray: | |
| # Chuyển token thành sequence cho LSTM | |
| try: | |
| max_timesteps = lstm_model.input_shape[1] | |
| lstm_embedding_dim = lstm_model.input_shape[2] // 2 # Vì ta nối 2 vector | |
| w2v_embedding_dim = word2vec_model.vector_size | |
| word_vectors = [] | |
| for word in tokens: | |
| if word in word2vec_model.wv: | |
| word_vector = word2vec_model.wv[word] | |
| # Điều chỉnh vector | |
| if w2v_embedding_dim != lstm_embedding_dim: | |
| if w2v_embedding_dim > lstm_embedding_dim: | |
| word_vector = word_vector[:lstm_embedding_dim] | |
| else: | |
| padded_vector = np.zeros(lstm_embedding_dim) | |
| padded_vector[:w2v_embedding_dim] = word_vector | |
| word_vector = padded_vector | |
| word_vectors.append(word_vector) | |
| if len(word_vectors) >= max_timesteps: | |
| break | |
| # Padding nếu cần | |
| if len(word_vectors) < max_timesteps: | |
| padding = [np.zeros(lstm_embedding_dim)] * (max_timesteps - len(word_vectors)) | |
| word_vectors.extend(padding) | |
| return np.array(word_vectors).reshape(1, max_timesteps, lstm_embedding_dim) | |
| except Exception as e: | |
| logger.error(f"Sequence conversion error: {str(e)}") | |
| raise | |
| if __name__ == '__main__': | |
| try: | |
| load_models() | |
| app.run(host='0.0.0.0', port=7860) | |
| except Exception as e: | |
| logger.critical(f"Application failed to start: {str(e)}") | |