Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import zipfile | |
| import os | |
| import requests | |
| import re | |
| from keras.models import load_model | |
| from keras.preprocessing.text import Tokenizer | |
| from keras.preprocessing.sequence import pad_sequences | |
| from sklearn.preprocessing import LabelEncoder | |
| import pickle | |
| import numpy as np | |
| from PIL import Image | |
| from joblib import load | |
| import math | |
| # Custom headers for the HTTP request | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', | |
| } | |
| def get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction): | |
| author_map = { | |
| "googlebard": "Google Bard", | |
| "gpt3": "ChatGPT-3", | |
| "gpt4": "ChatGPT-4", | |
| "huggingface": "HuggingChat", | |
| "human": "Human-Written" | |
| } | |
| cnn_predicted_author_display_name = author_map.get(predicted_author, predicted_author) | |
| ridge_predicted_author_display_name = author_map.get(ridge_prediction[0], ridge_prediction[0]) | |
| extra_trees_predicted_author_display_name = author_map.get(extra_trees_prediction[0], extra_trees_prediction[0]) | |
| return cnn_predicted_author_display_name, ridge_predicted_author_display_name, extra_trees_predicted_author_display_name | |
| ############# Download Or Check Files/folders exeistince ############## | |
| # Check if the model folder exists | |
| zip_file_path = "my_authorship_model_zip.zip" | |
| if not os.path.exists('my_authorship_model'): | |
| try: | |
| # Download the model | |
| model_url = 'https://jaifar.net/ADS/my_authorship_model_zip.zip' | |
| r = requests.get(model_url, headers=headers) | |
| r.raise_for_status() | |
| # Debugging: Check if download is successful by examining content length | |
| # st.write(f"Downloaded model size: {len(r.content)} bytes") | |
| # Save the downloaded content | |
| with open(zip_file_path, "wb") as f: | |
| f.write(r.content) | |
| # Debugging: Verify that the zip file exists | |
| if os.path.exists(zip_file_path): | |
| # st.write("Zip file exists") | |
| # Extract the model using zipfile | |
| with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: | |
| zip_ref.extractall('my_authorship_model') | |
| # # Debugging: Check if the folder is successfully created | |
| # if os.path.exists('my_authorship_model'): | |
| # # st.write("Model folder successfully extracted using zipfile") | |
| # # Debugging: List the directory contents after extraction | |
| # # st.write("Listing directory contents:") | |
| # # st.write(os.listdir('.')) | |
| # else: | |
| # st.write("Model folder was not extracted successfully using zipfile") | |
| # exit(1) | |
| else: | |
| st.write("Zip file does not exist") | |
| exit(1) | |
| except Exception as e: | |
| st.write(f"Failed to download or extract the model: {e}") | |
| exit(1) | |
| else: | |
| st.title("AI Text Detection") | |
| # Download the required files | |
| file_urls = { | |
| 'tokenizer.pkl': 'https://jaifar.net/ADS/tokenizer.pkl', | |
| 'label_encoder.pkl': 'https://jaifar.net/ADS/label_encoder.pkl' | |
| } | |
| for filename, url in file_urls.items(): | |
| if not os.path.exists(filename): # Check if the file doesn't exist | |
| try: | |
| r = requests.get(url, headers=headers) | |
| r.raise_for_status() | |
| with open(filename, 'wb') as f: | |
| f.write(r.content) | |
| except Exception as e: | |
| st.write(f"Failed to download {filename}: {e}") | |
| exit(1) | |
| # else: | |
| # st.write(f"File {filename} already exists. Skipping download.") | |
| ############ download ridge and ExtraTree stuff | |
| # def has_internet_connection(): | |
| # try: | |
| # response = requests.get("https://www.google.com/", timeout=5) | |
| # return True | |
| # except requests.ConnectionError: | |
| # return False | |
| def is_zip_file(file_path): | |
| return zipfile.is_zipfile(file_path) | |
| def are_files_extracted(extracted_files, missing_files): | |
| for file in missing_files: | |
| if file not in extracted_files: | |
| return False | |
| return True | |
| def check_and_download_files(): | |
| file_names = [ | |
| "truncated_260_to_284.xlsx_vectorizer.pkl", | |
| "not_trancated_full_paragraph.xlsx_extra_trees_model.pkl", | |
| "not_trancated_full_paragraph.xlsx_ridge_model.pkl", | |
| "not_trancated_full_paragraph.xlsx_vectorizer.pkl", | |
| "truncated_10_to_34.xlsx_extra_trees_model.pkl", | |
| "truncated_10_to_34.xlsx_ridge_model.pkl", | |
| "truncated_10_to_34.xlsx_vectorizer.pkl", | |
| "truncated_35_to_59.xlsx_extra_trees_model.pkl", | |
| "truncated_35_to_59.xlsx_ridge_model.pkl", | |
| "truncated_35_to_59.xlsx_vectorizer.pkl", | |
| "truncated_60_to_84.xlsx_extra_trees_model.pkl", | |
| "truncated_60_to_84.xlsx_ridge_model.pkl", | |
| "truncated_60_to_84.xlsx_vectorizer.pkl", | |
| "truncated_85_to_109.xlsx_extra_trees_model.pkl", | |
| "truncated_85_to_109.xlsx_ridge_model.pkl", | |
| "truncated_85_to_109.xlsx_vectorizer.pkl", | |
| "truncated_110_to_134.xlsx_extra_trees_model.pkl", | |
| "truncated_110_to_134.xlsx_ridge_model.pkl", | |
| "truncated_110_to_134.xlsx_vectorizer.pkl", | |
| "truncated_135_to_159.xlsx_extra_trees_model.pkl", | |
| "truncated_135_to_159.xlsx_ridge_model.pkl", | |
| "truncated_135_to_159.xlsx_vectorizer.pkl", | |
| "truncated_160_to_184.xlsx_extra_trees_model.pkl", | |
| "truncated_160_to_184.xlsx_ridge_model.pkl", | |
| "truncated_160_to_184.xlsx_vectorizer.pkl", | |
| "truncated_185_to_209.xlsx_extra_trees_model.pkl", | |
| "truncated_185_to_209.xlsx_ridge_model.pkl", | |
| "truncated_185_to_209.xlsx_vectorizer.pkl", | |
| "truncated_210_to_234.xlsx_extra_trees_model.pkl", | |
| "truncated_210_to_234.xlsx_ridge_model.pkl", | |
| "truncated_210_to_234.xlsx_vectorizer.pkl", | |
| "truncated_235_to_259.xlsx_extra_trees_model.pkl", | |
| "truncated_235_to_259.xlsx_ridge_model.pkl", | |
| "truncated_235_to_259.xlsx_vectorizer.pkl", | |
| "truncated_260_to_284.xlsx_extra_trees_model.pkl", | |
| "truncated_260_to_284.xlsx_ridge_model.pkl" | |
| ] | |
| missing_files = [] | |
| for file_name in file_names: | |
| if not os.path.exists(file_name): | |
| missing_files.append(file_name) | |
| if missing_files: | |
| #st.write("The following files are missing:") | |
| st.write("Some files are missing") | |
| # for file_name in missing_files: | |
| # st.write(file_name) | |
| # if not has_internet_connection(): | |
| # st.write("No internet connection. Cannot download missing files.") | |
| # return | |
| try: | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', | |
| } | |
| url = 'https://jaifar.net/ADS/content.zip' | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| with open('content.zip', 'wb') as zip_file: | |
| zip_file.write(response.content) | |
| if not is_zip_file('content.zip'): | |
| st.write("Downloaded content is not a ZIP file.") | |
| return | |
| with zipfile.ZipFile('content.zip', 'r') as zip_ref: | |
| zip_ref.extractall() | |
| extracted_files = os.listdir() | |
| if not are_files_extracted(extracted_files, missing_files): | |
| st.write("Not all missing files were extracted.") | |
| return | |
| st.write("content.zip downloaded and extracted successfully.") | |
| except Exception as e: | |
| st.write(f"Error downloading or extracting content.zip: {e}") | |
| # else: | |
| # st.write("All files exist.") | |
| check_and_download_files() | |
| ############### Load CNN Model ############ | |
| # Load the saved model | |
| loaded_model = load_model("my_authorship_model") | |
| # Load the saved tokenizer and label encoder | |
| with open('tokenizer.pkl', 'rb') as handle: | |
| tokenizer = pickle.load(handle) | |
| with open('label_encoder.pkl', 'rb') as handle: | |
| label_encoder = pickle.load(handle) | |
| max_length = 300 | |
| ############### End Load CNN Model ############ | |
| # Function to predict author for new text | |
| def predict_author(new_text, model, tokenizer, label_encoder): | |
| sequence = tokenizer.texts_to_sequences([new_text]) | |
| padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post') | |
| prediction = model.predict(padded_sequence) | |
| predicted_label = label_encoder.inverse_transform([prediction.argmax()])[0] | |
| probabilities = prediction[0] | |
| author_probabilities = {} | |
| for idx, prob in enumerate(probabilities): | |
| author = label_encoder.inverse_transform([idx])[0] | |
| author_probabilities[author] = prob | |
| return predicted_label, author_probabilities | |
| new_text = st.text_area("Input Your Text Here:") | |
| # Creates a button named 'Press me' | |
| press_me_button = st.button("Human or Robot?") | |
| if press_me_button: | |
| ########## ML | |
| word_count = len(re.findall(r'\w+', new_text)) | |
| st.write(f"Words Count: {word_count}") | |
| # Choose the appropriate model based on word count | |
| if 10 <= word_count <= 34: | |
| file_prefix = 'truncated_10_to_34.xlsx' | |
| elif 35 <= word_count <= 59: | |
| file_prefix = 'truncated_35_to_59.xlsx' | |
| elif 60 <= word_count <= 84: | |
| file_prefix = 'truncated_60_to_84.xlsx' | |
| elif 85 <= word_count <= 109: | |
| file_prefix = 'truncated_85_to_109.xlsx' | |
| elif 110 <= word_count <= 134: | |
| file_prefix = 'truncated_110_to_134.xlsx' | |
| elif 135 <= word_count <= 159: | |
| file_prefix = 'truncated_135_to_159.xlsx' | |
| elif 160 <= word_count <= 184: | |
| file_prefix = 'truncated_160_to_184.xlsx' | |
| elif 185 <= word_count <= 209: | |
| file_prefix = 'truncated_185_to_209.xlsx' | |
| elif 210 <= word_count <= 234: | |
| file_prefix = 'truncated_210_to_234.xlsx' | |
| elif 235 <= word_count <= 259: | |
| file_prefix = 'truncated_235_to_259.xlsx' | |
| elif 260 <= word_count <= 284: | |
| file_prefix = 'truncated_260_to_284.xlsx' | |
| else: | |
| file_prefix = 'not_trancated_full_paragraph.xlsx' | |
| # Load the models and vectorizer | |
| with open(f"{file_prefix}_ridge_model.pkl", 'rb') as file: | |
| ridge_model = pickle.load(file) | |
| with open(f"{file_prefix}_extra_trees_model.pkl", 'rb') as file: | |
| extra_trees_model = pickle.load(file) | |
| with open(f"{file_prefix}_vectorizer.pkl", 'rb') as file: | |
| vectorizer = pickle.load(file) | |
| # ML Vectorizing the input | |
| user_input_transformed = vectorizer.transform([new_text]) | |
| # ML predictions | |
| ridge_prediction = ridge_model.predict(user_input_transformed) | |
| extra_trees_prediction = extra_trees_model.predict(user_input_transformed) | |
| # CNN prediction + Vectorizing the input | |
| predicted_author, author_probabilities = predict_author(new_text, loaded_model, tokenizer, label_encoder) | |
| sorted_probabilities = sorted(author_probabilities.items(), key=lambda x: x[1], reverse=True) | |
| author_map = { | |
| "googlebard": "Google Bard", | |
| "gpt3": "ChatGPT-3", | |
| "gpt4": "ChatGPT-4", | |
| "huggingface": "HuggingChat", | |
| "human": "Human-Written" | |
| } | |
| cnn_name, ridge_name, extra_trees_name = get_author_display_name(predicted_author, ridge_prediction, extra_trees_prediction) | |
| with st.expander("Prediction Details (Click Here)..."): | |
| st.write(f"Ridge: {ridge_name}") | |
| st.write(f"ExtraTree: {extra_trees_name}") | |
| st.write(f"CNN: {cnn_name}") | |
| st.write("_" * 10) | |
| st.write("CNN Prediction Probabilities:") | |
| for author, prob in sorted_probabilities: | |
| display_name = author_map.get(author, author) | |
| st.write(f"{display_name}: {prob * 100:.2f}%") | |
| st.progress(float(prob)) | |
| max_cnn_prob_name = sorted_probabilities[0][0] | |
| max_cnn_prob = float(sorted_probabilities[0][1]) | |
| if word_count < 10.0 or word_count > 1081.0: | |
| st.warning("For better prediction input texts between 10 and 1081", icon="ℹ️") | |
| elif word_count < 256: | |
| if ridge_prediction == extra_trees_prediction == predicted_author: | |
| st.success(f"Most likely written by: **{cnn_name}**", icon="✅") | |
| st.info("We are quite confident in the accuracy of this result.", icon="ℹ️") | |
| elif ridge_prediction == predicted_author: | |
| st.success(f"Most likely written by: **{cnn_name}**", icon="✅") | |
| st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="✅") | |
| st.write("_" * 30) | |
| elif extra_trees_prediction == predicted_author: | |
| st.success(f"Most likely written by: **{cnn_name}**", icon="✅") | |
| st.success(f"2nd Most likely written by: **{ridge_name}**", icon="✅") | |
| st.write("_" * 30) | |
| else: | |
| st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="⚠️") | |
| st.success(f"1- **{cnn_name}**", icon="✅") | |
| st.success(f"2- **{ridge_name}**", icon="✅") | |
| st.success(f"3- **{extra_trees_name}**", icon="✅") | |
| else: | |
| if ridge_prediction == extra_trees_prediction == predicted_author: | |
| st.success(f"Most likely written by: **{ridge_name}**", icon="✅") | |
| st.info("We are quite confident in the accuracy of this result.", icon="ℹ️") | |
| elif ridge_prediction == predicted_author: | |
| st.success(f"Most likely written by: **{ridge_name}**", icon="✅") | |
| st.success(f"2nd Most likely written by: **{extra_trees_name}**", icon="✅") | |
| st.write("_" * 30) | |
| elif ridge_prediction == extra_trees_prediction: | |
| st.success(f"Most likely written by: **{ridge_name}**", icon="✅") | |
| st.success(f"2nd Most likely written by: **{cnn_name}**", icon="✅") | |
| st.write("_" * 30) | |
| else: | |
| st.warning("Notice 1: There is a difficulity predicting your text, it might fill into one of the below:", icon="⚠️") | |
| st.success(f"1- **{ridge_name}**", icon="✅") | |
| st.success(f"2- **{cnn_name}**", icon="✅") | |
| st.success(f"3- **{extra_trees_name}**", icon="✅") | |
| # Using expander to make FAQ sections | |
| st.subheader("More about AI Text Detector Project :") | |
| # Small Description | |
| with st.expander("What is this project about?"): | |
| st.write(""" | |
| This AI Text Detector tells whether a text is written by a Human or a specific Large Language Model (LLM) like ChatGPT-3, ChatGPT-4, Google Bard, or HuggingChat. | |
| Ridge, Extra trees and CNN are the machine learning algorithms have been used to create this AI Text Detector. | |
| """) | |
| # System Details | |
| with st.expander("How does the AI Text Detector work?"): | |
| st.write(""" | |
| The system is trained using deep learning model on a dataset of 140,546 paragraphs, varying in length from 10 to 1090 words. | |
| It achieves an accuracy of 0.9964 with a validation loss of 0.094. | |
| """) | |
| # Fetch the image from the URL | |
| accuracy_image_request = requests.get("https://jaifar.net/ADS/best_accuracy.png", headers=headers) | |
| # Save the downloaded content | |
| image_path = "best_accuracy.png" | |
| with open(image_path, "wb") as f: | |
| f.write(accuracy_image_request.content) | |
| # Open the image | |
| accuracy_image = Image.open(image_path) | |
| # Display the image using streamlit | |
| st.image(accuracy_image, caption='Best Accuracy', use_column_width=True) | |