ziadmostafa commited on
Commit
640b4b2
·
1 Parent(s): 3bccccd

added app files

Browse files
Files changed (8) hide show
  1. .gitattributes +0 -35
  2. README.md +87 -1
  3. app.py +118 -0
  4. pipeline/dataset.py +38 -0
  5. pipeline/main.py +35 -0
  6. pipeline/model_pipeline.py +169 -0
  7. requirements.txt +7 -0
  8. samples.json +0 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -11,4 +11,90 @@ license: apache-2.0
11
  short_description: MGT-Detection
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  short_description: MGT-Detection
12
  ---
13
 
14
+ # MGT-Detection
15
+
16
+ ## Overview
17
+ MGT-Detection (Machine-Generated Text Detection) is a project designed to classify and detect whether a given text is human-written or machine-generated. The project leverages state-of-the-art natural language processing (NLP) models and pipelines to achieve accurate classification results. It includes tools for training, evaluating, and deploying models for text classification tasks.
18
+
19
+ ## Features
20
+ - **Text Classification**: Detects whether a text is human-written or machine-generated.
21
+ - **Model Training Pipeline**: Includes hyperparameter optimization, dataset preparation, and model training.
22
+ - **Evaluation**: Provides metrics such as accuracy, precision, recall, and F1 score.
23
+ - **Dataset Management**: Tools for preparing and tokenizing datasets.
24
+ - **Model Deployment**: Save and load fine-tuned models for deployment.
25
+
26
+ ## Project Structure
27
+ ```
28
+ MGT-Detection/
29
+ ├── app.py # Main application for text classification
30
+ ├── pipeline/
31
+ │ ├── dataset.py # Dataset preparation and management
32
+ │ ├── model_pipeline.py # Model training and evaluation pipeline
33
+ │ ├── main.py # Entry point for running the training pipeline
34
+ ├── samples.json # Sample dataset for testing
35
+ ```
36
+
37
+
38
+ ## Usage
39
+ ### Running the Application
40
+ To launch the text classification application:
41
+ ```bash
42
+ python app.py
43
+ ```
44
+
45
+ ### Training a Model
46
+ To train a model using the pipeline:
47
+ ```bash
48
+ python pipeline/main.py \
49
+ --file_path <path_to_dataset> \
50
+ --out_path <output_directory> \
51
+ --model_name <model_name> \
52
+ --num_labels 2 \
53
+ --sample_frac 1.0 \
54
+ --num_trials 5 \
55
+ --num_epochs 5
56
+ ```
57
+
58
+ ### Dataset Preparation
59
+ Ensure your dataset is in JSON format with the following structure:
60
+ ```json
61
+ [
62
+ {
63
+ "text": "<text_sample>",
64
+ "label": "<label>",
65
+ },
66
+ ...
67
+ ]
68
+ ```
69
+
70
+ ## Key Components
71
+ ### `app.py`
72
+ - Provides a user interface for classifying text as human-written or machine-generated.
73
+
74
+ ### `pipeline/model_pipeline.py`
75
+ - Contains functions for model training, hyperparameter optimization, and evaluation.
76
+
77
+ ### `pipeline/dataset.py`
78
+ - Handles dataset preparation, tokenization, and saving/loading datasets.
79
+
80
+ ### `samples.json`
81
+ - A sample dataset for testing the application.
82
+
83
+ ## Requirements
84
+ - Python 3.8+
85
+ - Transformers
86
+ - Datasets
87
+ - Optuna
88
+ - Gradio
89
+ - Scikit-learn
90
+
91
+ ## Contributing
92
+ Contributions are welcome! Please fork the repository and submit a pull request with your changes.
93
+
94
+ ## License
95
+ This project is licensed under the MIT License. See the LICENSE file for details.
96
+
97
+ ## Acknowledgments
98
+ - Hugging Face Transformers
99
+ - Optuna for hyperparameter optimization
100
+ - Gradio for building the user interface
app.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+ from pathlib import Path
4
+ import gradio as gr
5
+ import numpy as np
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
7
+
8
+ # Constants
9
+ MIN_WORDS = 50
10
+ MAX_WORDS = 500
11
+ SAMPLE_JSON_PATH = Path('samples.json')
12
+
13
+ # Load models
14
+ def load_model(model_name):
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
17
+ return pipeline('text-classification', model=model, tokenizer=tokenizer, truncation=True, max_length=512, top_k=4)
18
+
19
+ classifier = load_model("ziadmostafa/MGT-Detection_deberta-base")
20
+
21
+ # Load sample essays
22
+ with open(SAMPLE_JSON_PATH, 'r') as f:
23
+ demo_essays = json.load(f)
24
+
25
+ # Global variable to store the current essay index
26
+ current_essay_index = None
27
+
28
+ TEXT_CLASS_MAPPING = {
29
+ 'LABEL_0': 'Human-Written',
30
+ 'LABEL_2': 'Machine-Generated'
31
+ }
32
+
33
+ def process_result(text):
34
+ result = classifier(text)[0]
35
+
36
+ labels = [TEXT_CLASS_MAPPING[x['label']] for x in result if x['label'] in TEXT_CLASS_MAPPING]
37
+ scores = list(np.array([x['score'] for x in result if x['label'] in TEXT_CLASS_MAPPING]))
38
+
39
+ final_results = dict(zip(labels, scores))
40
+
41
+ # Return only the label with the highest score
42
+ return max(final_results, key=final_results.get)
43
+
44
+ def update_result(name):
45
+ if name == '':
46
+ return ""
47
+ return process_result(name)
48
+
49
+ def active_button(input_text):
50
+ if not (50 <= len(input_text.split()) <= 500):
51
+ return gr.Button("Check Origin", variant="primary", interactive=False)
52
+ return gr.Button("Check Origin", variant="primary", interactive=True)
53
+
54
+ def clear_inputs():
55
+ return "", gr.Button("Check Origin", variant="primary", interactive=False)
56
+
57
+ def count_words(text):
58
+ return f'{len(text.split())}/500 words (Minimum 50 words)'
59
+
60
+ css = """
61
+ body, .gradio-container {
62
+ font-family: Arial, sans-serif;
63
+ }
64
+ .gr-input, .gr-textarea {
65
+ }
66
+ .class-intro {
67
+ padding: 15px;
68
+ margin-bottom: 20px;
69
+ border-radius: 5px;
70
+ }
71
+ .class-intro h2 {
72
+ margin-top: 0;
73
+ }
74
+ .class-intro p {
75
+ margin-bottom: 5px;
76
+ }
77
+ """
78
+
79
+ class_intro_html = """
80
+ <div class="class-intro">
81
+ <h2>Text Classes</h2>
82
+ <p><strong>Human-Written:</strong> Original text created by humans.</p>
83
+ <p><strong>Machine-Generated:</strong> Text created by AI from basic prompts, without style instructions.</p>
84
+ </div>
85
+ """
86
+
87
+ with gr.Blocks(css=css) as demo:
88
+ gr.Markdown("""<h1><centre>Machine Generated Text Detection</center></h1>""")
89
+ gr.HTML(class_intro_html)
90
+
91
+ with gr.Row():
92
+ input_text = gr.Textbox(placeholder="Paste your text here...", label="Text", lines=10, max_lines=15)
93
+
94
+ with gr.Row():
95
+ wc = gr.Markdown("0/500 words (Minimum 50 words)")
96
+ with gr.Row():
97
+ check_button = gr.Button("Check Origin", variant="primary", interactive=False)
98
+ clear_button = gr.ClearButton([input_text], variant="stop")
99
+
100
+ out = gr.Label(label='Result')
101
+ clear_button.add(out)
102
+
103
+ check_button.click(fn=update_result, inputs=[input_text], outputs=out)
104
+
105
+ input_text.change(count_words, input_text, wc, show_progress=False)
106
+ input_text.input(
107
+ active_button,
108
+ [input_text],
109
+ [check_button],
110
+ )
111
+
112
+ clear_button.click(
113
+ clear_inputs,
114
+ inputs=[],
115
+ outputs=[input_text, check_button],
116
+ )
117
+
118
+ demo.launch(share=False)
pipeline/dataset.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ from datasets import Dataset, DatasetDict
4
+ from sklearn.model_selection import train_test_split
5
+
6
+
7
+ def read_json(file_name):
8
+ with open(file_name, 'r') as file:
9
+ return [json.loads(line) for line in file]
10
+
11
+ def json_dataset_parser(jsons_list, labels_dict):
12
+ data_dict = {"text": [], "labels": []}
13
+ for obj in jsons_list:
14
+ data_dict["text"].append(obj["text"])
15
+ data_dict["labels"].append(labels_dict[obj["label"]])
16
+ return pd.DataFrame(data_dict)
17
+
18
+ def prepare_dataset(file_path, labels_dict, test_size=0.15, val_size=0.15, sample_frac=1.0):
19
+ jsons_list = read_json(file_path)
20
+ df = json_dataset_parser(jsons_list, labels_dict)
21
+ df = df.sample(frac=sample_frac).reset_index(drop=True)
22
+
23
+ train_val, test = train_test_split(df, test_size=test_size, stratify=df['labels'])
24
+ train, val = train_test_split(train_val, test_size=val_size/(1-test_size), stratify=train_val['labels'])
25
+
26
+ dataset = DatasetDict({
27
+ 'train': Dataset.from_pandas(train),
28
+ 'val': Dataset.from_pandas(val),
29
+ 'test': Dataset.from_pandas(test)
30
+ })
31
+ return dataset
32
+
33
+
34
+ def save_tokenized_dataset(tokenized_datasets, save_path):
35
+ tokenized_datasets.save_to_disk(save_path)
36
+
37
+ def load_tokenized_dataset(load_path):
38
+ return DatasetDict.load_from_disk(load_path)
pipeline/main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from model_pipeline import run_training_pipeline
3
+
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser(description="Run the training pipeline for the model.")
7
+
8
+ parser.add_argument('--file_path', type=str, required=True, help='Path to the input dataset file.')
9
+ parser.add_argument('--out_path', type=str, default=".", help='Path to the saving model, tokenizer and dataset.')
10
+ parser.add_argument('--model_name', type=str, required=True, help='Name of the model to be trained.')
11
+ parser.add_argument('--num_labels', type=int, default=4, help='Number of labels for the classification task.')
12
+ parser.add_argument('--sample_frac', type=float, default=1.0, help='Fraction of the dataset to sample for training.')
13
+ parser.add_argument('--num_trials', type=int, default=5, help='Number of trials for hyperparameter search.')
14
+ parser.add_argument('--num_epochs', type=int, default=5, help='Number of epochs for training.')
15
+
16
+ args = parser.parse_args()
17
+
18
+ labels_dict = {
19
+ "human_text": 0,
20
+ "machine_text": 1,
21
+ }
22
+
23
+ run_training_pipeline(
24
+ file_path=args.file_path,
25
+ labels_dict=labels_dict,
26
+ model_name=args.model_name,
27
+ num_labels=args.num_labels,
28
+ sample_frac=args.sample_frac,
29
+ num_trials=args.num_trials,
30
+ num_epochs=args.num_epochs,
31
+ save_dir=args.out_path
32
+ )
33
+
34
+ if __name__ == "__main__":
35
+ main()
pipeline/model_pipeline.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import optuna
3
+ import numpy as np
4
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
5
+ from transformers import (
6
+ AutoTokenizer, AutoModelForSequenceClassification,
7
+ Trainer, TrainingArguments, EarlyStoppingCallback
8
+ )
9
+
10
+ from dataset import prepare_dataset, save_tokenized_dataset
11
+
12
+ '''
13
+ tokenization functions
14
+ '''
15
+ def tokenize_function(examples, tokenizer):
16
+ return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)
17
+
18
+ def tokenize_and_prepare_dataset(dataset, tokenizer):
19
+ tokenized_datasets = dataset.map(lambda examples: tokenize_function(examples, tokenizer), batched=True)
20
+ tokenized_datasets = tokenized_datasets.remove_columns(["text"])
21
+ tokenized_datasets.set_format("torch")
22
+ return tokenized_datasets
23
+
24
+ '''
25
+ training & hyperparamters optimization functions
26
+ '''
27
+ def get_model_and_tokenizer(model_name, num_labels):
28
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
29
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
30
+ return model, tokenizer
31
+
32
+ def compute_metrics(eval_pred):
33
+ predictions, labels = eval_pred
34
+ predictions = np.argmax(predictions, axis=1)
35
+ accuracy = accuracy_score(labels, predictions)
36
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted', zero_division=0)
37
+ return {
38
+ 'accuracy': accuracy,
39
+ 'f1': f1,
40
+ 'precision': precision,
41
+ 'recall': recall
42
+ }
43
+
44
+ def objective(trial, model, tokenized_datasets, max_epochs, metric='eval_f1'):
45
+ learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
46
+ weight_decay = trial.suggest_float("weight_decay", 1e-7, 1e-1, log=True)
47
+ epoch = trial.suggest_int("epoch", 1, max_epochs)
48
+
49
+ training_args = TrainingArguments(
50
+ output_dir="./results",
51
+ eval_strategy="epoch",
52
+ save_strategy="epoch",
53
+ learning_rate=learning_rate,
54
+ num_train_epochs=epoch,
55
+ weight_decay=weight_decay,
56
+ per_device_train_batch_size=16,
57
+ per_device_eval_batch_size=64,
58
+ load_best_model_at_end=True,
59
+ )
60
+
61
+ trainer = Trainer(
62
+ model=model,
63
+ args=training_args,
64
+ train_dataset=tokenized_datasets['train'],
65
+ eval_dataset=tokenized_datasets['val'],
66
+ compute_metrics=compute_metrics,
67
+ )
68
+
69
+ trainer.train()
70
+ eval_results = trainer.evaluate()
71
+ return eval_results[metric]
72
+
73
+ def train_model(model_name, tokenized_datasets, num_labels, num_trials=5, max_epochs=5):
74
+ model, _ = get_model_and_tokenizer(model_name, num_labels)
75
+
76
+ study = optuna.create_study(direction='maximize')
77
+ study.optimize(lambda trial: objective(trial, model, tokenized_datasets, max_epochs), n_trials=num_trials)
78
+
79
+ print(f"Best hyperparameters for {model_name}:", study.best_params)
80
+ print(f"Best F1 score for {model_name}:", study.best_value)
81
+
82
+ # Train with best hyperparameters
83
+ best_training_args = TrainingArguments(
84
+ output_dir=f"./results_{model_name}",
85
+ num_train_epochs = study.best_params["epoch"],
86
+ eval_strategy="steps",
87
+ save_strategy="steps",
88
+ logging_strategy="steps",
89
+ learning_rate=study.best_params["learning_rate"],
90
+ weight_decay=study.best_params["weight_decay"],
91
+ per_device_train_batch_size=16,
92
+ per_device_eval_batch_size=64,
93
+ load_best_model_at_end=True,
94
+ eval_steps = 500,
95
+ logging_steps = 500
96
+ )
97
+
98
+ trainer = Trainer(
99
+ model=model,
100
+ args=best_training_args,
101
+ train_dataset=tokenized_datasets['train'],
102
+ eval_dataset=tokenized_datasets['val'],
103
+ compute_metrics=compute_metrics,
104
+ )
105
+
106
+ trainer.train()
107
+ return trainer, model
108
+
109
+
110
+
111
+ '''
112
+ Evaluation
113
+ '''
114
+
115
+ def evaluate_model(trainer, tokenized_datasets):
116
+ results = trainer.evaluate(tokenized_datasets['test'])
117
+ print("Test set results:", results)
118
+
119
+ predictions = trainer.predict(tokenized_datasets['test'])
120
+ preds = torch.argmax(torch.tensor(predictions.predictions), axis=-1).cpu().numpy()
121
+ true_labels = tokenized_datasets['test']['labels'].numpy()
122
+
123
+ accuracy = accuracy_score(true_labels, preds)
124
+ precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
125
+
126
+ print(f"Accuracy: {accuracy}")
127
+ print(f"F1 Score: {f1}")
128
+ print(f"Precision: {precision}")
129
+ print(f"Recall: {recall}")
130
+
131
+ '''
132
+ model and tokenizer save and load
133
+ '''
134
+ def save_model_and_tokenizer(model, tokenizer, save_path):
135
+ model.save_pretrained(save_path)
136
+ tokenizer.save_pretrained(save_path)
137
+
138
+ def load_model_and_tokenizer(model_name, load_path, num_labels):
139
+ model = AutoModelForSequenceClassification.from_pretrained(load_path, num_labels=num_labels)
140
+ tokenizer = AutoTokenizer.from_pretrained(load_path)
141
+ return model, tokenizer
142
+
143
+
144
+ '''
145
+ Integrated pipeling
146
+ '''
147
+ def run_training_pipeline(file_path, labels_dict, model_name, num_labels, sample_frac=1.0, num_trials=5, num_epochs=5, save_dir = "."):
148
+ # Prepare dataset
149
+ dataset = prepare_dataset(file_path, labels_dict, sample_frac=sample_frac)
150
+
151
+ print(f"Training {model_name}...")
152
+
153
+ # Get model and tokenizer
154
+ model, tokenizer = get_model_and_tokenizer(model_name, num_labels)
155
+
156
+ # Tokenize dataset
157
+ tokenized_datasets = tokenize_and_prepare_dataset(dataset, tokenizer)
158
+
159
+ # Save tokenized dataset
160
+ save_tokenized_dataset(tokenized_datasets, f"{save_dir}/tokenized_{model_name}")
161
+
162
+ # Train model
163
+ trainer, trained_model = train_model(model_name, tokenized_datasets, num_labels, num_trials, num_epochs)
164
+
165
+ # Evaluate model
166
+ evaluate_model(trainer, tokenized_datasets)
167
+
168
+ # Save model and tokenizer
169
+ save_model_and_tokenizer(trained_model, tokenizer, f"{save_dir}/fine_tuned_{model_name}")
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers==4.30.2
2
+ safetensors==0.3.1
3
+ gradio==3.6.0
4
+ numpy==1.24.3
5
+ httpx==0.23.0
6
+ httpcore==0.15.0
7
+ torch==2.0.1
samples.json ADDED
The diff for this file is too large to render. See raw diff