Upload 7 files

Browse files

Files changed (7) hide show

README_paraphrase_detection.md +124 -0
config (1).json +26 -0
model (2).safetensors +3 -0
special_tokens_map (1).json +37 -0
tokenizer (1).json +0 -0
tokenizer_config (1).json +58 -0
vocab (1).txt +0 -0

README_paraphrase_detection.md ADDED Viewed

	@@ -0,0 +1,124 @@

+# Paraphrase Detection Pipeline using Transformers
+This repository provides a complete pipeline to fine-tune a transformer model for **Paraphrase Detection** using the PAWS dataset.
+---
+## Steps
+### 1. Load Dataset
+Load the PAWS dataset which contains pairs of sentences with labels indicating if they are paraphrases or not.
+```python
+from datasets import load_dataset
+dataset = load_dataset("paws", "labeled_final")
+```
+### 2. Preprocess and Tokenize
+Tokenize sentence pairs with padding and truncation.
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2")
+def preprocess_function(examples):
+    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, padding="max_length", max_length=128)
+tokenized_datasets = dataset.map(preprocess_function, batched=True)
+```
+### 3. Load Model
+Load a pre-trained sequence classification model suitable for paraphrase detection.
+```python
+from transformers import AutoModelForSequenceClassification
+model = AutoModelForSequenceClassification.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2", num_labels=2)
+```
+### 4. Fine-tune the Model
+Setup training arguments and fine-tune the model using the Trainer API.
+```python
+from transformers import TrainingArguments, Trainer
+import evaluate
+training_args = TrainingArguments(
+    output_dir="./paraphrase-detector",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=64,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy"
+)
+accuracy = evaluate.load("accuracy")
+def compute_metrics(eval_preds):
+    logits, labels = eval_preds
+    predictions = logits.argmax(axis=-1)
+    return accuracy.compute(predictions=predictions, references=labels)
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_datasets["train"],
+    eval_dataset=tokenized_datasets["validation"],
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics,
+)
+trainer.train()
+trainer.save_model("paraphrase-detector")
+```
+### 5. Evaluate
+Evaluate the fine-tuned model.
+```python
+eval_results = trainer.evaluate()
+print(eval_results)
+```
+### 6. Inference
+Use the fine-tuned model for paraphrase detection inference.
+```python
+from transformers import pipeline
+paraphrase_pipeline = pipeline("text-classification", model="paraphrase-detector", tokenizer=tokenizer)
+example = paraphrase_pipeline({
+    "text": "How old are you?",
+    "text_pair": "What is your age?"
+})
+print(example)
+```
+---
+## Requirements
+- `datasets`
+- `transformers`
+- `evaluate`
+Install dependencies with:
+```bash
+pip install datasets transformers evaluate
+```
+---
+## Author
+Your Name - your.email@example.com
+---
+## License
+MIT License

config (1).json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "architectures": [
+    "BertForSequenceClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "problem_type": "single_label_classification",
+  "torch_dtype": "float16",
+  "transformers_version": "4.51.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

model (2).safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:082b6a4554b030aa6f347938550c83b72a796483f2e9c2a68a3220dfa9eb25fd
+size 45439980

special_tokens_map (1).json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer (1).json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config (1).json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab (1).txt ADDED Viewed

The diff for this file is too large to render. See raw diff