Upload latest checkpoint with model card
Browse files- README.md +44 -0
- test/dataset_dict.json +1 -0
- test/test/data-00000-of-00001.arrow +3 -0
- test/test/dataset_info.json +22 -0
- test/test/state.json +13 -0
- train/dataset_dict.json +1 -0
- train/train/data-00000-of-00001.arrow +3 -0
- train/train/dataset_info.json +16 -0
- train/train/state.json +13 -0
README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Dataset Card for Custom Text Dataset
|
| 3 |
+
|
| 4 |
+
## Dataset Name
|
| 5 |
+
Custom Text Dataset
|
| 6 |
+
|
| 7 |
+
## Overview
|
| 8 |
+
This dataset contains text data for training language models.
|
| 9 |
+
The data is collected from various sources, including books, articles,
|
| 10 |
+
and web pages.
|
| 11 |
+
|
| 12 |
+
## Composition
|
| 13 |
+
- **Number of records**: 101
|
| 14 |
+
- **Fields**: `sentence`, `labels`
|
| 15 |
+
- **Size**: 510 KB
|
| 16 |
+
|
| 17 |
+
## Collection Process
|
| 18 |
+
The data was collected using web scraping and manual extraction
|
| 19 |
+
from public domain sources.
|
| 20 |
+
|
| 21 |
+
## Preprocessing
|
| 22 |
+
- Removed HTML tags and special characters
|
| 23 |
+
- Tokenized text into sentences
|
| 24 |
+
|
| 25 |
+
## How to Use
|
| 26 |
+
```python
|
| 27 |
+
from datasets import load_dataset
|
| 28 |
+
dataset = load_dataset("path_to_dataset")
|
| 29 |
+
|
| 30 |
+
for example in dataset["train"]:
|
| 31 |
+
print(example["sentence"])
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
## Evaluation
|
| 35 |
+
This dataset is designed for evaluating text generation models.
|
| 36 |
+
Common evaluation metrics include ROUGE and BLEU.
|
| 37 |
+
|
| 38 |
+
## Limitations
|
| 39 |
+
The dataset may contain outdated or biased information.
|
| 40 |
+
Users should be aware of these limitations when using the data.
|
| 41 |
+
|
| 42 |
+
## Ethical Considerations
|
| 43 |
+
Privacy: Ensure that the data does not contain personal information.
|
| 44 |
+
Bias: Be aware of potential biases in the data.
|
test/dataset_dict.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"splits": ["test"]}
|
test/test/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e6aa13a3e10a33624931f6c220c9618528323886bd7b7ac334af681b8dc0646
|
| 3 |
+
size 346576
|
test/test/dataset_info.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"citation": "",
|
| 3 |
+
"description": "",
|
| 4 |
+
"features": {
|
| 5 |
+
"sentence": {
|
| 6 |
+
"feature": {
|
| 7 |
+
"dtype": "string",
|
| 8 |
+
"_type": "Value"
|
| 9 |
+
},
|
| 10 |
+
"_type": "Sequence"
|
| 11 |
+
},
|
| 12 |
+
"labels": {
|
| 13 |
+
"feature": {
|
| 14 |
+
"dtype": "string",
|
| 15 |
+
"_type": "Value"
|
| 16 |
+
},
|
| 17 |
+
"_type": "Sequence"
|
| 18 |
+
}
|
| 19 |
+
},
|
| 20 |
+
"homepage": "",
|
| 21 |
+
"license": ""
|
| 22 |
+
}
|
test/test/state.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "a966e5e39a3a551f",
|
| 8 |
+
"_format_columns": null,
|
| 9 |
+
"_format_kwargs": {},
|
| 10 |
+
"_format_type": null,
|
| 11 |
+
"_output_all_columns": false,
|
| 12 |
+
"_split": null
|
| 13 |
+
}
|
train/dataset_dict.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"splits": ["train"]}
|
train/train/data-00000-of-00001.arrow
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c3b84a293ed7afd9641f578c760558feab774e12174775ffef3bd6d130873903
|
| 3 |
+
size 1400
|
train/train/dataset_info.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"citation": "",
|
| 3 |
+
"description": "",
|
| 4 |
+
"features": {
|
| 5 |
+
"sentence": {
|
| 6 |
+
"dtype": "string",
|
| 7 |
+
"_type": "Value"
|
| 8 |
+
},
|
| 9 |
+
"labels": {
|
| 10 |
+
"dtype": "string",
|
| 11 |
+
"_type": "Value"
|
| 12 |
+
}
|
| 13 |
+
},
|
| 14 |
+
"homepage": "",
|
| 15 |
+
"license": ""
|
| 16 |
+
}
|
train/train/state.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_data_files": [
|
| 3 |
+
{
|
| 4 |
+
"filename": "data-00000-of-00001.arrow"
|
| 5 |
+
}
|
| 6 |
+
],
|
| 7 |
+
"_fingerprint": "a1df46296853828f",
|
| 8 |
+
"_format_columns": null,
|
| 9 |
+
"_format_kwargs": {},
|
| 10 |
+
"_format_type": null,
|
| 11 |
+
"_output_all_columns": false,
|
| 12 |
+
"_split": null
|
| 13 |
+
}
|