RahulGanapathy commited on
Commit
9500e4e
·
verified ·
1 Parent(s): e4fc48c

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitattributes +35 -35
  2. README.md +165 -1
  3. config.json +25 -0
  4. model.safetensors +3 -0
  5. special_tokens_map.json +37 -0
  6. tokenizer.json +0 -0
  7. tokenizer_config.json +63 -0
  8. vocab.txt +0 -0
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,167 @@
1
  ---
2
- license: cc-by-4.0
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: transformers
3
+ tags: [fake-news-detection, NLP, classification, transformers, DistilBERT]
4
  ---
5
+
6
+ # Model Card for Fake News Detection Model
7
+
8
+ ## Model Summary
9
+
10
+ This is a fine-tuned DistilBERT model for **fake news detection**. It classifies news articles as either **real** or **fake** based on textual content. The model has been trained on a labeled dataset consisting of true and false news articles collected from various sources.
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ - **Developed by:** Dhruv Pal
17
+ - **Finetuned from:** `distilbert-base-uncased`
18
+ - **Language:** English
19
+ - **Model type:** Transformer-based text classification model
20
+ - **License:** MIT
21
+ - **Intended Use:** Fake news detection on social media and news websites
22
+
23
+ ### Model Sources
24
+
25
+ - **Repository:** [Hugging Face Model Hub](https://huggingface.co/your-model-id)
26
+ - **Paper (if applicable):** N/A
27
+ - **Demo (if applicable):** N/A
28
+
29
+ ## Uses
30
+
31
+ ### Direct Use
32
+
33
+ - This model can be used to detect whether a given news article is **real or fake**.
34
+ - It can be integrated into fact-checking platforms, misinformation detection systems, and social media moderation tools.
35
+
36
+ ### Downstream Use
37
+
38
+ - Can be further fine-tuned on domain-specific fake news datasets.
39
+ - Useful for media companies, journalists, and researchers studying misinformation.
40
+
41
+ ### Out-of-Scope Use
42
+
43
+ - This model is **not designed for generating news content**.
44
+ - It may not work well for languages other than English.
45
+ - Not suitable for fact-checking complex claims requiring external knowledge.
46
+
47
+ ## Bias, Risks, and Limitations
48
+
49
+ ### Risks
50
+
51
+ - The model may be biased towards certain topics, sources, or writing styles based on the dataset used for training.
52
+ - There is a possibility of **false positives (real news misclassified as fake)** or **false negatives (fake news classified as real)**.
53
+ - Model performance can degrade on out-of-distribution samples.
54
+
55
+ ### Recommendations
56
+
57
+ - Users should **not rely solely** on this model for determining truthfulness.
58
+ - It is recommended to **use human verification** and **cross-check information** from multiple sources.
59
+
60
+ ## How to Use the Model
61
+
62
+ You can load the model using `transformers` and use it for inference as shown below:
63
+
64
+ ```python
65
+ from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
66
+ import torch
67
+
68
+ tokenizer = DistilBertTokenizerFast.from_pretrained("your-model-id")
69
+ model = DistilBertForSequenceClassification.from_pretrained("your-model-id")
70
+
71
+ def predict(text):
72
+ inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
73
+ outputs = model(**inputs)
74
+ probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
75
+ return "Fake News" if torch.argmax(probs) == 1 else "Real News"
76
+
77
+ text = "Breaking: Scientists discover a new element!"
78
+ print(predict(text))
79
+ ```
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ The model was trained on a dataset consisting of **news articles labeled as real or fake**. The dataset includes information from reputable sources and misinformation websites.
86
+
87
+ ### Training Procedure
88
+
89
+ - **Preprocessing:**
90
+ - Tokenization using `DistilBertTokenizerFast`
91
+ - Removal of stop words and punctuation
92
+ - Converting text to lowercase
93
+
94
+ - **Training Configuration:**
95
+ - **Model:** `distilbert-base-uncased`
96
+ - **Optimizer:** AdamW
97
+ - **Batch size:** 16
98
+ - **Epochs:** 3
99
+ - **Learning rate:** 2e-5
100
+
101
+ ### Compute Resources
102
+
103
+ - **Hardware:** NVIDIA Tesla T4 (Google Colab)
104
+ - **Training Time:** ~2 hours
105
+
106
+ ## Evaluation
107
+
108
+ ### Testing Data
109
+
110
+ - The model was evaluated on a held-out test set of **10,000 news articles**.
111
+
112
+ ### Metrics
113
+
114
+ - **Accuracy:** 92%
115
+ - **F1 Score:** 90%
116
+ - **Precision:** 91%
117
+ - **Recall:** 89%
118
+
119
+ ### Results
120
+
121
+ | Metric | Score |
122
+ |----------|-------|
123
+ | Accuracy | 92% |
124
+ | F1 Score | 90% |
125
+ | Precision | 91% |
126
+ | Recall | 89% |
127
+
128
+ ## Environmental Impact
129
+
130
+ - **Hardware Used:** NVIDIA Tesla T4
131
+ - **Total Compute Time:** ~2 hours
132
+ - **Carbon Emissions:** Estimated using the [ML Impact Calculator](https://mlco2.github.io/impact#compute)
133
+
134
+ ## Technical Specifications
135
+
136
+ ### Model Architecture
137
+
138
+ - The model is based on **DistilBERT**, a lightweight transformer architecture that reduces computation while retaining accuracy.
139
+
140
+ ### Dependencies
141
+
142
+ - `transformers`
143
+ - `torch`
144
+ - `datasets`
145
+ - `scikit-learn`
146
+
147
+ ## Citation
148
+
149
+ If you use this model, please cite it as:
150
+
151
+ ```bibtex
152
+ @misc{DhruvPal2025FakeNewsDetection,
153
+ title={Fake News Detection with DistilBERT},
154
+ author={Dhruv Pal},
155
+ year={2025},
156
+ howpublished={\url{https://huggingface.co/your-model-id}}
157
+ }
158
+ ```
159
+
160
+ ## Contact
161
+
162
+ For any queries, feel free to reach out:
163
+ - **Author:** Dhruv Pal
164
+ - **Email:** dhruv416pal@gmail.com
165
+ - **GitHub:** [dhruvpal05](https://github.com/dhruvpal05)
166
+ - **LinkedIn:** [idhruvpal](https://linkedin.com/in/idhruvpal)
167
+
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./fakenews_bert_model",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "initializer_range": 0.02,
12
+ "max_position_embeddings": 512,
13
+ "model_type": "distilbert",
14
+ "n_heads": 12,
15
+ "n_layers": 6,
16
+ "pad_token_id": 0,
17
+ "problem_type": "single_label_classification",
18
+ "qa_dropout": 0.1,
19
+ "seq_classif_dropout": 0.2,
20
+ "sinusoidal_pos_embds": false,
21
+ "tie_weights_": true,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.48.3",
24
+ "vocab_size": 30522
25
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc7192f5ca90c42dd9bbb84cc17b04b3b4782b01da7ead1e761db421397bf3f0
3
+ size 267832560
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "max_length": 512,
50
+ "model_max_length": 512,
51
+ "pad_to_multiple_of": null,
52
+ "pad_token": "[PAD]",
53
+ "pad_token_type_id": 0,
54
+ "padding_side": "right",
55
+ "sep_token": "[SEP]",
56
+ "stride": 0,
57
+ "strip_accents": null,
58
+ "tokenize_chinese_chars": true,
59
+ "tokenizer_class": "DistilBertTokenizer",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "[UNK]"
63
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff