chore: update code structure for improved readability and maintainability
Browse files- notebook/ai_vs_human/final_archi.md +1 -1
- notebook/ai_vs_human_nepali/notebook/Nepali_Ai_vs_Human.ipynb +1429 -0
- notebook/ai_vs_human_nepali/notebook/final_main.ipynb +0 -0
- notebook/ai_vs_human_nepali/notebook/main.ipynb +0 -0
- notebook/ai_vs_human_nepali/notebook/working model.ipynb +0 -0
- notebook/ai_vs_human_nepali/topic_scrapper.ipynb +542 -0
notebook/ai_vs_human/final_archi.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
# AI vs Human Text Detector V3 - Final Architecture Summary
|
| 2 |
-
|
| 3 |
**Model Version**: V3
|
| 4 |
**Type**: Hybrid Feature Engineering + TF-IDF Classifier
|
| 5 |
**Output Directory**: `./v3_model/`
|
|
|
|
| 1 |
# AI vs Human Text Detector V3 - Final Architecture Summary
|
| 2 |
+
dataset = "Pujan-Dev/english_aivshuman"
|
| 3 |
**Model Version**: V3
|
| 4 |
**Type**: Hybrid Feature Engineering + TF-IDF Classifier
|
| 5 |
**Output Directory**: `./v3_model/`
|
notebook/ai_vs_human_nepali/notebook/Nepali_Ai_vs_Human.ipynb
ADDED
|
@@ -0,0 +1,1429 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "901fc22d",
|
| 7 |
+
"metadata": {
|
| 8 |
+
"id": "901fc22d"
|
| 9 |
+
},
|
| 10 |
+
"outputs": [
|
| 11 |
+
{
|
| 12 |
+
"name": "stderr",
|
| 13 |
+
"output_type": "stream",
|
| 14 |
+
"text": [
|
| 15 |
+
"/home/pujan/miniconda3/envs/ml/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
| 16 |
+
" from .autonotebook import tqdm as notebook_tqdm\n"
|
| 17 |
+
]
|
| 18 |
+
}
|
| 19 |
+
],
|
| 20 |
+
"source": [
|
| 21 |
+
"import os\n",
|
| 22 |
+
"os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'\n",
|
| 23 |
+
"\n",
|
| 24 |
+
"import math\n",
|
| 25 |
+
"import pandas as pd\n",
|
| 26 |
+
"import torch\n",
|
| 27 |
+
"from torch.utils.data import Dataset, DataLoader\n",
|
| 28 |
+
"from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup\n",
|
| 29 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 30 |
+
"from sklearn.metrics import classification_report, f1_score, accuracy_score\n",
|
| 31 |
+
"import torch.nn as nn\n",
|
| 32 |
+
"from torch.optim import AdamW"
|
| 33 |
+
]
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"cell_type": "code",
|
| 37 |
+
"execution_count": 2,
|
| 38 |
+
"id": "70d3c048",
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [
|
| 41 |
+
{
|
| 42 |
+
"name": "stdout",
|
| 43 |
+
"output_type": "stream",
|
| 44 |
+
"text": [
|
| 45 |
+
"Columns: ['human_text', 'ai_generated_text']\n",
|
| 46 |
+
"Prepared dataset shape: (1986, 2)\n",
|
| 47 |
+
"label\n",
|
| 48 |
+
"1 996\n",
|
| 49 |
+
"0 990\n",
|
| 50 |
+
"Name: count, dtype: int64\n"
|
| 51 |
+
]
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"data": {
|
| 55 |
+
"text/html": [
|
| 56 |
+
"<div>\n",
|
| 57 |
+
"<style scoped>\n",
|
| 58 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 59 |
+
" vertical-align: middle;\n",
|
| 60 |
+
" }\n",
|
| 61 |
+
"\n",
|
| 62 |
+
" .dataframe tbody tr th {\n",
|
| 63 |
+
" vertical-align: top;\n",
|
| 64 |
+
" }\n",
|
| 65 |
+
"\n",
|
| 66 |
+
" .dataframe thead th {\n",
|
| 67 |
+
" text-align: right;\n",
|
| 68 |
+
" }\n",
|
| 69 |
+
"</style>\n",
|
| 70 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 71 |
+
" <thead>\n",
|
| 72 |
+
" <tr style=\"text-align: right;\">\n",
|
| 73 |
+
" <th></th>\n",
|
| 74 |
+
" <th>text</th>\n",
|
| 75 |
+
" <th>label</th>\n",
|
| 76 |
+
" </tr>\n",
|
| 77 |
+
" </thead>\n",
|
| 78 |
+
" <tbody>\n",
|
| 79 |
+
" <tr>\n",
|
| 80 |
+
" <th>0</th>\n",
|
| 81 |
+
" <td>हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान...</td>\n",
|
| 82 |
+
" <td>0</td>\n",
|
| 83 |
+
" </tr>\n",
|
| 84 |
+
" <tr>\n",
|
| 85 |
+
" <th>1</th>\n",
|
| 86 |
+
" <td>एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात...</td>\n",
|
| 87 |
+
" <td>0</td>\n",
|
| 88 |
+
" </tr>\n",
|
| 89 |
+
" <tr>\n",
|
| 90 |
+
" <th>2</th>\n",
|
| 91 |
+
" <td>नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील...</td>\n",
|
| 92 |
+
" <td>1</td>\n",
|
| 93 |
+
" </tr>\n",
|
| 94 |
+
" <tr>\n",
|
| 95 |
+
" <th>3</th>\n",
|
| 96 |
+
" <td>प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज...</td>\n",
|
| 97 |
+
" <td>1</td>\n",
|
| 98 |
+
" </tr>\n",
|
| 99 |
+
" <tr>\n",
|
| 100 |
+
" <th>4</th>\n",
|
| 101 |
+
" <td>बिहीबार एमालेका अध्यक्ष केपी शर्मा ओली र माओवा...</td>\n",
|
| 102 |
+
" <td>0</td>\n",
|
| 103 |
+
" </tr>\n",
|
| 104 |
+
" </tbody>\n",
|
| 105 |
+
"</table>\n",
|
| 106 |
+
"</div>"
|
| 107 |
+
],
|
| 108 |
+
"text/plain": [
|
| 109 |
+
" text label\n",
|
| 110 |
+
"0 हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान... 0\n",
|
| 111 |
+
"1 एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात... 0\n",
|
| 112 |
+
"2 नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील... 1\n",
|
| 113 |
+
"3 प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज... 1\n",
|
| 114 |
+
"4 बिहीबार एमालेका अध्यक्�� केपी शर्मा ओली र माओवा... 0"
|
| 115 |
+
]
|
| 116 |
+
},
|
| 117 |
+
"execution_count": 2,
|
| 118 |
+
"metadata": {},
|
| 119 |
+
"output_type": "execute_result"
|
| 120 |
+
}
|
| 121 |
+
],
|
| 122 |
+
"source": [
|
| 123 |
+
"# Load Dataset and convert to binary classification format\n",
|
| 124 |
+
"DATA_PATH = '../DATASET/new_data.csv'\n",
|
| 125 |
+
"raw_df = pd.read_csv(DATA_PATH)\n",
|
| 126 |
+
"print('Columns:', raw_df.columns.tolist())\n",
|
| 127 |
+
"\n",
|
| 128 |
+
"required_cols = ['human_text', 'ai_generated_text']\n",
|
| 129 |
+
"missing = [c for c in required_cols if c not in raw_df.columns]\n",
|
| 130 |
+
"if missing:\n",
|
| 131 |
+
" raise ValueError(f'Missing required columns: {missing}')\n",
|
| 132 |
+
"\n",
|
| 133 |
+
"# Build unified training dataframe: text + label (0=Human, 1=AI)\n",
|
| 134 |
+
"df_human = raw_df[['human_text']].dropna().rename(columns={'human_text': 'text'})\n",
|
| 135 |
+
"df_human['label'] = 0\n",
|
| 136 |
+
"\n",
|
| 137 |
+
"df_ai = raw_df[['ai_generated_text']].dropna().rename(columns={'ai_generated_text': 'text'})\n",
|
| 138 |
+
"df_ai['label'] = 1\n",
|
| 139 |
+
"\n",
|
| 140 |
+
"df = pd.concat([df_human, df_ai], ignore_index=True)\n",
|
| 141 |
+
"df['text'] = df['text'].astype(str).str.strip()\n",
|
| 142 |
+
"df = df[df['text'].str.len() > 10].drop_duplicates(subset=['text']).sample(frac=1, random_state=42).reset_index(drop=True)\n",
|
| 143 |
+
"\n",
|
| 144 |
+
"print('Prepared dataset shape:', df.shape)\n",
|
| 145 |
+
"print(df['label'].value_counts())\n",
|
| 146 |
+
"df.head()"
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
{
|
| 150 |
+
"cell_type": "code",
|
| 151 |
+
"execution_count": 3,
|
| 152 |
+
"id": "f93d4c7a",
|
| 153 |
+
"metadata": {
|
| 154 |
+
"id": "f93d4c7a"
|
| 155 |
+
},
|
| 156 |
+
"outputs": [
|
| 157 |
+
{
|
| 158 |
+
"name": "stdout",
|
| 159 |
+
"output_type": "stream",
|
| 160 |
+
"text": [
|
| 161 |
+
"Nulls in text: 0\n",
|
| 162 |
+
"Nulls in label: 0\n",
|
| 163 |
+
"Example text sample:\n",
|
| 164 |
+
"हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान्त राख्ने कि राख्ने माओवाद र जबज दुबै नराख्ने भन्दा उहाँहरु मान्नु भएन । एमालेका साथीहरुले जवजको विषय उठाउन चाहनुभएन । सिद्धान्तको विषय नै नमिलेपछि पार्टी एकता संयोजन समितिको बैठक रोकियो कार्यदलका एक सदस्\n"
|
| 165 |
+
]
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
"source": [
|
| 169 |
+
"# Quick sanity checks\n",
|
| 170 |
+
"print('Nulls in text:', int(df['text'].isnull().sum()))\n",
|
| 171 |
+
"print('Nulls in label:', int(df['label'].isnull().sum()))\n",
|
| 172 |
+
"print('Example text sample:')\n",
|
| 173 |
+
"print(df.loc[0, 'text'][:250])"
|
| 174 |
+
]
|
| 175 |
+
},
|
| 176 |
+
{
|
| 177 |
+
"cell_type": "code",
|
| 178 |
+
"execution_count": 4,
|
| 179 |
+
"id": "ba4a933f",
|
| 180 |
+
"metadata": {
|
| 181 |
+
"colab": {
|
| 182 |
+
"base_uri": "https://localhost:8080/",
|
| 183 |
+
"height": 206
|
| 184 |
+
},
|
| 185 |
+
"id": "ba4a933f",
|
| 186 |
+
"outputId": "9bf5f0a5-c547-43f1-b8f2-a580024d74a9"
|
| 187 |
+
},
|
| 188 |
+
"outputs": [
|
| 189 |
+
{
|
| 190 |
+
"name": "stdout",
|
| 191 |
+
"output_type": "stream",
|
| 192 |
+
"text": [
|
| 193 |
+
"label\n",
|
| 194 |
+
"AI 0.501511\n",
|
| 195 |
+
"Human 0.498489\n",
|
| 196 |
+
"Name: proportion, dtype: float64\n"
|
| 197 |
+
]
|
| 198 |
+
},
|
| 199 |
+
{
|
| 200 |
+
"data": {
|
| 201 |
+
"text/plain": [
|
| 202 |
+
"label \n",
|
| 203 |
+
"0 count 990.000000\n",
|
| 204 |
+
" mean 455.551515\n",
|
| 205 |
+
" std 56.825837\n",
|
| 206 |
+
" min 299.000000\n",
|
| 207 |
+
" 25% 418.000000\n",
|
| 208 |
+
" 50% 458.000000\n",
|
| 209 |
+
" 75% 494.000000\n",
|
| 210 |
+
" max 629.000000\n",
|
| 211 |
+
"1 count 996.000000\n",
|
| 212 |
+
" mean 284.231928\n",
|
| 213 |
+
" std 67.165254\n",
|
| 214 |
+
" min 103.000000\n",
|
| 215 |
+
" 25% 238.000000\n",
|
| 216 |
+
" 50% 282.000000\n",
|
| 217 |
+
" 75% 331.000000\n",
|
| 218 |
+
" max 433.000000\n",
|
| 219 |
+
"Name: text, dtype: float64"
|
| 220 |
+
]
|
| 221 |
+
},
|
| 222 |
+
"execution_count": 4,
|
| 223 |
+
"metadata": {},
|
| 224 |
+
"output_type": "execute_result"
|
| 225 |
+
}
|
| 226 |
+
],
|
| 227 |
+
"source": [
|
| 228 |
+
"# Class balance\n",
|
| 229 |
+
"print(df['label'].value_counts(normalize=True).rename({0: 'Human', 1: 'AI'}))\n",
|
| 230 |
+
"df.groupby('label')['text'].apply(lambda s: s.str.len().describe())"
|
| 231 |
+
]
|
| 232 |
+
},
|
| 233 |
+
{
|
| 234 |
+
"cell_type": "code",
|
| 235 |
+
"execution_count": 5,
|
| 236 |
+
"id": "d7b48175",
|
| 237 |
+
"metadata": {
|
| 238 |
+
"colab": {
|
| 239 |
+
"base_uri": "https://localhost:8080/",
|
| 240 |
+
"height": 206
|
| 241 |
+
},
|
| 242 |
+
"id": "d7b48175",
|
| 243 |
+
"outputId": "08bc4562-874c-40c1-d554-1d809a6d0e31"
|
| 244 |
+
},
|
| 245 |
+
"outputs": [
|
| 246 |
+
{
|
| 247 |
+
"data": {
|
| 248 |
+
"text/plain": [
|
| 249 |
+
"<matplotlib.legend.Legend at 0x7fef748b5290>"
|
| 250 |
+
]
|
| 251 |
+
},
|
| 252 |
+
"execution_count": 5,
|
| 253 |
+
"metadata": {},
|
| 254 |
+
"output_type": "execute_result"
|
| 255 |
+
},
|
| 256 |
+
{
|
| 257 |
+
"data": {
|
| 258 |
+
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAvwAAAGHCAYAAADMVYYQAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjgsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvwVt1zgAAAAlwSFlzAAAPYQAAD2EBqD+naQAARoNJREFUeJzt3X1cVGX+//H3gMMIAt7HTaJiIeVduVomVmgFu5aurtXWWq1Wa5bdaO5+7WtWDrsFZuVSa9nPttS2yG03c7Wvd6SJlbmp5epqmbspmkpkoaAoDMz1+6OYHAEdYGCY4+v5eMxDz3Wuuc7nzGdGPlxec47NGGMEAAAAwJJCAh0AAAAAgMZDwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAAAWRsEPIOjZbDafHmvXrvXL8Q4cOCCn06ktW7b41H/t2rWy2Wz6+9//7pfj+1tpaamcTmeNr4/T6ZTNZtOhQ4fqNfbYsWO9ctCqVSt17dpVP//5zzVv3jyVlZVVe87gwYM1ePDgOh1nx44dcjqd2rNnT52ed+qx9uzZI5vNpqeffrpO45xJZmamFi9eXK296r3hr/cmANSkRaADAICG+uijj7y2//CHP+i9997TmjVrvNp79Ojhl+MdOHBAGRkZ6tq1qy6++GK/jBlIpaWlysjIkKQ6F9q+CA8P9+Ti+PHj2rdvn5YvX65x48bpmWee0YoVK9SpUydP/xdeeKHOx9ixY4cyMjI0ePBgde3a1efn1edY9ZGZmakbbrhBI0eO9Gr/yU9+oo8++shv700AqAkFP4Cgd9lll3ltd+zYUSEhIdXaERg15eLXv/61br/9dg0bNkw33HCDNmzY4NnXFMVvaWmpIiIiAl5oR0dH8z4F0OhY0gPgrFBeXq7HH39cF1xwgRwOhzp27Kjbb79d33zzjafPjBkzFBISoqVLl3o9d+zYsYqIiNC2bdu0du1aXXLJJZKk22+/3bNUxel0NjjGgoICjR8/Xp06dVJYWJgSExOVkZGhiooKT5+Tl5zMmjVLiYmJioyM1MCBA72K5iovvfSSunfvLofDoR49eignJ0djx471zILv2bNHHTt2lCRlZGR4zmfs2LFe43z99df61a9+pdatWysmJkZ33HGHjhw50qDzTU9P17hx4/TPf/5T69at87TXtKRnzpw5uuiiixQZGamoqChdcMEFevjhhyVJ8+fP14033ihJGjJkiOcc5s+f7xmvV69eWrdunVJSUhQREaE77rij1mNJktvt1hNPPKHOnTurZcuW6t+/v1avXu3V5+TX8WRVy6Cq2Gw2HTt2TAsWLPDEVnXM2pb0LFmyRAMHDlRERISioqKUlpZW7X+yqo6zfft2v+cGgLVQ8AOwPLfbrREjRmjGjBkaPXq0/u///k8zZsxQbm6uBg8erOPHj0uSHnroIQ0dOlRjxoxRfn6+JGnevHlasGCB/vSnP6l37976yU9+onnz5kmSHnnkEX300Uf66KOP9Jvf/KZBMRYUFOjSSy/VypUr9dhjj2n58uW68847lZWVpXHjxlXr//zzzys3N1fZ2dl6/fXXdezYMV177bVehd7cuXN11113qU+fPlq0aJEeeeQRZWRkeBWXcXFxWrFihSTpzjvv9JzPo48+6nW866+/Xt27d9dbb72l//3f/1VOTo4efPDBBp2zJP385z+XJK+C/1QLFy7UhAkTlJqaqrfffluLFy/Wgw8+qGPHjkmSrrvuOmVmZnpel6pzuO666zxjHDx4ULfeeqtGjx6tZcuWacKECaeNa/bs2VqxYoWys7P12muvKSQkREOHDq1WdPvio48+Unh4uK699lpPbKdbSpSTk6MRI0YoOjpab7zxhl5++WUVFRVp8ODB+uCDD6r1b6zcALAQAwAWM2bMGNOqVSvP9htvvGEkmbfeesur38aNG40k88ILL3jaDh06ZDp16mQuvfRS88knn5iIiAhz66231vi8efPm+RTPe++9ZySZv/3tb7X2GT9+vImMjDT5+fle7U8//bSRZLZv326MMWb37t1Gkundu7epqKjw9Pv444+NJPPGG28YY4yprKw0sbGxZsCAAV7j5efnG7vdbrp06eJp++abb4wkM3369GpxTZ8+3UgyM2fO9GqfMGGCadmypXG73ac991NzcarPPvvMSDL33HOPpy01NdWkpqZ6tu+77z7Tpk2b0x7nb3/7m5Fk3nvvvWr7UlNTjSSzevXqGvedfKyq1zc+Pt4cP37c015cXGzatWtnrrnmGq9zO/l1rFL1mp2sVatWZsyYMdX6Vr03quKurKw08fHxpnfv3qaystLTr6SkxJxzzjkmJSWl2nHqmxsAZw9m+AFY3jvvvKM2bdpo+PDhqqio8DwuvvhixcbGes14t2/fXn/961/1ySefKCUlRZ07d9aLL77YJDEOGTJE8fHxXjEOHTpUkpSXl+fV/7rrrlNoaKhnu0+fPpLk+Z+JnTt3qqCgQL/85S+9nte5c2cNGjSozvFVzcSffLwTJ06osLCwzmOdzBhzxj6XXnqpDh8+rF/96lf6xz/+Ua8rBrVt21ZXXXWVz/1HjRqlli1berajoqI0fPhwrVu3TpWVlXU+vq927typAwcO6LbbblNIyI8/oiMjI3X99ddrw4YNKi0t9XpOY+UGgHVQ8AOwvK+//lqHDx9WWFiY7Ha716OgoKBaATlgwAD17NlTJ06c0D333KNWrVo1SYxLly6tFl/Pnj0lqVqM7du399p2OByS5Fme9O2330qSYmJiqh2rprYzOdPx6qvqF5T4+Pha+9x222165ZVXlJ+fr+uvv17nnHOOBgwYoNzcXJ+PExcXV6e4YmNja2wrLy/X0aNH6zRWXVTlraZ44+Pj5Xa7VVRU5NXeWLkBYB1cpQeA5XXo0EHt27f3rFU/VVRUlNf29OnTtW3bNvXr10+PPfaYhg0bpm7dujV6jH369NETTzxR4/7TFcQ1qSoCv/7662r7CgoK6h5gI1myZImkM18O9Pbbb9ftt9+uY8eOad26dZo+fbqGDRumL774Ql26dDnjcU7+Eq0vanqNCgoKFBYWpsjISElSy5Yta7yPQH3vWSD9mLeDBw9W23fgwAGFhISobdu29R4fwNmJGX4Aljds2DB9++23qqysVP/+/as9kpOTPX1zc3OVlZWlRx55RLm5uWrdurVuuukmlZeXe/o0xgzqsGHD9O9//1vnnXdejTHWteBPTk5WbGys3nzzTa/2vXv3av369V5tgZoRzs3N1Z///GelpKTo8ssv9+k5rVq10tChQzVt2jSVl5dr+/btkvx/DosWLdKJEyc82yUlJVq6dKmuuOIKz1Kqrl27qrCw0OuXqvLycq1cubLaeA6Hw6fYkpOTde655yonJ8drudOxY8f01ltvea7cAwB1wQw/AMu7+eab9frrr+vaa6/VxIkTdemll8put+urr77Se++9pxEjRugXv/iF50ouqampmj59ukJCQvTXv/5VV155paZMmaLs7GxJ0nnnnafw8HC9/vrruvDCCxUZGan4+PgzFuU1XTZTklJTU/X73/9eubm5SklJ0QMPPKDk5GSdOHFCe/bs0bJly/Tiiy963ZzqTEJCQpSRkaHx48frhhtu0B133KHDhw8rIyNDcXFxXuvDo6Ki1KVLF/3jH//Q1VdfrXbt2qlDhw51uoHV6bjdbs+5l5WVae/evVq+fLnefPNNXXjhhdV+KTnVuHHjFB4erkGDBikuLk4FBQXKyspS69atPZdI7dWrl6Tvr0wUFRWlli1bKjExsdpyF1+FhoYqLS1NkydPltvt1pNPPqni4mLPDcok6aabbtJjjz2mm2++Wf/zP/+jEydO6LnnnqtxjX/v3r21du1aLV26VHFxcYqKivL6RbNKSEiIZs6cqVtuuUXDhg3T+PHjVVZWpqeeekqHDx/WjBkz6nU+AM5ygf7WMAD4W01XhnG5XObpp582F110kWnZsqWJjIw0F1xwgRk/frzZtWuXqaioMKmpqSYmJsYcPHjQ67lPPfWUkWTefvttT9sbb7xhLrjgAmO322u9wk2Vqiux1PaoukLLN998Yx544AGTmJho7Ha7adeunenXr5+ZNm2aOXr0qDHmx6vIPPXUU9WOU1Mcc+fONeeff74JCwsz3bt3N6+88ooZMWKE6du3r1e/d9991/Tt29c4HA4jyXNFmaorwXzzzTde/efNm2ckmd27d9d63sZ8n4uTzzU8PNx07tzZDB8+3LzyyiumrKys2nNOvXLOggULzJAhQ0xMTIwJCwsz8fHx5pe//KXZunWr1/Oys7NNYmKiCQ0N9bqKUmpqqunZs2eN8dV2lZ4nn3zSZGRkmE6dOpmwsDDTt29fs3LlymrPX7Zsmbn44otNeHi46datm5k9e3aNV+nZsmWLGTRokImIiDCSPMc89So9VRYvXmwGDBhgWrZsaVq1amWuvvpq8+GHH3r1aWhuAJw9bMb4cIkEAIAlHD58WN27d9fIkSM1d+7cQIcDAGgCLOkBAIsqKCjQE088oSFDhqh9+/bKz8/XH//4R5WUlGjixImBDg8A0EQo+AHAohwOh/bs2aMJEybou+++U0REhC677DK9+OKLnst9AgCsjyU9AAAAgIVxWU4AAADAwij4AQAAAAuj4AcAAAAszPJf2nW73Tpw4ICioqLqfGt1AAAAoLkyxqikpETx8fFeN1Q8leUL/gMHDighISHQYQAAAACNYt++fae9G7vlC/6oqChJ378Q0dHRjX48l8ulVatWKT09XXa7vdGPh6ZHjq2N/FofObY28mt95PhHxcXFSkhI8NS7tbF8wV+1jCc6OrrJCv6IiAhFR0ef9W9CqyLH1kZ+rY8cWxv5tT5yXN2Zlq3zpV0AAADAwij4AQAAAAuj4AcAAAAszPJr+AEAAFB3lZWVcrlcgQ6jGpfLpRYtWujEiROqrKwMdDiNKjQ0VC1atGjwpeUp+AEAAODl6NGj+uqrr2SMCXQo1RhjFBsbq3379p0V91iKiIhQXFycwsLC6j0GBT8AAAA8Kisr9dVXXykiIkIdO3ZsdkW12+3W0aNHFRkZedqbTQU7Y4zKy8v1zTffaPfu3UpKSqr3+VLwAwAAwMPlcskYo44dOyo8PDzQ4VTjdrtVXl6uli1bWrrgl6Tw8HDZ7Xbl5+d7zrk+rP0qAQAAoF6a28z+2cofv9RQ8AMAAAAWRsEPAAAAWBgFPwAAAGBhfGkXAACc0dRF23zqlzWqdyNHgkDx9T3gL3V9L40dO1aHDx/W4sWLvdrXrl2rIUOGqKioSG3atPFfgEGEGX4AAADAwij4AQAAcFZwOp26+OKLvdqys7PVtWtXz/bYsWM1cuRIZWZmKiYmRm3atFFGRoYqKir0P//zP2rXrp06deqkV155xWuchx56SN27d1dERIS6deumRx991OtOxVXH/stf/qKuXbuqdevWuvnmm1VSUtKYpyyJgh8AAADwsmbNGh04cEDr1q3TrFmz5HQ6NWzYMLVt21b//Oc/dffdd+vuu+/Wvn37PM+JiorS/PnztWPHDj377LN66aWX9Mc//tFr3P/+979avHix3nnnHb3zzjvKy8vTjBkzGv18KPgBAABgCe+8844iIyO9HkOHDq3zOO3atdNzzz2n5ORk3XHHHUpOTlZpaakefvhhJSUlaerUqQoLC9OHH37oec4jjzyilJQUde3aVcOHD9dvf/tbvfnmm17jut1uzZ8/X7169dIVV1yh2267TatXr27weZ9JQAv+rl27ymazVXvce++9kr6/pbDT6VR8fLzCw8M1ePBgbd++PZAhAwAAoJkaMmSItmzZ4vX485//XOdxevbs6XXDq5iYGPXu/eOXiENDQ9W+fXsVFhZ62v7+97/r8ssvV2xsrCIjI/Xoo49q7969XuN27dpVUVFRnu24uDivMRpLQAv+jRs36uDBg55Hbm6uJOnGG2+UJM2cOVOzZs3S7NmztXHjRsXGxiotLa1J1joBAAAguLRq1Urnn3++1+Pcc8/17A8JCZExxus5J6+zr2K32722bTZbjW1ut1uStGHDBt18880aOnSo3nnnHX366aeaNm2aysvLzzhu1RiNKaCX5ezYsaPX9owZM3TeeecpNTVVxhhlZ2dr2rRpGjVqlCRpwYIFiomJUU5OjsaPHx+IkAEAABCkOnbsqIKCAhljZLPZJElbtmxp8LgffvihunTpomnTpnna8vPzGzyuvzSb6/CXl5frtdde0+TJk2Wz2fTll1+qoKBA6enpnj4Oh0Opqalav359rQV/WVmZysrKPNvFxcWSvv/trabf4Pyt6hhNcSwEBjm2NvJrfeS4fkLl2yxkoF9X8ttwLpdLxhi53W7v2edTZsYbW20z31Uz9FUxntx+atvJ47jdbl155ZX65ptv9OSTT+r666/XypUrtXz5ckVHR3v61TbO6dq6deumvXv3KicnR5dccomWLVumt99+2+v4VXGfGvPpzrVqnzFGLpdLoaGhXvt8fZ83m4J/8eLFOnz4sMaOHStJKigokPT9mqmTxcTEnPY3pqysLGVkZFRrX7VqlSIiIvwX8BlULU+CdZFjayO/1keO6+aS0DP3kaRly/Y0ahy+Ir/116JFC8XGxuro0aNeS1Ieurpzk8ZRNWlbm1OXeLtcLlVUVFR7Xmlpqaf/ueeeq6efflqzZs3S448/ruHDh+vee+/VggULvCaJTx2noqJC5eXlXm1ut1snTpxQcXGxhgwZonvuuUf333+/ysvLlZaWpt/97neaMWOG5zllZWWqrKz0GuPEiRNyu92nPdfy8nIdP35c69atU0VFRY3ndiY2c+pCpgD56U9/qrCwMC1dulSStH79eg0aNEgHDhxQXFycp9+4ceO0b98+rVixosZxaprhT0hI0KFDhxQdHd24J6Hv3yS5ublKS0urtk4L1kCOrY38Wh85rp+MpTt86jd9eI9GjuT0yG/DnThxQvv27VPXrl3VsmXLQIdTjTFGJSUlioqK8izLsbITJ05oz549SkhIqJaP4uJidejQQUeOHDltndssZvjz8/P17rvvatGiRZ622NhYSd/P9J9c8BcWFlab9T+Zw+GQw+Go1m6325v0g9/Ux0PTI8fWRn6tjxzXTaWP1/loLq8p+a2/yspK2Ww2hYSEeF2pprmoWv5SFaPVhYSEeL40fOp72tf3eLN4lebNm6dzzjlH1113nactMTFRsbGxXv8lV15erry8PKWkpAQiTAAAACDoBHyG3+12a968eRozZoxatPgxHJvNpkmTJikzM1NJSUlKSkpSZmamIiIiNHr06ABGDAAAAASPgBf87777rvbu3as77rij2r4pU6bo+PHjmjBhgoqKijRgwACtWrXK64YFAAAAAGoX8II/PT292g0QqthsNjmdTjmdzqYNCgAAALCIZrGGHwAAAEDjoOAHAAAALIyCHwAAALAwCn4AAADAwgL+pV0AAAAEgaUTm/Z4w59t2uNZGDP8AAAAsIz169crNDRUP/vZz7za9+zZI5vNpi1btgQmsACi4AcAAIBlvPLKK7r//vv1wQcfaO/evYEOp1mg4AcAAIAlHDt2TG+++abuueceDRs2TPPnzw90SM0CBT8AAAAs4a9//auSk5OVnJysW2+9VfPmzav1Bq9nEwp+AAAAWMLLL7+sW2+9VZL0s5/9TEePHtXq1asDHFXgUfADAAAg6O3cuVMff/yxbr75ZklSixYtdNNNN+mVV14JcGSBx2U5AQAAEPRefvllVVRU6Nxzz/W0GWNkt9tVVFQUwMgCjxl+AAAABLWKigq9+uqreuaZZ7RlyxbP41//+pe6dOmi119/PdAhBhQz/AAAAAhq77zzjoqKinTnnXeqdevWXvtuuOEGvfzyyxo2bFiAogs8Cn4AAACcWTO+8+3LL7+sa665plqxL0nXX3+9MjMz9d133wUgsuaBgh8AAABBbenSpbXu+8lPfuK5NOfZeolO1vADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAKCas/ULrs2NP/JAwQ8AAACP0NBQSVJ5eXmAI4EklZaWSpLsdnu9x+CynAAAAPBo0aKFIiIi9M0338hutyskpHnND7vdbpWXl+vEiRPNLjZ/MsaotLRUhYWFatOmjecXsfqg4AcAAICHzWZTXFycdu/erfz8/ECHU40xRsePH1d4eLhsNlugw2l0bdq0UWxsbIPGoOAHAACAl7CwMCUlJTXLZT0ul0vr1q3TlVde2aBlLsHAbrc3aGa/CgU/AABnsamLtgU6BDRTISEhatmyZaDDqCY0NFQVFRVq2bKl5Qt+f7HuwicAAAAAFPwAAACAlVHwAwAAABYW8IJ///79uvXWW9W+fXtFRETo4osv1ubNmz37jTFyOp2Kj49XeHi4Bg8erO3btwcwYgAAACB4BLTgLyoq0qBBg2S327V8+XLt2LFDzzzzjNq0aePpM3PmTM2aNUuzZ8/Wxo0bFRsbq7S0NJWUlAQucAAAACBIBPQqPU8++aQSEhI0b948T1vXrl09fzfGKDs7W9OmTdOoUaMkSQsWLFBMTIxycnI0fvz4pg4ZAAAACCoBLfiXLFmin/70p7rxxhuVl5enc889VxMmTNC4ceMkSbt371ZBQYHS09M9z3E4HEpNTdX69etrLPjLyspUVlbm2S4uLpb0/TVbXS5XI5+RPMdoimMhMMixtZFf6yPH3kLl9ut4gX5dya/1keMf+foa2IwxppFjqVXVtV0nT56sG2+8UR9//LEmTZqk//f//p9+/etfa/369Ro0aJD279+v+Ph4z/Puuusu5efna+XKldXGdDqdysjIqNaek5OjiIiIxjsZAAAAoAmVlpZq9OjROnLkiKKjo2vtF9AZfrfbrf79+yszM1OS1LdvX23fvl1z5szRr3/9a0+/U2+bbIyp9VbKU6dO1eTJkz3bxcXFSkhIUHp6+mlfCH9xuVzKzc1VWloaN4OwqKDK8fKHfOs39MnGjSOIBFV+US/k2FvG0h1+HW/68B5+Ha+uyK/1keMfVa1kOZOAFvxxcXHq0cP7H4YLL7xQb731liQpNjZWklRQUKC4uDhPn8LCQsXExNQ4psPhkMPhqNZut9ub9E3R1MdD0wuKHNsqfevX3M8jAIIiv2gQcvy9Sj9fv6O5vKbk1/rIse+ft4BepWfQoEHauXOnV9sXX3yhLl26SJISExMVGxur3Nxcz/7y8nLl5eUpJSWlSWMFAAAAglFAZ/gffPBBpaSkKDMzU7/85S/18ccfa+7cuZo7d66k75fyTJo0SZmZmUpKSlJSUpIyMzMVERGh0aNHBzJ0AAAAICgEtOC/5JJL9Pbbb2vq1Kn6/e9/r8TERGVnZ+uWW27x9JkyZYqOHz+uCRMmqKioSAMGDNCqVasUFRUVwMgBAACA4BDQgl+Shg0bpmHDhtW632azyel0yul0Nl1QAAAAgEUEdA0/AAAAgMZFwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFhYwC/LCeAstnSib/2GP9u4cQBBZOqibT71yxrVu5EjARAsmOEHAAAALIyCHwAAALAwCn4AAADAwij4AQAAAAuj4AcAAAAsjIIfAAAAsDAKfgAAAMDCuA4/AABoctxPAGg6zPADAAAAFkbBDwAAAFgYBT8AAABgYazhB4CGWDrRt37Dn23cOAAAqAUz/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAAAWRsEPAAAAWBjX4QcAoBmYumhboEMAYFHM8AMAAAAWRsEPAAAAWBgFPwAAAGBhAS34nU6nbDab1yM2Ntaz3xgjp9Op+Ph4hYeHa/Dgwdq+fXsAIwYAAACCS8Bn+Hv27KmDBw96Htu2/filpZkzZ2rWrFmaPXu2Nm7cqNjYWKWlpamkpCSAEQMAAADBI+AFf4sWLRQbG+t5dOzYUdL3s/vZ2dmaNm2aRo0apV69emnBggUqLS1VTk5OgKMGAAAAgkPAL8u5a9cuxcfHy+FwaMCAAcrMzFS3bt20e/duFRQUKD093dPX4XAoNTVV69ev1/jx42scr6ysTGVlZZ7t4uJiSZLL5ZLL5Wrck/nhOCf/CesJqhybUN/6BepcmmF8dc5vMzwHnF5z/QyHyu3X8Xw9v+Z+3LrmqbnmF/5Djn/k62tgM8aYRo6lVsuXL1dpaam6d++ur7/+Wo8//rg+//xzbd++XTt37tSgQYO0f/9+xcfHe55z1113KT8/XytXrqxxTKfTqYyMjGrtOTk5ioiIaLRzAQAAAJpSaWmpRo8erSNHjig6OrrWfgEt+E917NgxnXfeeZoyZYouu+wyDRo0SAcOHFBcXJynz7hx47Rv3z6tWLGixjFqmuFPSEjQoUOHTvtC+IvL5VJubq7S0tJkt9sb/XhoekGV4+UP+dZv6JONG0dtmmF8dc5vMzwHnF5z/QxnLN3h1/GmD+9hieP6Ol6V5ppf+A85/lFxcbE6dOhwxoI/4Et6TtaqVSv17t1bu3bt0siRIyVJBQUFXgV/YWGhYmJiah3D4XDI4XBUa7fb7U36pmjq46HpBUWObZW+9QvUeTTj+HzObzM+B5xec/sMV/r5a3W+nltzP259c9Tc8gv/I8e+fz4C/qXdk5WVlemzzz5TXFycEhMTFRsbq9zcXM/+8vJy5eXlKSUlJYBRAgAAAMEjoDP8v/vd7zR8+HB17txZhYWFevzxx1VcXKwxY8bIZrNp0qRJyszMVFJSkpKSkpSZmamIiAiNHj06kGEDOBssf8j32XvgNKYu2nbmTgDQiAJa8H/11Vf61a9+pUOHDqljx4667LLLtGHDBnXp0kWSNGXKFB0/flwTJkxQUVGRBgwYoFWrVikqKiqQYQMAAABBI6AF/8KFC0+732azyel0yul0Nk1AAAAAgMU0qzX8AAAAAPyrWV2lB0Azt3Sib/2GP9u4cQAAAJ8xww8AAABYGAU/AAAAYGEU/AAAAICFsYYfAAA0W3W9j0Go3LokVMpYuqPGu/lmjertr9CAoMEMPwAAAGBhFPwAAACAhVHwAwAAABbGGn4AACyormvfAVgXM/wAAACAhVHwAwAAABZGwQ8AAABYGGv4ATR/Syf61m/4s40bBwAAQYgZfgAAAMDCKPgBAAAAC6PgBwAAACyMNfwAzi5n+j6ACZWU2iShAFbE9f+B5ocZfgAAAMDCKPgBAAAAC6PgBwAAACyMgh8AAACwML60C8D/fL1RllWOCwBAM8YMPwAAAGBhFPwAAACAhdWr4O/WrZu+/fbbau2HDx9Wt27dGhwUAAAAAP+o1xr+PXv2qLKyslp7WVmZ9u/f3+CgAABo7rjBFIBgUaeCf8mSJZ6/r1y5Uq1bt/ZsV1ZWavXq1eratavfggMAAADQMHUq+EeOHClJstlsGjNmjNc+u92url276plnnvFbcAAAAAAapk4Fv9vtliQlJiZq48aN6tChQ6MEBQAAAMA/6rWGf/fu3f6OQ1lZWXr44Yc1ceJEZWdnS5KMMcrIyNDcuXNVVFSkAQMG6Pnnn1fPnj39fnwAAGB9vn73ImtU70aOBGg69b7x1urVq7V69WoVFhZ6Zv6rvPLKK3Uaa+PGjZo7d6769Onj1T5z5kzNmjVL8+fPV/fu3fX4448rLS1NO3fuVFRUVH1DBwAAAM4a9bosZ0ZGhtLT07V69WodOnRIRUVFXo+6OHr0qG655Ra99NJLatu2rafdGKPs7GxNmzZNo0aNUq9evbRgwQKVlpYqJyenPmEDAAAAZ516zfC/+OKLmj9/vm677bYGB3Dvvffquuuu0zXXXKPHH3/c0757924VFBQoPT3d0+ZwOJSamqr169dr/PjxNY5XVlamsrIyz3ZxcbEkyeVyyeVyNTjeM6k6RlMcC4ERVDk2ob718/VcfB0viLl+OEeXv881GN4vZwl/fYZD5T5zJzS5kB/yEtLA/ATFv/FnqaD6OdzIfH0N6lXwl5eXKyUlpT5P9bJw4UJ98skn2rhxY7V9BQUFkqSYmBiv9piYGOXn59c6ZlZWljIyMqq1r1q1ShEREQ2M2He5ublNdiwERnDkONW3bsuW+Xc8C8jV5ZLx44A+v8ZoKg39DF9i/d9/g1q/0L0Nev6yZXv8EwgaTXD8HG5cpaWlPvWrV8H/m9/8Rjk5OXr00Ufr83RJ0r59+zRx4kStWrVKLVu2rLWfzWbz2jbGVGs72dSpUzV58mTPdnFxsRISEpSenq7o6Oh6x+srl8ul3NxcpaWlyW63N/rx0PSCKsfLH/Kt39An/TteEHOZUOXqcqXpA9lt1W8wWG++vsZodP76DGcs3eHHqOAvIXKrX+heba7sLHf9Vi5LkqYP7+HHqOBPQfVzuJFVrWQ5k3oV/CdOnNDcuXP17rvvqk+fPtVe7FmzZp1xjM2bN6uwsFD9+vXztFVWVmrdunWaPXu2du7cKen7mf64uDhPn8LCwmqz/idzOBxyOBzV2u12e5O+KZr6eGh6QZFjXwtWX8/DnwVwc2Yku63SvwV/c3+vnIUa+hmubEAxicbnVkiDctTs/31HcPwcbmS+nn+9Cv6tW7fq4osvliT9+9//9tp3utn3k1199dXats370li33367LrjgAj300EPq1q2bYmNjlZubq759+0r6filRXl6ennySmTIAAADAF/Uq+N97770GHzgqKkq9evXyamvVqpXat2/vaZ80aZIyMzOVlJSkpKQkZWZmKiIiQqNHj27w8QEAAICzQb2vw98UpkyZouPHj2vChAmeG2+tWrWKa/ADAAAAPqpXwT9kyJDTLt1Zs2ZNvYJZu3at17bNZpPT6ZTT6azXeAAAAMDZrl4Ff9X6/Soul0tbtmzRv//9b40ZM8YfcQEAAADwg3oV/H/84x9rbHc6nTp69GiDAgIAAADgP369ptitt96qV155xZ9DAgAAAGgAvxb8H3300WlvogUAAACgadVrSc+oUaO8to0xOnjwoDZt2tSgu+8CAAAA8K96FfytW7f22g4JCVFycrJ+//vfKz093S+BAQAQCFMXbTtzJwAIIvUq+OfNm+fvOAAAAAA0ggbdeGvz5s367LPPZLPZ1KNHD/Xt29dfcQEAAADwg3oV/IWFhbr55pu1du1atWnTRsYYHTlyREOGDNHChQvVsWNHf8cJAAAAoB7qVfDff//9Ki4u1vbt23XhhRdKknbs2KExY8bogQce0BtvvOHXIAGcYunEQEcAAACCRL0K/hUrVujdd9/1FPuS1KNHDz3//PN8aRcAAABoRup1HX632y273V6t3W63y+12NzgoAAAAAP5Rr4L/qquu0sSJE3XgwAFP2/79+/Xggw/q6quv9ltwAAAAABqmXkt6Zs+erREjRqhr165KSEiQzWbT3r171bt3b7322mv+jhFAY+M7AQAAWFa9Cv6EhAR98sknys3N1eeffy5jjHr06KFrrrnG3/EBAAAAaIA6LelZs2aNevTooeLiYklSWlqa7r//fj3wwAO65JJL1LNnT73//vuNEigAAACAuqtTwZ+dna1x48YpOjq62r7WrVtr/PjxmjVrlt+CAwAAANAwdVrS869//UtPPvlkrfvT09P19NNPNzgoAMAZ+Pq9i+HPNm4cAIBmr04z/F9//XWNl+Os0qJFC33zzTcNDgoAAACAf9Sp4D/33HO1bdu2Wvdv3bpVcXFxDQ4KAAAAgH/UqeC/9tpr9dhjj+nEiRPV9h0/flzTp0/XsGHD/BYcAAAAgIap0xr+Rx55RIsWLVL37t113333KTk5WTabTZ999pmef/55VVZWatq0aY0VKxC8WG8N3gMBN3XR9/9DHSq3LgmVMpbuUGX97j8JAEGlTgV/TEyM1q9fr3vuuUdTp06VMUaSZLPZ9NOf/lQvvPCCYmJiGiVQAAAAAHVX5xtvdenSRcuWLVNRUZH+85//yBijpKQktW3btjHiAwAAANAA9brTriS1bdtWl1xyiT9jAQAAAOBnLF4EAAAALIyCHwAAALAwCn4AAADAwij4AQAAAAuj4AcAAAAsLKAF/5w5c9SnTx9FR0crOjpaAwcO1PLlyz37jTFyOp2Kj49XeHi4Bg8erO3btwcwYgAAACC4BLTg79Spk2bMmKFNmzZp06ZNuuqqqzRixAhPUT9z5kzNmjVLs2fP1saNGxUbG6u0tDSVlJQEMmwAAAAgaAS04B8+fLiuvfZade/eXd27d9cTTzyhyMhIbdiwQcYYZWdna9q0aRo1apR69eqlBQsWqLS0VDk5OYEMGwAAAAga9b7xlr9VVlbqb3/7m44dO6aBAwdq9+7dKigoUHp6uqePw+FQamqq1q9fr/Hjx9c4TllZmcrKyjzbxcXFkiSXyyWXy9W4J/HDcU7+E9ZTrxybUF8H9+94qDPXD6+tK1Cvsb/fA/xb5BEqtyQp5JQ/YS3+yi8/x5svaq0f+foa2IwxppFjOa1t27Zp4MCBOnHihCIjI5WTk6Nrr71W69ev16BBg7R//37Fx8d7+t91113Kz8/XypUraxzP6XQqIyOjWntOTo4iIiIa7TwAAACAplRaWqrRo0fryJEjio6OrrVfwGf4k5OTtWXLFh0+fFhvvfWWxowZo7y8PM9+m83m1d8YU63tZFOnTtXkyZM928XFxUpISFB6evppXwh/cblcys3NVVpamux2e6MfD02vXjle/pBv/YY+6d/xUGcuE6pcXa40fSC7rbLpA/D3e8DX8c4CGUt3SPp+5rdf6F5truwsNxersxx/5Xf68B5+jAr+RK31o6qVLGcS8II/LCxM559/viSpf//+2rhxo5599lk99ND3P8wKCgoUFxfn6V9YWKiYmJhax3M4HHI4HNXa7XZ7k74pmvp4aHp1yrGvhaO/x0P9GMluqwxMwR+o99RZoPKU4s+tkGptsI6G5pef4c0ftZbv79Nm9y+dMUZlZWVKTExUbGyscnNzPfvKy8uVl5enlJSUAEYIAAAABI+AzvA//PDDGjp0qBISElRSUqKFCxdq7dq1WrFihWw2myZNmqTMzEwlJSUpKSlJmZmZioiI0OjRowMZNgCguVo6sdZdI7/6TpLktrVQYeeRTRQQrG7qom1n7JM1qncTRALULqAF/9dff63bbrtNBw8eVOvWrdWnTx+tWLFCaWlpkqQpU6bo+PHjmjBhgoqKijRgwACtWrVKUVFRgQwbAAAACBoBLfhffvnl0+632WxyOp1yOp1NExAAAABgMc1uDT8AAAAA/wn4VXoAAI3oNGvavQx/tnHjqE1zjw8ALIAZfgAAAMDCKPgBAAAAC6PgBwAAACyMNfwAgKB28nXQq661DwD4ETP8AAAAgIVR8AMAAAAWRsEPAAAAWBhr+AEAAE5x8ndDgGDHDD8AAABgYRT8AAAAgIVR8AMAAAAWxhp+AIDf+LruOWtU70aOBABQhRl+AAAAwMIo+AEAAAALo+AHAAAALIw1/EBzsnRioCMAmkTVWv+RX31Xa58Bie1+3DjNZ+N0YwAAmOEHAAAALI2CHwAAALAwCn4AAADAwij4AQAAAAuj4AcAAAAsjIIfAAAAsDAKfgAAAMDCuA4/UJPTXQ/fhEpKlZY/JP18VpOFBASTkV/NDHQIAIAfMMMPAAAAWBgFPwAAAGBhFPwAAACAhQV0DX9WVpYWLVqkzz//XOHh4UpJSdGTTz6p5ORkTx9jjDIyMjR37lwVFRVpwIABev7559WzZ88ARg4AjeR03x85y/xz93eBDgFoUlMXbfOpX9ao3o0cCawmoDP8eXl5uvfee7Vhwwbl5uaqoqJC6enpOnbsmKfPzJkzNWvWLM2ePVsbN25UbGys0tLSVFJSEsDIAQAAgOAQ0Bn+FStWeG3PmzdP55xzjjZv3qwrr7xSxhhlZ2dr2rRpGjVqlCRpwYIFiomJUU5OjsaPHx+IsAEAAICg0awuy3nkyBFJUrt27SRJu3fvVkFBgdLT0z19HA6HUlNTtX79+hoL/rKyMpWVlXm2i4uLJUkul0sul6sxw/cc5+Q/EaRMaK27XD/sc5lQydc8n2Y8NC9e+T2b+OnfrFC5JUluW7P68eKlKraQH2KFtVTltTnl19eaINTHmM/2GoNa60e+vgY2Y4xp5Fh8YozRiBEjVFRUpPfff1+StH79eg0aNEj79+9XfHy8p+9dd92l/Px8rVy5sto4TqdTGRkZ1dpzcnIUERHReCcAAAAANKHS0lKNHj1aR44cUXR0dK39ms0UzH333aetW7fqgw8+qLbPZrN5bRtjqrVVmTp1qiZPnuzZLi4uVkJCgtLT00/7QviLy+VSbm6u0tLSZLfbG/14aCTLH6p1l8uEKleXK00fyH5tZoPHQ/PilV9bZaDDaTpDn/TLMBlLd0iSrtuf7ZfxGoPb1kKHEoapw753FGIqTtv3/86d1DRBwW9C5Fa/0L3aXNlZbotejHD68B6BDiGgqLV+VLWS5UyaRcF///33a8mSJVq3bp06derkaY+NjZUkFRQUKC4uztNeWFiomJiYGsdyOBxyOBzV2u12e5O+KZr6ePCzMxV6RrLbKn3P8dlUOFpBVX7Pprz56d+ryh8KrDMV0s1BiKk4Y5yVFi0YzwZuhVg2f9QX36PW8v29ENBPgjFG9913nxYtWqQ1a9YoMTHRa39iYqJiY2OVm5vraSsvL1deXp5SUlKaOlwAAAAg6AR0hv/ee+9VTk6O/vGPfygqKkoFBQWSpNatWys8PFw2m02TJk1SZmamkpKSlJSUpMzMTEVERGj06NGBDB1Nxddrkg9/tnHjAKyOzxoAWFZAC/45c+ZIkgYPHuzVPm/ePI0dO1aSNGXKFB0/flwTJkzw3Hhr1apVioqKauJoAQAAgOAT0ILflwsE2Ww2OZ1OOZ3Oxg8IAAAAsBhrfpsFAAAAgKRmcpUeoMn4uk4ZAADAIpjhBwAAACyMgh8AAACwMAp+AAAAwMJYww8A8N0Zvgcz8qvvmigQAICvmOEHAAAALIyCHwAAALAwCn4AAADAwij4AQAAAAvjS7uwBm6oBdTLP3fzJVsAsDpm+AEAAAALo+AHAAAALIyCHwAAALAw1vADDcF3BwAATWzqom0+9csa1buRI0GwYIYfAAAAsDAKfgAAAMDCKPgBAAAAC6PgBwAAACyMgh8AAACwMAp+AAAAwMIo+AEAAAAL4zr8AADUYuRXM33qt7jTlEaOBKg7rtePKszwAwAAABZGwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYQEt+NetW6fhw4crPj5eNptNixcv9tpvjJHT6VR8fLzCw8M1ePBgbd++PTDBAgAAAEEooAX/sWPHdNFFF2n27Nk17p85c6ZmzZql2bNna+PGjYqNjVVaWppKSkqaOFIAAAAgOAX0TrtDhw7V0KFDa9xnjFF2dramTZumUaNGSZIWLFigmJgY5eTkaPz48U0ZKgAAABCUAlrwn87u3btVUFCg9PR0T5vD4VBqaqrWr19fa8FfVlamsrIyz3ZxcbEkyeVyyeVyNW7QPxzn5D/RQCY00BFU4/ohJlczjA0Nd7bl121rtj8GGk3VOfvz3EPl9ttYaJiQH3IRQk58Fmw1C7XWj3x9DZrtv/QFBQWSpJiYGK/2mJgY5efn1/q8rKwsZWRkVGtftWqVIiIi/BvkaeTm5jbZsawtNdAB1CpXl0sm0FGgsZw1+e0c6AAC51DCML+NdYn2+G0s+Ee/0L2BDiFoLFu2J9Ah1Au1llRaWupTv2Zb8Fex2Wxe28aYam0nmzp1qiZPnuzZLi4uVkJCgtLT0xUdHd1ocVZxuVzKzc1VWlqa7HZ7ox/P8pY/FOgIqnGZUOXqcqXpA9ltlYEOB352tuV3U35RoENocm5bCx1KGKYO+95RiKnwy5j/d+4kv4yDhguRW/1C92pzZWe5uRihT6YP7xHoEOqEWutHVStZzqTZFvyxsbGSvp/pj4uL87QXFhZWm/U/mcPhkMPhqNZut9ub9E3R1MezrOZacBnJbqs8KwrCs9JZlF9/FbzBKMRU+O38Kyksmx23QsiLj4K1XqHW8j13zfaTkJiYqNjYWK//rikvL1deXp5SUlICGBkAAAAQPAI6w3/06FH95z//8Wzv3r1bW7ZsUbt27dS5c2dNmjRJmZmZSkpKUlJSkjIzMxUREaHRo0cHMGr4xdKJgY4AAADgrBDQgn/Tpk0aMmSIZ7tq7f2YMWM0f/58TZkyRcePH9eECRNUVFSkAQMGaNWqVYqKigpUyAAAAEBQCWjBP3jwYBlT+2UwbDabnE6nnE5n0wUFAAAAWEizXcMPAAAAoOGa7VV6AAAA0PimLtrmU7+sUb0bORI0Fmb4AQAAAAuj4AcAAAAsjIIfAAAAsDDW8MO/uL4+UC//3P2dT/0GJLZr5EgAAFbDDD8AAABgYRT8AAAAgIVR8AMAAAAWxhp++LbufvizjR8HgDNirT+AQOF6/cGLGX4AAADAwij4AQAAAAuj4AcAAAAsjDX8TcHXa9OzTh6An/i61h/+MfKrmT71W9xpSiNHAgQPvhPQdJjhBwAAACyMgh8AAACwMAp+AAAAwMJYw9+c+Hutv6/jNfVYgAWwRh4AECyY4QcAAAAsjIIfAAAAsDAKfgAAAMDCKPgBAAAAC+NLuwBwkk35RVLn7/8MMRWBDgcAgo6vN9RC02GGHwAAALAwCn4AAADAwij4AQAAAAtjDT8AAE1k5Fczfeq3uNOURo4EwNmEGX4AAADAwij4AQAAAAuj4AcAAAAsLCjW8L/wwgt66qmndPDgQfXs2VPZ2dm64oorAh1W4CydGOgIAAAAmhVfr/+fNap3QMYLpGY/w//Xv/5VkyZN0rRp0/Tpp5/qiiuu0NChQ7V3795AhwYAAAA0e82+4J81a5buvPNO/eY3v9GFF16o7OxsJSQkaM6cOYEODQAAAGj2mvWSnvLycm3evFn/+7//69Wenp6u9evX1/icsrIylZWVebaPHDkiSfruu+/kcrkaL9gfuFwulZaW6ttvv5Xdbv++8VhFox8XTcdljEpVqm9VIbutMtDhwM9KTrhVWlqqkhNuhRh3oMNBI3Dbmn+OK0qLAx1C0HLLrdLQUrkqi+Vu/vOa8MG3337rtV1jrSXfPzenjlcbf4/XGEpKSiRJxpjT9mvWBf+hQ4dUWVmpmJgYr/aYmBgVFBTU+JysrCxlZGRUa09MTGyUGHG2+lOgA0CjejXQAaDRNfcc5wQ6AKDZePosG68+SkpK1Lp161r3N+uCv4rNZvPaNsZUa6sydepUTZ482bPtdrv13XffqX379rU+x5+Ki4uVkJCgffv2KTo6utGPh6ZHjq2N/FofObY28mt95PhHxhiVlJQoPj7+tP2adcHfoUMHhYaGVpvNLywsrDbrX8XhcMjhcHi1tWnTprFCrFV0dPRZ/ya0OnJsbeTX+sixtZFf6yPH3zvdzH6VZr24LSwsTP369VNubq5Xe25urlJSUgIUFQAAABA8mvUMvyRNnjxZt912m/r376+BAwdq7ty52rt3r+6+++5AhwYAAAA0e82+4L/pppv07bff6ve//70OHjyoXr16admyZerSpUugQ6uRw+HQ9OnTqy0rgnWQY2sjv9ZHjq2N/FofOa47mznTdXwAAAAABK1mvYYfAAAAQMNQ8AMAAAAWRsEPAAAAWBgFPwAAAGBhFPw+WLdunYYPH674+HjZbDYtXrzYa78xRk6nU/Hx8QoPD9fgwYO1fft2rz5lZWW6//771aFDB7Vq1Uo///nP9dVXXzXhWaA2WVlZuuSSSxQVFaVzzjlHI0eO1M6dO736kOPgNmfOHPXp08dzk5aBAwdq+fLlnv3k11qysrJks9k0adIkTxs5Dm5Op1M2m83rERsb69lPfq1h//79uvXWW9W+fXtFRETo4osv1ubNmz37yXP9UfD74NixY7rooos0e/bsGvfPnDlTs2bN0uzZs7Vx40bFxsYqLS1NJSUlnj6TJk3S22+/rYULF+qDDz7Q0aNHNWzYMFVWVjbVaaAWeXl5uvfee7Vhwwbl5uaqoqJC6enpOnbsmKcPOQ5unTp10owZM7Rp0yZt2rRJV111lUaMGOH5QUF+rWPjxo2aO3eu+vTp49VOjoNfz549dfDgQc9j27Ztnn3kN/gVFRVp0KBBstvtWr58uXbs2KFnnnlGbdq08fQhzw1gUCeSzNtvv+3ZdrvdJjY21syYMcPTduLECdO6dWvz4osvGmOMOXz4sLHb7WbhwoWePvv37zchISFmxYoVTRY7fFNYWGgkmby8PGMMObaqtm3bmj//+c/k10JKSkpMUlKSyc3NNampqWbixInGGD7DVjB9+nRz0UUX1biP/FrDQw89ZC6//PJa95PnhmGGv4F2796tgoICpaene9ocDodSU1O1fv16SdLmzZvlcrm8+sTHx6tXr16ePmg+jhw5Iklq166dJHJsNZWVlVq4cKGOHTumgQMHkl8Luffee3Xdddfpmmuu8Wonx9awa9cuxcfHKzExUTfffLO+/PJLSeTXKpYsWaL+/fvrxhtv1DnnnKO+ffvqpZde8uwnzw1Dwd9ABQUFkqSYmBiv9piYGM++goIChYWFqW3btrX2QfNgjNHkyZN1+eWXq1evXpLIsVVs27ZNkZGRcjgcuvvuu/X222+rR48e5NciFi5cqE8++URZWVnV9pHj4DdgwAC9+uqrWrlypV566SUVFBQoJSVF3377Lfm1iC+//FJz5sxRUlKSVq5cqbvvvlsPPPCAXn31VUl8jhuqRaADsAqbzea1bYyp1nYqX/qgad13333aunWrPvjgg2r7yHFwS05O1pYtW3T48GG99dZbGjNmjPLy8jz7yW/w2rdvnyZOnKhVq1apZcuWtfYjx8Fr6NChnr/37t1bAwcO1HnnnacFCxbosssuk0R+g53b7Vb//v2VmZkpSerbt6+2b9+uOXPm6Ne//rWnH3muH2b4G6jqKgGn/uZYWFjo+S00NjZW5eXlKioqqrUPAu/+++/XkiVL9N5776lTp06ednJsDWFhYTr//PPVv39/ZWVl6aKLLtKzzz5Lfi1g8+bNKiwsVL9+/dSiRQu1aNFCeXl5eu6559SiRQtPjsixdbRq1Uq9e/fWrl27+AxbRFxcnHr06OHVduGFF2rv3r2S+FncUBT8DZSYmKjY2Fjl5uZ62srLy5WXl6eUlBRJUr9+/WS32736HDx4UP/+9789fRA4xhjdd999WrRokdasWaPExESv/eTYmowxKisrI78WcPXVV2vbtm3asmWL59G/f3/dcsst2rJli7p160aOLaasrEyfffaZ4uLi+AxbxKBBg6pdEvuLL75Qly5dJPGzuMEC8EXhoFNSUmI+/fRT8+mnnxpJZtasWebTTz81+fn5xhhjZsyYYVq3bm0WLVpktm3bZn71q1+ZuLg4U1xc7Bnj7rvvNp06dTLvvvuu+eSTT8xVV11lLrroIlNRURGo08IP7rnnHtO6dWuzdu1ac/DgQc+jtLTU04ccB7epU6eadevWmd27d5utW7eahx9+2ISEhJhVq1YZY8ivFZ18lR5jyHGw++1vf2vWrl1rvvzyS7NhwwYzbNgwExUVZfbs2WOMIb9W8PHHH5sWLVqYJ554wuzatcu8/vrrJiIiwrz22muePuS5/ij4ffDee+8ZSdUeY8aMMcZ8f6mo6dOnm9jYWONwOMyVV15ptm3b5jXG8ePHzX333WfatWtnwsPDzbBhw8zevXsDcDY4VU25lWTmzZvn6UOOg9sdd9xhunTpYsLCwkzHjh3N1Vdf7Sn2jSG/VnRqwU+Og9tNN91k4uLijN1uN/Hx8WbUqFFm+/btnv3k1xqWLl1qevXqZRwOh7ngggvM3LlzvfaT5/qzGWNMYP5vAQAAAEBjYw0/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFgYBT8AAABgYRT8AAAAgIVR8AMAAAAWRsEPAAAAWBgFPwCcBWw2mxYvXhzoMAAAAUDBDwAWUFBQoPvvv1/dunWTw+FQQkKChg8frtWrVwc6tDMaO3asRo4cGegwAMCyWgQ6AABAw+zZs0eDBg1SmzZtNHPmTPXp00cul0srV67Uvffeq88//7xRjlteXq6wsLBGGbs+mls8ANBcMMMPAEFuwoQJstls+vjjj3XDDTeoe/fu6tmzpyZPnqwNGzZ4+h06dEi/+MUvFBERoaSkJC1ZssSzr7KyUnfeeacSExMVHh6u5ORkPfvss17HqZqJz8rKUnx8vLp37y5Jeu2119S/f39FRUUpNjZWo0ePVmFhoddzt2/fruuuu07R0dGKiorSFVdcof/+979yOp1asGCB/vGPf8hms8lms2nt2rWSpP379+umm25S27Zt1b59e40YMUJ79uw5YzwvvPCCkpKS1LJlS8XExOiGG27w58sNAEGHGX4ACGLfffedVqxYoSeeeEKtWrWqtr9Nmzaev2dkZGjmzJl66qmn9Kc//Um33HKL8vPz1a5dO7ndbnXq1ElvvvmmOnTooPXr1+uuu+5SXFycfvnLX3rGWL16taKjo5WbmytjjKTvZ9b/8Ic/KDk5WYWFhXrwwQc1duxYLVu2TNL3hfuVV16pwYMHa82aNYqOjtaHH36oiooK/e53v9Nnn32m4uJizZs3T5LUrl07lZaWasiQIbriiiu0bt06tWjRQo8//rh+9rOfaevWrZ6Z/FPj2bRpkx544AH95S9/UUpKir777ju9//77jfXyA0BwMACAoPXPf/7TSDKLFi06bT9J5pFHHvFsHz161NhsNrN8+fJanzNhwgRz/fXXe7bHjBljYmJiTFlZ2WmP9fHHHxtJpqSkxBhjzNSpU01iYqIpLy+vsf+YMWPMiBEjvNpefvllk5ycbNxut6etrKzMhIeHm5UrV9Yaz1tvvWWio6NNcXHxaWMEgLMJS3oAIIiZH2bZbTbbGfv26dPH8/dWrVopKirKa+nNiy++qP79+6tjx46KjIzUSy+9pL1793qN0bt372rr5D/99FONGDFCXbp0UVRUlAYPHixJnudu2bJFV1xxhex2u8/ntXnzZv3nP/9RVFSUIiMjFRkZqXbt2unEiRP673//W2s8aWlp6tKli7p166bbbrtNr7/+ukpLS30+LgBYEQU/AASxpKQk2Ww2ffbZZ2fse2rBbbPZ5Ha7JUlvvvmmHnzwQd1xxx1atWqVtmzZottvv13l5eVezzl12dCxY8eUnp6uyMhIvfbaa9q4caPefvttSfI8Nzw8vM7n5Xa71a9fP23ZssXr8cUXX2j06NG1xhMVFaVPPvlEb7zxhuLi4vTYY4/poosu0uHDh+scAwBYBQU/AASxdu3a6ac//amef/55HTt2rNp+Xwvd999/XykpKZowYYL69u2r888/32smvTaff/65Dh06pBkzZuiKK67QBRdcUO0Lu3369NH7778vl8tV4xhhYWGqrKz0avvJT36iXbt26ZxzztH555/v9WjduvVpY2rRooWuueYazZw5U1u3btWePXu0Zs2aM54LAFgVBT8ABLkXXnhBlZWVuvTSS/XWW29p165d+uyzz/Tcc89p4MCBPo1x/vnna9OmTVq5cqW++OILPfroo9q4ceMZn9e5c2eFhYXpT3/6k7788kstWbJEf/jDH7z63HfffSouLtbNN9+sTZs2adeuXfrLX/6inTt3SpK6du2qrVu3aufOnTp06JBcLpduueUWdejQQSNGjND777+v3bt3Ky8vTxMnTtRXX31VazzvvPOOnnvuOW3ZskX5+fl69dVX5Xa7lZyc7NPrAABWRMEPAEEuMTFRn3zyiYYMGaLf/va36tWrl9LS0rR69WrNmTPHpzHuvvtujRo1SjfddJMGDBigb7/9VhMmTDjj8zp27Kj58+frb3/7m3r06KEZM2bo6aef9urTvn17rVmzRkePHlVqaqr69eunl156ybPEaNy4cUpOTvZ8f+DDDz9URESE1q1bp86dO2vUqFG68MILdccdd+j48eOKjo6uNZ42bdpo0aJFuuqqq3ThhRfqxRdf1BtvvKGePXv69DoAgBXZTNU3vgAAAABYDjP8AAAAgIVR8AMAAAAWRsEPAAAAWBgFPwAAAGBhFPwAAACAhVHwAwAAABZGwQ8AAABYGAU/AAAAYGEU/AAAAICFUfADAAAAFkbBDwAAAFjY/wfznNHMialmyAAAAABJRU5ErkJggg==",
|
| 259 |
+
"text/plain": [
|
| 260 |
+
"<Figure size 900x400 with 1 Axes>"
|
| 261 |
+
]
|
| 262 |
+
},
|
| 263 |
+
"metadata": {},
|
| 264 |
+
"output_type": "display_data"
|
| 265 |
+
}
|
| 266 |
+
],
|
| 267 |
+
"source": [
|
| 268 |
+
"# Visualize text-length distributions by class\n",
|
| 269 |
+
"df['text_len'] = df['text'].str.len()\n",
|
| 270 |
+
"ax = df[df['label'] == 0]['text_len'].hist(bins=40, alpha=0.6, label='Human', figsize=(9, 4))\n",
|
| 271 |
+
"df[df['label'] == 1]['text_len'].hist(bins=40, alpha=0.6, label='AI', ax=ax)\n",
|
| 272 |
+
"ax.set_title('Text Length Distribution')\n",
|
| 273 |
+
"ax.set_xlabel('Characters')\n",
|
| 274 |
+
"ax.set_ylabel('Count')\n",
|
| 275 |
+
"ax.legend()"
|
| 276 |
+
]
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"cell_type": "code",
|
| 280 |
+
"execution_count": 6,
|
| 281 |
+
"id": "59fe88ce",
|
| 282 |
+
"metadata": {
|
| 283 |
+
"id": "59fe88ce"
|
| 284 |
+
},
|
| 285 |
+
"outputs": [
|
| 286 |
+
{
|
| 287 |
+
"data": {
|
| 288 |
+
"text/html": [
|
| 289 |
+
"<div>\n",
|
| 290 |
+
"<style scoped>\n",
|
| 291 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 292 |
+
" vertical-align: middle;\n",
|
| 293 |
+
" }\n",
|
| 294 |
+
"\n",
|
| 295 |
+
" .dataframe tbody tr th {\n",
|
| 296 |
+
" vertical-align: top;\n",
|
| 297 |
+
" }\n",
|
| 298 |
+
"\n",
|
| 299 |
+
" .dataframe thead th {\n",
|
| 300 |
+
" text-align: right;\n",
|
| 301 |
+
" }\n",
|
| 302 |
+
"</style>\n",
|
| 303 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 304 |
+
" <thead>\n",
|
| 305 |
+
" <tr style=\"text-align: right;\">\n",
|
| 306 |
+
" <th></th>\n",
|
| 307 |
+
" <th>text</th>\n",
|
| 308 |
+
" <th>label</th>\n",
|
| 309 |
+
" </tr>\n",
|
| 310 |
+
" </thead>\n",
|
| 311 |
+
" <tbody>\n",
|
| 312 |
+
" <tr>\n",
|
| 313 |
+
" <th>0</th>\n",
|
| 314 |
+
" <td>हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान...</td>\n",
|
| 315 |
+
" <td>0</td>\n",
|
| 316 |
+
" </tr>\n",
|
| 317 |
+
" <tr>\n",
|
| 318 |
+
" <th>1</th>\n",
|
| 319 |
+
" <td>एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात...</td>\n",
|
| 320 |
+
" <td>0</td>\n",
|
| 321 |
+
" </tr>\n",
|
| 322 |
+
" <tr>\n",
|
| 323 |
+
" <th>2</th>\n",
|
| 324 |
+
" <td>नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील...</td>\n",
|
| 325 |
+
" <td>1</td>\n",
|
| 326 |
+
" </tr>\n",
|
| 327 |
+
" <tr>\n",
|
| 328 |
+
" <th>3</th>\n",
|
| 329 |
+
" <td>प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज...</td>\n",
|
| 330 |
+
" <td>1</td>\n",
|
| 331 |
+
" </tr>\n",
|
| 332 |
+
" <tr>\n",
|
| 333 |
+
" <th>4</th>\n",
|
| 334 |
+
" <td>बिहीबार एमालेका अध्यक्ष केपी शर्मा ओली र माओवा...</td>\n",
|
| 335 |
+
" <td>0</td>\n",
|
| 336 |
+
" </tr>\n",
|
| 337 |
+
" </tbody>\n",
|
| 338 |
+
"</table>\n",
|
| 339 |
+
"</div>"
|
| 340 |
+
],
|
| 341 |
+
"text/plain": [
|
| 342 |
+
" text label\n",
|
| 343 |
+
"0 हामीले पार्टी एकतापछि कि दुबै पार्टीको सिद्धान... 0\n",
|
| 344 |
+
"1 एमाले प्रतिनिधिसभाको प्रत्यक्षतर्फ ८० समानुपात... 0\n",
|
| 345 |
+
"2 नेकपा माओवादी केन्द्रका नेता रामनारायण विडारील... 1\n",
|
| 346 |
+
"3 प्रदेश नं २ का मुख्यमन्त्रीको रूपमा संघीय समाज... 1\n",
|
| 347 |
+
"4 बिहीबार एमालेका अध्यक्ष केपी शर्मा ओली र माओवा... 0"
|
| 348 |
+
]
|
| 349 |
+
},
|
| 350 |
+
"execution_count": 6,
|
| 351 |
+
"metadata": {},
|
| 352 |
+
"output_type": "execute_result"
|
| 353 |
+
}
|
| 354 |
+
],
|
| 355 |
+
"source": [
|
| 356 |
+
"# Keep only columns needed for training\n",
|
| 357 |
+
"df = df[['text', 'label']].copy()\n",
|
| 358 |
+
"df.head()"
|
| 359 |
+
]
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"cell_type": "code",
|
| 363 |
+
"execution_count": 7,
|
| 364 |
+
"id": "434df9a2",
|
| 365 |
+
"metadata": {
|
| 366 |
+
"id": "434df9a2"
|
| 367 |
+
},
|
| 368 |
+
"outputs": [
|
| 369 |
+
{
|
| 370 |
+
"name": "stdout",
|
| 371 |
+
"output_type": "stream",
|
| 372 |
+
"text": [
|
| 373 |
+
"Using model: distilbert-base-multilingual-cased\n"
|
| 374 |
+
]
|
| 375 |
+
}
|
| 376 |
+
],
|
| 377 |
+
"source": [
|
| 378 |
+
"# Model/tokenizer config (smaller multilingual model for low-VRAM GPU)\n",
|
| 379 |
+
"MODEL_NAME = 'distilbert-base-multilingual-cased'\n",
|
| 380 |
+
"MAX_LEN = 96\n",
|
| 381 |
+
"\n",
|
| 382 |
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)\n",
|
| 383 |
+
"print('Using model:', MODEL_NAME)"
|
| 384 |
+
]
|
| 385 |
+
},
|
| 386 |
+
{
|
| 387 |
+
"cell_type": "code",
|
| 388 |
+
"execution_count": 8,
|
| 389 |
+
"id": "ef7d53f9",
|
| 390 |
+
"metadata": {
|
| 391 |
+
"id": "ef7d53f9"
|
| 392 |
+
},
|
| 393 |
+
"outputs": [],
|
| 394 |
+
"source": [
|
| 395 |
+
"class NepaliDataset(Dataset):\n",
|
| 396 |
+
" def __init__(self, texts, labels):\n",
|
| 397 |
+
" self.texts = texts\n",
|
| 398 |
+
" self.labels = labels\n",
|
| 399 |
+
"\n",
|
| 400 |
+
" def __len__(self):\n",
|
| 401 |
+
" return len(self.texts)\n",
|
| 402 |
+
"\n",
|
| 403 |
+
" def __getitem__(self, idx):\n",
|
| 404 |
+
" return {\n",
|
| 405 |
+
" 'text': self.texts[idx],\n",
|
| 406 |
+
" 'label': int(self.labels[idx]),\n",
|
| 407 |
+
" }"
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"cell_type": "code",
|
| 412 |
+
"execution_count": 9,
|
| 413 |
+
"id": "134a3fc1",
|
| 414 |
+
"metadata": {
|
| 415 |
+
"id": "134a3fc1"
|
| 416 |
+
},
|
| 417 |
+
"outputs": [
|
| 418 |
+
{
|
| 419 |
+
"name": "stdout",
|
| 420 |
+
"output_type": "stream",
|
| 421 |
+
"text": [
|
| 422 |
+
"Train: 1588 | Val: 398\n"
|
| 423 |
+
]
|
| 424 |
+
}
|
| 425 |
+
],
|
| 426 |
+
"source": [
|
| 427 |
+
"# Train/Validation Split\n",
|
| 428 |
+
"train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
|
| 429 |
+
" df['text'].tolist(),\n",
|
| 430 |
+
" df['label'].tolist(),\n",
|
| 431 |
+
" test_size=0.2,\n",
|
| 432 |
+
" random_state=42,\n",
|
| 433 |
+
" stratify=df['label'].tolist(),\n",
|
| 434 |
+
")\n",
|
| 435 |
+
"print(f'Train: {len(train_texts)} | Val: {len(val_texts)}')"
|
| 436 |
+
]
|
| 437 |
+
},
|
| 438 |
+
{
|
| 439 |
+
"cell_type": "code",
|
| 440 |
+
"execution_count": 10,
|
| 441 |
+
"id": "dd226ed1",
|
| 442 |
+
"metadata": {
|
| 443 |
+
"id": "dd226ed1"
|
| 444 |
+
},
|
| 445 |
+
"outputs": [
|
| 446 |
+
{
|
| 447 |
+
"name": "stdout",
|
| 448 |
+
"output_type": "stream",
|
| 449 |
+
"text": [
|
| 450 |
+
"Batch size: 2 | Max length: 96\n"
|
| 451 |
+
]
|
| 452 |
+
}
|
| 453 |
+
],
|
| 454 |
+
"source": [
|
| 455 |
+
"train_dataset = NepaliDataset(train_texts, train_labels)\n",
|
| 456 |
+
"val_dataset = NepaliDataset(val_texts, val_labels)\n",
|
| 457 |
+
"\n",
|
| 458 |
+
"def collate_batch(batch):\n",
|
| 459 |
+
" texts = [item['text'] for item in batch]\n",
|
| 460 |
+
" labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)\n",
|
| 461 |
+
" enc = tokenizer(\n",
|
| 462 |
+
" texts,\n",
|
| 463 |
+
" padding=True,\n",
|
| 464 |
+
" truncation=True,\n",
|
| 465 |
+
" max_length=MAX_LEN,\n",
|
| 466 |
+
" return_tensors='pt',\n",
|
| 467 |
+
" )\n",
|
| 468 |
+
" return {\n",
|
| 469 |
+
" 'input_ids': enc['input_ids'],\n",
|
| 470 |
+
" 'attention_mask': enc['attention_mask'],\n",
|
| 471 |
+
" 'labels': labels,\n",
|
| 472 |
+
" }\n",
|
| 473 |
+
"\n",
|
| 474 |
+
"BATCH_SIZE = 2\n",
|
| 475 |
+
"train_loader = DataLoader(\n",
|
| 476 |
+
" train_dataset,\n",
|
| 477 |
+
" batch_size=BATCH_SIZE,\n",
|
| 478 |
+
" shuffle=True,\n",
|
| 479 |
+
" collate_fn=collate_batch,\n",
|
| 480 |
+
" pin_memory=(torch.cuda.is_available()),\n",
|
| 481 |
+
")\n",
|
| 482 |
+
"val_loader = DataLoader(\n",
|
| 483 |
+
" val_dataset,\n",
|
| 484 |
+
" batch_size=BATCH_SIZE,\n",
|
| 485 |
+
" shuffle=False,\n",
|
| 486 |
+
" collate_fn=collate_batch,\n",
|
| 487 |
+
" pin_memory=(torch.cuda.is_available()),\n",
|
| 488 |
+
")\n",
|
| 489 |
+
"print('Batch size:', BATCH_SIZE, '| Max length:', MAX_LEN)"
|
| 490 |
+
]
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"cell_type": "code",
|
| 494 |
+
"execution_count": 11,
|
| 495 |
+
"id": "51320951",
|
| 496 |
+
"metadata": {
|
| 497 |
+
"id": "51320951"
|
| 498 |
+
},
|
| 499 |
+
"outputs": [],
|
| 500 |
+
"source": [
|
| 501 |
+
"# === Model ===\n",
|
| 502 |
+
"class IndicBERTClassifier(nn.Module):\n",
|
| 503 |
+
" def __init__(self, dropout=0.2):\n",
|
| 504 |
+
" super(IndicBERTClassifier, self).__init__()\n",
|
| 505 |
+
" self.bert = AutoModel.from_pretrained(MODEL_NAME)\n",
|
| 506 |
+
" if hasattr(self.bert, 'gradient_checkpointing_enable'):\n",
|
| 507 |
+
" self.bert.gradient_checkpointing_enable()\n",
|
| 508 |
+
" self.dropout = nn.Dropout(dropout)\n",
|
| 509 |
+
" self.classifier = nn.Linear(self.bert.config.hidden_size, 2)\n",
|
| 510 |
+
"\n",
|
| 511 |
+
" def forward(self, input_ids, attention_mask):\n",
|
| 512 |
+
" outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n",
|
| 513 |
+
" cls_output = outputs.last_hidden_state[:, 0, :]\n",
|
| 514 |
+
" cls_output = self.dropout(cls_output)\n",
|
| 515 |
+
" return self.classifier(cls_output)"
|
| 516 |
+
]
|
| 517 |
+
},
|
| 518 |
+
{
|
| 519 |
+
"cell_type": "code",
|
| 520 |
+
"execution_count": 12,
|
| 521 |
+
"id": "944f918e",
|
| 522 |
+
"metadata": {
|
| 523 |
+
"id": "944f918e"
|
| 524 |
+
},
|
| 525 |
+
"outputs": [],
|
| 526 |
+
"source": [
|
| 527 |
+
"# Step 8: Create a custom Dataset class\n",
|
| 528 |
+
"class NepaliTextDataset(Dataset):\n",
|
| 529 |
+
" def __init__(self, input_ids, attention_mask, labels):\n",
|
| 530 |
+
" self.input_ids = input_ids\n",
|
| 531 |
+
" self.attention_mask = attention_mask\n",
|
| 532 |
+
" self.labels = labels\n",
|
| 533 |
+
"\n",
|
| 534 |
+
" def __len__(self):\n",
|
| 535 |
+
" return len(self.labels)\n",
|
| 536 |
+
"\n",
|
| 537 |
+
" def __getitem__(self, idx):\n",
|
| 538 |
+
" return {\n",
|
| 539 |
+
" 'input_ids': torch.tensor(self.input_ids[idx]),\n",
|
| 540 |
+
" 'attention_mask': torch.tensor(self.attention_mask[idx]),\n",
|
| 541 |
+
" 'labels': torch.tensor(self.labels[idx])\n",
|
| 542 |
+
" }"
|
| 543 |
+
]
|
| 544 |
+
},
|
| 545 |
+
{
|
| 546 |
+
"cell_type": "code",
|
| 547 |
+
"execution_count": 13,
|
| 548 |
+
"id": "a9d426e1",
|
| 549 |
+
"metadata": {
|
| 550 |
+
"id": "a9d426e1"
|
| 551 |
+
},
|
| 552 |
+
"outputs": [
|
| 553 |
+
{
|
| 554 |
+
"name": "stderr",
|
| 555 |
+
"output_type": "stream",
|
| 556 |
+
"text": [
|
| 557 |
+
"Loading weights: 100%|██████████| 100/100 [00:00<00:00, 11666.08it/s]\n",
|
| 558 |
+
"\u001b[1mDistilBertModel LOAD REPORT\u001b[0m from: distilbert-base-multilingual-cased\n",
|
| 559 |
+
"Key | Status | | \n",
|
| 560 |
+
"------------------------+------------+--+-\n",
|
| 561 |
+
"vocab_layer_norm.bias | UNEXPECTED | | \n",
|
| 562 |
+
"vocab_transform.weight | UNEXPECTED | | \n",
|
| 563 |
+
"vocab_layer_norm.weight | UNEXPECTED | | \n",
|
| 564 |
+
"vocab_transform.bias | UNEXPECTED | | \n",
|
| 565 |
+
"vocab_projector.bias | UNEXPECTED | | \n",
|
| 566 |
+
"\n",
|
| 567 |
+
"\u001b[3mNotes:\n",
|
| 568 |
+
"- UNEXPECTED\u001b[3m\t:can be ignored when loading from different task/architecture; not ok if you expect identical arch.\u001b[0m\n"
|
| 569 |
+
]
|
| 570 |
+
}
|
| 571 |
+
],
|
| 572 |
+
"source": [
|
| 573 |
+
"\n",
|
| 574 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 575 |
+
"model = IndicBERTClassifier().to(device)"
|
| 576 |
+
]
|
| 577 |
+
},
|
| 578 |
+
{
|
| 579 |
+
"cell_type": "code",
|
| 580 |
+
"execution_count": 14,
|
| 581 |
+
"id": "2740c14a",
|
| 582 |
+
"metadata": {
|
| 583 |
+
"id": "2740c14a"
|
| 584 |
+
},
|
| 585 |
+
"outputs": [
|
| 586 |
+
{
|
| 587 |
+
"name": "stdout",
|
| 588 |
+
"output_type": "stream",
|
| 589 |
+
"text": [
|
| 590 |
+
"Grad accumulation steps: 4\n"
|
| 591 |
+
]
|
| 592 |
+
}
|
| 593 |
+
],
|
| 594 |
+
"source": [
|
| 595 |
+
"# === Optimizer, Scheduler & Loss ===\n",
|
| 596 |
+
"optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)\n",
|
| 597 |
+
"loss_fn = nn.CrossEntropyLoss()\n",
|
| 598 |
+
"\n",
|
| 599 |
+
"max_epochs = 6\n",
|
| 600 |
+
"grad_accum_steps = 4 # effective batch = BATCH_SIZE * grad_accum_steps\n",
|
| 601 |
+
"steps_per_epoch = math.ceil(len(train_loader) / grad_accum_steps)\n",
|
| 602 |
+
"total_steps = steps_per_epoch * max_epochs\n",
|
| 603 |
+
"warmup_steps = int(0.1 * total_steps)\n",
|
| 604 |
+
"scheduler = get_linear_schedule_with_warmup(\n",
|
| 605 |
+
" optimizer,\n",
|
| 606 |
+
" num_warmup_steps=warmup_steps,\n",
|
| 607 |
+
" num_training_steps=total_steps,\n",
|
| 608 |
+
")\n",
|
| 609 |
+
"print('Grad accumulation steps:', grad_accum_steps)"
|
| 610 |
+
]
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"cell_type": "code",
|
| 614 |
+
"execution_count": 15,
|
| 615 |
+
"id": "14ce04bd",
|
| 616 |
+
"metadata": {
|
| 617 |
+
"id": "14ce04bd"
|
| 618 |
+
},
|
| 619 |
+
"outputs": [],
|
| 620 |
+
"source": [
|
| 621 |
+
"# === Training Loop ===\n",
|
| 622 |
+
"def train(model, loader):\n",
|
| 623 |
+
" model.train()\n",
|
| 624 |
+
" total_loss = 0\n",
|
| 625 |
+
" for batch in loader:\n",
|
| 626 |
+
" input_ids = batch['input_ids'].to(device)\n",
|
| 627 |
+
" attention_mask = batch['attention_mask'].to(device)\n",
|
| 628 |
+
" labels = batch['labels'].to(device)\n",
|
| 629 |
+
"\n",
|
| 630 |
+
" optimizer.zero_grad()\n",
|
| 631 |
+
" outputs = model(input_ids, attention_mask)\n",
|
| 632 |
+
" loss = loss_fn(outputs, labels)\n",
|
| 633 |
+
" loss.backward()\n",
|
| 634 |
+
" optimizer.step()\n",
|
| 635 |
+
" total_loss += loss.item()\n",
|
| 636 |
+
" return total_loss / len(loader)\n",
|
| 637 |
+
"\n",
|
| 638 |
+
"# === Evaluation ===\n",
|
| 639 |
+
"def evaluate(model, loader):\n",
|
| 640 |
+
" model.eval()\n",
|
| 641 |
+
" preds, true = [], []\n",
|
| 642 |
+
" with torch.no_grad():\n",
|
| 643 |
+
" for batch in loader:\n",
|
| 644 |
+
" input_ids = batch['input_ids'].to(device)\n",
|
| 645 |
+
" attention_mask = batch['attention_mask'].to(device)\n",
|
| 646 |
+
" labels = batch['labels'].to(device)\n",
|
| 647 |
+
"\n",
|
| 648 |
+
" outputs = model(input_ids, attention_mask)\n",
|
| 649 |
+
" pred_labels = torch.argmax(outputs, dim=1)\n",
|
| 650 |
+
" preds.extend(pred_labels.cpu().numpy())\n",
|
| 651 |
+
" true.extend(labels.cpu().numpy())\n",
|
| 652 |
+
"\n",
|
| 653 |
+
" print(classification_report(true, preds, target_names=[\"Human\", \"AI\"]))\n"
|
| 654 |
+
]
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"cell_type": "code",
|
| 658 |
+
"execution_count": null,
|
| 659 |
+
"id": "d24e91b7",
|
| 660 |
+
"metadata": {
|
| 661 |
+
"colab": {
|
| 662 |
+
"base_uri": "https://localhost:8080/"
|
| 663 |
+
},
|
| 664 |
+
"id": "d24e91b7",
|
| 665 |
+
"outputId": "33ef8227-5c71-4c0d-88e7-b1a9e30b45f4"
|
| 666 |
+
},
|
| 667 |
+
"outputs": [
|
| 668 |
+
{
|
| 669 |
+
"name": "stdout",
|
| 670 |
+
"output_type": "stream",
|
| 671 |
+
"text": [
|
| 672 |
+
"\n",
|
| 673 |
+
"Epoch 1/6\n"
|
| 674 |
+
]
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"name": "stderr",
|
| 678 |
+
"output_type": "stream",
|
| 679 |
+
"text": [
|
| 680 |
+
"/tmp/ipykernel_155548/4183901742.py:4: FutureWarning: `torch.cuda.amp.GradScaler(args...)` is deprecated. Please use `torch.amp.GradScaler('cuda', args...)` instead.\n",
|
| 681 |
+
" scaler = GradScaler(enabled=use_amp)\n",
|
| 682 |
+
"/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 683 |
+
" with autocast(enabled=use_amp):\n"
|
| 684 |
+
]
|
| 685 |
+
},
|
| 686 |
+
{
|
| 687 |
+
"name": "stdout",
|
| 688 |
+
"output_type": "stream",
|
| 689 |
+
"text": [
|
| 690 |
+
"Batch 0 | Loss: 0.8206\n",
|
| 691 |
+
"Batch 50 | Loss: 0.8677\n",
|
| 692 |
+
"Batch 100 | Loss: 0.8435\n",
|
| 693 |
+
"Batch 150 | Loss: 0.6523\n",
|
| 694 |
+
"Batch 200 | Loss: 0.7219\n",
|
| 695 |
+
"Batch 250 | Loss: 0.5793\n",
|
| 696 |
+
"Batch 300 | Loss: 0.6833\n",
|
| 697 |
+
"Batch 350 | Loss: 0.5742\n",
|
| 698 |
+
"Batch 400 | Loss: 0.4844\n",
|
| 699 |
+
"Batch 450 | Loss: 0.5671\n",
|
| 700 |
+
"Batch 500 | Loss: 0.5363\n",
|
| 701 |
+
"Batch 550 | Loss: 0.5386\n",
|
| 702 |
+
"Batch 600 | Loss: 0.5520\n",
|
| 703 |
+
"Batch 650 | Loss: 0.7692\n",
|
| 704 |
+
"Batch 700 | Loss: 0.4680\n",
|
| 705 |
+
"Batch 750 | Loss: 0.6353\n",
|
| 706 |
+
"Train | Loss: 0.6600 | Acc: 0.5913 | F1: 0.5895\n"
|
| 707 |
+
]
|
| 708 |
+
},
|
| 709 |
+
{
|
| 710 |
+
"name": "stderr",
|
| 711 |
+
"output_type": "stream",
|
| 712 |
+
"text": [
|
| 713 |
+
"/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 714 |
+
" with autocast(enabled=use_amp):\n"
|
| 715 |
+
]
|
| 716 |
+
},
|
| 717 |
+
{
|
| 718 |
+
"name": "stdout",
|
| 719 |
+
"output_type": "stream",
|
| 720 |
+
"text": [
|
| 721 |
+
"Validation | Loss: 0.5192 | Acc: 0.8015 | F1: 0.7812\n",
|
| 722 |
+
" precision recall f1-score support\n",
|
| 723 |
+
"\n",
|
| 724 |
+
" Human 0.75 0.90 0.82 198\n",
|
| 725 |
+
" AI 0.88 0.70 0.78 200\n",
|
| 726 |
+
"\n",
|
| 727 |
+
" accuracy 0.80 398\n",
|
| 728 |
+
" macro avg 0.81 0.80 0.80 398\n",
|
| 729 |
+
"weighted avg 0.81 0.80 0.80 398\n",
|
| 730 |
+
"\n",
|
| 731 |
+
"Saved improved checkpoint: model_best.pth\n",
|
| 732 |
+
"\n",
|
| 733 |
+
"Epoch 2/6\n"
|
| 734 |
+
]
|
| 735 |
+
},
|
| 736 |
+
{
|
| 737 |
+
"name": "stderr",
|
| 738 |
+
"output_type": "stream",
|
| 739 |
+
"text": [
|
| 740 |
+
"/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 741 |
+
" with autocast(enabled=use_amp):\n"
|
| 742 |
+
]
|
| 743 |
+
},
|
| 744 |
+
{
|
| 745 |
+
"name": "stdout",
|
| 746 |
+
"output_type": "stream",
|
| 747 |
+
"text": [
|
| 748 |
+
"Batch 0 | Loss: 0.6078\n",
|
| 749 |
+
"Batch 50 | Loss: 1.1135\n",
|
| 750 |
+
"Batch 100 | Loss: 0.3297\n",
|
| 751 |
+
"Batch 150 | Loss: 0.8473\n",
|
| 752 |
+
"Batch 200 | Loss: 0.9326\n",
|
| 753 |
+
"Batch 250 | Loss: 0.5112\n",
|
| 754 |
+
"Batch 300 | Loss: 0.1645\n",
|
| 755 |
+
"Batch 350 | Loss: 0.2250\n",
|
| 756 |
+
"Batch 400 | Loss: 0.7142\n",
|
| 757 |
+
"Batch 450 | Loss: 0.3741\n",
|
| 758 |
+
"Batch 500 | Loss: 0.3084\n",
|
| 759 |
+
"Batch 550 | Loss: 0.1472\n",
|
| 760 |
+
"Batch 600 | Loss: 0.0679\n",
|
| 761 |
+
"Batch 650 | Loss: 0.1234\n",
|
| 762 |
+
"Batch 700 | Loss: 1.1370\n",
|
| 763 |
+
"Batch 750 | Loss: 0.8843\n",
|
| 764 |
+
"Train | Loss: 0.4817 | Acc: 0.7720 | F1: 0.7665\n"
|
| 765 |
+
]
|
| 766 |
+
},
|
| 767 |
+
{
|
| 768 |
+
"name": "stderr",
|
| 769 |
+
"output_type": "stream",
|
| 770 |
+
"text": [
|
| 771 |
+
"/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 772 |
+
" with autocast(enabled=use_amp):\n"
|
| 773 |
+
]
|
| 774 |
+
},
|
| 775 |
+
{
|
| 776 |
+
"name": "stdout",
|
| 777 |
+
"output_type": "stream",
|
| 778 |
+
"text": [
|
| 779 |
+
"Validation | Loss: 0.3708 | Acc: 0.8417 | F1: 0.8225\n",
|
| 780 |
+
" precision recall f1-score support\n",
|
| 781 |
+
"\n",
|
| 782 |
+
" Human 0.78 0.95 0.86 198\n",
|
| 783 |
+
" AI 0.94 0.73 0.82 200\n",
|
| 784 |
+
"\n",
|
| 785 |
+
" accuracy 0.84 398\n",
|
| 786 |
+
" macro avg 0.86 0.84 0.84 398\n",
|
| 787 |
+
"weighted avg 0.86 0.84 0.84 398\n",
|
| 788 |
+
"\n",
|
| 789 |
+
"Saved improved checkpoint: model_best.pth\n",
|
| 790 |
+
"\n",
|
| 791 |
+
"Epoch 3/6\n"
|
| 792 |
+
]
|
| 793 |
+
},
|
| 794 |
+
{
|
| 795 |
+
"name": "stderr",
|
| 796 |
+
"output_type": "stream",
|
| 797 |
+
"text": [
|
| 798 |
+
"/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 799 |
+
" with autocast(enabled=use_amp):\n"
|
| 800 |
+
]
|
| 801 |
+
},
|
| 802 |
+
{
|
| 803 |
+
"name": "stdout",
|
| 804 |
+
"output_type": "stream",
|
| 805 |
+
"text": [
|
| 806 |
+
"Batch 0 | Loss: 0.0415\n",
|
| 807 |
+
"Batch 50 | Loss: 0.0845\n",
|
| 808 |
+
"Batch 100 | Loss: 0.0336\n",
|
| 809 |
+
"Batch 150 | Loss: 0.6389\n",
|
| 810 |
+
"Batch 200 | Loss: 1.6021\n",
|
| 811 |
+
"Batch 250 | Loss: 0.0696\n",
|
| 812 |
+
"Batch 300 | Loss: 0.5184\n",
|
| 813 |
+
"Batch 350 | Loss: 0.0569\n",
|
| 814 |
+
"Batch 400 | Loss: 0.8119\n",
|
| 815 |
+
"Batch 450 | Loss: 1.5121\n",
|
| 816 |
+
"Batch 500 | Loss: 0.0330\n",
|
| 817 |
+
"Batch 550 | Loss: 0.0208\n",
|
| 818 |
+
"Batch 600 | Loss: 1.1329\n",
|
| 819 |
+
"Batch 650 | Loss: 0.7745\n",
|
| 820 |
+
"Batch 700 | Loss: 0.0740\n",
|
| 821 |
+
"Batch 750 | Loss: 1.4907\n",
|
| 822 |
+
"Train | Loss: 0.3830 | Acc: 0.8495 | F1: 0.8488\n"
|
| 823 |
+
]
|
| 824 |
+
},
|
| 825 |
+
{
|
| 826 |
+
"name": "stderr",
|
| 827 |
+
"output_type": "stream",
|
| 828 |
+
"text": [
|
| 829 |
+
"/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 830 |
+
" with autocast(enabled=use_amp):\n"
|
| 831 |
+
]
|
| 832 |
+
},
|
| 833 |
+
{
|
| 834 |
+
"name": "stdout",
|
| 835 |
+
"output_type": "stream",
|
| 836 |
+
"text": [
|
| 837 |
+
"Validation | Loss: 0.3527 | Acc: 0.8668 | F1: 0.8515\n",
|
| 838 |
+
" precision recall f1-score support\n",
|
| 839 |
+
"\n",
|
| 840 |
+
" Human 0.80 0.97 0.88 198\n",
|
| 841 |
+
" AI 0.97 0.76 0.85 200\n",
|
| 842 |
+
"\n",
|
| 843 |
+
" accuracy 0.87 398\n",
|
| 844 |
+
" macro avg 0.88 0.87 0.87 398\n",
|
| 845 |
+
"weighted avg 0.88 0.87 0.87 398\n",
|
| 846 |
+
"\n",
|
| 847 |
+
"Saved improved checkpoint: model_best.pth\n",
|
| 848 |
+
"\n",
|
| 849 |
+
"Epoch 4/6\n"
|
| 850 |
+
]
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"name": "stderr",
|
| 854 |
+
"output_type": "stream",
|
| 855 |
+
"text": [
|
| 856 |
+
"/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 857 |
+
" with autocast(enabled=use_amp):\n"
|
| 858 |
+
]
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"name": "stdout",
|
| 862 |
+
"output_type": "stream",
|
| 863 |
+
"text": [
|
| 864 |
+
"Batch 0 | Loss: 1.2321\n",
|
| 865 |
+
"Batch 50 | Loss: 0.0369\n",
|
| 866 |
+
"Batch 100 | Loss: 0.0161\n",
|
| 867 |
+
"Batch 150 | Loss: 0.2000\n",
|
| 868 |
+
"Batch 200 | Loss: 0.0035\n",
|
| 869 |
+
"Batch 250 | Loss: 2.3207\n",
|
| 870 |
+
"Batch 300 | Loss: 0.0022\n",
|
| 871 |
+
"Batch 350 | Loss: 2.2738\n",
|
| 872 |
+
"Batch 400 | Loss: 0.0011\n",
|
| 873 |
+
"Batch 450 | Loss: 0.0075\n",
|
| 874 |
+
"Batch 500 | Loss: 2.4454\n",
|
| 875 |
+
"Batch 550 | Loss: 0.3863\n",
|
| 876 |
+
"Batch 600 | Loss: 0.0038\n",
|
| 877 |
+
"Batch 650 | Loss: 0.0061\n",
|
| 878 |
+
"Batch 700 | Loss: 0.0005\n",
|
| 879 |
+
"Batch 750 | Loss: 0.0182\n",
|
| 880 |
+
"Train | Loss: 0.4209 | Acc: 0.8923 | F1: 0.8903\n"
|
| 881 |
+
]
|
| 882 |
+
},
|
| 883 |
+
{
|
| 884 |
+
"name": "stderr",
|
| 885 |
+
"output_type": "stream",
|
| 886 |
+
"text": [
|
| 887 |
+
"/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 888 |
+
" with autocast(enabled=use_amp):\n"
|
| 889 |
+
]
|
| 890 |
+
},
|
| 891 |
+
{
|
| 892 |
+
"name": "stdout",
|
| 893 |
+
"output_type": "stream",
|
| 894 |
+
"text": [
|
| 895 |
+
"Validation | Loss: 0.4601 | Acc: 0.8769 | F1: 0.8831\n",
|
| 896 |
+
" precision recall f1-score support\n",
|
| 897 |
+
"\n",
|
| 898 |
+
" Human 0.92 0.83 0.87 198\n",
|
| 899 |
+
" AI 0.84 0.93 0.88 200\n",
|
| 900 |
+
"\n",
|
| 901 |
+
" accuracy 0.88 398\n",
|
| 902 |
+
" macro avg 0.88 0.88 0.88 398\n",
|
| 903 |
+
"weighted avg 0.88 0.88 0.88 398\n",
|
| 904 |
+
"\n",
|
| 905 |
+
"Saved improved checkpoint: model_best.pth\n",
|
| 906 |
+
"\n",
|
| 907 |
+
"Epoch 5/6\n"
|
| 908 |
+
]
|
| 909 |
+
},
|
| 910 |
+
{
|
| 911 |
+
"name": "stderr",
|
| 912 |
+
"output_type": "stream",
|
| 913 |
+
"text": [
|
| 914 |
+
"/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 915 |
+
" with autocast(enabled=use_amp):\n"
|
| 916 |
+
]
|
| 917 |
+
},
|
| 918 |
+
{
|
| 919 |
+
"name": "stdout",
|
| 920 |
+
"output_type": "stream",
|
| 921 |
+
"text": [
|
| 922 |
+
"Batch 0 | Loss: 0.0010\n",
|
| 923 |
+
"Batch 50 | Loss: 0.0061\n",
|
| 924 |
+
"Batch 100 | Loss: 0.0047\n",
|
| 925 |
+
"Batch 150 | Loss: 0.0201\n",
|
| 926 |
+
"Batch 200 | Loss: 0.0023\n",
|
| 927 |
+
"Batch 250 | Loss: 0.0395\n",
|
| 928 |
+
"Batch 300 | Loss: 0.0011\n",
|
| 929 |
+
"Batch 350 | Loss: 0.0002\n",
|
| 930 |
+
"Batch 400 | Loss: 3.2169\n",
|
| 931 |
+
"Batch 450 | Loss: 4.4883\n",
|
| 932 |
+
"Batch 500 | Loss: 0.0002\n",
|
| 933 |
+
"Batch 550 | Loss: 0.0003\n",
|
| 934 |
+
"Batch 600 | Loss: 0.0000\n",
|
| 935 |
+
"Batch 650 | Loss: 0.0002\n",
|
| 936 |
+
"Batch 700 | Loss: 0.0000\n",
|
| 937 |
+
"Batch 750 | Loss: 4.6367\n",
|
| 938 |
+
"Train | Loss: 0.5447 | Acc: 0.9011 | F1: 0.8990\n"
|
| 939 |
+
]
|
| 940 |
+
},
|
| 941 |
+
{
|
| 942 |
+
"name": "stderr",
|
| 943 |
+
"output_type": "stream",
|
| 944 |
+
"text": [
|
| 945 |
+
"/tmp/ipykernel_155548/4183901742.py:55: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 946 |
+
" with autocast(enabled=use_amp):\n"
|
| 947 |
+
]
|
| 948 |
+
},
|
| 949 |
+
{
|
| 950 |
+
"name": "stdout",
|
| 951 |
+
"output_type": "stream",
|
| 952 |
+
"text": [
|
| 953 |
+
"Validation | Loss: 0.5331 | Acc: 0.9271 | F1: 0.9266\n",
|
| 954 |
+
" precision recall f1-score support\n",
|
| 955 |
+
"\n",
|
| 956 |
+
" Human 0.92 0.94 0.93 198\n",
|
| 957 |
+
" AI 0.94 0.92 0.93 200\n",
|
| 958 |
+
"\n",
|
| 959 |
+
" accuracy 0.93 398\n",
|
| 960 |
+
" macro avg 0.93 0.93 0.93 398\n",
|
| 961 |
+
"weighted avg 0.93 0.93 0.93 398\n",
|
| 962 |
+
"\n",
|
| 963 |
+
"Saved improved checkpoint: model_best.pth\n",
|
| 964 |
+
"\n",
|
| 965 |
+
"Epoch 6/6\n"
|
| 966 |
+
]
|
| 967 |
+
},
|
| 968 |
+
{
|
| 969 |
+
"name": "stderr",
|
| 970 |
+
"output_type": "stream",
|
| 971 |
+
"text": [
|
| 972 |
+
"/tmp/ipykernel_155548/4183901742.py:17: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
|
| 973 |
+
" with autocast(enabled=use_amp):\n"
|
| 974 |
+
]
|
| 975 |
+
},
|
| 976 |
+
{
|
| 977 |
+
"name": "stdout",
|
| 978 |
+
"output_type": "stream",
|
| 979 |
+
"text": [
|
| 980 |
+
"Batch 0 | Loss: 0.0000\n"
|
| 981 |
+
]
|
| 982 |
+
}
|
| 983 |
+
],
|
| 984 |
+
"source": [
|
| 985 |
+
"from torch.cuda.amp import autocast, GradScaler\n",
|
| 986 |
+
"\n",
|
| 987 |
+
"use_amp = device.type == 'cuda'\n",
|
| 988 |
+
"scaler = GradScaler(enabled=use_amp)\n",
|
| 989 |
+
"\n",
|
| 990 |
+
"def train_one_epoch(model, loader):\n",
|
| 991 |
+
" model.train()\n",
|
| 992 |
+
" total_loss = 0.0\n",
|
| 993 |
+
" all_preds, all_true = [], []\n",
|
| 994 |
+
"\n",
|
| 995 |
+
" optimizer.zero_grad(set_to_none=True)\n",
|
| 996 |
+
" for batch_idx, batch in enumerate(loader):\n",
|
| 997 |
+
" input_ids = batch['input_ids'].to(device, non_blocking=True)\n",
|
| 998 |
+
" attention_mask = batch['attention_mask'].to(device, non_blocking=True)\n",
|
| 999 |
+
" labels = batch['labels'].to(device, non_blocking=True)\n",
|
| 1000 |
+
"\n",
|
| 1001 |
+
" with autocast(enabled=use_amp):\n",
|
| 1002 |
+
" logits = model(input_ids, attention_mask=attention_mask)\n",
|
| 1003 |
+
" loss = loss_fn(logits, labels) / grad_accum_steps\n",
|
| 1004 |
+
"\n",
|
| 1005 |
+
" scaler.scale(loss).backward()\n",
|
| 1006 |
+
"\n",
|
| 1007 |
+
" if (batch_idx + 1) % grad_accum_steps == 0 or (batch_idx + 1) == len(loader):\n",
|
| 1008 |
+
" torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)\n",
|
| 1009 |
+
" scaler.step(optimizer)\n",
|
| 1010 |
+
" scaler.update()\n",
|
| 1011 |
+
" scheduler.step()\n",
|
| 1012 |
+
" optimizer.zero_grad(set_to_none=True)\n",
|
| 1013 |
+
"\n",
|
| 1014 |
+
" total_loss += loss.item() * grad_accum_steps\n",
|
| 1015 |
+
" preds = torch.argmax(logits, dim=1)\n",
|
| 1016 |
+
" all_preds.extend(preds.detach().cpu().numpy())\n",
|
| 1017 |
+
" all_true.extend(labels.detach().cpu().numpy())\n",
|
| 1018 |
+
"\n",
|
| 1019 |
+
" if batch_idx % 50 == 0:\n",
|
| 1020 |
+
" print(f'Batch {batch_idx} | Loss: {(loss.item() * grad_accum_steps):.4f}')\n",
|
| 1021 |
+
"\n",
|
| 1022 |
+
" avg_loss = total_loss / max(len(loader), 1)\n",
|
| 1023 |
+
" train_acc = accuracy_score(all_true, all_preds)\n",
|
| 1024 |
+
" train_f1 = f1_score(all_true, all_preds)\n",
|
| 1025 |
+
" return avg_loss, train_acc, train_f1\n",
|
| 1026 |
+
"\n",
|
| 1027 |
+
"\n",
|
| 1028 |
+
"def evaluate(model, loader):\n",
|
| 1029 |
+
" model.eval()\n",
|
| 1030 |
+
" all_preds, all_true = [], []\n",
|
| 1031 |
+
" total_loss = 0.0\n",
|
| 1032 |
+
"\n",
|
| 1033 |
+
" with torch.no_grad():\n",
|
| 1034 |
+
" for batch in loader:\n",
|
| 1035 |
+
" input_ids = batch['input_ids'].to(device, non_blocking=True)\n",
|
| 1036 |
+
" attention_mask = batch['attention_mask'].to(device, non_blocking=True)\n",
|
| 1037 |
+
" labels = batch['labels'].to(device, non_blocking=True)\n",
|
| 1038 |
+
"\n",
|
| 1039 |
+
" with autocast(enabled=use_amp):\n",
|
| 1040 |
+
" logits = model(input_ids, attention_mask=attention_mask)\n",
|
| 1041 |
+
" loss = loss_fn(logits, labels)\n",
|
| 1042 |
+
"\n",
|
| 1043 |
+
" total_loss += loss.item()\n",
|
| 1044 |
+
" preds = torch.argmax(logits, dim=1)\n",
|
| 1045 |
+
" all_preds.extend(preds.cpu().numpy())\n",
|
| 1046 |
+
" all_true.extend(labels.cpu().numpy())\n",
|
| 1047 |
+
"\n",
|
| 1048 |
+
" val_loss = total_loss / max(len(loader), 1)\n",
|
| 1049 |
+
" val_acc = accuracy_score(all_true, all_preds)\n",
|
| 1050 |
+
" val_f1 = f1_score(all_true, all_preds)\n",
|
| 1051 |
+
"\n",
|
| 1052 |
+
" print(f'Validation | Loss: {val_loss:.4f} | Acc: {val_acc:.4f} | F1: {val_f1:.4f}')\n",
|
| 1053 |
+
" print(classification_report(all_true, all_preds, target_names=['Human', 'AI']))\n",
|
| 1054 |
+
" return val_loss, val_acc, val_f1\n",
|
| 1055 |
+
"\n",
|
| 1056 |
+
"\n",
|
| 1057 |
+
"# Training with early stopping on validation F1\n",
|
| 1058 |
+
"patience = 2\n",
|
| 1059 |
+
"best_val_f1 = 0.0\n",
|
| 1060 |
+
"epochs_without_improve = 0\n",
|
| 1061 |
+
"\n",
|
| 1062 |
+
"for epoch in range(1, max_epochs + 1):\n",
|
| 1063 |
+
" print(f'\\nEpoch {epoch}/{max_epochs}')\n",
|
| 1064 |
+
" if device.type == 'cuda':\n",
|
| 1065 |
+
" torch.cuda.empty_cache()\n",
|
| 1066 |
+
"\n",
|
| 1067 |
+
" train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader)\n",
|
| 1068 |
+
" print(f'Train | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}')\n",
|
| 1069 |
+
"\n",
|
| 1070 |
+
" val_loss, val_acc, val_f1 = evaluate(model, val_loader)\n",
|
| 1071 |
+
"\n",
|
| 1072 |
+
" if val_f1 > best_val_f1:\n",
|
| 1073 |
+
" best_val_f1 = val_f1\n",
|
| 1074 |
+
" epochs_without_improve = 0\n",
|
| 1075 |
+
" torch.save(model.state_dict(), 'model_best.pth')\n",
|
| 1076 |
+
" print('Saved improved checkpoint: model_best.pth')\n",
|
| 1077 |
+
" else:\n",
|
| 1078 |
+
" epochs_without_improve += 1\n",
|
| 1079 |
+
" if epochs_without_improve >= patience:\n",
|
| 1080 |
+
" print('Early stopping triggered.')\n",
|
| 1081 |
+
" break\n",
|
| 1082 |
+
"\n",
|
| 1083 |
+
"print(f'Best validation F1: {best_val_f1:.4f}')"
|
| 1084 |
+
]
|
| 1085 |
+
},
|
| 1086 |
+
{
|
| 1087 |
+
"cell_type": "code",
|
| 1088 |
+
"execution_count": null,
|
| 1089 |
+
"id": "wBIT-kPaswqy",
|
| 1090 |
+
"metadata": {
|
| 1091 |
+
"id": "wBIT-kPaswqy"
|
| 1092 |
+
},
|
| 1093 |
+
"outputs": [],
|
| 1094 |
+
"source": [
|
| 1095 |
+
"# Optional: save current in-memory weights as latest checkpoint\n",
|
| 1096 |
+
"torch.save(model.state_dict(), 'model_latest.pth')\n",
|
| 1097 |
+
"print('Saved: model_latest.pth')"
|
| 1098 |
+
]
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"cell_type": "code",
|
| 1102 |
+
"execution_count": null,
|
| 1103 |
+
"id": "19b9652c",
|
| 1104 |
+
"metadata": {
|
| 1105 |
+
"colab": {
|
| 1106 |
+
"base_uri": "https://localhost:8080/"
|
| 1107 |
+
},
|
| 1108 |
+
"id": "19b9652c",
|
| 1109 |
+
"outputId": "e1b12835-b081-4d46-a909-c92cb3b6d230"
|
| 1110 |
+
},
|
| 1111 |
+
"outputs": [
|
| 1112 |
+
{
|
| 1113 |
+
"data": {
|
| 1114 |
+
"text/plain": [
|
| 1115 |
+
"('./nepali_xlmr_classifier/tokenizer_config.json',\n",
|
| 1116 |
+
" './nepali_xlmr_classifier/special_tokens_map.json',\n",
|
| 1117 |
+
" './nepali_xlmr_classifier/sentencepiece.bpe.model',\n",
|
| 1118 |
+
" './nepali_xlmr_classifier/added_tokens.json',\n",
|
| 1119 |
+
" './nepali_xlmr_classifier/tokenizer.json')"
|
| 1120 |
+
]
|
| 1121 |
+
},
|
| 1122 |
+
"execution_count": 41,
|
| 1123 |
+
"metadata": {},
|
| 1124 |
+
"output_type": "execute_result"
|
| 1125 |
+
}
|
| 1126 |
+
],
|
| 1127 |
+
"source": [
|
| 1128 |
+
"tokenizer.save_pretrained(\"./nepali_xlmr_classifier\")"
|
| 1129 |
+
]
|
| 1130 |
+
},
|
| 1131 |
+
{
|
| 1132 |
+
"cell_type": "code",
|
| 1133 |
+
"execution_count": null,
|
| 1134 |
+
"id": "eAnrw316iRw8",
|
| 1135 |
+
"metadata": {
|
| 1136 |
+
"colab": {
|
| 1137 |
+
"base_uri": "https://localhost:8080/"
|
| 1138 |
+
},
|
| 1139 |
+
"id": "eAnrw316iRw8",
|
| 1140 |
+
"outputId": "04885bb5-4f06-459b-a83c-40f5e00703fe"
|
| 1141 |
+
},
|
| 1142 |
+
"outputs": [
|
| 1143 |
+
{
|
| 1144 |
+
"name": "stdout",
|
| 1145 |
+
"output_type": "stream",
|
| 1146 |
+
"text": [
|
| 1147 |
+
"0\n"
|
| 1148 |
+
]
|
| 1149 |
+
}
|
| 1150 |
+
],
|
| 1151 |
+
"source": [
|
| 1152 |
+
"def predict(text):\n",
|
| 1153 |
+
" model.eval()\n",
|
| 1154 |
+
" inputs = tokenizer(\n",
|
| 1155 |
+
" text,\n",
|
| 1156 |
+
" return_tensors='pt',\n",
|
| 1157 |
+
" truncation=True,\n",
|
| 1158 |
+
" padding=True,\n",
|
| 1159 |
+
" max_length=MAX_LEN,\n",
|
| 1160 |
+
" )\n",
|
| 1161 |
+
" inputs = {k: v.to(device) for k, v in inputs.items()}\n",
|
| 1162 |
+
"\n",
|
| 1163 |
+
" with torch.no_grad():\n",
|
| 1164 |
+
" logits = model(inputs['input_ids'], inputs['attention_mask'])\n",
|
| 1165 |
+
" probs = torch.softmax(logits, dim=1)\n",
|
| 1166 |
+
" pred = torch.argmax(probs, dim=1).item()\n",
|
| 1167 |
+
" confidence = probs[0, pred].item()\n",
|
| 1168 |
+
"\n",
|
| 1169 |
+
" label = 'AI' if pred == 1 else 'Human'\n",
|
| 1170 |
+
" return label, confidence\n",
|
| 1171 |
+
"\n",
|
| 1172 |
+
"sample = 'अख्तियार दुरुपयोग अनुसन्धान आयोगले सिन्धुपाल्चोक–२ बाट प्रतिनिधिसभा सदस्य निर्वाचित सांसद तथा पूर्वमन्त्री बस्नेतसहित १६ जना र २ कम्पनी विरुद्ध ३ अर्ब २१ करोडभन्दा बढी बिगो कायम गरी बिहीबार विशेष अदालतमा भ्रष्टाचार मुद्दा दायर गरेको छ ।'\n",
|
| 1173 |
+
"label, conf = predict(sample)\n",
|
| 1174 |
+
"print(f'Prediction: {label} | Confidence: {conf:.4f}')"
|
| 1175 |
+
]
|
| 1176 |
+
},
|
| 1177 |
+
{
|
| 1178 |
+
"cell_type": "code",
|
| 1179 |
+
"execution_count": null,
|
| 1180 |
+
"id": "lqGrqG51NiQV",
|
| 1181 |
+
"metadata": {
|
| 1182 |
+
"colab": {
|
| 1183 |
+
"base_uri": "https://localhost:8080/"
|
| 1184 |
+
},
|
| 1185 |
+
"id": "lqGrqG51NiQV",
|
| 1186 |
+
"outputId": "6bdae59b-2684-4bd0-f804-d16ebd8272db"
|
| 1187 |
+
},
|
| 1188 |
+
"outputs": [
|
| 1189 |
+
{
|
| 1190 |
+
"name": "stdout",
|
| 1191 |
+
"output_type": "stream",
|
| 1192 |
+
"text": [
|
| 1193 |
+
"1\n",
|
| 1194 |
+
"1\n",
|
| 1195 |
+
"1\n",
|
| 1196 |
+
"1\n",
|
| 1197 |
+
"1\n",
|
| 1198 |
+
"1\n",
|
| 1199 |
+
"1\n",
|
| 1200 |
+
"1\n",
|
| 1201 |
+
"1\n",
|
| 1202 |
+
"0\n"
|
| 1203 |
+
]
|
| 1204 |
+
}
|
| 1205 |
+
],
|
| 1206 |
+
"source": [
|
| 1207 |
+
"print(predict(\"इन्टरनेटको सुरुवात सन् १९६९ मा अमेरिकी रक्षा मन्त्रालयले निर्माण गरेको ARPANET नामक प्रोजेक्टबाट भएको हो, जसको उद्देश्य आपसी संचारलाई सहज बनाउने थियो र जसले भविष्यमा इन्टरनेटको रूप लियो\"))\n",
|
| 1208 |
+
"\n",
|
| 1209 |
+
"print(predict(\"सुरुमा इन्टरनेट केही वैज्ञानिक तथा सरकारी संस्थाहरूमा सीमित रहेको भए पनि, समयक्रममा यसको पहुँच आम नागरिक, विद्यालय, र व्यवसायिक क्षेत्रमा विस्तार हुँदै गयो\"))\n",
|
| 1210 |
+
"\n",
|
| 1211 |
+
"print(predict(\"ARPANETले कम्प्युटरहरूलाई आपसमा जोड्ने सफल प्रयोग गरेपछि इन्टरनेटको सम्भावना प्रमाणित भयो, जसले गर्दा विश्वभरका अनुसन्धानकर्ताहरू यसप्रति आकर्षित हुन थाले\"))\n",
|
| 1212 |
+
"\n",
|
| 1213 |
+
"print(predict(\"सन् १९९० को दशकमा विश्वव्यापी रूपमा इन्टरनेट विस्तार हुन थालेपछि मानिसहरू सूचनाको आदान–प्रदान, इमेल, र वेबसाइटहरूको प्रयोगमार्फत डिजिटल संसारमा प्रवेश गर्न थाले।\"))\n",
|
| 1214 |
+
"\n",
|
| 1215 |
+
"print(predict(\"इन्टरनेटले शिक्षा, स्वास्थ्य, सञ्चार, मनोरञ्जन, तथा व्यापारजस्ता धेरै क्षेत्रहरूमा अभूतपूर्व परिवर्तन ल्याएको छ, जसले गर्दा मानव जीवन सरल, छरितो र प्रभावकारी बनेको छ।\"))\n",
|
| 1216 |
+
"\n",
|
| 1217 |
+
"print(predict(\"समयसँगै इन्टरनेट एक अत्यावश्यक सेवाको रूपमा विकास भएको छ, जसबिनाको आधुनिक जीवन लगभग असम्भवजस्तै लाग्ने अवस्था सिर्जना भएको छ।\"))\n",
|
| 1218 |
+
"\n",
|
| 1219 |
+
"print(predict(\"आजको युगमा इन्टरनेट केवल सूचना प्राप्तिको माध्यम मात्र नभई ज्ञानको भण्डार, रचनात्मकता प्रदर्शन गर्ने मंच, तथा रोजगार सृजनाको स्रोत पनि बनिसकेको छ।\"))\n",
|
| 1220 |
+
"\n",
|
| 1221 |
+
"print(predict(\"इन्टरनेटको प्रभाव त्यति गहिरो भएको छ कि विद्यालयका बालबालिकादेखि वृद्धसम्म यसको प्रयोगमा संलग्न छन्, जसले डिजिटल विभाजनको अवधारणा जन्माएको छ।\"))\n",
|
| 1222 |
+
"\n",
|
| 1223 |
+
"print(predict(\"इन्टरनेटले विश्वलाई एउटा सानो गाउँमा रूपान्तरण गरेको छ, जहाँ मानिसहरू हजारौं किलोमिटर टाढा भएर पनि एकअर्कासँग प्रत्यक्ष संवाद गर्न सक्छन्।\"))\n",
|
| 1224 |
+
"\n",
|
| 1225 |
+
"print(predict(\"संसदीय समितिले समन्वयकारी भूमिका निर्वाह गर्दै मनसुनजन्य विपद् जोखिम न्यूनीकरण, विपद् प्रतिकार्यका लागि तयारी गर्न तीन तहकै सरकारलाई निर्देशन दिएको छ।\"))\n"
|
| 1226 |
+
]
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"cell_type": "code",
|
| 1230 |
+
"execution_count": null,
|
| 1231 |
+
"id": "X2ePCc5Disrt",
|
| 1232 |
+
"metadata": {
|
| 1233 |
+
"colab": {
|
| 1234 |
+
"base_uri": "https://localhost:8080/",
|
| 1235 |
+
"height": 35
|
| 1236 |
+
},
|
| 1237 |
+
"id": "X2ePCc5Disrt",
|
| 1238 |
+
"outputId": "a4d27689-28cb-43c0-8333-67f2d3a6e097"
|
| 1239 |
+
},
|
| 1240 |
+
"outputs": [
|
| 1241 |
+
{
|
| 1242 |
+
"data": {
|
| 1243 |
+
"application/vnd.google.colaboratory.intrinsic+json": {
|
| 1244 |
+
"type": "string"
|
| 1245 |
+
},
|
| 1246 |
+
"text/plain": [
|
| 1247 |
+
"'/content/classifier.zip'"
|
| 1248 |
+
]
|
| 1249 |
+
},
|
| 1250 |
+
"execution_count": 42,
|
| 1251 |
+
"metadata": {},
|
| 1252 |
+
"output_type": "execute_result"
|
| 1253 |
+
}
|
| 1254 |
+
],
|
| 1255 |
+
"source": [
|
| 1256 |
+
"import shutil\n",
|
| 1257 |
+
"\n",
|
| 1258 |
+
"# Replace 'my_folder' with your folder name or path\n",
|
| 1259 |
+
"folder_path = '/content/nepali_xlmr_classifier'\n",
|
| 1260 |
+
"zip_path = '/content/classifier.zip'\n",
|
| 1261 |
+
"\n",
|
| 1262 |
+
"shutil.make_archive(zip_path.replace('.zip', ''), 'zip', folder_path)\n"
|
| 1263 |
+
]
|
| 1264 |
+
},
|
| 1265 |
+
{
|
| 1266 |
+
"cell_type": "code",
|
| 1267 |
+
"execution_count": null,
|
| 1268 |
+
"id": "4BDzVg2gN7xi",
|
| 1269 |
+
"metadata": {
|
| 1270 |
+
"colab": {
|
| 1271 |
+
"base_uri": "https://localhost:8080/",
|
| 1272 |
+
"height": 17
|
| 1273 |
+
},
|
| 1274 |
+
"id": "4BDzVg2gN7xi",
|
| 1275 |
+
"outputId": "ef31798e-24f5-45ad-900f-7528b32ae39f"
|
| 1276 |
+
},
|
| 1277 |
+
"outputs": [
|
| 1278 |
+
{
|
| 1279 |
+
"data": {
|
| 1280 |
+
"application/javascript": "\n async function download(id, filename, size) {\n if (!google.colab.kernel.accessAllowed) {\n return;\n }\n const div = document.createElement('div');\n const label = document.createElement('label');\n label.textContent = `Downloading \"${filename}\": `;\n div.appendChild(label);\n const progress = document.createElement('progress');\n progress.max = size;\n div.appendChild(progress);\n document.body.appendChild(div);\n\n const buffers = [];\n let downloaded = 0;\n\n const channel = await google.colab.kernel.comms.open(id);\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n\n for await (const message of channel.messages) {\n // Send a message to notify the kernel that we're ready.\n channel.send({})\n if (message.buffers) {\n for (const buffer of message.buffers) {\n buffers.push(buffer);\n downloaded += buffer.byteLength;\n progress.value = downloaded;\n }\n }\n }\n const blob = new Blob(buffers, {type: 'application/binary'});\n const a = document.createElement('a');\n a.href = window.URL.createObjectURL(blob);\n a.download = filename;\n div.appendChild(a);\n a.click();\n div.remove();\n }\n ",
|
| 1281 |
+
"text/plain": [
|
| 1282 |
+
"<IPython.core.display.Javascript object>"
|
| 1283 |
+
]
|
| 1284 |
+
},
|
| 1285 |
+
"metadata": {},
|
| 1286 |
+
"output_type": "display_data"
|
| 1287 |
+
},
|
| 1288 |
+
{
|
| 1289 |
+
"data": {
|
| 1290 |
+
"application/javascript": "download(\"download_33034c8f-76d5-48d0-b7cd-3d066ac8e32f\", \"classifier.zip\", 6596694)",
|
| 1291 |
+
"text/plain": [
|
| 1292 |
+
"<IPython.core.display.Javascript object>"
|
| 1293 |
+
]
|
| 1294 |
+
},
|
| 1295 |
+
"metadata": {},
|
| 1296 |
+
"output_type": "display_data"
|
| 1297 |
+
}
|
| 1298 |
+
],
|
| 1299 |
+
"source": [
|
| 1300 |
+
"from google.colab import files\n",
|
| 1301 |
+
"\n",
|
| 1302 |
+
"files.download(zip_path)\n"
|
| 1303 |
+
]
|
| 1304 |
+
},
|
| 1305 |
+
{
|
| 1306 |
+
"cell_type": "code",
|
| 1307 |
+
"execution_count": null,
|
| 1308 |
+
"id": "2jJkcOlw_R1k",
|
| 1309 |
+
"metadata": {
|
| 1310 |
+
"id": "2jJkcOlw_R1k"
|
| 1311 |
+
},
|
| 1312 |
+
"outputs": [],
|
| 1313 |
+
"source": [
|
| 1314 |
+
"torch.save(model.state_dict(), \"final_model.pth\") # AFTER training with classification head\n"
|
| 1315 |
+
]
|
| 1316 |
+
},
|
| 1317 |
+
{
|
| 1318 |
+
"cell_type": "code",
|
| 1319 |
+
"execution_count": null,
|
| 1320 |
+
"id": "xnHr1IDABebZ",
|
| 1321 |
+
"metadata": {
|
| 1322 |
+
"colab": {
|
| 1323 |
+
"base_uri": "https://localhost:8080/"
|
| 1324 |
+
},
|
| 1325 |
+
"id": "xnHr1IDABebZ",
|
| 1326 |
+
"outputId": "95761a2d-56fa-418c-de03-d66d1ae662ee"
|
| 1327 |
+
},
|
| 1328 |
+
"outputs": [
|
| 1329 |
+
{
|
| 1330 |
+
"name": "stdout",
|
| 1331 |
+
"output_type": "stream",
|
| 1332 |
+
"text": [
|
| 1333 |
+
"The text is predicted to be: Human\n",
|
| 1334 |
+
"1\n",
|
| 1335 |
+
"0\n",
|
| 1336 |
+
"1\n"
|
| 1337 |
+
]
|
| 1338 |
+
}
|
| 1339 |
+
],
|
| 1340 |
+
"source": [
|
| 1341 |
+
"# prompt: How to load the model and classifier and use it ? if no other code is in top of this\n",
|
| 1342 |
+
"\n",
|
| 1343 |
+
"# Define the device\n",
|
| 1344 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
| 1345 |
+
"\n",
|
| 1346 |
+
"# Instantiate the model\n",
|
| 1347 |
+
"model = IndicBERTClassifier().to(device)\n",
|
| 1348 |
+
"\n",
|
| 1349 |
+
"# Load the saved state dictionary\n",
|
| 1350 |
+
"# Make sure the path to your saved model file is correct\n",
|
| 1351 |
+
"model_path = \"final_model.pth\" # Or \"model_95_acc.pth\" if you saved that one last\n",
|
| 1352 |
+
"model.load_state_dict(torch.load(model_path, map_location=device))\n",
|
| 1353 |
+
"\n",
|
| 1354 |
+
"# Set the model to evaluation mode\n",
|
| 1355 |
+
"model.eval()\n",
|
| 1356 |
+
"\n",
|
| 1357 |
+
"# Load the tokenizer\n",
|
| 1358 |
+
"tokenizer_path = \"./nepali_xlmr_classifier\" # Make sure this path is correct\n",
|
| 1359 |
+
"tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)\n",
|
| 1360 |
+
"\n",
|
| 1361 |
+
"# Now the model and tokenizer are loaded and ready to be used for predictions.\n",
|
| 1362 |
+
"# You can use the existing `predict` function or write a new one.\n",
|
| 1363 |
+
"\n",
|
| 1364 |
+
"# Example of using the predict function with the loaded model and tokenizer\n",
|
| 1365 |
+
"def predict(text):\n",
|
| 1366 |
+
" model.eval() # Ensure model is in evaluation mode\n",
|
| 1367 |
+
" inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)\n",
|
| 1368 |
+
" inputs = {k: v.to(device) for k, v in inputs.items()}\n",
|
| 1369 |
+
" with torch.no_grad():\n",
|
| 1370 |
+
" outputs = model(**inputs)\n",
|
| 1371 |
+
"\n",
|
| 1372 |
+
" # Handle if output is tensor (some versions/models return logits directly)\n",
|
| 1373 |
+
" logits = outputs if isinstance(outputs, torch.Tensor) else outputs.logits\n",
|
| 1374 |
+
"\n",
|
| 1375 |
+
" pred = torch.argmax(logits, dim=1).item()\n",
|
| 1376 |
+
" return pred\n",
|
| 1377 |
+
"\n",
|
| 1378 |
+
"# Example usage with some text\n",
|
| 1379 |
+
"text_to_predict = \"This is a test sentence.\" # Replace with your Nepali text\n",
|
| 1380 |
+
"predicted_class = predict(text_to_predict)\n",
|
| 1381 |
+
"\n",
|
| 1382 |
+
"# Interpret the prediction (assuming 0 for Human, 1 for AI based on your previous code)\n",
|
| 1383 |
+
"class_label = \"Human\" if predicted_class == 0 else \"AI\"\n",
|
| 1384 |
+
"print(f\"The text is predicted to be: {class_label}\")\n",
|
| 1385 |
+
"\n",
|
| 1386 |
+
"# You can test with more examples as you did before\n",
|
| 1387 |
+
"print(predict(\"यी सबै वाक्यहरू इन्टरनेटको विकास, प्रभाव, र चुनौतीहरूको गहिरो सन्दर्भ समेटेर तयार पारिएका छन्। यदि तिमीलाई चाहिएको खण्डमा विशेष विषय (जस्तै शिक्षा, साइबर सुरक्षा, ग्रामीण प्रभाव आदि) चाहिएको हो भने, म त्यही विषयमा केन्द्रित लामो वाक्यहरू पनि दिन सक्छु।\"))\n",
|
| 1388 |
+
"print(predict(\"अख्तियार दुरुपयोग अनुसन्धान आयोगले सिन्धुपाल्चोक–२ बाट प्रतिनिधिसभा सदस्य निर्वाचित सांसद तथा पूर्वमन्त्री बस्नेतसहित १६ जना र २ कम्पनी विरुद्ध ३ अर्ब २१ करोडभन्दा बढी बिगो कायम गरी बिहीबार विशेष अदालतमा भ्रष्टाचार मुद्दा दायर गरेको छ । योसँगै बस्नेत सांसद पदबाट स्वतः निलम्बनमा परेका छन् ।\"))\n",
|
| 1389 |
+
"print(predict(\"इन्टरनेटको सुरुवात सन् १९६९ मा अमेरिकी रक्षा मन्त्रालयले निर्माण गरेको ARPANET नामक प्रोजेक्टबाट भएको हो, जसको उद्देश्य आपसी संचारलाई सहज बनाउने थियो र जसले भविष्यमा इन्टरनेटको रूप लियो\"))\n"
|
| 1390 |
+
]
|
| 1391 |
+
},
|
| 1392 |
+
{
|
| 1393 |
+
"cell_type": "code",
|
| 1394 |
+
"execution_count": null,
|
| 1395 |
+
"id": "gG8fnbqyDUpm",
|
| 1396 |
+
"metadata": {
|
| 1397 |
+
"id": "gG8fnbqyDUpm"
|
| 1398 |
+
},
|
| 1399 |
+
"outputs": [],
|
| 1400 |
+
"source": []
|
| 1401 |
+
}
|
| 1402 |
+
],
|
| 1403 |
+
"metadata": {
|
| 1404 |
+
"accelerator": "TPU",
|
| 1405 |
+
"colab": {
|
| 1406 |
+
"gpuType": "V28",
|
| 1407 |
+
"provenance": []
|
| 1408 |
+
},
|
| 1409 |
+
"kernelspec": {
|
| 1410 |
+
"display_name": "ml",
|
| 1411 |
+
"language": "python",
|
| 1412 |
+
"name": "python3"
|
| 1413 |
+
},
|
| 1414 |
+
"language_info": {
|
| 1415 |
+
"codemirror_mode": {
|
| 1416 |
+
"name": "ipython",
|
| 1417 |
+
"version": 3
|
| 1418 |
+
},
|
| 1419 |
+
"file_extension": ".py",
|
| 1420 |
+
"mimetype": "text/x-python",
|
| 1421 |
+
"name": "python",
|
| 1422 |
+
"nbconvert_exporter": "python",
|
| 1423 |
+
"pygments_lexer": "ipython3",
|
| 1424 |
+
"version": "3.11.14"
|
| 1425 |
+
}
|
| 1426 |
+
},
|
| 1427 |
+
"nbformat": 4,
|
| 1428 |
+
"nbformat_minor": 5
|
| 1429 |
+
}
|
notebook/ai_vs_human_nepali/notebook/final_main.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebook/ai_vs_human_nepali/notebook/main.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebook/ai_vs_human_nepali/notebook/working model.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebook/ai_vs_human_nepali/topic_scrapper.ipynb
ADDED
|
@@ -0,0 +1,542 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 15,
|
| 6 |
+
"id": "4b53d4bc",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"# # Groq Nepali Rewriter\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"# This notebook loads the dataset, builds a Nepali rewrite prompt, tests one sample, and then saves a batch output CSV using the Groq API.\n",
|
| 13 |
+
"\n",
|
| 14 |
+
"# Requirements:\n",
|
| 15 |
+
"# - `GROQ_API_KEY` must be available in `.env`\n",
|
| 16 |
+
"# - the input file must contain a `paragraph` column"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "code",
|
| 21 |
+
"execution_count": 16,
|
| 22 |
+
"id": "6c8dc1cb",
|
| 23 |
+
"metadata": {},
|
| 24 |
+
"outputs": [
|
| 25 |
+
{
|
| 26 |
+
"data": {
|
| 27 |
+
"text/plain": [
|
| 28 |
+
"True"
|
| 29 |
+
]
|
| 30 |
+
},
|
| 31 |
+
"execution_count": 16,
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"output_type": "execute_result"
|
| 34 |
+
}
|
| 35 |
+
],
|
| 36 |
+
"source": [
|
| 37 |
+
"import os\n",
|
| 38 |
+
"import re\n",
|
| 39 |
+
"import time\n",
|
| 40 |
+
"from concurrent.futures import ThreadPoolExecutor, as_completed\n",
|
| 41 |
+
"\n",
|
| 42 |
+
"import pandas as pd\n",
|
| 43 |
+
"from dotenv import load_dotenv\n",
|
| 44 |
+
"from groq import Groq\n",
|
| 45 |
+
"\n",
|
| 46 |
+
"load_dotenv()"
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"cell_type": "code",
|
| 51 |
+
"execution_count": 17,
|
| 52 |
+
"id": "019adfa8",
|
| 53 |
+
"metadata": {},
|
| 54 |
+
"outputs": [],
|
| 55 |
+
"source": [
|
| 56 |
+
"api_key = os.getenv(\"GROQ_API_KEY2\")\n",
|
| 57 |
+
"if not api_key:\n",
|
| 58 |
+
" raise ValueError(\"GROQ_API_KEY not found in .env or environment.\")\n",
|
| 59 |
+
"\n",
|
| 60 |
+
"client = Groq(api_key=api_key)\n",
|
| 61 |
+
"MODEL_NAME = \"llama-3.3-70b-versatile\""
|
| 62 |
+
]
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "code",
|
| 66 |
+
"execution_count": 18,
|
| 67 |
+
"id": "4b4d2bbe",
|
| 68 |
+
"metadata": {},
|
| 69 |
+
"outputs": [],
|
| 70 |
+
"source": [
|
| 71 |
+
"data =pd.read_csv(\"DATASET/topics_1000.csv\")"
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
"cell_type": "code",
|
| 76 |
+
"execution_count": 19,
|
| 77 |
+
"id": "c36cfbbf",
|
| 78 |
+
"metadata": {},
|
| 79 |
+
"outputs": [
|
| 80 |
+
{
|
| 81 |
+
"data": {
|
| 82 |
+
"text/html": [
|
| 83 |
+
"<div>\n",
|
| 84 |
+
"<style scoped>\n",
|
| 85 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 86 |
+
" vertical-align: middle;\n",
|
| 87 |
+
" }\n",
|
| 88 |
+
"\n",
|
| 89 |
+
" .dataframe tbody tr th {\n",
|
| 90 |
+
" vertical-align: top;\n",
|
| 91 |
+
" }\n",
|
| 92 |
+
"\n",
|
| 93 |
+
" .dataframe thead th {\n",
|
| 94 |
+
" text-align: right;\n",
|
| 95 |
+
" }\n",
|
| 96 |
+
"</style>\n",
|
| 97 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 98 |
+
" <thead>\n",
|
| 99 |
+
" <tr style=\"text-align: right;\">\n",
|
| 100 |
+
" <th></th>\n",
|
| 101 |
+
" <th>id</th>\n",
|
| 102 |
+
" <th>topic</th>\n",
|
| 103 |
+
" </tr>\n",
|
| 104 |
+
" </thead>\n",
|
| 105 |
+
" <tbody>\n",
|
| 106 |
+
" <tr>\n",
|
| 107 |
+
" <th>0</th>\n",
|
| 108 |
+
" <td>1</td>\n",
|
| 109 |
+
" <td>नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अव...</td>\n",
|
| 110 |
+
" </tr>\n",
|
| 111 |
+
" <tr>\n",
|
| 112 |
+
" <th>1</th>\n",
|
| 113 |
+
" <td>2</td>\n",
|
| 114 |
+
" <td>नेपालको शिक्षा प्रणालीमा डिजिटल प्रविधिको प्रभाव</td>\n",
|
| 115 |
+
" </tr>\n",
|
| 116 |
+
" <tr>\n",
|
| 117 |
+
" <th>2</th>\n",
|
| 118 |
+
" <td>3</td>\n",
|
| 119 |
+
" <td>काठमाडौँ उपत्यकाको वायु प्रदूषण समस्या</td>\n",
|
| 120 |
+
" </tr>\n",
|
| 121 |
+
" <tr>\n",
|
| 122 |
+
" <th>3</th>\n",
|
| 123 |
+
" <td>4</td>\n",
|
| 124 |
+
" <td>नेपालमा जलवायु परिवर्तनका असरहरू</td>\n",
|
| 125 |
+
" </tr>\n",
|
| 126 |
+
" <tr>\n",
|
| 127 |
+
" <th>4</th>\n",
|
| 128 |
+
" <td>5</td>\n",
|
| 129 |
+
" <td>ग्रामीण क्षेत्रमा इन्टरनेट पहुँचको विस्तार</td>\n",
|
| 130 |
+
" </tr>\n",
|
| 131 |
+
" </tbody>\n",
|
| 132 |
+
"</table>\n",
|
| 133 |
+
"</div>"
|
| 134 |
+
],
|
| 135 |
+
"text/plain": [
|
| 136 |
+
" id topic\n",
|
| 137 |
+
"0 1 नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अव...\n",
|
| 138 |
+
"1 2 नेपालको शिक्षा प्रणालीमा डिजिटल प्रविधिको प्रभाव\n",
|
| 139 |
+
"2 3 काठमाडौँ उपत्यकाको वायु प्रदूषण समस्या\n",
|
| 140 |
+
"3 4 नेपालमा जलवायु परिवर्तनका असरहरू\n",
|
| 141 |
+
"4 5 ग्रामीण क्षेत्रमा इन्टरनेट पहुँचको विस्तार"
|
| 142 |
+
]
|
| 143 |
+
},
|
| 144 |
+
"execution_count": 19,
|
| 145 |
+
"metadata": {},
|
| 146 |
+
"output_type": "execute_result"
|
| 147 |
+
}
|
| 148 |
+
],
|
| 149 |
+
"source": [
|
| 150 |
+
"data.head()"
|
| 151 |
+
]
|
| 152 |
+
},
|
| 153 |
+
{
|
| 154 |
+
"cell_type": "code",
|
| 155 |
+
"execution_count": 20,
|
| 156 |
+
"id": "b6e226b8",
|
| 157 |
+
"metadata": {},
|
| 158 |
+
"outputs": [],
|
| 159 |
+
"source": [
|
| 160 |
+
"import numpy as np\n",
|
| 161 |
+
"def build_prompt(paragraph):\n",
|
| 162 |
+
" style = [\n",
|
| 163 |
+
" \"Use simple and clear language.\",\n",
|
| 164 |
+
" \"Make it engaging and interesting to read.\",\n",
|
| 165 |
+
" \"Use a conversational tone.\",\n",
|
| 166 |
+
" \"Keep the original meaning intact.\",\n",
|
| 167 |
+
" \"Avoid complex jargon and technical terms.\",\n",
|
| 168 |
+
" \"Use short sentences and paragraphs.\",\n",
|
| 169 |
+
" \"Add examples or anecdotes to illustrate points.\",\n",
|
| 170 |
+
" \"Use active voice instead of passive voice.\",\n",
|
| 171 |
+
" \"Include a call to action or a thought-provoking question at the end.\",\n",
|
| 172 |
+
" ]\n",
|
| 173 |
+
" selected_style_random_single = np.random.choice(style, size=len(style), replace=False) # Select the first 5 style guidelines\n",
|
| 174 |
+
" prompt = f\"\"\"\n",
|
| 175 |
+
" give me an essay for the following topics puree nepali ok no enlgish language:\n",
|
| 176 |
+
" {paragraph}\n",
|
| 177 |
+
" Rewrite the above paragraph in Nepali, following these style guidelines:\n",
|
| 178 |
+
" {', '.join(selected_style_random_single)}\n",
|
| 179 |
+
" \"\"\"\n",
|
| 180 |
+
" return prompt.strip()"
|
| 181 |
+
]
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"cell_type": "code",
|
| 185 |
+
"execution_count": 21,
|
| 186 |
+
"id": "cf16922b",
|
| 187 |
+
"metadata": {},
|
| 188 |
+
"outputs": [
|
| 189 |
+
{
|
| 190 |
+
"name": "stdout",
|
| 191 |
+
"output_type": "stream",
|
| 192 |
+
"text": [
|
| 193 |
+
"नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अवस्था\n",
|
| 194 |
+
"\n",
|
| 195 |
+
"कृत्रिम बुद्धिमत्ता विकास नेपालको लागि एक नयाँ युग हो । यो प्राविधिक क्षेत्र दिन-प्रतिदिन विकसित हुने क्रममा छ । नेपालमा कृत्रिम बुद्धिमत्ताले विभिन्न क्षेत्रमा परिवर्तन ल्याउने क्षमता राख्दछ । जस्तै: स्वास्थ्य सेवामा, शिक्षामा, वित्तीय सेवामा, तथा उत्पादन क्षेत्रमा ।\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"नेपालमा कृत्रिम बुद्धिमत्ताको विकासले नयाँ अवस्था प्राप्त गरिरहेको छ । यो देशमा विभिन्न प्राविधिक कम्पनीहरुले कृत्रिम बुद्धिमत्ताको विकासमा लगनशील छन् । तसर्थ, यसले नेपालमा रोजगारीको अवसर पनि बढाउने छ । उदाहरणको लागि, कृत्रिम बुद्धिमत्ताले स्वास्थ्य सेवामा रोग निदान गर्ने, रोगको उपचार सुझाउने, तथा व्यक्तिको स्वास्थ्य जाँच गर्ने काम गर्नसक्ने छ ।\n",
|
| 198 |
+
"\n",
|
| 199 |
+
"कृत्रिम बुद्धिमत्ताको विकासले नेपालको अर्थतन्त्रमा पनि परिवर्तन ल्याउने छ । यसले व्यवसायिक क्षेत्रमा उत्पादनशीलता बढाउने, उत्पादन मुल्य कम गर्ने, तथा गुणस्तर मापन गर्ने काम गर्नसक्ने छ । उदाहरणको लागि, कृत्रिम बुद्धिमत्ताले वित्तीय सेवामा लेनदेनको निरीक्षण गर्ने, धोकाधोकाको मुल्यांकन गर्ने, तथा वित्तीय संस्थाहरुलाई सुझाव दिने काम गर्नसक्ने छ ।\n",
|
| 200 |
+
"\n",
|
| 201 |
+
"नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अवस्थाले देशलाई एक नयाँ दिशामा लम्बने क्षमता राख्दछ । तर, यसको विकासमा चुनौतिहरु पनि छन् । जस्तै: डाटा सुरक्षा, निजताको हनन, तथा श्रमिकहरुको प्रतिस्पर्धी क्षमता । तसर्थ, नेपालमा कृत्रिम बुद्धिमत्ताको विकासलाई प्रोत्साहित गर्नको लागि, हामीले यसको विकासमा लगनशील कम्पनीहरुलाई साथ दिनु पर्छ । हामीले पनि कृत्रिम बुद्धिमत्ता���ो विकासमा योगदान पुर्याउनुपर्छ ।\n",
|
| 202 |
+
"\n",
|
| 203 |
+
"आह, नेपालमा कृत्रिम बुद्धिमत्ता विकासको वर्तमान अवस्थाले देशलाई एक नयाँ दिशामा लम्बने क्षमता राख्दछ । तर, यसको विकासमा हामी के गरिरहेका छौ? हामीले कृत्रिम बुद्धिमत्ताको विकासमा योगदान पुर्याउने छौ कि? हामीले यसको विकासमा चुनौतिहरुलाई मात गर्ने छौ कि? यस प्रश्नको उत्तर हामीसँग छ । आउनうभ, हामी नेपालमा कृत्रिम बुद्धिमत्ताको विकासलाई प्रोत्साहित गरौं । आउनूभ, हामी देशलाई एक नयाँ दिशामा लम्बौं ।\n"
|
| 204 |
+
]
|
| 205 |
+
}
|
| 206 |
+
],
|
| 207 |
+
"source": [
|
| 208 |
+
"build_prompt = build_prompt\n",
|
| 209 |
+
"\n",
|
| 210 |
+
"sample_title = str(data.iloc[0][\"topic\"])\n",
|
| 211 |
+
"\n",
|
| 212 |
+
"sample_response = client.chat.completions.create(\n",
|
| 213 |
+
" model=MODEL_NAME,\n",
|
| 214 |
+
" messages=[{\"role\": \"user\", \"content\": build_prompt(sample_title)}],\n",
|
| 215 |
+
")\n",
|
| 216 |
+
"\n",
|
| 217 |
+
"generated_text = sample_response.choices[0].message.content.strip()\n",
|
| 218 |
+
"print(generated_text)"
|
| 219 |
+
]
|
| 220 |
+
},
|
| 221 |
+
{
|
| 222 |
+
"cell_type": "code",
|
| 223 |
+
"execution_count": null,
|
| 224 |
+
"id": "c709f126",
|
| 225 |
+
"metadata": {},
|
| 226 |
+
"outputs": [],
|
| 227 |
+
"source": [
|
| 228 |
+
"def grok_step3_5_scraper(\n",
|
| 229 |
+
" input_file,\n",
|
| 230 |
+
" output_file=\"step3_5_grok_nepali.csv\",\n",
|
| 231 |
+
" limit=100,\n",
|
| 232 |
+
" model=MODEL_NAME,\n",
|
| 233 |
+
" requests_per_second=2,\n",
|
| 234 |
+
" max_workers=2,\n",
|
| 235 |
+
" max_retries=3,\n",
|
| 236 |
+
"):\n",
|
| 237 |
+
" working_df = pd.read_csv(input_file)\n",
|
| 238 |
+
" if limit is not None:\n",
|
| 239 |
+
" working_df = working_df.head(limit)\n",
|
| 240 |
+
"\n",
|
| 241 |
+
" cols = set(working_df.columns)\n",
|
| 242 |
+
" if \"Title\" in cols or \"शीर्षक\" in cols:\n",
|
| 243 |
+
" title_col = \"Title\" if \"Title\" in cols else \"शीर्षक\"\n",
|
| 244 |
+
" prompt_col = title_col\n",
|
| 245 |
+
" if \"Paragraph\" in cols:\n",
|
| 246 |
+
" human_col = \"Paragraph\"\n",
|
| 247 |
+
" elif \"विवरण\" in cols:\n",
|
| 248 |
+
" human_col = \"विवरण\"\n",
|
| 249 |
+
" elif \"paragraph\" in cols:\n",
|
| 250 |
+
" human_col = \"paragraph\"\n",
|
| 251 |
+
" else:\n",
|
| 252 |
+
" human_col = prompt_col\n",
|
| 253 |
+
" elif \"paragraph\" in cols or \"Paragraph\" in cols or \"विवरण\" in cols:\n",
|
| 254 |
+
" prompt_col = (\n",
|
| 255 |
+
" \"paragraph\" if \"paragraph\" in cols\n",
|
| 256 |
+
" else (\"Paragraph\" if \"Paragraph\" in cols else \"विवरण\")\n",
|
| 257 |
+
" )\n",
|
| 258 |
+
" human_col = prompt_col\n",
|
| 259 |
+
" title_col = prompt_col\n",
|
| 260 |
+
" else:\n",
|
| 261 |
+
" raise ValueError(\n",
|
| 262 |
+
" \"No supported text columns found. Expected one of: Title/शीर्षक with Paragraph/विवरण, or paragraph.\"\n",
|
| 263 |
+
" )\n",
|
| 264 |
+
"\n",
|
| 265 |
+
" working_df = working_df.dropna(subset=[human_col]).copy()\n",
|
| 266 |
+
"\n",
|
| 267 |
+
" total_input_rows = len(working_df)\n",
|
| 268 |
+
" already_done = 0\n",
|
| 269 |
+
"\n",
|
| 270 |
+
" if os.path.exists(output_file):\n",
|
| 271 |
+
" try:\n",
|
| 272 |
+
" existing_df = pd.read_csv(output_file)\n",
|
| 273 |
+
" already_done = len(existing_df)\n",
|
| 274 |
+
" except pd.errors.EmptyDataError:\n",
|
| 275 |
+
" already_done = 0\n",
|
| 276 |
+
"\n",
|
| 277 |
+
" if already_done >= total_input_rows:\n",
|
| 278 |
+
" print(\n",
|
| 279 |
+
" f\"Nothing to do. {already_done} rows already exist in {output_file} (input rows: {total_input_rows}).\"\n",
|
| 280 |
+
" )\n",
|
| 281 |
+
" return\n",
|
| 282 |
+
"\n",
|
| 283 |
+
" if already_done > 0:\n",
|
| 284 |
+
" working_df = working_df.iloc[already_done:].copy()\n",
|
| 285 |
+
" print(\n",
|
| 286 |
+
" f\"Resuming from row {already_done}. Processing remaining {len(working_df)} rows out of {total_input_rows}.\"\n",
|
| 287 |
+
" )\n",
|
| 288 |
+
" else:\n",
|
| 289 |
+
" print(f\"Loaded {total_input_rows} rows from {input_file}\")\n",
|
| 290 |
+
" print(\n",
|
| 291 |
+
" f\"Using title column: {title_col} | prompt column: {prompt_col} | human column: {human_col}\"\n",
|
| 292 |
+
" )\n",
|
| 293 |
+
"\n",
|
| 294 |
+
" results = []\n",
|
| 295 |
+
"\n",
|
| 296 |
+
" bad_markers = [\n",
|
| 297 |
+
" \"error\",\n",
|
| 298 |
+
" \"invalid\",\n",
|
| 299 |
+
" \"not found\",\n",
|
| 300 |
+
" \"decommissioned\",\n",
|
| 301 |
+
" \"rate limit\",\n",
|
| 302 |
+
" \"api key\",\n",
|
| 303 |
+
" ]\n",
|
| 304 |
+
"\n",
|
| 305 |
+
" def is_valid_ai_text(text: str) -> bool:\n",
|
| 306 |
+
" if not text:\n",
|
| 307 |
+
" return False\n",
|
| 308 |
+
" clean_text = text.strip()\n",
|
| 309 |
+
" if len(clean_text) < 20:\n",
|
| 310 |
+
" return False\n",
|
| 311 |
+
" lower_text = clean_text.lower()\n",
|
| 312 |
+
" return not any(marker in lower_text for marker in bad_markers)\n",
|
| 313 |
+
"\n",
|
| 314 |
+
" def extract_retry_wait_seconds(error_text: str) -> float:\n",
|
| 315 |
+
" match = re.search(r\"try again in\\s*(\\d+)ms\", error_text, re.IGNORECASE)\n",
|
| 316 |
+
" if match:\n",
|
| 317 |
+
" return int(match.group(1)) / 1000.0 + 0.2\n",
|
| 318 |
+
" return 1.5\n",
|
| 319 |
+
"\n",
|
| 320 |
+
" def process_one(idx, title_text, prompt_text, human_text):\n",
|
| 321 |
+
" local_client = Groq(api_key=api_key)\n",
|
| 322 |
+
"\n",
|
| 323 |
+
" for attempt in range(max_retries + 1):\n",
|
| 324 |
+
" try:\n",
|
| 325 |
+
" completion = local_client.chat.completions.create(\n",
|
| 326 |
+
" model=model,\n",
|
| 327 |
+
" messages=[{\"role\": \"user\", \"content\": build_prompt(str(prompt_text))}],\n",
|
| 328 |
+
" temperature=0.2,\n",
|
| 329 |
+
" max_tokens=500,\n",
|
| 330 |
+
" )\n",
|
| 331 |
+
" ai_text = completion.choices[0].message.content.strip()\n",
|
| 332 |
+
"\n",
|
| 333 |
+
" if not is_valid_ai_text(ai_text):\n",
|
| 334 |
+
" if attempt < max_retries:\n",
|
| 335 |
+
" continue\n",
|
| 336 |
+
" return {\n",
|
| 337 |
+
" \"idx\": idx,\n",
|
| 338 |
+
" \"ok\": False,\n",
|
| 339 |
+
" \"reason\": \"invalid_or_error_text\",\n",
|
| 340 |
+
" \"ai_text\": ai_text,\n",
|
| 341 |
+
" }\n",
|
| 342 |
+
"\n",
|
| 343 |
+
" return {\n",
|
| 344 |
+
" \"idx\": idx,\n",
|
| 345 |
+
" \"ok\": True,\n",
|
| 346 |
+
" \"title\": str(title_text),\n",
|
| 347 |
+
" \"human_text\": str(human_text),\n",
|
| 348 |
+
" \"ai_generated_text\": ai_text,\n",
|
| 349 |
+
" }\n",
|
| 350 |
+
" except Exception as error:\n",
|
| 351 |
+
" error_text = str(error)\n",
|
| 352 |
+
" is_rate_limited = (\n",
|
| 353 |
+
" \"rate_limit_exceeded\" in error_text.lower()\n",
|
| 354 |
+
" or \"rate limit reached\" in error_text.lower()\n",
|
| 355 |
+
" or \"429\" in error_text\n",
|
| 356 |
+
" )\n",
|
| 357 |
+
"\n",
|
| 358 |
+
" if is_rate_limited and attempt < max_retries:\n",
|
| 359 |
+
" wait_seconds = extract_retry_wait_seconds(error_text)\n",
|
| 360 |
+
" print(\n",
|
| 361 |
+
" f\"Row {idx} rate-limited, retry {attempt + 1}/{max_retries} after {wait_seconds:.2f}s\"\n",
|
| 362 |
+
" )\n",
|
| 363 |
+
" time.sleep(wait_seconds)\n",
|
| 364 |
+
" continue\n",
|
| 365 |
+
"\n",
|
| 366 |
+
" return {\n",
|
| 367 |
+
" \"idx\": idx,\n",
|
| 368 |
+
" \"ok\": False,\n",
|
| 369 |
+
" \"reason\": error_text,\n",
|
| 370 |
+
" \"ai_text\": \"\",\n",
|
| 371 |
+
" }\n",
|
| 372 |
+
"\n",
|
| 373 |
+
" rows = list(working_df[[title_col, prompt_col, human_col]].itertuples(index=True, name=None))\n",
|
| 374 |
+
" total = len(rows)\n",
|
| 375 |
+
"\n",
|
| 376 |
+
" for start in range(0, total, requests_per_second):\n",
|
| 377 |
+
" window = rows[start : start + requests_per_second]\n",
|
| 378 |
+
" tick_start = time.time()\n",
|
| 379 |
+
"\n",
|
| 380 |
+
" with ThreadPoolExecutor(max_workers=max_workers) as executor:\n",
|
| 381 |
+
" futures = {\n",
|
| 382 |
+
" executor.submit(process_one, idx, title_text, prompt_text, human_text): idx\n",
|
| 383 |
+
" for idx, title_text, prompt_text, human_text in window\n",
|
| 384 |
+
" }\n",
|
| 385 |
+
"\n",
|
| 386 |
+
" for future in as_completed(futures):\n",
|
| 387 |
+
" out = future.result()\n",
|
| 388 |
+
" if out[\"ok\"]:\n",
|
| 389 |
+
" # Save as id + ai_gen only\n",
|
| 390 |
+
" results.append({\n",
|
| 391 |
+
" \"id\": out[\"idx\"],\n",
|
| 392 |
+
" \"ai_gen\": out[\"ai_generated_text\"]\n",
|
| 393 |
+
" })\n",
|
| 394 |
+
" print(\n",
|
| 395 |
+
" f\"Row {out['idx']}: generated {len(out['ai_generated_text'].split())} words\"\n",
|
| 396 |
+
" )\n",
|
| 397 |
+
" else:\n",
|
| 398 |
+
" print(f\"Row {out['idx']} skipped: {out['reason']}\")\n",
|
| 399 |
+
"\n",
|
| 400 |
+
" if len(results) >= 10:\n",
|
| 401 |
+
" pd.DataFrame(results)[[\"id\", \"ai_gen\"]].to_csv(\n",
|
| 402 |
+
" output_file,\n",
|
| 403 |
+
" index=False,\n",
|
| 404 |
+
" mode=\"a\",\n",
|
| 405 |
+
" header=not os.path.exists(output_file),\n",
|
| 406 |
+
" )\n",
|
| 407 |
+
" print(f\"Saved {len(results)} valid rows to {output_file}\")\n",
|
| 408 |
+
" results = []\n",
|
| 409 |
+
"\n",
|
| 410 |
+
" elapsed = time.time() - tick_start\n",
|
| 411 |
+
" if elapsed < 1:\n",
|
| 412 |
+
" time.sleep(1 - elapsed)\n",
|
| 413 |
+
"\n",
|
| 414 |
+
" if results:\n",
|
| 415 |
+
" pd.DataFrame(results)[[\"id\", \"ai_gen\"]].to_csv(\n",
|
| 416 |
+
" output_file,\n",
|
| 417 |
+
" index=False,\n",
|
| 418 |
+
" mode=\"a\",\n",
|
| 419 |
+
" header=not os.path.exists(output_file),\n",
|
| 420 |
+
" )\n",
|
| 421 |
+
"\n",
|
| 422 |
+
" print(f\"Finished. Output saved to {output_file}\")"
|
| 423 |
+
]
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"cell_type": "code",
|
| 427 |
+
"execution_count": 23,
|
| 428 |
+
"id": "29c3627c",
|
| 429 |
+
"metadata": {},
|
| 430 |
+
"outputs": [
|
| 431 |
+
{
|
| 432 |
+
"ename": "ParserError",
|
| 433 |
+
"evalue": "Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n",
|
| 434 |
+
"output_type": "error",
|
| 435 |
+
"traceback": [
|
| 436 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 437 |
+
"\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)",
|
| 438 |
+
"Cell \u001b[0;32mIn[23], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m output_file \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnews_scrap_new21223123.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2\u001b[0m prepared_input \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDATASET/News_csv/ai_vs_human_input_all.csv\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m \u001b[43mgrok_step3_5_scraper\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 5\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_input\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 6\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 7\u001b[0m \u001b[43m \u001b[49m\u001b[43mlimit\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 8\u001b[0m \u001b[43m \u001b[49m\u001b[43mmodel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMODEL_NAME\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 9\u001b[0m \u001b[43m \u001b[49m\u001b[43mrequests_per_second\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_workers\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m2\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 11\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_retries\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m3\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m)\u001b[49m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(output_file):\n\u001b[1;32m 15\u001b[0m pd\u001b[38;5;241m.\u001b[39mread_csv(output_file)\u001b[38;5;241m.\u001b[39mtail()\n",
|
| 439 |
+
"Cell \u001b[0;32mIn[22], line 45\u001b[0m, in \u001b[0;36mgrok_step3_5_scraper\u001b[0;34m(input_file, output_file, limit, model, requests_per_second, max_workers, max_retries)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mexists(output_file):\n\u001b[1;32m 44\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 45\u001b[0m existing_df \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 46\u001b[0m already_done \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlen\u001b[39m(existing_df)\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m pd\u001b[38;5;241m.\u001b[39merrors\u001b[38;5;241m.\u001b[39mEmptyDataError:\n",
|
| 440 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:873\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, skip_blank_lines, parse_dates, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 861\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 862\u001b[0m dialect,\n\u001b[1;32m 863\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 869\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 870\u001b[0m )\n\u001b[1;32m 871\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 873\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 441 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:306\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m parser:\n\u001b[0;32m--> 306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 442 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1947\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1940\u001b[0m nrows \u001b[38;5;241m=\u001b[39m validate_integer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnrows\u001b[39m\u001b[38;5;124m\"\u001b[39m, nrows)\n\u001b[1;32m 1941\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1942\u001b[0m \u001b[38;5;66;03m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m 1943\u001b[0m (\n\u001b[1;32m 1944\u001b[0m index,\n\u001b[1;32m 1945\u001b[0m columns,\n\u001b[1;32m 1946\u001b[0m col_dict,\n\u001b[0;32m-> 1947\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m 1948\u001b[0m \u001b[43m \u001b[49m\u001b[43mnrows\u001b[49m\n\u001b[1;32m 1949\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1951\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
|
| 443 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:215\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlow_memory:\n\u001b[0;32m--> 215\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_low_memory\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;66;03m# destructive to chunks\u001b[39;00m\n\u001b[1;32m 217\u001b[0m data \u001b[38;5;241m=\u001b[39m _concatenate_chunks(chunks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnames)\n",
|
| 444 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:832\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
|
| 445 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:897\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
|
| 446 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:868\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
|
| 447 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:885\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n",
|
| 448 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:2084\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
|
| 449 |
+
"\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n"
|
| 450 |
+
]
|
| 451 |
+
}
|
| 452 |
+
],
|
| 453 |
+
"source": [
|
| 454 |
+
"output_file = \"news_scrap_new21223123.csv\"\n",
|
| 455 |
+
"prepared_input = \"DATASET/News_csv/ai_vs_human_input_all.csv\"\n",
|
| 456 |
+
"\n",
|
| 457 |
+
"grok_step3_5_scraper(\n",
|
| 458 |
+
" input_file=prepared_input,\n",
|
| 459 |
+
" output_file=output_file,\n",
|
| 460 |
+
" limit=10,\n",
|
| 461 |
+
" model=MODEL_NAME,\n",
|
| 462 |
+
" requests_per_second=2,\n",
|
| 463 |
+
" max_workers=2,\n",
|
| 464 |
+
" max_retries=3,\n",
|
| 465 |
+
")\n",
|
| 466 |
+
"\n",
|
| 467 |
+
"if os.path.exists(output_file):\n",
|
| 468 |
+
" pd.read_csv(output_file).tail()\n",
|
| 469 |
+
"else:\n",
|
| 470 |
+
" print(f\"No output file found: {output_file}\")"
|
| 471 |
+
]
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"cell_type": "code",
|
| 475 |
+
"execution_count": null,
|
| 476 |
+
"id": "3c3777e8",
|
| 477 |
+
"metadata": {},
|
| 478 |
+
"outputs": [
|
| 479 |
+
{
|
| 480 |
+
"ename": "ParserError",
|
| 481 |
+
"evalue": "Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n",
|
| 482 |
+
"output_type": "error",
|
| 483 |
+
"traceback": [
|
| 484 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 485 |
+
"\u001b[0;31mParserError\u001b[0m Traceback (most recent call last)",
|
| 486 |
+
"Cell \u001b[0;32mIn[14], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m teststes \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutput_file\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mtail()\n",
|
| 487 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:873\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, skip_blank_lines, parse_dates, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 861\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 862\u001b[0m dialect,\n\u001b[1;32m 863\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 869\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 870\u001b[0m )\n\u001b[1;32m 871\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 873\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 488 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:306\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 303\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m parser:\n\u001b[0;32m--> 306\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mparser\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n",
|
| 489 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/readers.py:1947\u001b[0m, in \u001b[0;36mTextFileReader.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 1940\u001b[0m nrows \u001b[38;5;241m=\u001b[39m validate_integer(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnrows\u001b[39m\u001b[38;5;124m\"\u001b[39m, nrows)\n\u001b[1;32m 1941\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1942\u001b[0m \u001b[38;5;66;03m# error: \"ParserBase\" has no attribute \"read\"\u001b[39;00m\n\u001b[1;32m 1943\u001b[0m (\n\u001b[1;32m 1944\u001b[0m index,\n\u001b[1;32m 1945\u001b[0m columns,\n\u001b[1;32m 1946\u001b[0m col_dict,\n\u001b[0;32m-> 1947\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_engine\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# type: ignore[attr-defined]\u001b[39;49;00m\n\u001b[1;32m 1948\u001b[0m \u001b[43m \u001b[49m\u001b[43mnrows\u001b[49m\n\u001b[1;32m 1949\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1950\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m:\n\u001b[1;32m 1951\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mclose()\n",
|
| 490 |
+
"File \u001b[0;32m~/miniconda3/envs/ml/lib/python3.11/site-packages/pandas/io/parsers/c_parser_wrapper.py:215\u001b[0m, in \u001b[0;36mCParserWrapper.read\u001b[0;34m(self, nrows)\u001b[0m\n\u001b[1;32m 213\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 214\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlow_memory:\n\u001b[0;32m--> 215\u001b[0m chunks \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_reader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_low_memory\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnrows\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;66;03m# destructive to chunks\u001b[39;00m\n\u001b[1;32m 217\u001b[0m data \u001b[38;5;241m=\u001b[39m _concatenate_chunks(chunks, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnames)\n",
|
| 491 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:832\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[0;34m()\u001b[0m\n",
|
| 492 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:897\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._read_rows\u001b[0;34m()\u001b[0m\n",
|
| 493 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:868\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._tokenize_rows\u001b[0;34m()\u001b[0m\n",
|
| 494 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:885\u001b[0m, in \u001b[0;36mpandas._libs.parsers.TextReader._check_tokenize_status\u001b[0;34m()\u001b[0m\n",
|
| 495 |
+
"File \u001b[0;32mpandas/_libs/parsers.pyx:2084\u001b[0m, in \u001b[0;36mpandas._libs.parsers.raise_parser_error\u001b[0;34m()\u001b[0m\n",
|
| 496 |
+
"\u001b[0;31mParserError\u001b[0m: Error tokenizing data. C error: Expected 8 fields in line 33, saw 16\n"
|
| 497 |
+
]
|
| 498 |
+
}
|
| 499 |
+
],
|
| 500 |
+
"source": [
|
| 501 |
+
"teststes = pd.read_csv(output_file).tail()"
|
| 502 |
+
]
|
| 503 |
+
},
|
| 504 |
+
{
|
| 505 |
+
"cell_type": "code",
|
| 506 |
+
"execution_count": null,
|
| 507 |
+
"id": "89c46554",
|
| 508 |
+
"metadata": {},
|
| 509 |
+
"outputs": [],
|
| 510 |
+
"source": []
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"cell_type": "code",
|
| 514 |
+
"execution_count": null,
|
| 515 |
+
"id": "357ccb81",
|
| 516 |
+
"metadata": {},
|
| 517 |
+
"outputs": [],
|
| 518 |
+
"source": []
|
| 519 |
+
}
|
| 520 |
+
],
|
| 521 |
+
"metadata": {
|
| 522 |
+
"kernelspec": {
|
| 523 |
+
"display_name": "ml",
|
| 524 |
+
"language": "python",
|
| 525 |
+
"name": "python3"
|
| 526 |
+
},
|
| 527 |
+
"language_info": {
|
| 528 |
+
"codemirror_mode": {
|
| 529 |
+
"name": "ipython",
|
| 530 |
+
"version": 3
|
| 531 |
+
},
|
| 532 |
+
"file_extension": ".py",
|
| 533 |
+
"mimetype": "text/x-python",
|
| 534 |
+
"name": "python",
|
| 535 |
+
"nbconvert_exporter": "python",
|
| 536 |
+
"pygments_lexer": "ipython3",
|
| 537 |
+
"version": "3.11.14"
|
| 538 |
+
}
|
| 539 |
+
},
|
| 540 |
+
"nbformat": 4,
|
| 541 |
+
"nbformat_minor": 5
|
| 542 |
+
}
|