{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "18f92b4471414c7ba7858ef546ca9694": { "model_module": "@jupyter-widgets/controls", "model_name": "VBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "VBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "VBoxView", "box_style": "", "children": [], "layout": "IPY_MODEL_25b575728e7a4c36aacfec2fb6a07bd8" } }, "5c5290ace6f4415287a629ba34a40801": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_15d83d95d920490f9db87469aa0467cf", "placeholder": "​", "style": "IPY_MODEL_0f9a2004d6fe420394aa6182db4622e4", "value": "

Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.
" } }, "9bbcc97cb57e408793ebb52f2b3a76af": { "model_module": "@jupyter-widgets/controls", "model_name": "PasswordModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "PasswordModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "PasswordView", "continuous_update": true, "description": "Token:", "description_tooltip": null, "disabled": false, "layout": "IPY_MODEL_93aa1893d1cd47e4a21dbba2a3b08fca", "placeholder": "​", "style": "IPY_MODEL_c7addcdadc0648b8b1e49832fed6317e", "value": "" } }, "5239c01a8f25467cbde8321328543ae8": { "model_module": "@jupyter-widgets/controls", "model_name": "CheckboxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "CheckboxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "CheckboxView", "description": "Add token as git credential?", "description_tooltip": null, "disabled": false, "indent": true, "layout": "IPY_MODEL_e18fc1b2d5204ff19582de058526ad8e", "style": "IPY_MODEL_014e7145967f4ef789312b93ad55a58b", "value": true } }, "651f18226c694fb08d31a9df029def31": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ButtonView", "button_style": "", "description": "Login", "disabled": false, "icon": "", "layout": "IPY_MODEL_f35a9ca56c914af4af9a0957b869edfd", "style": "IPY_MODEL_b0a2da29ab7e4faeadb742377ecb541d", "tooltip": "" } }, "24e495cdc07d4cba99e99f7116bd3564": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b7fe020ca03543e7a90969b931b04ab9", "placeholder": "​", "style": "IPY_MODEL_49d75544a5174653b1fccf64940b65bc", "value": "\nPro Tip: If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. " } }, "25b575728e7a4c36aacfec2fb6a07bd8": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": "center", "align_self": null, "border": null, "bottom": null, "display": "flex", "flex": null, "flex_flow": "column", "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "50%" } }, "15d83d95d920490f9db87469aa0467cf": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "0f9a2004d6fe420394aa6182db4622e4": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "93aa1893d1cd47e4a21dbba2a3b08fca": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "c7addcdadc0648b8b1e49832fed6317e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "e18fc1b2d5204ff19582de058526ad8e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "014e7145967f4ef789312b93ad55a58b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "f35a9ca56c914af4af9a0957b869edfd": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b0a2da29ab7e4faeadb742377ecb541d": { "model_module": "@jupyter-widgets/controls", "model_name": "ButtonStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ButtonStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "button_color": null, "font_weight": "" } }, "b7fe020ca03543e7a90969b931b04ab9": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "49d75544a5174653b1fccf64940b65bc": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "de7d001d076141089c018ec1ff1dbbbc": { "model_module": "@jupyter-widgets/controls", "model_name": "LabelModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "LabelModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "LabelView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_52aee30fd47140c0b64411ed82a1c24c", "placeholder": "​", "style": "IPY_MODEL_4c85e92bbac74c29b023f9fc6accaf4f", "value": "Connecting..." } }, "52aee30fd47140c0b64411ed82a1c24c": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "4c85e92bbac74c29b023f9fc6accaf4f": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "30770abab18645c9a698aec06159ea58": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_2f953a7971d2478cb6c238ce8e738235", "IPY_MODEL_ba84d0eead6042edbc5c3a32be4e1d2f", "IPY_MODEL_cfc3e719458b482fb0c5fb77bece4178" ], "layout": "IPY_MODEL_1510df7b9a5843e4bbc2a2c694843d7e" } }, "2f953a7971d2478cb6c238ce8e738235": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_b29b3ab3d2d34ef6ba93616e75453606", "placeholder": "​", "style": "IPY_MODEL_e971a5dec3a44b06810b0106eaf923bf", "value": "README.md: 100%" } }, "ba84d0eead6042edbc5c3a32be4e1d2f": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3d22489fd4234b3f8defe2cc8ce83161", "max": 30, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_6bb33383ee1a41ce972675333bba8070", "value": 30 } }, "cfc3e719458b482fb0c5fb77bece4178": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_738b9ae8baa1457fb3dbc3b51df1110d", "placeholder": "​", "style": "IPY_MODEL_31b98097ac464ee78b74c68928d6d171", "value": " 30.0/30.0 [00:00<00:00, 3.13kB/s]" } }, "1510df7b9a5843e4bbc2a2c694843d7e": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "b29b3ab3d2d34ef6ba93616e75453606": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e971a5dec3a44b06810b0106eaf923bf": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "3d22489fd4234b3f8defe2cc8ce83161": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "6bb33383ee1a41ce972675333bba8070": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "738b9ae8baa1457fb3dbc3b51df1110d": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "31b98097ac464ee78b74c68928d6d171": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "da0959189c1e4408af4fef60cc1a69fc": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_8beb703726c747a3a7fcbb8a74a2ff5d", "IPY_MODEL_79911e3f198844fc82ce62a5194d4e80", "IPY_MODEL_a18126a395d94c1585dc686d88d76bb4" ], "layout": "IPY_MODEL_a2373620b57e41b8b38bb2cb143ac42b" } }, "8beb703726c747a3a7fcbb8a74a2ff5d": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_e244474b51e84996aebd1cafc269bf3f", "placeholder": "​", "style": "IPY_MODEL_2cfc12c3b3644e52a0533b6213edffc9", "value": "dataset.json: 100%" } }, "79911e3f198844fc82ce62a5194d4e80": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_95463f72e82d4336826615b6de356213", "max": 11582708, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_9f8bc73d5bb44f40b20c3e6b6c8dac1d", "value": 11582708 } }, "a18126a395d94c1585dc686d88d76bb4": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_6eb160ab03254dc4923d5a15b1786115", "placeholder": "​", "style": "IPY_MODEL_34017a6d3cfd424fba949cf91add7e9e", "value": " 11.6M/11.6M [00:01<00:00, 57.7MB/s]" } }, "a2373620b57e41b8b38bb2cb143ac42b": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "e244474b51e84996aebd1cafc269bf3f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "2cfc12c3b3644e52a0533b6213edffc9": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "95463f72e82d4336826615b6de356213": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "9f8bc73d5bb44f40b20c3e6b6c8dac1d": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "6eb160ab03254dc4923d5a15b1786115": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "34017a6d3cfd424fba949cf91add7e9e": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "02e208dd03624308a4f93df0189ba7fd": { "model_module": "@jupyter-widgets/controls", "model_name": "HBoxModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HBoxModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HBoxView", "box_style": "", "children": [ "IPY_MODEL_a37b8252f3654f8286d19d70029bce5b", "IPY_MODEL_4f6fc6e8ad8543fab23a2db5c459bba5", "IPY_MODEL_48538ec25ece4f11af71d2e886825e43" ], "layout": "IPY_MODEL_452619760ba548dabf932e58d948c31f" } }, "a37b8252f3654f8286d19d70029bce5b": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_30b59116166c46a2a4e71b93e51e67d6", "placeholder": "​", "style": "IPY_MODEL_19d14f2901a1481099b983a9b416c80b", "value": "Generating train split: " } }, "4f6fc6e8ad8543fab23a2db5c459bba5": { "model_module": "@jupyter-widgets/controls", "model_name": "FloatProgressModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "FloatProgressModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "ProgressView", "bar_style": "success", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_bbf03e75175c4e69ba27b9eb5fdbaeef", "max": 1, "min": 0, "orientation": "horizontal", "style": "IPY_MODEL_28f4437d656d4631ab12985a37509be4", "value": 1 } }, "48538ec25ece4f11af71d2e886825e43": { "model_module": "@jupyter-widgets/controls", "model_name": "HTMLModel", "model_module_version": "1.5.0", "state": { "_dom_classes": [], "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "HTMLModel", "_view_count": null, "_view_module": "@jupyter-widgets/controls", "_view_module_version": "1.5.0", "_view_name": "HTMLView", "description": "", "description_tooltip": null, "layout": "IPY_MODEL_3fded25665e9473dae3867bb69ce8a35", "placeholder": "​", "style": "IPY_MODEL_fa2dcad26a724ddf8dbb0e179e6f3e24", "value": " 10642/0 [00:00<00:00, 18988.51 examples/s]" } }, "452619760ba548dabf932e58d948c31f": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "30b59116166c46a2a4e71b93e51e67d6": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "19d14f2901a1481099b983a9b416c80b": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } }, "bbf03e75175c4e69ba27b9eb5fdbaeef": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": "20px" } }, "28f4437d656d4631ab12985a37509be4": { "model_module": "@jupyter-widgets/controls", "model_name": "ProgressStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "ProgressStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "bar_color": null, "description_width": "" } }, "3fded25665e9473dae3867bb69ce8a35": { "model_module": "@jupyter-widgets/base", "model_name": "LayoutModel", "model_module_version": "1.2.0", "state": { "_model_module": "@jupyter-widgets/base", "_model_module_version": "1.2.0", "_model_name": "LayoutModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "LayoutView", "align_content": null, "align_items": null, "align_self": null, "border": null, "bottom": null, "display": null, "flex": null, "flex_flow": null, "grid_area": null, "grid_auto_columns": null, "grid_auto_flow": null, "grid_auto_rows": null, "grid_column": null, "grid_gap": null, "grid_row": null, "grid_template_areas": null, "grid_template_columns": null, "grid_template_rows": null, "height": null, "justify_content": null, "justify_items": null, "left": null, "margin": null, "max_height": null, "max_width": null, "min_height": null, "min_width": null, "object_fit": null, "object_position": null, "order": null, "overflow": null, "overflow_x": null, "overflow_y": null, "padding": null, "right": null, "top": null, "visibility": null, "width": null } }, "fa2dcad26a724ddf8dbb0e179e6f3e24": { "model_module": "@jupyter-widgets/controls", "model_name": "DescriptionStyleModel", "model_module_version": "1.5.0", "state": { "_model_module": "@jupyter-widgets/controls", "_model_module_version": "1.5.0", "_model_name": "DescriptionStyleModel", "_view_count": null, "_view_module": "@jupyter-widgets/base", "_view_module_version": "1.2.0", "_view_name": "StyleView", "description_width": "" } } } }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "VR8UXa7wKT7b", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "3655a54e-c6c2-443f-c3c2-48369f516a2e" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Collecting underthesea\n", " Downloading underthesea-9.4.0-py3-none-any.whl.metadata (10 kB)\n", "Requirement already satisfied: datasets in /usr/local/lib/python3.12/dist-packages (4.0.0)\n", "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.12/dist-packages (1.6.1)\n", "Requirement already satisfied: huggingface_hub in /usr/local/lib/python3.12/dist-packages (1.11.0)\n", "Requirement already satisfied: Click>=6.0 in /usr/local/lib/python3.12/dist-packages (from underthesea) (8.3.3)\n", "Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from underthesea) (4.67.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.12/dist-packages (from underthesea) (2.32.4)\n", "Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from underthesea) (1.5.3)\n", "Requirement already satisfied: PyYAML in /usr/local/lib/python3.12/dist-packages (from underthesea) (6.0.3)\n", "Collecting underthesea_core>=3.3.0 (from underthesea)\n", " Downloading underthesea_core-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.12/dist-packages (from datasets) (3.29.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (2.0.2)\n", "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (18.1.0)\n", "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.3.8)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.12/dist-packages (from datasets) (2.2.2)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.12/dist-packages (from datasets) (3.6.0)\n", "Requirement already satisfied: multiprocess<0.70.17 in /usr/local/lib/python3.12/dist-packages (from datasets) (0.70.16)\n", "Requirement already satisfied: fsspec<=2025.3.0,>=2023.1.0 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2025.3.0)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.12/dist-packages (from datasets) (26.1)\n", "Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (1.16.3)\n", "Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.12/dist-packages (from scikit-learn) (3.6.0)\n", "Requirement already satisfied: hf-xet<2.0.0,>=1.4.3 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (1.4.3)\n", "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (0.28.1)\n", "Requirement already satisfied: typer in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (0.24.2)\n", "Requirement already satisfied: typing-extensions>=4.1.0 in /usr/local/lib/python3.12/dist-packages (from huggingface_hub) (4.15.0)\n", "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /usr/local/lib/python3.12/dist-packages (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (3.13.5)\n", "Requirement already satisfied: anyio in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (4.13.0)\n", "Requirement already satisfied: certifi in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (2026.4.22)\n", "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (1.0.9)\n", "Requirement already satisfied: idna in /usr/local/lib/python3.12/dist-packages (from httpx<1,>=0.23.0->huggingface_hub) (3.13)\n", "Requirement already satisfied: h11>=0.16 in /usr/local/lib/python3.12/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->huggingface_hub) (0.16.0)\n", "Requirement already satisfied: charset_normalizer<4,>=2 in /usr/local/lib/python3.12/dist-packages (from requests->underthesea) (3.4.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.12/dist-packages (from requests->underthesea) (2.5.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2.9.0.post0)\n", "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2025.2)\n", "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.12/dist-packages (from pandas->datasets) (2026.1)\n", "Requirement already satisfied: shellingham>=1.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface_hub) (1.5.4)\n", "Requirement already satisfied: rich>=12.3.0 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface_hub) (13.9.4)\n", "Requirement already satisfied: annotated-doc>=0.0.2 in /usr/local/lib/python3.12/dist-packages (from typer->huggingface_hub) (0.0.4)\n", "Requirement already satisfied: aiohappyeyeballs>=2.5.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (2.6.1)\n", "Requirement already satisfied: aiosignal>=1.4.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.4.0)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (26.1.0)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.8.0)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (6.7.1)\n", "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (0.4.1)\n", "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.12/dist-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->fsspec[http]<=2025.3.0,>=2023.1.0->datasets) (1.23.0)\n", "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.12/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.17.0)\n", "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer->huggingface_hub) (4.0.0)\n", "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.12/dist-packages (from rich>=12.3.0->typer->huggingface_hub) (2.20.0)\n", "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.12/dist-packages (from markdown-it-py>=2.2.0->rich>=12.3.0->typer->huggingface_hub) (0.1.2)\n", "Downloading underthesea-9.4.0-py3-none-any.whl (7.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.3/7.3 MB\u001b[0m \u001b[31m88.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading underthesea_core-3.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m91.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: underthesea_core, underthesea\n", "Successfully installed underthesea-9.4.0 underthesea_core-3.3.0\n" ] } ], "source": [ "!pip install underthesea datasets scikit-learn huggingface_hub" ] }, { "cell_type": "code", "source": [ "from huggingface_hub import login\n", "\n", "login()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 141, "referenced_widgets": [ "18f92b4471414c7ba7858ef546ca9694", "5c5290ace6f4415287a629ba34a40801", "9bbcc97cb57e408793ebb52f2b3a76af", "5239c01a8f25467cbde8321328543ae8", "651f18226c694fb08d31a9df029def31", "24e495cdc07d4cba99e99f7116bd3564", "25b575728e7a4c36aacfec2fb6a07bd8", "15d83d95d920490f9db87469aa0467cf", "0f9a2004d6fe420394aa6182db4622e4", "93aa1893d1cd47e4a21dbba2a3b08fca", "c7addcdadc0648b8b1e49832fed6317e", "e18fc1b2d5204ff19582de058526ad8e", "014e7145967f4ef789312b93ad55a58b", "f35a9ca56c914af4af9a0957b869edfd", "b0a2da29ab7e4faeadb742377ecb541d", "b7fe020ca03543e7a90969b931b04ab9", "49d75544a5174653b1fccf64940b65bc", "de7d001d076141089c018ec1ff1dbbbc", "52aee30fd47140c0b64411ed82a1c24c", "4c85e92bbac74c29b023f9fc6accaf4f" ] }, "id": "_NC3VCthA9uV", "outputId": "873b8d6a-d70f-4619-f86f-d4cb1302d211" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:93: UserWarning: \n", "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", "You will be able to reuse this secret in all of your notebooks.\n", "Please note that authentication is recommended but still optional to access public models or datasets.\n", " warnings.warn(\n" ] }, { "output_type": "display_data", "data": { "text/plain": [ "VBox(children=(HTML(value='
\"] = 0\n", "vocab[\"\"] = 1\n", "\n", "print(\"Vocab size:\", len(vocab))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OhnAF0iuC3yu", "outputId": "e01f9d16-e5bb-44d1-a977-9f479d47e6a3" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Vocab size: 37282\n" ] } ] }, { "cell_type": "code", "source": [ "MAX_LEN = 200\n", "\n", "def encode_tokens(tokens):\n", " ids = [vocab.get(t, 1) for t in tokens]\n", "\n", " ids = ids[:MAX_LEN]\n", "\n", " ids += [0] * (MAX_LEN - len(ids))\n", "\n", " return ids" ], "metadata": { "id": "ArRdlEEze7f6" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "for item in train_data:\n", " item[\"input_ids\"] = encode_tokens(item[\"tokens\"])\n", "\n", "for item in val_data:\n", " item[\"input_ids\"] = encode_tokens(item[\"tokens\"])\n", "\n", "for item in test_data:\n", " item[\"input_ids\"] = encode_tokens(item[\"tokens\"])" ], "metadata": { "id": "JH-OZUj4EsrL" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "def extract_features(data):\n", " return np.array([\n", " [\n", " item[\"num_words\"],\n", " item[\"avg_sentence_length\"],\n", " item[\"type_token_ratio\"]\n", " ]\n", " for item in data\n", " ])" ], "metadata": { "id": "dEnkRjdaEvuz" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "scaler = StandardScaler()\n", "\n", "train_features = scaler.fit_transform(\n", " extract_features(train_data)\n", ")\n", "\n", "val_features = scaler.transform(\n", " extract_features(val_data)\n", ")\n", "\n", "test_features = scaler.transform(\n", " extract_features(test_data)\n", ")" ], "metadata": { "id": "m1Ue0TyuE0Xk" }, "execution_count": 13, "outputs": [] }, { "cell_type": "code", "source": [ "class TextDataset(Dataset):\n", " def __init__(self, data, features):\n", " self.data = data\n", " self.features = features\n", "\n", " def __len__(self):\n", " return len(self.data)\n", "\n", " def __getitem__(self, idx):\n", " item = self.data[idx]\n", "\n", " input_ids = torch.tensor(\n", " item[\"input_ids\"],\n", " dtype=torch.long\n", " )\n", "\n", " feature = torch.tensor(\n", " self.features[idx],\n", " dtype=torch.float\n", " )\n", "\n", " label = torch.tensor(\n", " item[\"level\"],\n", " dtype=torch.long\n", " )\n", "\n", " return {\n", " \"input_ids\": input_ids,\n", " \"features\": feature,\n", " \"label\": label\n", " }" ], "metadata": { "id": "eAsDGbXnE2w3" }, "execution_count": 14, "outputs": [] }, { "cell_type": "code", "source": [ "train_dataset = TextDataset(\n", " train_data,\n", " train_features\n", ")\n", "\n", "val_dataset = TextDataset(\n", " val_data,\n", " val_features\n", ")\n", "\n", "test_dataset = TextDataset(\n", " test_data,\n", " test_features\n", ")\n", "\n", "train_loader = DataLoader(\n", " train_dataset,\n", " batch_size=32,\n", " shuffle=True\n", ")\n", "\n", "val_loader = DataLoader(\n", " val_dataset,\n", " batch_size=32\n", ")\n", "\n", "test_loader = DataLoader(\n", " test_dataset,\n", " batch_size=32\n", ")" ], "metadata": { "id": "c4yMahWlE6IC" }, "execution_count": 15, "outputs": [] }, { "cell_type": "code", "source": [ "class Attention(nn.Module):\n", " def __init__(self, hidden_dim):\n", " super().__init__()\n", "\n", " self.attention = nn.Linear(\n", " hidden_dim * 2,\n", " 1\n", " )\n", "\n", " def forward(self, lstm_output):\n", " scores = self.attention(\n", " lstm_output\n", " ).squeeze(-1)\n", "\n", " weights = torch.softmax(\n", " scores,\n", " dim=1\n", " )\n", "\n", " context = torch.sum(\n", " lstm_output * weights.unsqueeze(-1),\n", " dim=1\n", " )\n", "\n", " return context" ], "metadata": { "id": "7GspSTaFE7K3" }, "execution_count": 16, "outputs": [] }, { "cell_type": "code", "source": [ "class BiLSTMReadability(nn.Module):\n", " def __init__(\n", " self,\n", " vocab_size,\n", " embed_dim=128,\n", " hidden_dim=128,\n", " num_features=3,\n", " num_classes=3,\n", " dropout=0.3\n", " ):\n", " super().__init__()\n", "\n", " self.embedding = nn.Embedding(\n", " vocab_size,\n", " embed_dim,\n", " padding_idx=0\n", " )\n", "\n", " self.lstm = nn.LSTM(\n", " embed_dim,\n", " hidden_dim,\n", " batch_first=True,\n", " bidirectional=True\n", " )\n", "\n", " self.attention = Attention(\n", " hidden_dim\n", " )\n", "\n", " self.dropout = nn.Dropout(\n", " dropout\n", " )\n", "\n", " self.classifier = nn.Sequential(\n", " nn.Linear(\n", " hidden_dim * 2 + num_features,\n", " 128\n", " ),\n", "\n", " nn.ReLU(),\n", "\n", " nn.Dropout(dropout),\n", "\n", " nn.Linear(128, num_classes)\n", " )\n", "\n", " def forward(self, input_ids, features):\n", " x = self.embedding(input_ids)\n", "\n", " lstm_out, _ = self.lstm(x)\n", "\n", " context = self.attention(lstm_out)\n", "\n", " context = self.dropout(context)\n", "\n", " combined = torch.cat(\n", " [context, features],\n", " dim=1\n", " )\n", "\n", " logits = self.classifier(combined)\n", "\n", " return logits" ], "metadata": { "id": "C7cYo2coE9fr" }, "execution_count": 17, "outputs": [] }, { "cell_type": "code", "source": [ "device = torch.device(\n", " \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", ")\n", "\n", "model = BiLSTMReadability(\n", " vocab_size=len(vocab)\n", ").to(device)\n", "\n", "train_labels = [\n", " item[\"level\"]\n", " for item in train_data\n", "]\n", "\n", "weights = compute_class_weight(\n", " class_weight=\"balanced\",\n", " classes=np.array([0, 1, 2]),\n", " y=train_labels\n", ")\n", "\n", "weights = torch.tensor(\n", " weights,\n", " dtype=torch.float\n", ")\n", "\n", "criterion = nn.CrossEntropyLoss(\n", " weight=weights.to(device)\n", ")\n", "\n", "optimizer = torch.optim.Adam(\n", " model.parameters(),\n", " lr=5e-4\n", ")" ], "metadata": { "id": "Weq7VvbBFBbT" }, "execution_count": 18, "outputs": [] }, { "cell_type": "code", "source": [ "best_val_f1 = 0\n", "\n", "patience = 3\n", "counter = 0\n", "\n", "EPOCHS = 10\n", "\n", "for epoch in range(EPOCHS):\n", "\n", " # TRAIN\n", " model.train()\n", "\n", " total_loss = 0\n", "\n", " for batch in train_loader:\n", "\n", " input_ids = batch[\"input_ids\"].to(device)\n", " features = batch[\"features\"].to(device)\n", " labels = batch[\"label\"].to(device)\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " loss = criterion(\n", " logits,\n", " labels\n", " )\n", "\n", " optimizer.zero_grad()\n", "\n", " loss.backward()\n", "\n", " optimizer.step()\n", "\n", " total_loss += loss.item()\n", "\n", " # VALIDATION\n", " model.eval()\n", "\n", " all_preds = []\n", " all_labels = []\n", "\n", " with torch.no_grad():\n", "\n", " for batch in val_loader:\n", "\n", " input_ids = batch[\"input_ids\"].to(device)\n", " features = batch[\"features\"].to(device)\n", " labels = batch[\"label\"].to(device)\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " preds = torch.argmax(\n", " logits,\n", " dim=1\n", " )\n", "\n", " all_preds.extend(\n", " preds.cpu().numpy()\n", " )\n", "\n", " all_labels.extend(\n", " labels.cpu().numpy()\n", " )\n", "\n", " val_f1 = f1_score(\n", " all_labels,\n", " all_preds,\n", " average=\"macro\"\n", " )\n", "\n", " print(\n", " f\"Epoch {epoch+1} \"\n", " f\"| Loss: {total_loss:.4f} \"\n", " f\"| Val F1: {val_f1:.4f}\"\n", " )\n", "\n", " # EARLY STOPPING\n", " if val_f1 > best_val_f1:\n", "\n", " best_val_f1 = val_f1\n", "\n", " counter = 0\n", "\n", " torch.save(\n", " model.state_dict(),\n", " \"pytorch_model.bin\"\n", " )\n", "\n", " else:\n", " counter += 1\n", "\n", " if counter >= patience:\n", " print(\"Early stopping!\")\n", " break" ], "metadata": { "id": "8g1G3Kjde961", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cb42e986-40f5-4140-8232-8283819b7242" }, "execution_count": 19, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1 | Loss: 183.0716 | Val F1: 0.6738\n", "Epoch 2 | Loss: 130.3617 | Val F1: 0.7389\n", "Epoch 3 | Loss: 98.9159 | Val F1: 0.7918\n", "Epoch 4 | Loss: 70.6794 | Val F1: 0.7764\n", "Epoch 5 | Loss: 52.0055 | Val F1: 0.8059\n", "Epoch 6 | Loss: 36.5027 | Val F1: 0.8126\n", "Epoch 7 | Loss: 30.9510 | Val F1: 0.8055\n", "Epoch 8 | Loss: 18.1933 | Val F1: 0.8118\n", "Epoch 9 | Loss: 10.3218 | Val F1: 0.8262\n", "Epoch 10 | Loss: 5.9807 | Val F1: 0.8226\n" ] } ] }, { "cell_type": "code", "source": [ "model.load_state_dict(\n", " torch.load(\"pytorch_model.bin\")\n", ")\n", "\n", "model.eval()\n", "\n", "all_preds = []\n", "all_labels = []\n", "\n", "with torch.no_grad():\n", "\n", " for batch in test_loader:\n", "\n", " input_ids = batch[\"input_ids\"].to(device)\n", " features = batch[\"features\"].to(device)\n", " labels = batch[\"label\"].to(device)\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " preds = torch.argmax(\n", " logits,\n", " dim=1\n", " )\n", "\n", " all_preds.extend(\n", " preds.cpu().numpy()\n", " )\n", "\n", " all_labels.extend(\n", " labels.cpu().numpy()\n", " )\n", "\n", "print(\n", " classification_report(\n", " all_labels,\n", " all_preds,\n", " digits=4\n", " )\n", ")" ], "metadata": { "id": "ayCu8T3wfAiZ", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "e1fda29c-2bd4-4ec7-f713-169a7bca6840" }, "execution_count": 20, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " precision recall f1-score support\n", "\n", " 0 0.8862 0.8132 0.8481 182\n", " 1 0.7956 0.8898 0.8400 844\n", " 2 0.8333 0.7093 0.7663 571\n", "\n", " accuracy 0.8165 1597\n", " macro avg 0.8384 0.8041 0.8182 1597\n", "weighted avg 0.8194 0.8165 0.8146 1597\n", "\n" ] } ] }, { "cell_type": "code", "source": [ "# SAVE VOCAB\n", "with open(\"vocab.pkl\", \"wb\") as f:\n", " pickle.dump(vocab, f)\n", "\n", "# SAVE SCALER\n", "with open(\"scaler.pkl\", \"wb\") as f:\n", " pickle.dump(scaler, f)\n", "\n", "# SAVE CONFIG\n", "config = {\n", " \"model_type\": \"BiLSTM\",\n", " \"embedding_dim\": 128,\n", " \"hidden_dim\": 128,\n", " \"max_length\": 200,\n", " \"num_classes\": 3\n", "}\n", "\n", "with open(\"config.json\", \"w\") as f:\n", " json.dump(config, f, indent=4)" ], "metadata": { "id": "LforjqZ_fCzm" }, "execution_count": 21, "outputs": [] }, { "cell_type": "code", "source": [ "label_map = {\n", " 0: \"Primary\",\n", " 1: \"Middle School\",\n", " 2: \"High School\"\n", "}\n", "\n", "def predict(text):\n", "\n", " model.eval()\n", "\n", " tokens = tokenize(text)\n", "\n", " ids = encode_tokens(tokens)\n", "\n", " input_ids = torch.tensor(\n", " [ids],\n", " dtype=torch.long\n", " ).to(device)\n", "\n", " feature_vector = np.array([[\n", " len(tokens),\n", " max(1, len(tokens) // 5),\n", " len(set(tokens)) / max(1, len(tokens))\n", " ]])\n", "\n", " feature_vector = scaler.transform(\n", " feature_vector\n", " )\n", "\n", " features = torch.tensor(\n", " feature_vector,\n", " dtype=torch.float\n", " ).to(device)\n", "\n", " with torch.no_grad():\n", "\n", " logits = model(\n", " input_ids,\n", " features\n", " )\n", "\n", " pred = torch.argmax(\n", " logits,\n", " dim=1\n", " ).item()\n", "\n", " return label_map[pred]" ], "metadata": { "id": "U9IHkUr2fFIR" }, "execution_count": 22, "outputs": [] }, { "cell_type": "code", "source": [ "sample = \"\"\"\n", "Mặt trời là ngôi sao nằm ở trung tâm của hệ Mặt Trời.\n", "\"\"\"\n", "\n", "print(predict(sample))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RvM6d0T3fHVH", "outputId": "75b12f20-f0c7-4c07-fea4-ace5580664d6" }, "execution_count": 23, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Middle School\n" ] } ] } ] }