vietnamese_hate_speech_detection

Sleeping

App Files Files Community

jesse-tong commited on Apr 6, 2025

Commit

f292cd1

1 Parent(s): 8a6c918

First commit

Browse files

Files changed (13) hide show

LICENSE +674 -0
dataset.py +90 -36
distill_bert_to_lstm.py +7 -4
example_uses.md +4 -4
inference_example.py +24 -7
inference_lstm.py +16 -5
knowledge_distillation.py +29 -2
model.py +7 -5
requirements.txt +6 -2
run.py +0 -86
train.py +9 -5
trainer.py +111 -71
utils/word_segmentation_vi.py +23 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,674 @@

+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+                            Preamble
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+  The precise terms and conditions for copying, distribution and
+modification follow.
+                       TERMS AND CONDITIONS
+  0. Definitions.
+  "This License" refers to version 3 of the GNU General Public License.
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+  1. Source Code.
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+  The Corresponding Source for a work in source code form is that
+same work.
+  2. Basic Permissions.
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+  4. Conveying Verbatim Copies.
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+  5. Conveying Modified Source Versions.
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+  6. Conveying Non-Source Forms.
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+  7. Additional Terms.
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+  8. Termination.
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+  9. Acceptance Not Required for Having Copies.
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+  10. Automatic Licensing of Downstream Recipients.
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+  11. Patents.
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+  12. No Surrender of Others' Freedom.
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+  13. Use with the GNU Affero General Public License.
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+  14. Revised Versions of this License.
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+  15. Disclaimer of Warranty.
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+  16. Limitation of Liability.
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+  17. Interpretation of Sections 15 and 16.
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+                     END OF TERMS AND CONDITIONS
+            How to Apply These Terms to Your New Programs
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+Also add information on how to contact you by electronic and paper mail.
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.

dataset.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 from torch.utils.data import Dataset, DataLoader
-from transformers import BertTokenizer
 import pandas as pd
 import numpy as np
 import logging
@@ -15,26 +15,51 @@ class DocumentDataset(Dataset):
     def __init__(self, texts, labels, tokenizer_name='bert-base-uncased', max_length=512, num_classes=None):
         self.texts = texts
         self.labels = labels
-        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name)
         self.max_length = max_length
-        # Validate labels
-        unique_labels = set(labels)
-        min_label = min(unique_labels) if unique_labels else 0
-        max_label = max(unique_labels) if unique_labels else 0
-        # Log warning if labels might be out of range
-        if num_classes is not None and (min_label < 0 or max_label >= num_classes):
-            logger.warning(f"LABEL RANGE ERROR: Labels must be between 0 and {num_classes-1}, "
-                          f"but found range [{min_label}, {max_label}]")
-            logger.warning(f"Unique label values: {sorted(unique_labels)}")
-            # Fix labels by remapping them to start from 0 (some datasets might have labels starting from 1)
-            if min_label != 0:
-                logger.warning(f"Auto-correcting labels to be zero-indexed...")
-                label_map = {original: idx for idx, original in enumerate(sorted(unique_labels))}
-                self.labels = np.array([label_map[label] for label in labels])
-                logger.warning(f"New unique label values: {sorted(set(self.labels))}")
     def __len__(self):
         return len(self.texts)
@@ -68,7 +93,8 @@ class DocumentDataset(Dataset):
             'text': self.texts[idx],
             'label': self.labels[idx]
         }
-def load_data(data_path, text_col='text', label_col='label', validation_split=0.1, test_split=0.1, seed=42):
     """
     Load data from CSV/TSV and split into train, validation and test sets
     """
@@ -80,23 +106,51 @@ def load_data(data_path, text_col='text', label_col='label', validation_split=0.
     else:
         raise ValueError("Unsupported file format. Please provide CSV or TSV file.")
-    # Convert labels to numeric if they aren't already
-    if not np.issubdtype(df[label_col].dtype, np.number):
-        label_map = {label: idx for idx, label in enumerate(sorted(df[label_col].unique()))}
-        df['label_numeric'] = df[label_col].map(label_map)
-        labels = df['label_numeric'].values
-        # Log the mapping for reference
-        logger.info(f"Label mapping: {label_map}")
-    else:
-        labels = df[label_col].values
-        # Check if labels start from 0
-        min_label = labels.min()
-        if min_label != 0:
-            logger.warning(f"Labels don't start from 0 (min={min_label}). Converting to zero-indexed...")
-            label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}
-            labels = np.array([label_map[label] for label in labels])
     # Create a DataFrame with text and numeric labels
     texts = df[text_col].values

 import torch
 from torch.utils.data import Dataset, DataLoader
+from transformers import BertTokenizer, AutoTokenizer
 import pandas as pd
 import numpy as np
 import logging
     def __init__(self, texts, labels, tokenizer_name='bert-base-uncased', max_length=512, num_classes=None):
         self.texts = texts
         self.labels = labels
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
         self.max_length = max_length
+        if type(labels) is not np.ndarray or type(labels) is not list:
+            # Validate labels
+            unique_labels = set(labels)
+            min_label = min(unique_labels) if unique_labels else 0
+            max_label = max(unique_labels) if unique_labels else 0
+            # Log warning if labels might be out of range
+            if num_classes is not None and (min_label < 0 or max_label >= num_classes):
+                logger.warning(f"Label Range Error: Labels must be between 0 and {num_classes-1}, "
+                            f"but found range [{min_label}, {max_label}]")
+                logger.warning(f"Unique label values: {sorted(unique_labels)}")
+                # Fix labels by remapping them to start from 0 (some datasets might have labels starting from 1)
+                if min_label != 0:
+                    logger.warning(f"Auto-correcting labels to be zero-indexed...")
+                    label_map = {original: idx for idx, original in enumerate(sorted(unique_labels))}
+                    self.labels = np.array([label_map[label] for label in labels])
+                    logger.warning(f"New unique label values: {sorted(set(self.labels))}")
+        else:
+            # If labels is a list or numpy array, there are multiple label columns
+            # Validate each label column
+            labels = np.array(labels)
+            for i in range(labels.shape[1]):
+                unique_labels = set(labels[:, i])
+                min_label = min(unique_labels) if unique_labels else 0
+                max_label = max(unique_labels) if unique_labels else 0
+                # Log warning if labels might be out of range
+                if num_classes is not None and (min_label < 0 or max_label >= num_classes):
+                    logger.warning(f"Label Range Error: Labels must be between 0 and {num_classes-1}, "
+                                f"but found range [{min_label}, {max_label}]")
+                    logger.warning(f"Unique label values: {sorted(unique_labels)}")
+                    # Fix labels by remapping them to start from 0
+                    if min_label != 0:
+                        logger.warning(f"Auto-correcting labels to be zero-indexed...")
+                        label_map = {original: idx for idx, original in enumerate(sorted(unique_labels))}
+                        labels[:, i] = np.array([label_map[label] for label in labels[:, i]])
+                        logger.warning(f"New unique label values: {sorted(set(labels[:, i]))}")
+            self.labels = labels
     def __len__(self):
         return len(self.texts)
             'text': self.texts[idx],
             'label': self.labels[idx]
         }
+def load_data(data_path, text_col='text', label_col: str | list ='label', validation_split=0.1, test_split=0.1, seed=42):
     """
     Load data from CSV/TSV and split into train, validation and test sets
     """
     else:
         raise ValueError("Unsupported file format. Please provide CSV or TSV file.")
+    # If label_col is a list of columns, do the below but for each column
+    if isinstance(label_col, list):
+        labels = None
+        for idx, label in enumerate(label_col):
+            if label not in df.columns:
+                raise ValueError(f"Label column '{label}' not found in the dataset.")
+            # Convert labels to numeric if they aren't already
+            if not np.issubdtype(df[label].dtype, np.number):
+                label_map = {label: idx for idx, label in enumerate(sorted(df[label].unique()))}
+                df[f'label_numeric_{idx}'] = df[label].map(label_map)
+                if labels is None:
+                    labels = df[f'label_numeric_{idx}'].values
+                else:
+                    # Extend the labels array to dim 1
+                    labels = np.column_stack((labels, df[f'label_numeric_{idx}'].values))
+                # Log the mapping for reference
+                logger.info(f"Label mapping for column '{label}': {label_map}")
+            else:
+                # Check if labels start from 0
+                labels = df[label].values
+                min_label = labels.min()
+                if min_label != 0:
+                    logger.warning(f"Labels don't start from 0 (min={min_label}). Converting to zero-indexed...")
+                    label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}
+                    labels = np.array([label_map[label] for label in labels])
+    else: # In case there is only one label column
+        # Convert labels to numeric if they aren't already
+        if not np.issubdtype(df[label_col].dtype, np.number):
+            label_map = {label: idx for idx, label in enumerate(sorted(df[label_col].unique()))}
+            df['label_numeric'] = df[label_col].map(label_map)
+            labels = df['label_numeric'].values
+            # Log the mapping for reference
+            logger.info(f"Label mapping: {label_map}")
+        else:
+            labels = df[label_col].values
+            # Check if labels start from 0
+            min_label = labels.min()
+            if min_label != 0:
+                logger.warning(f"Labels don't start from 0 (min={min_label}). Converting to zero-indexed...")
+                label_map = {label: idx for idx, label in enumerate(sorted(set(labels)))}
+                labels = np.array([label_map[label] for label in labels])
     # Create a DataFrame with text and numeric labels
     texts = df[text_col].values

distill_bert_to_lstm.py CHANGED Viewed

@@ -37,7 +37,7 @@ def main():
     # Data arguments
     parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
     parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
-    parser.add_argument("--label_column", type=str, default="label", help="Name of the label column")
     parser.add_argument("--val_split", type=float, default=0.1, help="Validation set split ratio")
     parser.add_argument("--test_split", type=float, default=0.1, help="Test set split ratio")
@@ -79,10 +79,12 @@ def main():
     logger.info("Loading and preparing data...")
     # Load data first
     train_data, val_data, test_data = load_data(
         args.data_path,
         text_col=args.text_column,
-        label_col=args.label_column,
         validation_split=args.val_split,
         test_split=args.test_split,
         seed=args.seed
@@ -115,7 +117,8 @@ def main():
     bert_model = DocBERT(
         num_classes=args.num_classes,
         bert_model_name=args.bert_model,
-        dropout_prob=0.1
     )
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Load saved BERT weights
@@ -128,7 +131,7 @@ def main():
         vocab_size=vocab_size,
         embedding_dim=args.embedding_dim,
         hidden_dim=args.hidden_dim,
-        output_dim=args.num_classes,
         n_layers=args.num_layers,
         dropout=args.dropout
     )

     # Data arguments
     parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
     parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
+    parser.add_argument("--label_column", type=str, nargs="+", help="Name of the label column")
     parser.add_argument("--val_split", type=float, default=0.1, help="Validation set split ratio")
     parser.add_argument("--test_split", type=float, default=0.1, help="Test set split ratio")
     logger.info("Loading and preparing data...")
     # Load data first
+    label_column = args.label_column[0] if isinstance(args.label_column, list) and len(args.label_column) == 1 else args.label_column
+    num_categories = len(args.label_column) if isinstance(args.label_column, list) else 1
     train_data, val_data, test_data = load_data(
         args.data_path,
         text_col=args.text_column,
+        label_col=label_column,
         validation_split=args.val_split,
         test_split=args.test_split,
         seed=args.seed
     bert_model = DocBERT(
         num_classes=args.num_classes,
         bert_model_name=args.bert_model,
+        dropout_prob=0.1,
+        num_categories=num_categories
     )
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     # Load saved BERT weights
         vocab_size=vocab_size,
         embedding_dim=args.embedding_dim,
         hidden_dim=args.hidden_dim,
+        output_dim=args.num_classes * num_categories,
         n_layers=args.num_layers,
         dropout=args.dropout
     )

example_uses.md CHANGED Viewed

@@ -3,19 +3,19 @@
 - Train with BERT model (train.csv is ag_news dataset with 4 classes)
 ```
-python ./train.py --bert_model bert-base-uncased --data_path train.csv --label_column "Class Index" --text_column "Description" --epochs 4 --num_classes 4
 ```
 - Inference with BERT model (test_data.csv is test dataset with 4 classes like ag_news)
 ```
-python ./inference_example.py --bert_model bert-base-uncased --model_path "./bert_base_uncased/best_model.pth" --num_classes 4 --class_names "World" "Sports" "Business" "Science" --text_column "Description" --label_column "Class Index" --data_path "./test_data.csv" --inference_batch_limit 10
 ```
 - Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
 ```
-python ./distill_bert_to_lstm.py --bert_model bert-base-uncased --bert_model_path "./bert_base_uncased/best_model.pth" --output_dir "./docbert_lstm" --batch_size 32 --epochs 10 --data_path "./train.csv" --text_column "Description" --label_column "Class Index" --num_classes 4
 ```
 - Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
 ```
-python ./inference_lstm.py --model_path "./docbert_lstm/distilled_lstm_model.pth" --num_classes 4 --class_names "World" "Sports" "Business" "Science" --text_column "Description" --label_column "Class Index" --data_path "./test_data.csv" --inference_batch_limit 10 --tokenizer_path "./docbert_lstm/tokenizer.json"
 ```

 - Train with BERT model (train.csv is ag_news dataset with 4 classes)
 ```
+python ./train.py --bert_model "vinai/phobert-base-v2" --data_path "./datasets/train.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --epochs 7 --num_classes 4
 ```
 - Inference with BERT model (test_data.csv is test dataset with 4 classes like ag_news)
 ```
+python ./inference_example.py --bert_model "vinai/phobert-base-v2" --model_path "./vinai_phobert-base-v2_finetuned/best_model.pth" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./datasets/test.csv" --inference_batch_limit 10
 ```
 - Train LSTM model from BERT model using distillation (train dataset should be the same as distillation training dataset)
 ```
+python ./distill_bert_to_lstm.py --bert_model "vinai/phobert-base-v2" --bert_model_path "./vinai_phobert-base-v2_finetuned/best_model.pth" --output_dir "./docbert_lstm" --batch_size 32 --epochs 10 --data_path "./datasets/train.csv" --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --num_classes 4
 ```
 - Inference with distilled LSTM model (test_data.csv is test dataset with 4 classes like ag_news)
 ```
+python ./inference_lstm.py --model_path "./docbert_lstm/distilled_lstm_model.pth" --num_classes 4  --label_column "individual" "groups" "religion/creed" "race/ethnicity" "politics" --text_column "content" --data_path "./dataset/test.csv" --inference_batch_limit 10
 ```

inference_example.py CHANGED Viewed

@@ -15,8 +15,8 @@ if __name__ == "__main__":
     parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training and evaluation")
     parser.add_argument("--num_classes", type=int, required=True, help="Number of classes for classification")
     parser.add_argument("--text_column", type=str, default="text", help="Column name for text data")
-    parser.add_argument("--label_column", type=str, default="label", help="Column name for labels")
-    parser.add_argument("--class_names", type=str, nargs='+', required=True, help="List of class names for classification")
     parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
     parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
     args = parser.parse_args()
@@ -25,9 +25,13 @@ if __name__ == "__main__":
     # Set device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     train_data, val_data, test_data = load_data(args.data_path,
                                                 text_col=args.text_column,
-                                                label_col=args.label_column,
                                                 validation_split=0.0,
                                                 test_split=1.0)
     train_loader, val_loader, test_loader = create_data_loaders(train_data=train_data,
@@ -35,9 +39,10 @@ if __name__ == "__main__":
                                                                 test_data=test_data,
                                                                 tokenizer_name=args.bert_model,
                                                                 batch_size=args.batch_size,
-                                                                max_length=args.max_seq_length)
-    model = DocBERT(bert_model_name=args.bert_model, num_classes=args.num_classes)
     model.load_state_dict(torch.load(args.model_path, map_location=device))
     model = model.to(device)
@@ -62,7 +67,20 @@ if __name__ == "__main__":
         with torch.no_grad():
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs
-            predictions = torch.argmax(logits, dim=-1)
             all_predictions = np.append(all_predictions, predictions.cpu().numpy())
         if args.print_predictions:
@@ -94,7 +112,6 @@ if __name__ == "__main__":
             idx = int(i)
             f.write(f"Text: {test_data[0][idx]}\n")
             f.write(f"True Label: {all_labels[idx]}, Predicted Label: {all_predictions[idx]}\n")
-            f.write(f"Predicted Class: {class_names[all_predictions[idx]] if len(class_names) > all_predictions[idx] else 'Unknown'}, True Class: {class_names[all_labels[idx]] if len(class_names) > all_labels[idx] else 'Unknown'}\n")
             f.write("-" * 50 + "\n")
     with open("metrics.txt", "w") as f:

     parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training and evaluation")
     parser.add_argument("--num_classes", type=int, required=True, help="Number of classes for classification")
     parser.add_argument("--text_column", type=str, default="text", help="Column name for text data")
+    parser.add_argument("--label_column", type=str, nargs="+", help="Column name for labels")
+    parser.add_argument("--class_names", type=str, nargs='+', required=False, help="List of class names for classification")
     parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
     parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
     args = parser.parse_args()
     # Set device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load data first
+    label_column = args.label_column[0] if isinstance(args.label_column, list) and len(args.label_column) == 1 else args.label_column
+    num_categories = len(args.label_column) if isinstance(args.label_column, list) else 1
     train_data, val_data, test_data = load_data(args.data_path,
                                                 text_col=args.text_column,
+                                                label_col=label_column,
                                                 validation_split=0.0,
                                                 test_split=1.0)
     train_loader, val_loader, test_loader = create_data_loaders(train_data=train_data,
                                                                 test_data=test_data,
                                                                 tokenizer_name=args.bert_model,
                                                                 batch_size=args.batch_size,
+                                                                max_length=args.max_seq_length,
+                                                                num_classes=args.num_classes)
+    model = DocBERT(bert_model_name=args.bert_model, num_classes=args.num_classes, num_categories=num_categories)
     model.load_state_dict(torch.load(args.model_path, map_location=device))
     model = model.to(device)
         with torch.no_grad():
             outputs = model(input_ids, attention_mask=attention_mask)
             logits = outputs
+            if num_categories > 1:
+                batch_size, total_classes = outputs.shape
+                if total_classes % num_categories != 0:
+                    raise ValueError(f"Error: Number of total classes in the batch must of divisible by {num_categories}")
+                classes_per_group = total_classes // num_categories
+                # Group every classes_per_group values along dim=1
+                reshaped = outputs.view(outputs.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+                # Argmax over each group of classes_per_group
+                predictions = reshaped.argmax(dim=-1)
+            else:
+                predictions = torch.argmax(logits, dim=-1)
             all_predictions = np.append(all_predictions, predictions.cpu().numpy())
         if args.print_predictions:
             idx = int(i)
             f.write(f"Text: {test_data[0][idx]}\n")
             f.write(f"True Label: {all_labels[idx]}, Predicted Label: {all_predictions[idx]}\n")
             f.write("-" * 50 + "\n")
     with open("metrics.txt", "w") as f:

inference_lstm.py CHANGED Viewed

@@ -20,7 +20,7 @@ if __name__ == "__main__":
     parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training and evaluation")
     parser.add_argument("--num_classes", type=int, required=True, help="Number of classes for classification")
     parser.add_argument("--text_column", type=str, default="text", help="Column name for text data")
-    parser.add_argument("--label_column", type=str, default="label", help="Column name for labels")
     parser.add_argument("--class_names", type=str, nargs='+', required=True, help="List of class names for classification")
     parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
     parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
@@ -40,10 +40,12 @@ if __name__ == "__main__":
     model_state = torch.load(args.model_path, map_location=device)
     # Load data first
     train_data, val_data, test_data = load_data(
         args.data_path,
         text_col=args.text_column,
-        label_col=args.label_column,
         validation_split=0.0,
         test_split=1.0,
         seed=42
@@ -69,7 +71,7 @@ if __name__ == "__main__":
                            embedding_dim=args.embedding_dim,
                            hidden_dim=args.hidden_dim,
                            n_layers=args.num_layers,
-                           output_dim=args.num_classes)
     # I don't know why the model is trained with 30000 embedding size (maybe I forgot to update the distillation code before training)
     # so this is a temporary fix
@@ -101,13 +103,23 @@ if __name__ == "__main__":
             outputs = model(input_ids, attention_mask=attention_mask)
             probs = F.softmax(outputs, dim=1)
             predictions = torch.argmax(probs, dim=1)
             all_predictions = np.append(all_predictions, predictions.cpu().numpy())
             if args.print_predictions:
                 for i in range(len(predictions)):
-                    print(f"Text: {test_dataset.get_text_(batch_count * args.batch_size + i)}, Prediction: {class_names[predictions[i]]}, True Label: {class_names[labels[i]]}")
             if args.inference_batch_limit > 0 and batch_count >= args.inference_batch_limit:
                 break
@@ -131,7 +143,6 @@ if __name__ == "__main__":
             idx = int(i)
             f.write(f"Text: {test_dataset.get_text_(idx)}\n")
             f.write(f"True Label: {all_labels[idx]}, Predicted Label: {all_predictions[idx]}\n")
-            f.write(f"Predicted Class: {class_names[all_predictions[idx]] if len(class_names) > all_predictions[idx] else 'Unknown'}, True Class: {class_names[all_labels[idx]] if len(class_names) > all_labels[idx] else 'Unknown'}\n")
             f.write("\n")
     with open("metrics_lstm.txt", "w") as f:

     parser.add_argument("--batch_size", type=int, default=32, help="Batch size for training and evaluation")
     parser.add_argument("--num_classes", type=int, required=True, help="Number of classes for classification")
     parser.add_argument("--text_column", type=str, default="text", help="Column name for text data")
+    parser.add_argument("--label_column", type=str, nargs='+', help="Column name for labels")
     parser.add_argument("--class_names", type=str, nargs='+', required=True, help="List of class names for classification")
     parser.add_argument("--inference_batch_limit", type=int, default=-1, help="Limit for inference batch counts")
     parser.add_argument("--print_predictions", type=bool, default=False, help="Print predictions to console")
     model_state = torch.load(args.model_path, map_location=device)
     # Load data first
+    label_column = args.label_column[0] if isinstance(args.label_column, list) and len(args.label_column) == 1 else args.label_column
+    num_categories = len(args.label_column) if isinstance(args.label_column, list) else 1
     train_data, val_data, test_data = load_data(
         args.data_path,
         text_col=args.text_column,
+        label_col=label_column,
         validation_split=0.0,
         test_split=1.0,
         seed=42
                            embedding_dim=args.embedding_dim,
                            hidden_dim=args.hidden_dim,
                            n_layers=args.num_layers,
+                           output_dim=args.num_classes * num_categories)
     # I don't know why the model is trained with 30000 embedding size (maybe I forgot to update the distillation code before training)
     # so this is a temporary fix
             outputs = model(input_ids, attention_mask=attention_mask)
             probs = F.softmax(outputs, dim=1)
+            batch_size, total_classes = outputs.shape
+            if total_classes % num_categories != 0:
+                raise ValueError(f"Error: Number of total classes in the batch must of divisible by {num_categories}")
+            classes_per_group = total_classes // num_categories
+            # Group every classes_per_group values along dim=1
+            reshaped = outputs.view(outputs.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+            # Argmax over each group of classes_per_group
+            preds = reshaped.argmax(dim=-1)
             predictions = torch.argmax(probs, dim=1)
             all_predictions = np.append(all_predictions, predictions.cpu().numpy())
             if args.print_predictions:
                 for i in range(len(predictions)):
+                    print(f"Text: {test_dataset.get_text_(batch_count * args.batch_size + i)}, Prediction: {predictions[i]}, True Label: {labels[i]}")
             if args.inference_batch_limit > 0 and batch_count >= args.inference_batch_limit:
                 break
             idx = int(i)
             f.write(f"Text: {test_dataset.get_text_(idx)}\n")
             f.write(f"True Label: {all_labels[idx]}, Predicted Label: {all_predictions[idx]}\n")
             f.write("\n")
     with open("metrics_lstm.txt", "w") as f:

knowledge_distillation.py CHANGED Viewed

@@ -25,6 +25,7 @@ class DistillationTrainer:
         weight_decay=1e-5,
         max_grad_norm=1.0,
         label_mapping=None,
         device=None
     ):
         self.teacher_model = teacher_model
@@ -35,6 +36,7 @@ class DistillationTrainer:
         self.temperature = temperature
         self.alpha = alpha
         self.max_grad_norm = max_grad_norm
         self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         logger.info(f"Using device: {self.device}")
@@ -65,6 +67,7 @@ class DistillationTrainer:
         self.best_val_f1 = 0.0
         self.best_model_state = None
         self.label_mapping = label_mapping
     def distillation_loss(self, student_logits, teacher_logits, labels, temperature, alpha):
         """
@@ -147,7 +150,19 @@ class DistillationTrainer:
                 train_loss += loss.item()
                 # Calculate accuracy for progress tracking
-                _, preds = torch.max(student_logits, 1)
                 all_preds.extend(preds.cpu().tolist())
                 all_labels.extend(labels.cpu().tolist())
@@ -217,7 +232,19 @@ class DistillationTrainer:
                 eval_loss += loss.item()
                 # Get predictions
-                _, preds = torch.max(student_logits, 1)
                 all_preds.extend(preds.cpu().tolist())
                 all_labels.extend(labels.cpu().tolist())

         weight_decay=1e-5,
         max_grad_norm=1.0,
         label_mapping=None,
+        num_categories=1,
         device=None
     ):
         self.teacher_model = teacher_model
         self.temperature = temperature
         self.alpha = alpha
         self.max_grad_norm = max_grad_norm
+        self.num_categories = num_categories
         self.device = device if device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         logger.info(f"Using device: {self.device}")
         self.best_val_f1 = 0.0
         self.best_model_state = None
         self.label_mapping = label_mapping
     def distillation_loss(self, student_logits, teacher_logits, labels, temperature, alpha):
         """
                 train_loss += loss.item()
                 # Calculate accuracy for progress tracking
+                if self.num_categories > 1:
+                    batch_size, total_classes = student_logits.shape
+                    if total_classes % self.num_categories != 0:
+                        raise ValueError(f"Error: Number of total classes in the batch must of divisible by {self.num_categories}")
+                    classes_per_group = total_classes // self.num_categories
+                    # Group every classes_per_group values along dim=1
+                    reshaped = student_logits.view(student_logits.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+                    # Argmax over each group of classes_per_group
+                    preds = reshaped.argmax(dim=-1)
+                else:
+                    _, preds = torch.max(student_logits, 1)
                 all_preds.extend(preds.cpu().tolist())
                 all_labels.extend(labels.cpu().tolist())
                 eval_loss += loss.item()
                 # Get predictions
+                if self.num_categories > 1:
+                    batch_size, total_classes = student_logits.shape
+                    if total_classes % self.num_categories != 0:
+                        raise ValueError(f"Error: Number of total classes in the batch must of divisible by {self.num_categories}")
+                    classes_per_group = total_classes // self.num_categories
+                    # Group every classes_per_group values along dim=1
+                    reshaped = student_logits.view(student_logits.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+                    # Argmax over each group of classes_per_group
+                    preds = reshaped.argmax(dim=-1)
+                else:
+                    _, preds = torch.max(student_logits, 1)
                 all_preds.extend(preds.cpu().tolist())
                 all_labels.extend(labels.cpu().tolist())

model.py CHANGED Viewed

@@ -1,25 +1,27 @@
 import torch
 import torch.nn as nn
-from transformers import BertModel, BertConfig
 class DocBERT(nn.Module):
     """
     Document classification using BERT with improved architecture
     based on Hedwig implementation patterns.
     """
-    def __init__(self, num_classes, bert_model_name='bert-base-uncased', dropout_prob=0.1):
         super(DocBERT, self).__init__()
         # Load pre-trained BERT model or config
-        self.bert = BertModel.from_pretrained(bert_model_name)
-        self.config = self.bert.config
         # Dropout layer for regularization (helps prevent overfitting)
         self.dropout = nn.Dropout(dropout_prob)
         # Multiple classification heads approach (inspired by Hedwig)
         self.hidden_size = self.config.hidden_size
-        self.classifier = nn.Linear(self.hidden_size, num_classes)
         # Layer normalization before classification (helps stabilize training)
         self.layer_norm = nn.LayerNorm(self.hidden_size)

 import torch
 import torch.nn as nn
+from transformers import AutoConfig, AutoModel
 class DocBERT(nn.Module):
     """
     Document classification using BERT with improved architecture
     based on Hedwig implementation patterns.
     """
+    def __init__(self, num_classes, bert_model_name='bert-base-uncased', dropout_prob=0.1, num_categories=1):
         super(DocBERT, self).__init__()
         # Load pre-trained BERT model or config
+        self.bert = AutoModel.from_pretrained(bert_model_name)
+        self.config = AutoConfig.from_pretrained(bert_model_name)
         # Dropout layer for regularization (helps prevent overfitting)
         self.dropout = nn.Dropout(dropout_prob)
         # Multiple classification heads approach (inspired by Hedwig)
         self.hidden_size = self.config.hidden_size
+        self.num_categories = num_categories
+        self.classifier = nn.Linear(self.hidden_size, num_classes*num_categories)
         # Layer normalization before classification (helps stabilize training)
         self.layer_norm = nn.LayerNorm(self.hidden_size)

requirements.txt CHANGED Viewed

@@ -2,6 +2,10 @@ scikit-learn
 numpy
 pandas
 torch
-transformers
 datasets
-torchtext

 numpy
 pandas
 torch
+transformers>=4.28.0
+tokenizers
 datasets
+torchtext
+maturin
+underthesea --only-binary :all:
+accelerate

run.py DELETED Viewed

@@ -1,86 +0,0 @@
-"""
-Simple script to run the DocBERT model with predefined config presets
-"""
-import argparse
-import logging
-import os
-from config import get_config
-from model import DocBERT
-from dataset import load_data, create_data_loaders
-from trainer import Trainer
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-def main():
-    parser = argparse.ArgumentParser(description="Run DocBERT with a predefined config")
-    parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
-    parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
-    parser.add_argument("--label_column", type=str, default="label", help="Name of the label column")
-    parser.add_argument("--num_classes", type=int, required=True, help="Number of classes to predict")
-    parser.add_argument("--config", type=str, default="default",
-                        choices=["default", "short_text", "long_document", "fine_tuning"],
-                        help="Configuration preset to use")
-    parser.add_argument("--output_dir", type=str, default="./output", help="Directory to save outputs")
-    args = parser.parse_args()
-    # Get config
-    config_class = get_config(args.config)
-    config = config_class()
-    logger.info(f"Using '{args.config}' config preset")
-    # Create output directory
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-    # Load and prepare data
-    logger.info("Loading data...")
-    train_data, val_data, test_data = load_data(
-        args.data_path,
-        text_col=args.text_column,
-        label_col=args.label_column,
-        validation_split=config.val_split,
-        test_split=config.test_split,
-        seed=config.seed
-    )
-    train_loader, val_loader, test_loader = create_data_loaders(
-        train_data,
-        val_data,
-        test_data,
-        tokenizer_name=config.bert_model,
-        max_length=config.max_seq_length,
-        batch_size=config.batch_size
-    )
-    # Initialize model
-    logger.info(f"Initializing model with {config.bert_model}...")
-    model = DocBERT(
-        num_classes=args.num_classes,
-        bert_model_name=config.bert_model,
-        dropout_prob=config.dropout
-    )
-    # Initialize trainer
-    trainer = Trainer(
-        model=model,
-        train_loader=train_loader,
-        val_loader=val_loader,
-        test_loader=test_loader,
-        lr=config.learning_rate,
-        weight_decay=config.weight_decay,
-        gradient_accumulation_steps=config.grad_accum_steps
-    )
-    # Train model
-    logger.info("Starting training...")
-    save_path = os.path.join(args.output_dir, "best_model.pth")
-    trainer.train(epochs=config.epochs, save_path=save_path)
-    logger.info("Training completed!")
-if __name__ == "__main__":
-    main()

train.py CHANGED Viewed

@@ -32,7 +32,7 @@ def main():
     # Data arguments
     parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
     parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
-    parser.add_argument("--label_column", type=str, default="label", help="Name of the label column")
     parser.add_argument("--val_split", type=float, default=0.1, help="Validation set split ratio")
     parser.add_argument("--test_split", type=float, default=0.1, help="Test set split ratio")
@@ -67,12 +67,14 @@ def main():
     # Log args for debugging
     logger.info(f"Running with arguments: {args}")
     # Load and prepare data
     logger.info("Loading and preparing data...")
     train_data, val_data, test_data = load_data(
         args.data_path,
         text_col=args.text_column,
-        label_col=args.label_column,
         validation_split=args.val_split,
         test_split=args.test_split,
         seed=args.seed
@@ -98,7 +100,8 @@ def main():
     model = DocBERT(
         num_classes=args.num_classes,
         bert_model_name=args.bert_model,
-        dropout_prob=args.dropout
     )
     # Count and log model parameters
@@ -116,12 +119,13 @@ def main():
         lr=args.learning_rate,
         weight_decay=args.weight_decay,
         warmup_proportion=args.warmup_proportion,
-        gradient_accumulation_steps=args.grad_accum_steps
     )
     # Train the model
     logger.info("Starting training...")
-    save_path = os.path.join(args.output_dir, "bert-base-uncased")
     trainer.train(epochs=args.epochs, save_path=save_path)
     logger.info("Training completed!")

     # Data arguments
     parser.add_argument("--data_path", type=str, required=True, help="Path to the dataset file (CSV or TSV)")
     parser.add_argument("--text_column", type=str, default="text", help="Name of the text column")
+    parser.add_argument("--label_column", type=str, nargs="+", help="Name of the label column")
     parser.add_argument("--val_split", type=float, default=0.1, help="Validation set split ratio")
     parser.add_argument("--test_split", type=float, default=0.1, help="Test set split ratio")
     # Log args for debugging
     logger.info(f"Running with arguments: {args}")
+    num_categories = len(args.label_column) if isinstance(args.label_column, list) else 1
+    label_column = args.label_column[0] if isinstance(args.label_column, list) and len(args.label_column) == 1 else args.label_column
     # Load and prepare data
     logger.info("Loading and preparing data...")
     train_data, val_data, test_data = load_data(
         args.data_path,
         text_col=args.text_column,
+        label_col=label_column,
         validation_split=args.val_split,
         test_split=args.test_split,
         seed=args.seed
     model = DocBERT(
         num_classes=args.num_classes,
         bert_model_name=args.bert_model,
+        dropout_prob=args.dropout,
+        num_categories=num_categories
     )
     # Count and log model parameters
         lr=args.learning_rate,
         weight_decay=args.weight_decay,
         warmup_proportion=args.warmup_proportion,
+        gradient_accumulation_steps=args.grad_accum_steps,
+        num_categories=num_categories,
     )
     # Train the model
     logger.info("Starting training...")
+    save_path = os.path.join(args.output_dir, args.bert_model.replace("/", "_") + "_finetuned")
     trainer.train(epochs=args.epochs, save_path=save_path)
     logger.info("Training completed!")

trainer.py CHANGED Viewed

@@ -28,6 +28,7 @@ class Trainer:
         warmup_proportion=0.1,
         gradient_accumulation_steps=1,
         max_grad_norm=1.0,
         device=None
     ):
         self.model = model
@@ -68,6 +69,9 @@ class Trainer:
         # For tracking metrics
         self.best_val_f1 = 0.0
         self.best_model_state = None
     def train(self, epochs, save_path='best_model.pth'):
         """
@@ -75,84 +79,106 @@ class Trainer:
         """
         logger.info(f"Starting training for {epochs} epochs")
-        for epoch in range(epochs):
-            start_time = time.time()
-            # Training phase
-            self.model.train()
-            train_loss = 0
-            all_predictions = []
-            all_labels = []
-            # Progress bar for training
-            train_iterator = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
-            for i, batch in enumerate(train_iterator):
-                # Move batch to device
-                input_ids = batch['input_ids'].to(self.device)
-                attention_mask = batch['attention_mask'].to(self.device)
-                token_type_ids = batch['token_type_ids'].to(self.device)
-                labels = batch['label'].to(self.device)
-                # Forward pass
-                outputs = self.model(
-                    input_ids=input_ids,
-                    attention_mask=attention_mask,
-                    token_type_ids=token_type_ids
-                )
-                # Calculate loss
-                loss = self.criterion(outputs, labels)
-                # Scale loss if using gradient accumulation
-                if self.gradient_accumulation_steps > 1:
-                    loss = loss / self.gradient_accumulation_steps
-                # Backward pass
-                loss.backward()
-                # Update weights if we've accumulated enough gradients
-                if (i + 1) % self.gradient_accumulation_steps == 0:
-                    # Gradient clipping to prevent exploding gradients
-                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
-                    self.optimizer.step()
-                    self.optimizer.zero_grad()
-                train_loss += loss.item() * self.gradient_accumulation_steps
-                # Get predictions for metrics
-                _, preds = torch.max(outputs, dim=1)
-                all_predictions.extend(preds.cpu().tolist())
-                all_labels.extend(labels.cpu().tolist())
-                # Update progress bar with current loss
-                train_iterator.set_postfix({'loss': f"{loss.item():.4f}"})
-            # Calculate training metrics
-            train_loss /= len(self.train_loader)
-            train_acc = accuracy_score(all_labels, all_predictions)
-            train_f1 = f1_score(all_labels, all_predictions, average='macro')
-            # Validation phase
-            val_loss, val_acc, val_f1, val_precision, val_recall = self.evaluate(self.val_loader, "Validation")
-            # Adjust learning rate based on validation performance
-            self.scheduler.step(val_f1)
-            # Save best model
-            if val_f1 > self.best_val_f1:
-                self.best_val_f1 = val_f1
-                self.best_model_state = self.model.state_dict().copy()
-                torch.save(self.model.state_dict(), save_path)
-                logger.info(f"New best model saved with validation F1: {val_f1:.4f}")
-            # Print epoch summary
-            epoch_time = time.time() - start_time
-            logger.info(f"Epoch {epoch+1}/{epochs} - "
-                       f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, "
-                       f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}, "
-                       f"Time: {epoch_time:.2f}s")
         # Load best model for final evaluation
         if self.best_model_state is not None:
             self.model.load_state_dict(self.best_model_state)
@@ -197,7 +223,21 @@ class Trainer:
                 eval_loss += loss.item()
                 # Get predictions
-                _, preds = torch.max(outputs, dim=1)
                 all_predictions.extend(preds.cpu().tolist())
                 all_labels.extend(labels.cpu().tolist())

         warmup_proportion=0.1,
         gradient_accumulation_steps=1,
         max_grad_norm=1.0,
+        num_categories=1,
         device=None
     ):
         self.model = model
         # For tracking metrics
         self.best_val_f1 = 0.0
         self.best_model_state = None
+        # For training if using multiple categories (e.g., multiple sentiment classes, there can be multiple sentiment in one document)
+        self.num_categories = num_categories
     def train(self, epochs, save_path='best_model.pth'):
         """
         """
         logger.info(f"Starting training for {epochs} epochs")
+        try:
+            for epoch in range(epochs):
+                start_time = time.time()
+                # Training phase
+                self.model.train()
+                train_loss = 0
+                all_predictions = []
+                all_labels = []
+                # Progress bar for training
+                train_iterator = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]")
+                for i, batch in enumerate(train_iterator):
+                    # Move batch to device
+                    input_ids = batch['input_ids'].to(self.device)
+                    attention_mask = batch['attention_mask'].to(self.device)
+                    token_type_ids = batch['token_type_ids'].to(self.device)
+                    labels = batch['label'].to(self.device)
+                    # Forward pass
+                    outputs = self.model(
+                        input_ids=input_ids,
+                        attention_mask=attention_mask,
+                        token_type_ids=token_type_ids
+                    )
+                    # Calculate loss
+                    loss = self.criterion(outputs, labels)
+                    # Scale loss if using gradient accumulation
+                    if self.gradient_accumulation_steps > 1:
+                        loss = loss / self.gradient_accumulation_steps
+                    # Backward pass
+                    loss.backward()
+                    # Update weights if we've accumulated enough gradients
+                    if (i + 1) % self.gradient_accumulation_steps == 0:
+                        # Gradient clipping to prevent exploding gradients
+                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+                        self.optimizer.step()
+                        self.optimizer.zero_grad()
+                    train_loss += loss.item() * self.gradient_accumulation_steps
+                    # Get predictions for metrics
+                    if self.num_categories > 1:
+                        batch_size, total_classes = outputs.shape
+                        if total_classes % self.num_categories != 0:
+                            raise ValueError(f"Error: Number of total classes in the batch must of divisible by {self.num_categories}")
+                        classes_per_group = total_classes // self.num_categories
+                        # Group every classes_per_group values along dim=1
+                        reshaped = outputs.view(outputs.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+                        # Argmax over each group of classes_per_group
+                        preds = reshaped.argmax(dim=-1)
+                    else:
+                        _, preds = torch.max(outputs, dim=1)
+                    all_predictions.extend(preds.cpu().tolist())
+                    all_labels.extend(labels.cpu().tolist())
+                    # Update progress bar with current loss
+                    train_iterator.set_postfix({'loss': f"{loss.item():.4f}"})
+                # Calculate training metrics
+                train_loss /= len(self.train_loader)
+                train_acc = accuracy_score(all_labels, all_predictions)
+                train_f1 = f1_score(all_labels, all_predictions, average='macro')
+                # Validation phase
+                val_loss, val_acc, val_f1, val_precision, val_recall = self.evaluate(self.val_loader, "Validation")
+                # Log validation metrics
+                logger.info(f"Validation - Loss: {val_loss:.4f}, Acc: {val_acc:.4f}, F1: {val_f1:.4f}, "
+                            f"Precision: {val_precision:.4f}, Recall: {val_recall:.4f}")
+                # Adjust learning rate based on validation performance
+                self.scheduler.step(val_f1)
+                # Save best model
+                if val_f1 > self.best_val_f1:
+                    self.best_val_f1 = val_f1
+                    self.best_model_state = self.model.state_dict().copy()
+                    torch.save(self.model.state_dict(), save_path)
+                    logger.info(f"New best model saved with validation F1: {val_f1:.4f}")
+                # Print epoch summary
+                epoch_time = time.time() - start_time
+                logger.info(f"Epoch {epoch+1}/{epochs} - "
+                        f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Train F1: {train_f1:.4f}, "
+                        f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}, "
+                        f"Time: {epoch_time:.2f}s")
+        except Exception as e:
+            logger.error(f"Error during training: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
         # Load best model for final evaluation
         if self.best_model_state is not None:
             self.model.load_state_dict(self.best_model_state)
                 eval_loss += loss.item()
                 # Get predictions
+                # Get predictions for metrics
+                if self.num_categories > 1:
+                    batch_size, total_classes = outputs.shape
+                    if total_classes % self.num_categories != 0:
+                        raise ValueError(f"Error: Number of total classes in the batch must of divisible by {self.num_categories}")
+                    classes_per_group = total_classes // self.num_categories
+                    # Group every classes_per_group values along dim=1
+                    reshaped = outputs.view(outputs.size(0), -1, classes_per_group)  # shape: (batch, self., classes_per_group)
+                    # Argmax over each group of classes_per_group
+                    preds = reshaped.argmax(dim=-1)
+                else:
+                    _, preds = torch.max(outputs, dim=1)
                 all_predictions.extend(preds.cpu().tolist())
                 all_labels.extend(labels.cpu().tolist())

utils/word_segmentation_vi.py ADDED Viewed

	@@ -0,0 +1,23 @@

+from underthesea import word_tokenize
+import os, pandas
+def word_segmentation_vi(text):
+    segmented_text = word_tokenize(text, format="text")
+    return segmented_text
+if __name__ == "__main__":
+    # Script này để segment các file CSV và TSV trong thư mục datasets cho tiếng Việt (do PhoBERT yêu cầu đầu vào đã được segment theo từ)
+    dataset_dir = "../datasets"
+    csv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.csv')]
+    tsv_files = [f for f in os.listdir(dataset_dir) if f.endswith('.tsv')]
+    for file in csv_files:
+        file_path = os.path.join(dataset_dir, file)
+        df = pandas.read_csv(file_path)
+        if 'content' in df.columns:
+            df['content'] = df['content'].apply(lambda text: word_segmentation_vi(str(text)))
+            df.to_csv(file_path, index=False)
+            print(f"Processed {file}")
+        else:
+            print(f"'content' column not found in {file}")