Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- .gitattributes +2 -0
- LICENSE +339 -0
- README.md +2 -12
- datos_iniciales/female_names.csv +0 -0
- datos_iniciales/male_names.csv +0 -0
- datos_iniciales/nombres_apellido_nombre.json +0 -0
- datos_iniciales/nombres_apellido_nombres.json +3 -0
- datos_iniciales/nombres_separados_coma.json +0 -0
- modelo/modelo_prediccion_genero_ml_dt.py +197 -0
- modelo/modelo_prediccion_genero_ml_nb.py +233 -0
- prediccion/Prediccion_final_nombres_nombres_para_clasificar.csv +0 -0
- prediccion/Prediccion_nombres_apellido_nombre.xlsx +0 -0
- prediccion/Prediccion_nombres_apellido_nombres.xlsx +3 -0
- prediccion/Prediccion_nombres_separados_coma.xlsx +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
datos_iniciales/nombres_apellido_nombres.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
prediccion/Prediccion_nombres_apellido_nombres.xlsx filter=lfs diff=lfs merge=lfs -text
|
LICENSE
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
GNU GENERAL PUBLIC LICENSE
|
| 2 |
+
Version 2, June 1991
|
| 3 |
+
|
| 4 |
+
Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
|
| 5 |
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
| 6 |
+
Everyone is permitted to copy and distribute verbatim copies
|
| 7 |
+
of this license document, but changing it is not allowed.
|
| 8 |
+
|
| 9 |
+
Preamble
|
| 10 |
+
|
| 11 |
+
The licenses for most software are designed to take away your
|
| 12 |
+
freedom to share and change it. By contrast, the GNU General Public
|
| 13 |
+
License is intended to guarantee your freedom to share and change free
|
| 14 |
+
software--to make sure the software is free for all its users. This
|
| 15 |
+
General Public License applies to most of the Free Software
|
| 16 |
+
Foundation's software and to any other program whose authors commit to
|
| 17 |
+
using it. (Some other Free Software Foundation software is covered by
|
| 18 |
+
the GNU Lesser General Public License instead.) You can apply it to
|
| 19 |
+
your programs, too.
|
| 20 |
+
|
| 21 |
+
When we speak of free software, we are referring to freedom, not
|
| 22 |
+
price. Our General Public Licenses are designed to make sure that you
|
| 23 |
+
have the freedom to distribute copies of free software (and charge for
|
| 24 |
+
this service if you wish), that you receive source code or can get it
|
| 25 |
+
if you want it, that you can change the software or use pieces of it
|
| 26 |
+
in new free programs; and that you know you can do these things.
|
| 27 |
+
|
| 28 |
+
To protect your rights, we need to make restrictions that forbid
|
| 29 |
+
anyone to deny you these rights or to ask you to surrender the rights.
|
| 30 |
+
These restrictions translate to certain responsibilities for you if you
|
| 31 |
+
distribute copies of the software, or if you modify it.
|
| 32 |
+
|
| 33 |
+
For example, if you distribute copies of such a program, whether
|
| 34 |
+
gratis or for a fee, you must give the recipients all the rights that
|
| 35 |
+
you have. You must make sure that they, too, receive or can get the
|
| 36 |
+
source code. And you must show them these terms so they know their
|
| 37 |
+
rights.
|
| 38 |
+
|
| 39 |
+
We protect your rights with two steps: (1) copyright the software, and
|
| 40 |
+
(2) offer you this license which gives you legal permission to copy,
|
| 41 |
+
distribute and/or modify the software.
|
| 42 |
+
|
| 43 |
+
Also, for each author's protection and ours, we want to make certain
|
| 44 |
+
that everyone understands that there is no warranty for this free
|
| 45 |
+
software. If the software is modified by someone else and passed on, we
|
| 46 |
+
want its recipients to know that what they have is not the original, so
|
| 47 |
+
that any problems introduced by others will not reflect on the original
|
| 48 |
+
authors' reputations.
|
| 49 |
+
|
| 50 |
+
Finally, any free program is threatened constantly by software
|
| 51 |
+
patents. We wish to avoid the danger that redistributors of a free
|
| 52 |
+
program will individually obtain patent licenses, in effect making the
|
| 53 |
+
program proprietary. To prevent this, we have made it clear that any
|
| 54 |
+
patent must be licensed for everyone's free use or not licensed at all.
|
| 55 |
+
|
| 56 |
+
The precise terms and conditions for copying, distribution and
|
| 57 |
+
modification follow.
|
| 58 |
+
|
| 59 |
+
GNU GENERAL PUBLIC LICENSE
|
| 60 |
+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
| 61 |
+
|
| 62 |
+
0. This License applies to any program or other work which contains
|
| 63 |
+
a notice placed by the copyright holder saying it may be distributed
|
| 64 |
+
under the terms of this General Public License. The "Program", below,
|
| 65 |
+
refers to any such program or work, and a "work based on the Program"
|
| 66 |
+
means either the Program or any derivative work under copyright law:
|
| 67 |
+
that is to say, a work containing the Program or a portion of it,
|
| 68 |
+
either verbatim or with modifications and/or translated into another
|
| 69 |
+
language. (Hereinafter, translation is included without limitation in
|
| 70 |
+
the term "modification".) Each licensee is addressed as "you".
|
| 71 |
+
|
| 72 |
+
Activities other than copying, distribution and modification are not
|
| 73 |
+
covered by this License; they are outside its scope. The act of
|
| 74 |
+
running the Program is not restricted, and the output from the Program
|
| 75 |
+
is covered only if its contents constitute a work based on the
|
| 76 |
+
Program (independent of having been made by running the Program).
|
| 77 |
+
Whether that is true depends on what the Program does.
|
| 78 |
+
|
| 79 |
+
1. You may copy and distribute verbatim copies of the Program's
|
| 80 |
+
source code as you receive it, in any medium, provided that you
|
| 81 |
+
conspicuously and appropriately publish on each copy an appropriate
|
| 82 |
+
copyright notice and disclaimer of warranty; keep intact all the
|
| 83 |
+
notices that refer to this License and to the absence of any warranty;
|
| 84 |
+
and give any other recipients of the Program a copy of this License
|
| 85 |
+
along with the Program.
|
| 86 |
+
|
| 87 |
+
You may charge a fee for the physical act of transferring a copy, and
|
| 88 |
+
you may at your option offer warranty protection in exchange for a fee.
|
| 89 |
+
|
| 90 |
+
2. You may modify your copy or copies of the Program or any portion
|
| 91 |
+
of it, thus forming a work based on the Program, and copy and
|
| 92 |
+
distribute such modifications or work under the terms of Section 1
|
| 93 |
+
above, provided that you also meet all of these conditions:
|
| 94 |
+
|
| 95 |
+
a) You must cause the modified files to carry prominent notices
|
| 96 |
+
stating that you changed the files and the date of any change.
|
| 97 |
+
|
| 98 |
+
b) You must cause any work that you distribute or publish, that in
|
| 99 |
+
whole or in part contains or is derived from the Program or any
|
| 100 |
+
part thereof, to be licensed as a whole at no charge to all third
|
| 101 |
+
parties under the terms of this License.
|
| 102 |
+
|
| 103 |
+
c) If the modified program normally reads commands interactively
|
| 104 |
+
when run, you must cause it, when started running for such
|
| 105 |
+
interactive use in the most ordinary way, to print or display an
|
| 106 |
+
announcement including an appropriate copyright notice and a
|
| 107 |
+
notice that there is no warranty (or else, saying that you provide
|
| 108 |
+
a warranty) and that users may redistribute the program under
|
| 109 |
+
these conditions, and telling the user how to view a copy of this
|
| 110 |
+
License. (Exception: if the Program itself is interactive but
|
| 111 |
+
does not normally print such an announcement, your work based on
|
| 112 |
+
the Program is not required to print an announcement.)
|
| 113 |
+
|
| 114 |
+
These requirements apply to the modified work as a whole. If
|
| 115 |
+
identifiable sections of that work are not derived from the Program,
|
| 116 |
+
and can be reasonably considered independent and separate works in
|
| 117 |
+
themselves, then this License, and its terms, do not apply to those
|
| 118 |
+
sections when you distribute them as separate works. But when you
|
| 119 |
+
distribute the same sections as part of a whole which is a work based
|
| 120 |
+
on the Program, the distribution of the whole must be on the terms of
|
| 121 |
+
this License, whose permissions for other licensees extend to the
|
| 122 |
+
entire whole, and thus to each and every part regardless of who wrote it.
|
| 123 |
+
|
| 124 |
+
Thus, it is not the intent of this section to claim rights or contest
|
| 125 |
+
your rights to work written entirely by you; rather, the intent is to
|
| 126 |
+
exercise the right to control the distribution of derivative or
|
| 127 |
+
collective works based on the Program.
|
| 128 |
+
|
| 129 |
+
In addition, mere aggregation of another work not based on the Program
|
| 130 |
+
with the Program (or with a work based on the Program) on a volume of
|
| 131 |
+
a storage or distribution medium does not bring the other work under
|
| 132 |
+
the scope of this License.
|
| 133 |
+
|
| 134 |
+
3. You may copy and distribute the Program (or a work based on it,
|
| 135 |
+
under Section 2) in object code or executable form under the terms of
|
| 136 |
+
Sections 1 and 2 above provided that you also do one of the following:
|
| 137 |
+
|
| 138 |
+
a) Accompany it with the complete corresponding machine-readable
|
| 139 |
+
source code, which must be distributed under the terms of Sections
|
| 140 |
+
1 and 2 above on a medium customarily used for software interchange; or,
|
| 141 |
+
|
| 142 |
+
b) Accompany it with a written offer, valid for at least three
|
| 143 |
+
years, to give any third party, for a charge no more than your
|
| 144 |
+
cost of physically performing source distribution, a complete
|
| 145 |
+
machine-readable copy of the corresponding source code, to be
|
| 146 |
+
distributed under the terms of Sections 1 and 2 above on a medium
|
| 147 |
+
customarily used for software interchange; or,
|
| 148 |
+
|
| 149 |
+
c) Accompany it with the information you received as to the offer
|
| 150 |
+
to distribute corresponding source code. (This alternative is
|
| 151 |
+
allowed only for noncommercial distribution and only if you
|
| 152 |
+
received the program in object code or executable form with such
|
| 153 |
+
an offer, in accord with Subsection b above.)
|
| 154 |
+
|
| 155 |
+
The source code for a work means the preferred form of the work for
|
| 156 |
+
making modifications to it. For an executable work, complete source
|
| 157 |
+
code means all the source code for all modules it contains, plus any
|
| 158 |
+
associated interface definition files, plus the scripts used to
|
| 159 |
+
control compilation and installation of the executable. However, as a
|
| 160 |
+
special exception, the source code distributed need not include
|
| 161 |
+
anything that is normally distributed (in either source or binary
|
| 162 |
+
form) with the major components (compiler, kernel, and so on) of the
|
| 163 |
+
operating system on which the executable runs, unless that component
|
| 164 |
+
itself accompanies the executable.
|
| 165 |
+
|
| 166 |
+
If distribution of executable or object code is made by offering
|
| 167 |
+
access to copy from a designated place, then offering equivalent
|
| 168 |
+
access to copy the source code from the same place counts as
|
| 169 |
+
distribution of the source code, even though third parties are not
|
| 170 |
+
compelled to copy the source along with the object code.
|
| 171 |
+
|
| 172 |
+
4. You may not copy, modify, sublicense, or distribute the Program
|
| 173 |
+
except as expressly provided under this License. Any attempt
|
| 174 |
+
otherwise to copy, modify, sublicense or distribute the Program is
|
| 175 |
+
void, and will automatically terminate your rights under this License.
|
| 176 |
+
However, parties who have received copies, or rights, from you under
|
| 177 |
+
this License will not have their licenses terminated so long as such
|
| 178 |
+
parties remain in full compliance.
|
| 179 |
+
|
| 180 |
+
5. You are not required to accept this License, since you have not
|
| 181 |
+
signed it. However, nothing else grants you permission to modify or
|
| 182 |
+
distribute the Program or its derivative works. These actions are
|
| 183 |
+
prohibited by law if you do not accept this License. Therefore, by
|
| 184 |
+
modifying or distributing the Program (or any work based on the
|
| 185 |
+
Program), you indicate your acceptance of this License to do so, and
|
| 186 |
+
all its terms and conditions for copying, distributing or modifying
|
| 187 |
+
the Program or works based on it.
|
| 188 |
+
|
| 189 |
+
6. Each time you redistribute the Program (or any work based on the
|
| 190 |
+
Program), the recipient automatically receives a license from the
|
| 191 |
+
original licensor to copy, distribute or modify the Program subject to
|
| 192 |
+
these terms and conditions. You may not impose any further
|
| 193 |
+
restrictions on the recipients' exercise of the rights granted herein.
|
| 194 |
+
You are not responsible for enforcing compliance by third parties to
|
| 195 |
+
this License.
|
| 196 |
+
|
| 197 |
+
7. If, as a consequence of a court judgment or allegation of patent
|
| 198 |
+
infringement or for any other reason (not limited to patent issues),
|
| 199 |
+
conditions are imposed on you (whether by court order, agreement or
|
| 200 |
+
otherwise) that contradict the conditions of this License, they do not
|
| 201 |
+
excuse you from the conditions of this License. If you cannot
|
| 202 |
+
distribute so as to satisfy simultaneously your obligations under this
|
| 203 |
+
License and any other pertinent obligations, then as a consequence you
|
| 204 |
+
may not distribute the Program at all. For example, if a patent
|
| 205 |
+
license would not permit royalty-free redistribution of the Program by
|
| 206 |
+
all those who receive copies directly or indirectly through you, then
|
| 207 |
+
the only way you could satisfy both it and this License would be to
|
| 208 |
+
refrain entirely from distribution of the Program.
|
| 209 |
+
|
| 210 |
+
If any portion of this section is held invalid or unenforceable under
|
| 211 |
+
any particular circumstance, the balance of the section is intended to
|
| 212 |
+
apply and the section as a whole is intended to apply in other
|
| 213 |
+
circumstances.
|
| 214 |
+
|
| 215 |
+
It is not the purpose of this section to induce you to infringe any
|
| 216 |
+
patents or other property right claims or to contest validity of any
|
| 217 |
+
such claims; this section has the sole purpose of protecting the
|
| 218 |
+
integrity of the free software distribution system, which is
|
| 219 |
+
implemented by public license practices. Many people have made
|
| 220 |
+
generous contributions to the wide range of software distributed
|
| 221 |
+
through that system in reliance on consistent application of that
|
| 222 |
+
system; it is up to the author/donor to decide if he or she is willing
|
| 223 |
+
to distribute software through any other system and a licensee cannot
|
| 224 |
+
impose that choice.
|
| 225 |
+
|
| 226 |
+
This section is intended to make thoroughly clear what is believed to
|
| 227 |
+
be a consequence of the rest of this License.
|
| 228 |
+
|
| 229 |
+
8. If the distribution and/or use of the Program is restricted in
|
| 230 |
+
certain countries either by patents or by copyrighted interfaces, the
|
| 231 |
+
original copyright holder who places the Program under this License
|
| 232 |
+
may add an explicit geographical distribution limitation excluding
|
| 233 |
+
those countries, so that distribution is permitted only in or among
|
| 234 |
+
countries not thus excluded. In such case, this License incorporates
|
| 235 |
+
the limitation as if written in the body of this License.
|
| 236 |
+
|
| 237 |
+
9. The Free Software Foundation may publish revised and/or new versions
|
| 238 |
+
of the General Public License from time to time. Such new versions will
|
| 239 |
+
be similar in spirit to the present version, but may differ in detail to
|
| 240 |
+
address new problems or concerns.
|
| 241 |
+
|
| 242 |
+
Each version is given a distinguishing version number. If the Program
|
| 243 |
+
specifies a version number of this License which applies to it and "any
|
| 244 |
+
later version", you have the option of following the terms and conditions
|
| 245 |
+
either of that version or of any later version published by the Free
|
| 246 |
+
Software Foundation. If the Program does not specify a version number of
|
| 247 |
+
this License, you may choose any version ever published by the Free Software
|
| 248 |
+
Foundation.
|
| 249 |
+
|
| 250 |
+
10. If you wish to incorporate parts of the Program into other free
|
| 251 |
+
programs whose distribution conditions are different, write to the author
|
| 252 |
+
to ask for permission. For software which is copyrighted by the Free
|
| 253 |
+
Software Foundation, write to the Free Software Foundation; we sometimes
|
| 254 |
+
make exceptions for this. Our decision will be guided by the two goals
|
| 255 |
+
of preserving the free status of all derivatives of our free software and
|
| 256 |
+
of promoting the sharing and reuse of software generally.
|
| 257 |
+
|
| 258 |
+
NO WARRANTY
|
| 259 |
+
|
| 260 |
+
11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
|
| 261 |
+
FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
|
| 262 |
+
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
|
| 263 |
+
PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
|
| 264 |
+
OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
| 265 |
+
MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
|
| 266 |
+
TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
|
| 267 |
+
PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
|
| 268 |
+
REPAIR OR CORRECTION.
|
| 269 |
+
|
| 270 |
+
12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
| 271 |
+
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
|
| 272 |
+
REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
|
| 273 |
+
INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
|
| 274 |
+
OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
|
| 275 |
+
TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
|
| 276 |
+
YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
|
| 277 |
+
PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
|
| 278 |
+
POSSIBILITY OF SUCH DAMAGES.
|
| 279 |
+
|
| 280 |
+
END OF TERMS AND CONDITIONS
|
| 281 |
+
|
| 282 |
+
How to Apply These Terms to Your New Programs
|
| 283 |
+
|
| 284 |
+
If you develop a new program, and you want it to be of the greatest
|
| 285 |
+
possible use to the public, the best way to achieve this is to make it
|
| 286 |
+
free software which everyone can redistribute and change under these terms.
|
| 287 |
+
|
| 288 |
+
To do so, attach the following notices to the program. It is safest
|
| 289 |
+
to attach them to the start of each source file to most effectively
|
| 290 |
+
convey the exclusion of warranty; and each file should have at least
|
| 291 |
+
the "copyright" line and a pointer to where the full notice is found.
|
| 292 |
+
|
| 293 |
+
<one line to give the program's name and a brief idea of what it does.>
|
| 294 |
+
Copyright (C) <year> <name of author>
|
| 295 |
+
|
| 296 |
+
This program is free software; you can redistribute it and/or modify
|
| 297 |
+
it under the terms of the GNU General Public License as published by
|
| 298 |
+
the Free Software Foundation; either version 2 of the License, or
|
| 299 |
+
(at your option) any later version.
|
| 300 |
+
|
| 301 |
+
This program is distributed in the hope that it will be useful,
|
| 302 |
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
| 303 |
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
| 304 |
+
GNU General Public License for more details.
|
| 305 |
+
|
| 306 |
+
You should have received a copy of the GNU General Public License along
|
| 307 |
+
with this program; if not, write to the Free Software Foundation, Inc.,
|
| 308 |
+
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
| 309 |
+
|
| 310 |
+
Also add information on how to contact you by electronic and paper mail.
|
| 311 |
+
|
| 312 |
+
If the program is interactive, make it output a short notice like this
|
| 313 |
+
when it starts in an interactive mode:
|
| 314 |
+
|
| 315 |
+
Gnomovision version 69, Copyright (C) year name of author
|
| 316 |
+
Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
| 317 |
+
This is free software, and you are welcome to redistribute it
|
| 318 |
+
under certain conditions; type `show c' for details.
|
| 319 |
+
|
| 320 |
+
The hypothetical commands `show w' and `show c' should show the appropriate
|
| 321 |
+
parts of the General Public License. Of course, the commands you use may
|
| 322 |
+
be called something other than `show w' and `show c'; they could even be
|
| 323 |
+
mouse-clicks or menu items--whatever suits your program.
|
| 324 |
+
|
| 325 |
+
You should also get your employer (if you work as a programmer) or your
|
| 326 |
+
school, if any, to sign a "copyright disclaimer" for the program, if
|
| 327 |
+
necessary. Here is a sample; alter the names:
|
| 328 |
+
|
| 329 |
+
Yoyodyne, Inc., hereby disclaims all copyright interest in the program
|
| 330 |
+
`Gnomovision' (which makes passes at compilers) written by James Hacker.
|
| 331 |
+
|
| 332 |
+
<signature of Ty Coon>, 1 April 1989
|
| 333 |
+
Ty Coon, President of Vice
|
| 334 |
+
|
| 335 |
+
This General Public License does not permit incorporating your program into
|
| 336 |
+
proprietary programs. If your program is a subroutine library, you may
|
| 337 |
+
consider it more useful to permit linking proprietary applications with the
|
| 338 |
+
library. If this is what you want to do, use the GNU Lesser General
|
| 339 |
+
Public License instead of this License.
|
README.md
CHANGED
|
@@ -1,12 +1,2 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
emoji: 馃弳
|
| 4 |
-
colorFrom: yellow
|
| 5 |
-
colorTo: gray
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 4.38.1
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
# modelo_ML_Prediccion_Genero
|
| 2 |
+
Modelo inicial contemplado para realizar la predicci贸n de g茅nero basado en el nombre de la persona
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
datos_iniciales/female_names.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datos_iniciales/male_names.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datos_iniciales/nombres_apellido_nombre.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
datos_iniciales/nombres_apellido_nombres.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:422202a6802fdc494be3538b9e2e01c9cd4a18a7fc85383ef2a456fc9de0a4e2
|
| 3 |
+
size 19701892
|
datos_iniciales/nombres_separados_coma.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
modelo/modelo_prediccion_genero_ml_dt.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Archivo Python adaptado desde el notebook de databricks dispuesto desde el servicio de Azure-Data Sandbox
|
| 4 |
+
Created on Mon Feb 15 22:14:16 2021
|
| 5 |
+
@author: Carlos Delgado
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
## Configrurando el storage account key
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
storage_account_name = "Storage Account"
|
| 12 |
+
storage_account_key = "Storage Account Key"
|
| 13 |
+
container = "Storage Account Source Container"
|
| 14 |
+
container_raw = "Storage Account Source Container Raw"
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
dbutils.fs.ls("abfss://raw@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/JSON/")
|
| 18 |
+
dbutils.fs.ls("abfss://sandbox@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/nombres_entrenamiento_espanol_filtrados/")
|
| 19 |
+
|
| 20 |
+
# Algoritmo adaptado basado en https://nlpforhackers.io/introduction-machine-learning/ y adaptado del repositorio https://github.com/Jcharis/Python-Machine-Learning/tree/master/Gender%20Classification%20With%20%20Machine%20Learning
|
| 21 |
+
|
| 22 |
+
#importando librerias iniciales de administracion de datos
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import numpy as np
|
| 25 |
+
|
| 26 |
+
#importando libreria y paquedes de ML desde Scikit-learn
|
| 27 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 28 |
+
from sklearn.feature_extraction import DictVectorizer
|
| 29 |
+
from sklearn.model_selection import train_test_split
|
| 30 |
+
from sklearn.tree import DecisionTreeClassifier
|
| 31 |
+
|
| 32 |
+
from pyspark.sql.functions import *
|
| 33 |
+
from pyspark.sql.types import *
|
| 34 |
+
|
| 35 |
+
df_original = spark.read.csv("abfss://sandbox@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/nombres_entrenamiento_espanol_filtrados/nombres_entrenamiento_filtrados_v2.csv",header=True)
|
| 36 |
+
display(df_original)
|
| 37 |
+
|
| 38 |
+
#almacenando los nombres por genero a pandas dataframes
|
| 39 |
+
df_pd_nombres_mf_general = df_original.select("*").toPandas()
|
| 40 |
+
df_pd_nombres_mf_general.head()
|
| 41 |
+
|
| 42 |
+
# Limpieza de datos
|
| 43 |
+
# Verificando consistencia de columnnas
|
| 44 |
+
print(df_pd_nombres_mf_general.columns)
|
| 45 |
+
|
| 46 |
+
# Verificando tipos de datos
|
| 47 |
+
print(df_pd_nombres_mf_general.dtypes)
|
| 48 |
+
|
| 49 |
+
# Verificando valores nulos
|
| 50 |
+
print(df_pd_nombres_mf_general.isnull().isnull().sum())
|
| 51 |
+
|
| 52 |
+
# Numero de nombres femeninos
|
| 53 |
+
print("Numero de nombres femeninos: %s" %(len(df_pd_nombres_mf_general[df_pd_nombres_mf_general.SEXO == 'FEMENINO'])))
|
| 54 |
+
# Numero de nombres masculinos
|
| 55 |
+
print("Numero de nombres masculinos: %s" %(len(df_pd_nombres_mf_general[df_pd_nombres_mf_general.SEXO == 'MASCULINO'])))
|
| 56 |
+
|
| 57 |
+
df_names = df_pd_nombres_mf_general
|
| 58 |
+
|
| 59 |
+
# Remplazando con ceros y unos.
|
| 60 |
+
df_names.SEXO.replace({'FEMENINO':0,'MASCULINO':1},inplace=True)
|
| 61 |
+
df_names.SEXO.unique()
|
| 62 |
+
df_names.dtypes
|
| 63 |
+
|
| 64 |
+
Xfeatures = df_pd_nombres_mf_general['PRIMER_NOMBRE']
|
| 65 |
+
|
| 66 |
+
#Extraccion de las caracter铆sticas del df vectorizando
|
| 67 |
+
cv = CountVectorizer()
|
| 68 |
+
X = cv.fit_transform(Xfeatures.values.astype('U')) #Con el fin de no generar problemas en nombres con determinados carcateres
|
| 69 |
+
cv.get_feature_names()
|
| 70 |
+
|
| 71 |
+
# Conformando el diccionario con la extraccion de las primeras y ultimas letras de cada uno de los nombres
|
| 72 |
+
def features(name):
|
| 73 |
+
name = name.lower()
|
| 74 |
+
return {
|
| 75 |
+
'first-letter': name[0], # Primera letra
|
| 76 |
+
'first2-letters': name[0:2], # Primeras 2 letras
|
| 77 |
+
'first3-letters': name[0:3], # Primeras 3 letras
|
| 78 |
+
'last-letter': name[-1], # Ultima letra
|
| 79 |
+
'last2-letters': name[-2:], # Ultimas dos letras
|
| 80 |
+
'last3-letters': name[-3:], # Ultimas tres letras
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Vectorize the features function
|
| 84 |
+
features = np.vectorize(features)
|
| 85 |
+
#Ejemplo
|
| 86 |
+
print(features(["Anna", "Camilo", "Antonio","Margarita","Judith","Samuel"]))
|
| 87 |
+
|
| 88 |
+
#Extrayendo las caracter铆sticas para el conjunto de datos vectorizado
|
| 89 |
+
df_X = features(df_names['PRIMER_NOMBRE'].values.astype('U'))
|
| 90 |
+
df_y = df_names['SEXO']
|
| 91 |
+
|
| 92 |
+
#Ejemplo
|
| 93 |
+
arreglo = features(["Mike", "Julia"])
|
| 94 |
+
dv = DictVectorizer()
|
| 95 |
+
dv.fit(arreglo)
|
| 96 |
+
transformed = dv.transform(arreglo)
|
| 97 |
+
print(transformed)
|
| 98 |
+
|
| 99 |
+
dv.get_feature_names()
|
| 100 |
+
|
| 101 |
+
# Partiendo porcentaje de entrenamiento y testeo
|
| 102 |
+
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.2, random_state=42)
|
| 103 |
+
dfX_train
|
| 104 |
+
dv = DictVectorizer()
|
| 105 |
+
dv.fit_transform(dfX_train)
|
| 106 |
+
|
| 107 |
+
#Definicion del clasificador Decision Trees
|
| 108 |
+
dclf = DecisionTreeClassifier()
|
| 109 |
+
my_xfeatures = dv.transform(dfX_train)
|
| 110 |
+
dclf.fit(my_xfeatures, dfy_train)
|
| 111 |
+
|
| 112 |
+
#Creacion de la funcion para mayor facilidad
|
| 113 |
+
def prediccionGenero(a):
|
| 114 |
+
test_name1 = [a]
|
| 115 |
+
transform_dv =dv.transform(features(test_name1))
|
| 116 |
+
vector = transform_dv.toarray()
|
| 117 |
+
|
| 118 |
+
if dclf.predict(vector) == 0:
|
| 119 |
+
#print("Female")
|
| 120 |
+
return "FEMENINO"
|
| 121 |
+
else:
|
| 122 |
+
#print("Male")
|
| 123 |
+
return("MASCULINO")
|
| 124 |
+
|
| 125 |
+
#Calculando precisi贸n (Accuracy) del modelo teniendo en cuenta los datos de entrenamiento
|
| 126 |
+
print("Accuracy sobre los datos de entrenamiento: %s" %(dclf.score(dv.transform(dfX_train), dfy_train)))
|
| 127 |
+
|
| 128 |
+
#Calculando precisi贸n (Accuracy) del modelo teniendo en cuenta los datos testeo
|
| 129 |
+
print("Accuracy sobre los datos de testeo: %s" %(dclf.score(dv.transform(dfX_test), dfy_test)))
|
| 130 |
+
|
| 131 |
+
dbutils.fs.mount(
|
| 132 |
+
source = "wasbs://standarized@stupramonitoreomercado.blob.core.windows.net",
|
| 133 |
+
mount_point = "/mnt/auxiliar_2",
|
| 134 |
+
extra_configs = {"fs.azure.account.key.stupramonitoreomercado.blob.core.windows.net":"GlhaYCap6LQYbAwFb8PF3nzT3iPsjbTV6DmMB8rG08ms4R4KPcBn2Y3Y0p4OAtLCTqk8NS80y2tfBe0Ga2El7w=="}
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
import pickle
|
| 138 |
+
from sklearn.externals import joblib
|
| 139 |
+
import tempfile
|
| 140 |
+
import os
|
| 141 |
+
from joblib import dump, load
|
| 142 |
+
|
| 143 |
+
s = pickle.dumps(dclf)
|
| 144 |
+
classifier2 = pickle.loads(s)
|
| 145 |
+
tmpFile = tempfile.NamedTemporaryFile(delete=False)
|
| 146 |
+
dump(dclf, tmpFile)
|
| 147 |
+
tmpFile.flush()
|
| 148 |
+
#print(clf2)
|
| 149 |
+
|
| 150 |
+
#copiando el modelo en pkl desde el punto de montaje al datalake
|
| 151 |
+
dump(classifier2, '/tmp/modelo_gender_pred_dt.pkl')
|
| 152 |
+
dbutils.fs.cp('file:/tmp/modelo_gender_pred_dt.pkl', '/mnt/auxiliar_2/OTROS/SNR/modelo_gender_pred_dt_dl.pkl')
|
| 153 |
+
|
| 154 |
+
dbutils.fs.cp('/mnt/auxiliar_2/OTROS/SNR/modelo_gender_pred_dt_dl.pkl', '/tmp/modelo_gender_pred_dt_saved.pkl', )
|
| 155 |
+
display(dbutils.fs.ls ("/tmp/"))
|
| 156 |
+
classifier_Final = joblib.load('/dbfs/tmp/modelo_gender_pred_dt_saved.pkl')
|
| 157 |
+
|
| 158 |
+
#leyendo el archivo desde el json del storage y alamcenandolo como pandas dataframe
|
| 159 |
+
dbutils.fs.ls("abfss://sandbox@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/")
|
| 160 |
+
#df_from_json = spark.read.json("abfss://sandbox@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/nombres_apellido_nombre.json")
|
| 161 |
+
#df_from_json = spark.read.json("abfss://sandbox@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/nombres_comunes_masculino_femenino.json")
|
| 162 |
+
df_from_json = spark.read.json("abfss://standarized@stupramonitoreomercado.dfs.core.windows.net/OTROS/SNR/intervinientes_clean_sexo_nombres_rurales_pendientes_modelo_similaridad.json")
|
| 163 |
+
|
| 164 |
+
display(df_from_json)
|
| 165 |
+
|
| 166 |
+
df_to_predict = df_from_json.select("*").toPandas()
|
| 167 |
+
df_to_predict.head()
|
| 168 |
+
|
| 169 |
+
#Prediccion de los nuevos nombres que se le presentan al modelo
|
| 170 |
+
final_gender = []
|
| 171 |
+
|
| 172 |
+
#for item in df_to_predict.NOMBRES:
|
| 173 |
+
for item in df_to_predict.PRIMER_NOMBRE:
|
| 174 |
+
#print("Nombre: %s ---- Genero: %s" %(item,clf.predict((item, ))))
|
| 175 |
+
if pd.isnull(item) == True or item == '':
|
| 176 |
+
final_gender.append("")
|
| 177 |
+
else:
|
| 178 |
+
final_gender.append(prediccionGenero(item))
|
| 179 |
+
|
| 180 |
+
#Campo donde se almacena la predicci贸n de los nombres
|
| 181 |
+
df_to_predict['PREDICCION'] = final_gender
|
| 182 |
+
display(df_to_predict)
|
| 183 |
+
|
| 184 |
+
df_final_Selection = df_to_predict[['id','PRIMER_NOMBRE','SEXO','PREDICCION', 'TIPO_CLASIFICACION']]
|
| 185 |
+
df_no_similarity = df_final_Selection[df_final_Selection['TIPO_CLASIFICACION'].isnull()]
|
| 186 |
+
df_no_similarity['TIPO_CLASIFICACION'] = 'MODELO_ML'
|
| 187 |
+
df_no_similarity.SEXO = np.where(df_no_similarity.SEXO.isnull(), df_no_similarity.PREDICCION, df_no_similarity.SEXO)
|
| 188 |
+
|
| 189 |
+
final_table_to_export = df_no_similarity[['id','PRIMER_NOMBRE','SEXO','TIPO_CLASIFICACION']]
|
| 190 |
+
display(df_final_Selection)
|
| 191 |
+
display(final_table_to_export)
|
| 192 |
+
print("Numero de nombres que fueron predecidos: %s" %(len(final_table_to_export)))
|
| 193 |
+
|
| 194 |
+
#Guardando los reusltados de la predicci贸n en CSV hacia el data lake
|
| 195 |
+
#df_to_predict.to_csv('/tmp/Prediccion_nombres_apellido_nombre.csv', index=False)
|
| 196 |
+
df_to_predict.to_csv('/tmp/prediccion_intervinientes_clean_sexo_nombres_rurales_pendientes_modelo_ml_v1.csv', index=False)
|
| 197 |
+
dbutils.fs.cp('file:/tmp/prediccion_intervinientes_clean_sexo_nombres_rurales_pendientes_modelo_ml_v1.csv', '/mnt/auxiliar_2/OTROS/SNR/prediccion_intervinientes_clean_sexo_nombres_rurales_pendientes_modelo_ml_v1.csv')
|
modelo/modelo_prediccion_genero_ml_nb.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""
|
| 3 |
+
Archivo Python adaptado desde el notebook de databricks dispuesto desde el servicio de Azure-Data Sandbox
|
| 4 |
+
Created on Mon Dec 14 08:18:00 2020
|
| 5 |
+
@author: Carlos Delgado
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
## Configrurando el storage account key
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
storage_account_name = "Storage Account"
|
| 12 |
+
storage_account_key = "Storage Account Key"
|
| 13 |
+
container = "Storage Account Source Container"
|
| 14 |
+
container_raw = "Storage Account Source Container Raw"
|
| 15 |
+
|
| 16 |
+
spark.conf.set("fs.azure.account.key.{0}.dfs.core.windows.net".format(storage_account_name), storage_account_key)
|
| 17 |
+
spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "true")
|
| 18 |
+
dbutils.fs.ls("abfss://{0}@{1}.dfs.core.windows.net/".format(container_raw, storage_account_name))
|
| 19 |
+
spark.conf.set("fs.azure.createRemoteFileSystemDuringInitialization", "false")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## Examinando el listado de archivos
|
| 23 |
+
|
| 24 |
+
dbutils.fs.ls("abfss://{0}@{1}.dfs.core.windows.net/OTROS/SNR/JSON/".format(container_raw, storage_account_name))
|
| 25 |
+
dbutils.fs.ls("abfss://{0}@{1}.dfs.core.windows.net/OTROS/KAGGLE/nombres_espanol_entrenamiento_test1/".format(container, storage_account_name))
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
## Importanto las principales librerias utilizadas en el algoritmo de predicci贸n
|
| 29 |
+
|
| 30 |
+
# Algoritmo adaptado de https://www.kaggle.com/migalpha/gender-predictor-by-spanish-names
|
| 31 |
+
#importando librerias iniciales
|
| 32 |
+
import pandas as pd
|
| 33 |
+
import numpy as np
|
| 34 |
+
|
| 35 |
+
#importando libreria de AI Scikit-learn
|
| 36 |
+
from sklearn.model_selection import train_test_split
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
## Obtencion de los datos iniciales desde el csv con spark
|
| 40 |
+
|
| 41 |
+
from pyspark.sql.functions import *
|
| 42 |
+
from pyspark.sql.types import *
|
| 43 |
+
|
| 44 |
+
df_fem_original = spark.read.csv("abfss://{0}@{1}.dfs.core.windows.net/OTROS/KAGGLE/nombres_espanol_entrenamiento_test1/female_names.csv".format(container, storage_account_name),header=True)
|
| 45 |
+
df_mal_original = spark.read.csv("abfss://{0}@{1}.dfs.core.windows.net/OTROS/KAGGLE/nombres_espanol_entrenamiento_test1/male_names.csv".format(container, storage_account_name),header=True)
|
| 46 |
+
display(df_fem_original)
|
| 47 |
+
display(df_mal_original)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
## Migrando de Spark dataframes a Pandas dtaframes
|
| 52 |
+
|
| 53 |
+
#almacenando los nombres por genero a pandas dataframes
|
| 54 |
+
fem_nombres = df_fem_original.select("*").toPandas()
|
| 55 |
+
mal_nombres = df_mal_original.select("*").toPandas()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
## Instanciando variables
|
| 59 |
+
|
| 60 |
+
#asignaci贸n de indicadores para genero, 0 femenino, 1 masculino
|
| 61 |
+
fem_nombres['genero'] = 0
|
| 62 |
+
mal_nombres['genero'] = 1
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
#unificaci贸n de conjunto de datos eliminando potenciales duplicados
|
| 66 |
+
data = fem_nombres.append(mal_nombres, ignore_index=True)
|
| 67 |
+
data = data.drop_duplicates(subset='name', keep=False)
|
| 68 |
+
|
| 69 |
+
#parceando y limpiando atributos innecesarios
|
| 70 |
+
|
| 71 |
+
target = data['genero'].astype(str)
|
| 72 |
+
|
| 73 |
+
del(data['frequency'])
|
| 74 |
+
del(data['mean_age'])
|
| 75 |
+
del(data['genero'])
|
| 76 |
+
|
| 77 |
+
features = data['name'].astype(str)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
## Configurando el modelo del modelo
|
| 81 |
+
|
| 82 |
+
#se dividen los datos 80% entrenamiento - 20% testeo
|
| 83 |
+
|
| 84 |
+
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=7, test_size=0.2)
|
| 85 |
+
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
|
| 86 |
+
|
| 87 |
+
#libreria para extraer caracteristicas de los archivos de texto
|
| 88 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 89 |
+
|
| 90 |
+
#TF-IDF Esta librer铆a se encarga de transformar una matriz de conteo de ocurrencias (term-frequency) a una matriz normalizada de ocurrencias ( term-frequency times inverse document-frequency)
|
| 91 |
+
#Mayor documentaci贸n tecnica en https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html
|
| 92 |
+
from sklearn.feature_extraction.text import TfidfTransformer
|
| 93 |
+
|
| 94 |
+
#importando clasificador ML Naive Bayes NB
|
| 95 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#creando el pipeline
|
| 99 |
+
from sklearn.pipeline import Pipeline
|
| 100 |
+
|
| 101 |
+
text_classifier = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))), ('tfidf',TfidfTransformer(norm='l2', sublinear_tf=True, use_idf=False)), ('clf', MultinomialNB(alpha=0.1))])
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
## Entrenamiento del modelo y calculo de accuracy
|
| 105 |
+
|
| 106 |
+
#entrenando el modelo
|
| 107 |
+
text_classifier = text_classifier.fit(X_train, y_train)
|
| 108 |
+
|
| 109 |
+
#calculando precisi贸n del modelo utilizando el clasificador NB
|
| 110 |
+
from sklearn.metrics import accuracy_score
|
| 111 |
+
predicted = text_classifier.predict(X_test)
|
| 112 |
+
|
| 113 |
+
print(accuracy_score(y_test, predicted))
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
## Testeando el modelo
|
| 118 |
+
|
| 119 |
+
#importando GridSearchCV para seleccionar los mejores hiperparametros
|
| 120 |
+
from sklearn.model_selection import GridSearchCV
|
| 121 |
+
parameters = {
|
| 122 |
+
#'vect__max_df': (0.5, 0.625, 0.75, 0.875, 1.0),
|
| 123 |
+
#'vect__max_features': (None, 5000, 10000, 20000),
|
| 124 |
+
#'vect__min_df': (1, 5, 10, 20, 50),
|
| 125 |
+
'vect__ngram_range': [(1, 1), (1, 2)],
|
| 126 |
+
'tfidf__use_idf': (True, False),
|
| 127 |
+
'tfidf__sublinear_tf': (True, False),
|
| 128 |
+
#'vect__binary': (True, False),
|
| 129 |
+
'tfidf__norm': ('l1', 'l2'),
|
| 130 |
+
'clf__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001)
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
gs_classifier = GridSearchCV(text_classifier, parameters, n_jobs=-1, cv=2)
|
| 134 |
+
gs_classifier = gs_classifier.fit(X_train, y_train)
|
| 135 |
+
|
| 136 |
+
print(gs_classifier.best_score_)
|
| 137 |
+
print(gs_classifier.best_params_)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Generaci贸n del punto de montaje
|
| 141 |
+
|
| 142 |
+
#testeando punto de montaje, el montaje se esta ejecutando satisfactoriamente
|
| 143 |
+
|
| 144 |
+
dbutils.fs.mount(
|
| 145 |
+
source = "wasbs://{0}@{1}.blob.core.windows.net".format(container, storage_account_name),
|
| 146 |
+
mount_point = "/mnt/auxiliar",
|
| 147 |
+
extra_configs = {"fs.azure.account.key.{0}.blob.core.windows.net".format(storage_account_name):storage_account_key}
|
| 148 |
+
)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
## Salvando el modelo como archivo de pickle .pkl
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
import pickle
|
| 155 |
+
from sklearn.externals import joblib
|
| 156 |
+
import tempfile
|
| 157 |
+
import os
|
| 158 |
+
from joblib import dump, load
|
| 159 |
+
|
| 160 |
+
#model_path = "/mnt/auxiliar/modelo_gender_pred.pkl"
|
| 161 |
+
s = pickle.dumps(text_classifier)
|
| 162 |
+
classifier2 = pickle.loads(s)
|
| 163 |
+
tmpFile = tempfile.NamedTemporaryFile(delete=False)
|
| 164 |
+
dump(text_classifier, tmpFile)
|
| 165 |
+
tmpFile.flush()
|
| 166 |
+
#print(clf2)
|
| 167 |
+
|
| 168 |
+
#copiando el modelo en pkl desde el punto de montaje al datalake
|
| 169 |
+
dump(classifier2, '/tmp/modelo_gender_pred.pkl')
|
| 170 |
+
dbutils.fs.cp('file:/tmp/modelo_gender_pred.pkl', '/mnt/auxiliar/OTROS/KAGGLE/nombres_espanol_entrenamiento_test1/modelo_gender_pred_data_lake.pkl')
|
| 171 |
+
|
| 172 |
+
dbutils.fs.cp('/mnt/auxiliar/OTROS/KAGGLE/nombres_espanol_entrenamiento_test1/modelo_gender_pred_data_lake.pkl', '/tmp/modelo_gender_saved.pkl', )
|
| 173 |
+
display(dbutils.fs.ls ("/tmp/"))
|
| 174 |
+
classifier_Final = joblib.load('/dbfs/tmp/modelo_gender_saved.pkl')
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
#testeando algunas predicciones desde el modelo almacenado
|
| 178 |
+
|
| 179 |
+
print(classifier_Final.predict(("Valentina", )))
|
| 180 |
+
print(classifier_Final.predict(("Miguel", )))
|
| 181 |
+
print(classifier_Final.predict(("Maria", )))
|
| 182 |
+
print(classifier_Final.predict(("Joe", )))
|
| 183 |
+
print(classifier_Final.predict(("Santiago", )))
|
| 184 |
+
print(classifier_Final.predict(("Magdalena", )))
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
## Prediccion - Leyendo el archivo para prediccion
|
| 188 |
+
|
| 189 |
+
#leyendo el archivo desde el json del storage y alamcenandolo como pandas dataframe
|
| 190 |
+
dbutils.fs.ls("abfss://{0}@{1}.dfs.core.windows.net/OTROS/SNR/".format(container, storage_account_name))
|
| 191 |
+
#df_from_json = spark.read.json("abfss://{0}@{1}.dfs.core.windows.net/OTROS/SNR/nombres_apellido_nombre.json")
|
| 192 |
+
df_from_json = spark.read.json("abfss://{0}@{1}.dfs.core.windows.net/OTROS/SNR/nombres_para_clasificar.json".format(container, storage_account_name))
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
display(df_from_json)
|
| 197 |
+
|
| 198 |
+
df_to_predict = df_from_json.select("*").toPandas()
|
| 199 |
+
df_to_predict.head()
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
## Ejecutando la prediccion de los genereros
|
| 203 |
+
|
| 204 |
+
#Prediccion de los nuevos nombres que se le presentan al modelo
|
| 205 |
+
final_gender = []
|
| 206 |
+
|
| 207 |
+
#for item in df_to_predict.NOMBRES:
|
| 208 |
+
for item in df_to_predict.PRIMER_NOMBRE:
|
| 209 |
+
#print("Nombre: %s ---- Genero: %s" %(item,clf.predict((item, ))))
|
| 210 |
+
if pd.isnull(item) == True:
|
| 211 |
+
final_gender.append("")
|
| 212 |
+
elif clf.predict((item, ))[0] == '0':
|
| 213 |
+
#print("Mujer")
|
| 214 |
+
final_gender.append("MUJER")
|
| 215 |
+
else:
|
| 216 |
+
#print("Hombre")
|
| 217 |
+
final_gender.append("HOMBRE")
|
| 218 |
+
|
| 219 |
+
#Campo donde se almacena la predicci贸n de los nombres
|
| 220 |
+
df_to_predict['PREDICCION'] = final_gender
|
| 221 |
+
|
| 222 |
+
display(df_to_predict)
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
## Guardando los reusltados de la predicci贸n en CSV hacia el data lake
|
| 226 |
+
|
| 227 |
+
#df_to_predict.to_csv('/tmp/Prediccion_nombres_apellido_nombre.csv', index=False)
|
| 228 |
+
df_to_predict.to_csv('/tmp/Prediccion_nombres_para_clasificar.csv', index=False)
|
| 229 |
+
dbutils.fs.cp('file:/tmp/Prediccion_nombres_para_clasificar.csv', '/mnt/auxiliar/OTROS/KAGGLE/nombres_espanol_entrenamiento_test1/Prediccion_final_nombres_nombres_para_clasificar.csv')
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
|
prediccion/Prediccion_final_nombres_nombres_para_clasificar.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
prediccion/Prediccion_nombres_apellido_nombre.xlsx
ADDED
|
Binary file (101 kB). View file
|
|
|
prediccion/Prediccion_nombres_apellido_nombres.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0459c3fe75fa8a8ed65e6308ba96198921b5463ce0e341a819b692ea2ce07a89
|
| 3 |
+
size 5873079
|
prediccion/Prediccion_nombres_separados_coma.xlsx
ADDED
|
Binary file (38.4 kB). View file
|
|
|