Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
9716e1f
1
Parent(s):
dc4ad9e
fix blake3 (vaex dependency)
Browse files- requirements.txt +1 -0
- tests/notebook.ipynb +144 -1
requirements.txt
CHANGED
|
@@ -6,6 +6,7 @@ xlrd==2.0.1
|
|
| 6 |
openpyxl==3.0.9
|
| 7 |
watchdog==2.1.6
|
| 8 |
vaex==4.7.0
|
|
|
|
| 9 |
|
| 10 |
# english
|
| 11 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
|
|
|
| 6 |
openpyxl==3.0.9
|
| 7 |
watchdog==2.1.6
|
| 8 |
vaex==4.7.0
|
| 9 |
+
blake3==0.2.1 # to make vaex work
|
| 10 |
|
| 11 |
# english
|
| 12 |
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz#egg=en_core_web_sm
|
tests/notebook.ipynb
CHANGED
|
@@ -12,7 +12,150 @@
|
|
| 12 |
"import sys\n",
|
| 13 |
"sys.path.append(\"..\")\n",
|
| 14 |
"from src.preprocessing import PreprocessingPipeline\n",
|
| 15 |
-
"import pandas as pd"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
]
|
| 17 |
},
|
| 18 |
{
|
|
|
|
| 12 |
"import sys\n",
|
| 13 |
"sys.path.append(\"..\")\n",
|
| 14 |
"from src.preprocessing import PreprocessingPipeline\n",
|
| 15 |
+
"import pandas as pd\n",
|
| 16 |
+
"import vaex"
|
| 17 |
+
]
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"cell_type": "markdown",
|
| 21 |
+
"metadata": {},
|
| 22 |
+
"source": [
|
| 23 |
+
"----\n",
|
| 24 |
+
"### Test vaex"
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"cell_type": "code",
|
| 29 |
+
"execution_count": 2,
|
| 30 |
+
"metadata": {},
|
| 31 |
+
"outputs": [],
|
| 32 |
+
"source": [
|
| 33 |
+
"df = pd.read_csv(\"../data/test_en.csv\")"
|
| 34 |
+
]
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"cell_type": "code",
|
| 38 |
+
"execution_count": 3,
|
| 39 |
+
"metadata": {},
|
| 40 |
+
"outputs": [
|
| 41 |
+
{
|
| 42 |
+
"data": {
|
| 43 |
+
"text/html": [
|
| 44 |
+
"<table>\n",
|
| 45 |
+
"<thead>\n",
|
| 46 |
+
"<tr><th># </th><th>label </th><th>text </th></tr>\n",
|
| 47 |
+
"</thead>\n",
|
| 48 |
+
"<tbody>\n",
|
| 49 |
+
"<tr><td><i style='opacity: 0.6'>0</i> </td><td>0 </td><td>"I think it's time John Rambo move on with his l...</td></tr>\n",
|
| 50 |
+
"<tr><td><i style='opacity: 0.6'>1</i> </td><td>1 </td><td>"I've just watch 2 films of Pang brothers, The E...</td></tr>\n",
|
| 51 |
+
"<tr><td><i style='opacity: 0.6'>2</i> </td><td>1 </td><td>'Jewel Thief is *THE* crime thriller of Bollywoo...</td></tr>\n",
|
| 52 |
+
"<tr><td><i style='opacity: 0.6'>3</i> </td><td>0 </td><td>'This so called remake is terrible. I went to se...</td></tr>\n",
|
| 53 |
+
"<tr><td><i style='opacity: 0.6'>4</i> </td><td>1 </td><td>'When Northfork debuted at the Cannes Film Festi...</td></tr>\n",
|
| 54 |
+
"<tr><td>... </td><td>... </td><td>... </td></tr>\n",
|
| 55 |
+
"<tr><td><i style='opacity: 0.6'>4,995</i></td><td>0 </td><td>'The title tells it all -- Ed Gein, the butcher ...</td></tr>\n",
|
| 56 |
+
"<tr><td><i style='opacity: 0.6'>4,996</i></td><td>0 </td><td>"This film makes about as much sense as an 'Ozzi...</td></tr>\n",
|
| 57 |
+
"<tr><td><i style='opacity: 0.6'>4,997</i></td><td>0 </td><td>'"Sex and the City" has some great things going ...</td></tr>\n",
|
| 58 |
+
"<tr><td><i style='opacity: 0.6'>4,998</i></td><td>0 </td><td>'Please...if anybody gets the chance to read thi...</td></tr>\n",
|
| 59 |
+
"<tr><td><i style='opacity: 0.6'>4,999</i></td><td>0 </td><td>'...a film comes along that manages to be absolu...</td></tr>\n",
|
| 60 |
+
"</tbody>\n",
|
| 61 |
+
"</table>"
|
| 62 |
+
],
|
| 63 |
+
"text/plain": [
|
| 64 |
+
"# label text\n",
|
| 65 |
+
"0 0 \"I think it's time John Rambo move on with his l...\n",
|
| 66 |
+
"1 1 \"I've just watch 2 films of Pang brothers, The E...\n",
|
| 67 |
+
"2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n",
|
| 68 |
+
"3 0 'This so called remake is terrible. I went to se...\n",
|
| 69 |
+
"4 1 'When Northfork debuted at the Cannes Film Festi...\n",
|
| 70 |
+
"... ... ...\n",
|
| 71 |
+
"4,995 0 'The title tells it all -- Ed Gein, the butcher ...\n",
|
| 72 |
+
"4,996 0 \"This film makes about as much sense as an 'Ozzi...\n",
|
| 73 |
+
"4,997 0 '\"Sex and the City\" has some great things going ...\n",
|
| 74 |
+
"4,998 0 'Please...if anybody gets the chance to read thi...\n",
|
| 75 |
+
"4,999 0 '...a film comes along that manages to be absolu..."
|
| 76 |
+
]
|
| 77 |
+
},
|
| 78 |
+
"execution_count": 3,
|
| 79 |
+
"metadata": {},
|
| 80 |
+
"output_type": "execute_result"
|
| 81 |
+
}
|
| 82 |
+
],
|
| 83 |
+
"source": [
|
| 84 |
+
"vaex.from_pandas(df)"
|
| 85 |
+
]
|
| 86 |
+
},
|
| 87 |
+
{
|
| 88 |
+
"cell_type": "code",
|
| 89 |
+
"execution_count": 4,
|
| 90 |
+
"metadata": {},
|
| 91 |
+
"outputs": [],
|
| 92 |
+
"source": [
|
| 93 |
+
"df_small = df.iloc[:1000]"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"cell_type": "code",
|
| 98 |
+
"execution_count": 5,
|
| 99 |
+
"metadata": {},
|
| 100 |
+
"outputs": [
|
| 101 |
+
{
|
| 102 |
+
"data": {
|
| 103 |
+
"text/html": [
|
| 104 |
+
"<table>\n",
|
| 105 |
+
"<thead>\n",
|
| 106 |
+
"<tr><th># </th><th>label </th><th>text </th></tr>\n",
|
| 107 |
+
"</thead>\n",
|
| 108 |
+
"<tbody>\n",
|
| 109 |
+
"<tr><td><i style='opacity: 0.6'>0</i> </td><td>0 </td><td>"I think it's time John Rambo move on with his l...</td></tr>\n",
|
| 110 |
+
"<tr><td><i style='opacity: 0.6'>1</i> </td><td>1 </td><td>"I've just watch 2 films of Pang brothers, The E...</td></tr>\n",
|
| 111 |
+
"<tr><td><i style='opacity: 0.6'>2</i> </td><td>1 </td><td>'Jewel Thief is *THE* crime thriller of Bollywoo...</td></tr>\n",
|
| 112 |
+
"<tr><td><i style='opacity: 0.6'>3</i> </td><td>0 </td><td>'This so called remake is terrible. I went to se...</td></tr>\n",
|
| 113 |
+
"<tr><td><i style='opacity: 0.6'>4</i> </td><td>1 </td><td>'When Northfork debuted at the Cannes Film Festi...</td></tr>\n",
|
| 114 |
+
"<tr><td>... </td><td>... </td><td>... </td></tr>\n",
|
| 115 |
+
"<tr><td><i style='opacity: 0.6'>995</i></td><td>1 </td><td>"It's a funny business, reviewing movies. These ...</td></tr>\n",
|
| 116 |
+
"<tr><td><i style='opacity: 0.6'>996</i></td><td>1 </td><td>'Right from the start you see that "Anchors Awei...</td></tr>\n",
|
| 117 |
+
"<tr><td><i style='opacity: 0.6'>997</i></td><td>0 </td><td>'I saw this movie in NEW York city. I was waitin...</td></tr>\n",
|
| 118 |
+
"<tr><td><i style='opacity: 0.6'>998</i></td><td>0 </td><td>'Firstly, this is NOT an adaptation of a Stephen...</td></tr>\n",
|
| 119 |
+
"<tr><td><i style='opacity: 0.6'>999</i></td><td>1 </td><td>"Barbra Streisand's debut television special is ...</td></tr>\n",
|
| 120 |
+
"</tbody>\n",
|
| 121 |
+
"</table>"
|
| 122 |
+
],
|
| 123 |
+
"text/plain": [
|
| 124 |
+
"# label text\n",
|
| 125 |
+
"0 0 \"I think it's time John Rambo move on with his l...\n",
|
| 126 |
+
"1 1 \"I've just watch 2 films of Pang brothers, The E...\n",
|
| 127 |
+
"2 1 'Jewel Thief is *THE* crime thriller of Bollywoo...\n",
|
| 128 |
+
"3 0 'This so called remake is terrible. I went to se...\n",
|
| 129 |
+
"4 1 'When Northfork debuted at the Cannes Film Festi...\n",
|
| 130 |
+
"... ... ...\n",
|
| 131 |
+
"995 1 \"It's a funny business, reviewing movies. These ...\n",
|
| 132 |
+
"996 1 'Right from the start you see that \"Anchors Awei...\n",
|
| 133 |
+
"997 0 'I saw this movie in NEW York city. I was waitin...\n",
|
| 134 |
+
"998 0 'Firstly, this is NOT an adaptation of a Stephen...\n",
|
| 135 |
+
"999 1 \"Barbra Streisand's debut television special is ..."
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
"execution_count": 5,
|
| 139 |
+
"metadata": {},
|
| 140 |
+
"output_type": "execute_result"
|
| 141 |
+
}
|
| 142 |
+
],
|
| 143 |
+
"source": [
|
| 144 |
+
"vaex.from_pandas(df_small)"
|
| 145 |
+
]
|
| 146 |
+
},
|
| 147 |
+
{
|
| 148 |
+
"cell_type": "code",
|
| 149 |
+
"execution_count": null,
|
| 150 |
+
"metadata": {},
|
| 151 |
+
"outputs": [],
|
| 152 |
+
"source": []
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"cell_type": "markdown",
|
| 156 |
+
"metadata": {},
|
| 157 |
+
"source": [
|
| 158 |
+
"----"
|
| 159 |
]
|
| 160 |
},
|
| 161 |
{
|