Spaces:
Running
Running
Update main.py
Browse files
main.py
CHANGED
|
@@ -117,13 +117,20 @@ def main():
|
|
| 117 |
),
|
| 118 |
)
|
| 119 |
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
large-scale, comprehensive, and fully transparent
|
| 128 |
dataset designed for Large Language Model (LLM)
|
| 129 |
pre-training. TxT360 is engineered to strike a
|
|
@@ -161,12 +168,9 @@ def intro():
|
|
| 161 |
represents a significant step forward in the
|
| 162 |
availability and transparency of large-scale
|
| 163 |
training data for language models, setting a new
|
| 164 |
-
standard for dataset quality and openness.""")
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
Section(
|
| 168 |
-
H2("Background"),
|
| 169 |
-
P(
|
| 170 |
""" The quality and size of a pre-training dataset
|
| 171 |
play a crucial role in the performance of large
|
| 172 |
language models (LLMs). The community has
|
|
@@ -197,11 +201,8 @@ def intro():
|
|
| 197 |
rigorous standards required for state-of-the-art
|
| 198 |
LLM pre-training. """
|
| 199 |
),
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
Section(
|
| 203 |
-
H2("Main Content"),
|
| 204 |
-
P("""The performance of a large language model (LLM)
|
| 205 |
depends heavily on the quality and size of its
|
| 206 |
pretraining dataset. However, the pretraining
|
| 207 |
datasets for state-of-the-art open LLMs like Llama
|
|
@@ -246,13 +247,34 @@ def intro():
|
|
| 246 |
(listing and explaining all of our design choices),
|
| 247 |
and the process followed to create its 📚
|
| 248 |
FineWeb-Edu subset."""),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
id="section3",
|
| 250 |
),
|
| 251 |
Section(
|
| 252 |
H2("Conclusion"),
|
| 253 |
-
|
| 254 |
-
summarize the key points discussed in the blog post
|
| 255 |
-
and provide final thoughts."""),
|
| 256 |
id="section4",
|
| 257 |
),
|
| 258 |
id="inner-text",
|
|
|
|
| 117 |
),
|
| 118 |
)
|
| 119 |
|
| 120 |
+
intro_text = P(
|
| 121 |
+
"""Pretraining performant large language models (LLMs) requires trillions of tokens of high quality data. Many prior work, including our previous pretraining projects Amber-7B, Crystal-7B, and K2-65B have demonstrated how data curation is a ‘make-or-break’ decision for model quality and capability.""")
|
| 122 |
|
| 123 |
+
intro_list = P("""We present TxT360, the Trillion eXtracted Text corpus, a 5.7T token dataset for pretraining projects that:""")
|
| 124 |
+
|
| 125 |
+
intro_list1 = Ol(
|
| 126 |
+
Li("Curates commonly used pretraining datasets, including all CommonCrawl"),
|
| 127 |
+
Li("Employs carefully selected filters designed for each data source"),
|
| 128 |
+
Li("Provides only unique data elements via globally deduplicated across all datasets"),
|
| 129 |
+
Li("Retains all deduplication metadata for custom upweighting"),
|
| 130 |
+
Li("Is Production ready! Download here [link to HF repo]")
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
previous_intro = P("""We are excited to introduce TxT360, a
|
| 134 |
large-scale, comprehensive, and fully transparent
|
| 135 |
dataset designed for Large Language Model (LLM)
|
| 136 |
pre-training. TxT360 is engineered to strike a
|
|
|
|
| 168 |
represents a significant step forward in the
|
| 169 |
availability and transparency of large-scale
|
| 170 |
training data for language models, setting a new
|
| 171 |
+
standard for dataset quality and openness.""")
|
| 172 |
+
|
| 173 |
+
previous_background = P(
|
|
|
|
|
|
|
|
|
|
| 174 |
""" The quality and size of a pre-training dataset
|
| 175 |
play a crucial role in the performance of large
|
| 176 |
language models (LLMs). The community has
|
|
|
|
| 201 |
rigorous standards required for state-of-the-art
|
| 202 |
LLM pre-training. """
|
| 203 |
),
|
| 204 |
+
|
| 205 |
+
previous_content = P("""The performance of a large language model (LLM)
|
|
|
|
|
|
|
|
|
|
| 206 |
depends heavily on the quality and size of its
|
| 207 |
pretraining dataset. However, the pretraining
|
| 208 |
datasets for state-of-the-art open LLMs like Llama
|
|
|
|
| 247 |
(listing and explaining all of our design choices),
|
| 248 |
and the process followed to create its 📚
|
| 249 |
FineWeb-Edu subset."""),
|
| 250 |
+
|
| 251 |
+
previous_conclusion = P("""This is the conclusion section where we
|
| 252 |
+
summarize the key points discussed in the blog post
|
| 253 |
+
and provide final thoughts."""),
|
| 254 |
+
|
| 255 |
+
@app.get("/intro")
|
| 256 |
+
def intro():
|
| 257 |
+
return Div(
|
| 258 |
+
Section(
|
| 259 |
+
H2("About TxT360"),
|
| 260 |
+
intro_text,
|
| 261 |
+
intro_list,
|
| 262 |
+
intro_list1,
|
| 263 |
+
id="section1",
|
| 264 |
+
),
|
| 265 |
+
Section(
|
| 266 |
+
H2("Background"),
|
| 267 |
+
|
| 268 |
+
id="section2",
|
| 269 |
+
),
|
| 270 |
+
Section(
|
| 271 |
+
H2("Main Content"),
|
| 272 |
+
|
| 273 |
id="section3",
|
| 274 |
),
|
| 275 |
Section(
|
| 276 |
H2("Conclusion"),
|
| 277 |
+
|
|
|
|
|
|
|
| 278 |
id="section4",
|
| 279 |
),
|
| 280 |
id="inner-text",
|