Spaces:
Build error
Build error
Adding sample datasets
Browse files- app.py +84 -68
- utils/prompts.py +11 -9
app.py
CHANGED
|
@@ -16,6 +16,8 @@ from utils.prompts import (
|
|
| 16 |
generate_eda_system_prompt,
|
| 17 |
generate_embedding_system_prompt,
|
| 18 |
)
|
|
|
|
|
|
|
| 19 |
|
| 20 |
"""
|
| 21 |
TODOs:
|
|
@@ -36,6 +38,17 @@ TODOs:
|
|
| 36 |
"""
|
| 37 |
|
| 38 |
# Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
|
| 40 |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
| 41 |
|
|
@@ -101,7 +114,7 @@ def get_txt_from_output(output):
|
|
| 101 |
return content
|
| 102 |
except Exception as e:
|
| 103 |
gr.Error("Error when parsing notebook, try again.")
|
| 104 |
-
logging.error(f"Failed to
|
| 105 |
raise
|
| 106 |
|
| 107 |
|
|
@@ -139,74 +152,64 @@ def content_from_output(output):
|
|
| 139 |
return match.group(1)
|
| 140 |
|
| 141 |
|
| 142 |
-
def generate_eda_cells(dataset_id
|
| 143 |
for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
|
| 144 |
-
yield messages,
|
| 145 |
|
| 146 |
yield (
|
| 147 |
messages,
|
| 148 |
-
gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
|
| 149 |
f"{dataset_id.replace('/', '-')}-eda.ipynb",
|
| 150 |
)
|
| 151 |
|
| 152 |
|
| 153 |
-
def generate_rag_cells(dataset_id
|
| 154 |
for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
|
| 155 |
-
yield messages,
|
| 156 |
|
| 157 |
yield (
|
| 158 |
messages,
|
| 159 |
-
gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
|
| 160 |
f"{dataset_id.replace('/', '-')}-rag.ipynb",
|
| 161 |
)
|
| 162 |
|
| 163 |
|
| 164 |
-
def generate_embedding_cells(dataset_id
|
| 165 |
for messages in generate_cells(
|
| 166 |
dataset_id, generate_embedding_system_prompt, "embedding"
|
| 167 |
):
|
| 168 |
-
yield messages,
|
| 169 |
|
| 170 |
yield (
|
| 171 |
messages,
|
| 172 |
-
gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
|
| 173 |
f"{dataset_id.replace('/', '-')}-embedding.ipynb",
|
| 174 |
)
|
| 175 |
|
| 176 |
|
| 177 |
-
def
|
| 178 |
history,
|
| 179 |
dataset_id,
|
| 180 |
notebook_file,
|
| 181 |
-
profile: gr.OAuthProfile | None,
|
| 182 |
-
oauth_token: gr.OAuthToken | None,
|
| 183 |
):
|
| 184 |
logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
|
| 185 |
-
if not profile or not oauth_token:
|
| 186 |
-
yield history + [
|
| 187 |
-
gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
|
| 188 |
-
]
|
| 189 |
-
return
|
| 190 |
-
logging.info(f"Profile: {profile}, token: {oauth_token.token}")
|
| 191 |
|
| 192 |
-
notebook_name =
|
| 193 |
-
api = HfApi(token=
|
| 194 |
try:
|
| 195 |
-
logging.info(f"About to push {notebook_file} - {
|
| 196 |
api.upload_file(
|
| 197 |
path_or_fileobj=notebook_file,
|
| 198 |
path_in_repo=notebook_name,
|
| 199 |
-
repo_id=
|
| 200 |
repo_type="dataset",
|
| 201 |
)
|
| 202 |
-
link = f"https://huggingface.co/datasets/{
|
| 203 |
logging.info(f"Notebook pushed to hub: {link}")
|
| 204 |
yield history + [
|
| 205 |
gr.ChatMessage(
|
| 206 |
role="user",
|
| 207 |
-
content=f"[
|
| 208 |
)
|
| 209 |
]
|
|
|
|
| 210 |
except Exception as e:
|
| 211 |
logging.info("Failed to push notebook", e)
|
| 212 |
yield history + [gr.ChatMessage(role="assistant", content=e)]
|
|
@@ -292,31 +295,50 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
|
|
| 292 |
notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
|
| 293 |
create_notebook_file(commands, notebook_name=notebook_name)
|
| 294 |
messages.append(
|
| 295 |
-
gr.ChatMessage(role="user", content="
|
| 296 |
-
)
|
| 297 |
-
yield messages
|
| 298 |
-
messages.append(
|
| 299 |
-
gr.ChatMessage(
|
| 300 |
-
role="user",
|
| 301 |
-
content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
|
| 302 |
-
)
|
| 303 |
)
|
| 304 |
yield messages
|
|
|
|
| 305 |
|
| 306 |
|
| 307 |
def coming_soon_message():
|
| 308 |
return gr.Info("Coming soon")
|
| 309 |
|
| 310 |
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
|
| 313 |
-
with gr.Row():
|
| 314 |
-
with gr.Column(scale=
|
|
|
|
|
|
|
| 315 |
dataset_name = HuggingfaceHubSearch(
|
| 316 |
label="Hub Dataset ID",
|
| 317 |
placeholder="Search for dataset id on Huggingface",
|
| 318 |
search_type="dataset",
|
| 319 |
-
value="",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
)
|
| 321 |
|
| 322 |
@gr.render(inputs=dataset_name)
|
|
@@ -334,50 +356,44 @@ with gr.Blocks(fill_height=True) as demo:
|
|
| 334 |
return gr.HTML(value=html_code)
|
| 335 |
|
| 336 |
with gr.Row():
|
| 337 |
-
generate_eda_btn = gr.Button("
|
| 338 |
-
generate_embedding_btn = gr.Button("
|
| 339 |
-
generate_rag_btn = gr.Button("
|
| 340 |
-
generate_training_btn = gr.Button(
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
type="messages",
|
| 345 |
-
avatar_images=(
|
| 346 |
-
None,
|
| 347 |
-
None,
|
| 348 |
-
),
|
| 349 |
-
)
|
| 350 |
with gr.Row():
|
| 351 |
-
|
| 352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
notebook_file = gr.File(visible=False)
|
| 354 |
generate_eda_btn.click(
|
| 355 |
generate_eda_cells,
|
| 356 |
inputs=[dataset_name],
|
| 357 |
-
outputs=[chatbot,
|
| 358 |
)
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
inputs=[dataset_name],
|
| 363 |
-
outputs=[chatbot,
|
| 364 |
)
|
| 365 |
|
| 366 |
-
|
| 367 |
-
|
| 368 |
inputs=[dataset_name],
|
| 369 |
-
outputs=[chatbot,
|
| 370 |
)
|
| 371 |
|
| 372 |
generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
|
| 373 |
-
|
| 374 |
-
push_to_hub,
|
| 375 |
-
inputs=[
|
| 376 |
-
chatbot,
|
| 377 |
-
dataset_name,
|
| 378 |
-
notebook_file,
|
| 379 |
-
],
|
| 380 |
-
outputs=[chatbot],
|
| 381 |
-
)
|
| 382 |
|
| 383 |
demo.launch()
|
|
|
|
| 16 |
generate_eda_system_prompt,
|
| 17 |
generate_embedding_system_prompt,
|
| 18 |
)
|
| 19 |
+
from dotenv import load_dotenv
|
| 20 |
+
import os
|
| 21 |
|
| 22 |
"""
|
| 23 |
TODOs:
|
|
|
|
| 38 |
"""
|
| 39 |
|
| 40 |
# Configuration
|
| 41 |
+
|
| 42 |
+
load_dotenv()
|
| 43 |
+
|
| 44 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 45 |
+
NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
|
| 46 |
+
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
|
| 47 |
+
assert (
|
| 48 |
+
NOTEBOOKS_REPOSITORY is not None
|
| 49 |
+
), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
|
| 50 |
+
|
| 51 |
+
|
| 52 |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
|
| 53 |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
| 54 |
|
|
|
|
| 114 |
return content
|
| 115 |
except Exception as e:
|
| 116 |
gr.Error("Error when parsing notebook, try again.")
|
| 117 |
+
logging.error(f"Failed to parse code: {e}")
|
| 118 |
raise
|
| 119 |
|
| 120 |
|
|
|
|
| 152 |
return match.group(1)
|
| 153 |
|
| 154 |
|
| 155 |
+
def generate_eda_cells(dataset_id):
|
| 156 |
for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
|
| 157 |
+
yield messages, None # Keep button hidden
|
| 158 |
|
| 159 |
yield (
|
| 160 |
messages,
|
|
|
|
| 161 |
f"{dataset_id.replace('/', '-')}-eda.ipynb",
|
| 162 |
)
|
| 163 |
|
| 164 |
|
| 165 |
+
def generate_rag_cells(dataset_id):
|
| 166 |
for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
|
| 167 |
+
yield messages, None # Keep button hidden
|
| 168 |
|
| 169 |
yield (
|
| 170 |
messages,
|
|
|
|
| 171 |
f"{dataset_id.replace('/', '-')}-rag.ipynb",
|
| 172 |
)
|
| 173 |
|
| 174 |
|
| 175 |
+
def generate_embedding_cells(dataset_id):
|
| 176 |
for messages in generate_cells(
|
| 177 |
dataset_id, generate_embedding_system_prompt, "embedding"
|
| 178 |
):
|
| 179 |
+
yield messages, None # Keep button hidden
|
| 180 |
|
| 181 |
yield (
|
| 182 |
messages,
|
|
|
|
| 183 |
f"{dataset_id.replace('/', '-')}-embedding.ipynb",
|
| 184 |
)
|
| 185 |
|
| 186 |
|
| 187 |
+
def _push_to_hub(
|
| 188 |
history,
|
| 189 |
dataset_id,
|
| 190 |
notebook_file,
|
|
|
|
|
|
|
| 191 |
):
|
| 192 |
logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
notebook_name = notebook_file.split("/")[-1]
|
| 195 |
+
api = HfApi(token=HF_TOKEN)
|
| 196 |
try:
|
| 197 |
+
logging.info(f"About to push {notebook_file} - {dataset_id}")
|
| 198 |
api.upload_file(
|
| 199 |
path_or_fileobj=notebook_file,
|
| 200 |
path_in_repo=notebook_name,
|
| 201 |
+
repo_id=NOTEBOOKS_REPOSITORY,
|
| 202 |
repo_type="dataset",
|
| 203 |
)
|
| 204 |
+
link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
|
| 205 |
logging.info(f"Notebook pushed to hub: {link}")
|
| 206 |
yield history + [
|
| 207 |
gr.ChatMessage(
|
| 208 |
role="user",
|
| 209 |
+
content=f"[{notebook_name}]({link})",
|
| 210 |
)
|
| 211 |
]
|
| 212 |
+
|
| 213 |
except Exception as e:
|
| 214 |
logging.info("Failed to push notebook", e)
|
| 215 |
yield history + [gr.ChatMessage(role="assistant", content=e)]
|
|
|
|
| 295 |
notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
|
| 296 |
create_notebook_file(commands, notebook_name=notebook_name)
|
| 297 |
messages.append(
|
| 298 |
+
gr.ChatMessage(role="user", content="See the generated notebook on the Hub")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
)
|
| 300 |
yield messages
|
| 301 |
+
yield from _push_to_hub(messages, dataset_id, notebook_name)
|
| 302 |
|
| 303 |
|
| 304 |
def coming_soon_message():
|
| 305 |
return gr.Info("Coming soon")
|
| 306 |
|
| 307 |
|
| 308 |
+
def handle_example(example, button_action):
|
| 309 |
+
return button_action(example)
|
| 310 |
+
|
| 311 |
+
|
| 312 |
+
with gr.Blocks(fill_width=True) as demo:
|
| 313 |
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
|
| 314 |
+
with gr.Row(equal_height=True):
|
| 315 |
+
with gr.Column(scale=2):
|
| 316 |
+
text_input = gr.Textbox(label="Suggested notebook type", visible=False)
|
| 317 |
+
|
| 318 |
dataset_name = HuggingfaceHubSearch(
|
| 319 |
label="Hub Dataset ID",
|
| 320 |
placeholder="Search for dataset id on Huggingface",
|
| 321 |
search_type="dataset",
|
| 322 |
+
value="jamescalam/world-cities-geo",
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
dataset_samples = gr.Examples(
|
| 326 |
+
examples=[
|
| 327 |
+
[
|
| 328 |
+
"infinite-dataset-hub/WorldPopCounts",
|
| 329 |
+
"Try this dataset for Exploratory Data Analysis",
|
| 330 |
+
],
|
| 331 |
+
[
|
| 332 |
+
"infinite-dataset-hub/GlobaleCuisineRecipes",
|
| 333 |
+
"Try this dataset for Embeddings generation",
|
| 334 |
+
],
|
| 335 |
+
[
|
| 336 |
+
"infinite-dataset-hub/GlobalBestSellersSummaries",
|
| 337 |
+
"Try this dataset for RAG generation",
|
| 338 |
+
],
|
| 339 |
+
],
|
| 340 |
+
inputs=[dataset_name, text_input],
|
| 341 |
+
cache_examples=False,
|
| 342 |
)
|
| 343 |
|
| 344 |
@gr.render(inputs=dataset_name)
|
|
|
|
| 356 |
return gr.HTML(value=html_code)
|
| 357 |
|
| 358 |
with gr.Row():
|
| 359 |
+
generate_eda_btn = gr.Button("Exploratory Data Analysis")
|
| 360 |
+
generate_embedding_btn = gr.Button("Data Embeddings")
|
| 361 |
+
generate_rag_btn = gr.Button("RAG")
|
| 362 |
+
generate_training_btn = gr.Button(
|
| 363 |
+
"Training - Coming soon", interactive=False
|
| 364 |
+
)
|
| 365 |
+
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
with gr.Row():
|
| 367 |
+
chatbot = gr.Chatbot(
|
| 368 |
+
label="Results",
|
| 369 |
+
type="messages",
|
| 370 |
+
height=650,
|
| 371 |
+
avatar_images=(
|
| 372 |
+
None,
|
| 373 |
+
None,
|
| 374 |
+
),
|
| 375 |
+
)
|
| 376 |
+
|
| 377 |
notebook_file = gr.File(visible=False)
|
| 378 |
generate_eda_btn.click(
|
| 379 |
generate_eda_cells,
|
| 380 |
inputs=[dataset_name],
|
| 381 |
+
outputs=[chatbot, notebook_file],
|
| 382 |
)
|
| 383 |
|
| 384 |
+
generate_embedding_btn.click(
|
| 385 |
+
generate_embedding_cells,
|
| 386 |
inputs=[dataset_name],
|
| 387 |
+
outputs=[chatbot, notebook_file],
|
| 388 |
)
|
| 389 |
|
| 390 |
+
generate_rag_btn.click(
|
| 391 |
+
generate_rag_cells,
|
| 392 |
inputs=[dataset_name],
|
| 393 |
+
outputs=[chatbot, notebook_file],
|
| 394 |
)
|
| 395 |
|
| 396 |
generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
|
| 397 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
|
| 399 |
demo.launch()
|
utils/prompts.py
CHANGED
|
@@ -6,8 +6,9 @@ def generate_mapping_prompt(code):
|
|
| 6 |
"""Format the following python code to a list of cells to be used in a jupyter notebook:
|
| 7 |
{{ code }}
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
|
|
|
| 11 |
|
| 12 |
```json
|
| 13 |
[
|
|
@@ -42,7 +43,7 @@ def generate_eda_system_prompt():
|
|
| 42 |
You create Exploratory Data Analysis jupyter notebooks with the following content:
|
| 43 |
|
| 44 |
1. Install an import libraries
|
| 45 |
-
2. Load the
|
| 46 |
3. Understand the dataset
|
| 47 |
4. Check for missing values
|
| 48 |
5. Identify the data types of each column
|
|
@@ -70,12 +71,12 @@ def generate_eda_system_prompt():
|
|
| 70 |
@outlines.prompt
|
| 71 |
def generate_embedding_system_prompt():
|
| 72 |
"""You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
|
| 73 |
-
You
|
| 74 |
You create a jupyter notebooks with the following content:
|
| 75 |
|
| 76 |
-
1. Install libraries
|
| 77 |
2. Import libraries
|
| 78 |
-
3. Load dataset as dataframe
|
| 79 |
4. Choose column to be used for the embeddings
|
| 80 |
5. Remove duplicate data
|
| 81 |
6. Load column as a list
|
|
@@ -103,12 +104,13 @@ def generate_embedding_system_prompt():
|
|
| 103 |
def generate_rag_system_prompt():
|
| 104 |
"""You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
|
| 105 |
The data is provided as a pandas DataFrame with the following structure:
|
|
|
|
| 106 |
|
| 107 |
You create Exploratory RAG jupyter notebooks with the following content:
|
| 108 |
|
| 109 |
1. Install libraries
|
| 110 |
2. Import libraries
|
| 111 |
-
3. Load dataset as dataframe
|
| 112 |
4. Choose column to be used for the embeddings
|
| 113 |
5. Remove duplicate data
|
| 114 |
6. Load column as a list
|
|
@@ -116,8 +118,8 @@ def generate_rag_system_prompt():
|
|
| 116 |
8. Create FAISS index
|
| 117 |
9. Ask a query sample and encode it
|
| 118 |
10. Search similar documents based on the query sample and the FAISS index
|
| 119 |
-
11. Load HuggingFaceH4/zephyr-7b-beta model from transformers library and create a pipeline
|
| 120 |
-
12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar
|
| 121 |
13. Send the prompt to the pipeline and show answer
|
| 122 |
|
| 123 |
Ensure the notebook is well-organized, with explanations for each step.
|
|
|
|
| 6 |
"""Format the following python code to a list of cells to be used in a jupyter notebook:
|
| 7 |
{{ code }}
|
| 8 |
|
| 9 |
+
## Instruction
|
| 10 |
+
Before returning the result, evaluate if the json object is well formatted, if not, fix it.
|
| 11 |
+
The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
|
| 12 |
|
| 13 |
```json
|
| 14 |
[
|
|
|
|
| 43 |
You create Exploratory Data Analysis jupyter notebooks with the following content:
|
| 44 |
|
| 45 |
1. Install an import libraries
|
| 46 |
+
2. Load dataset as dataframe using the provided loading data code snippet
|
| 47 |
3. Understand the dataset
|
| 48 |
4. Check for missing values
|
| 49 |
5. Identify the data types of each column
|
|
|
|
| 71 |
@outlines.prompt
|
| 72 |
def generate_embedding_system_prompt():
|
| 73 |
"""You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
|
| 74 |
+
You must use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model and 'faiss-cpu' to create the index.
|
| 75 |
You create a jupyter notebooks with the following content:
|
| 76 |
|
| 77 |
+
1. Install libraries as !pip install
|
| 78 |
2. Import libraries
|
| 79 |
+
3. Load dataset as dataframe using the provided loading data code snippet
|
| 80 |
4. Choose column to be used for the embeddings
|
| 81 |
5. Remove duplicate data
|
| 82 |
6. Load column as a list
|
|
|
|
| 104 |
def generate_rag_system_prompt():
|
| 105 |
"""You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
|
| 106 |
The data is provided as a pandas DataFrame with the following structure:
|
| 107 |
+
You can use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index and 'transformers' for inference.
|
| 108 |
|
| 109 |
You create Exploratory RAG jupyter notebooks with the following content:
|
| 110 |
|
| 111 |
1. Install libraries
|
| 112 |
2. Import libraries
|
| 113 |
+
3. Load dataset as dataframe using the provided loading data code snippet
|
| 114 |
4. Choose column to be used for the embeddings
|
| 115 |
5. Remove duplicate data
|
| 116 |
6. Load column as a list
|
|
|
|
| 118 |
8. Create FAISS index
|
| 119 |
9. Ask a query sample and encode it
|
| 120 |
10. Search similar documents based on the query sample and the FAISS index
|
| 121 |
+
11. Load 'HuggingFaceH4/zephyr-7b-beta model' from transformers library and create a pipeline
|
| 122 |
+
12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar documents and a 'user' part with the query
|
| 123 |
13. Send the prompt to the pipeline and show answer
|
| 124 |
|
| 125 |
Ensure the notebook is well-organized, with explanations for each step.
|