Spaces:
Sleeping
Sleeping
| def replace_wildcards(templates, wildcards, replacements): | |
| if len(wildcards) != len(replacements): | |
| raise ValueError( | |
| "The number of wildcards must match the number of replacements." | |
| ) | |
| new_templates = [] | |
| for tmp in templates: | |
| tmp_text = tmp["source"] | |
| for wildcard, replacement in zip(wildcards, replacements): | |
| tmp_text = tmp_text.replace(wildcard, replacement) | |
| new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text}) | |
| return new_templates | |
| rag_cells = [ | |
| { | |
| "cell_type": "markdown", | |
| "source": "# Retrieval-Augmented Generation (RAG) System Notebook", | |
| }, | |
| {"cell_type": "code", "source": ""}, | |
| ] | |
| embeggins_cells = [ | |
| { | |
| "cell_type": "markdown", | |
| "source": "# Embeddings Generation Notebook", | |
| }, | |
| {"cell_type": "code", "source": ""}, | |
| ] | |
| eda_cells = [ | |
| { | |
| "cell_type": "markdown", | |
| "source": "# Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset", | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| from IPython.display import HTML | |
| display(HTML("{html_code}")) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 1. Install and import necessary libraries. | |
| !pip install pandas matplotlib seaborn | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 2. Load the dataset as a DataFrame using the provided code | |
| {first_code} | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 3. Understand the dataset structure | |
| print(df.head()) | |
| print(df.info()) | |
| print(df.describe()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 4. Check for missing values | |
| print(df.isnull().sum()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 5. Identify data types of each column | |
| print(df.dtypes) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 6. Detect duplicated rows | |
| print(df.duplicated().sum()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 7. Generate descriptive statistics | |
| print(df.describe()) | |
| """, | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": """ | |
| # 8. Visualize the distribution of each column. | |
| # TODO: Add code to visualize the distribution of each column. | |
| # 9. Explore relationships between columns. | |
| # TODO: Add code to explore relationships between columns. | |
| # 10. Perform correlation analysis. | |
| # TODO: Add code to perform correlation analysis. | |
| """, | |
| }, | |
| ] | |
| def generate_embedding_system_prompt(): | |
| """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset. | |
| Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index. | |
| The notebook should include: | |
| 1. Install necessary libraries with !pip install. | |
| 2. Import libraries. | |
| 3. Load the dataset as a DataFrame using the provided code. | |
| 4. Select the column to generate embeddings. | |
| 5. Remove duplicate data. | |
| 6. Convert the selected column to a list. | |
| 7. Load the sentence-transformers model. | |
| 8. Create a FAISS index. | |
| 9. Encode a query sample. | |
| 10. Search for similar documents using the FAISS index. | |
| Ensure the notebook is well-organized with explanations for each step. | |
| The output should be Markdown content with Python code snippets enclosed in "```python" and "```". | |
| The user will provide dataset information in the following format: | |
| ## Columns and Data Types | |
| ## Sample Data | |
| ## Loading Data code | |
| Use the provided code to load the dataset; do not use any other method. | |
| """ | |
| def generate_rag_system_prompt(): | |
| """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset. | |
| The dataset is provided as a pandas DataFrame. | |
| Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference. | |
| The RAG notebook should include: | |
| 1. Install necessary libraries. | |
| 2. Import libraries. | |
| 3. Load the dataset as a DataFrame using the provided code. | |
| 4. Select the column for generating embeddings. | |
| 5. Remove duplicate data. | |
| 6. Convert the selected column to a list. | |
| 7. Load the sentence-transformers model. | |
| 8. Create a FAISS index. | |
| 9. Encode a query sample. | |
| 10. Search for similar documents using the FAISS index. | |
| 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline. | |
| 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query. | |
| 13. Send the prompt to the pipeline and display the answer. | |
| Ensure the notebook is well-organized with explanations for each step. | |
| The output should be Markdown content with Python code snippets enclosed in "```python" and "```". | |
| The user will provide the dataset information in the following format: | |
| ## Columns and Data Types | |
| ## Sample Data | |
| ## Loading Data code | |
| Use the provided code to load the dataset; do not use any other method. | |
| """ | |