Spaces:
Runtime error
Runtime error
| from google import genai | |
| client = genai.Client(api_key="AIzaSyD6voSAiSUim17kB90skpdisMMyFXZPxMo") | |
| MODEL_ID = "gemini-2.0-flash-exp" | |
| def model_response(text): | |
| response = client.models.generate_content( | |
| model=MODEL_ID, | |
| contents=text | |
| ) | |
| return response.text | |
| def generate_dataset_queries(dataset_id,generated_glossary, schema_relationships): | |
| queries = model_response(f'''You are an expert in drafting BQ queries. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and | |
| recommend interesting data exploration queries. Format: | |
| Query description in the form of a single line question | |
| Actual query''') | |
| return queries | |
| def generate_lookml(dataset_id,generated_glossary, schema_relationships): | |
| lookml = model_response(f'''You are an expert in drafting LookML models. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and | |
| recommend the LookML semantic model corresponding to the dataset. ''') | |
| return lookml | |
| def run(DATASET_ID): | |
| dataset_description = None | |
| py = None | |
| schema_relations = None | |
| generated_glossary = None | |
| queries = None | |
| lookml = None | |
| lookml_explore = None | |
| dataset = model_response(f'''You are an expert in BQ public datasets. Generate a dataset schema related to {DATASET_ID}. You need to come up with atleast 5 tables with each table | |
| containing atleast 10 columns along with their descriptions.Ensure that these tables have columns that talk about data quality issues.''') | |
| dataset_description = model_response(f'''Generate a succinct 3-4 line description of the dataset: {dataset}.''') | |
| yield dataset_description, None, None, None, None, None, None | |
| #get_table_doc(PROJECT_ID, DATASET_ID) | |
| py = model_response(f'''Based on the dataset provided: {dataset}, identify all the possible relationships | |
| that exist between the tables in the dataset. Discover these relationships from | |
| the point of view of data exploration. | |
| Output: | |
| List of relationships along with the description which is the business value of the relationship and a query | |
| with description that validates the relationship. | |
| Ensure that the column names and table names are accurate.''') | |
| yield dataset_description, py, schema_relations, None, None, None, None | |
| schema_relations = model_response(f'''Based on the context: {py}, generate a knowledge graph represented using ASCII art. Also generate a brief description of the graph. | |
| Output: | |
| Description of the graph listing all the relationships in markdown format | |
| ASCII version of the knowledge graph with nodes represented by tables and edges represented by the relationships. Edges should be annotated with the type of relationships identified - many-to-one, many-to-many, one-to-one, primary key, self joins, foreign keys etc''') | |
| yield dataset_description, py, schema_relations, None, None, None, None | |
| generated_glossary = model_response(f'''Based on the relationships identified: {schema_relations} | |
| and the dataset: {dataset_description}, generate glossary terms that will help business users easily find the tables in the dataset. | |
| ## Task | |
| - Your goal is to create a business glossary for the data in this dataset, aligned with the definition of business glossary specified above. | |
| - Provide each business term in a newline, along with the definition. | |
| - Include examples in the term definitions, wherever suitable. | |
| - Make sure the business terms are relevant as per the table and column names and descriptions, and relevant to the domain to which the data belongs. | |
| - Also include a few business terms around the users/clients and around 5 key metrics in the domain of the data. | |
| - After defining the terms, identify the relationships between the business terms identified previously. | |
| ## Output format | |
| Ensure that the output is in markdown format with proper indentation | |
| - Output each business glossary term definition in a newline in the folowing format: | |
| term: definition | |
| - For the business terms which are the key metrics in the business domain, mark such terms by adding "[METRIC]" in the beginning of the line, in the following format: | |
| [METRIC] term: definition | |
| - Then print a header to indicate the end of this section and start of the relationships section. | |
| - Then output the relationships between the business terms as follows: | |
| term -> [related_term1, related_term2] | |
| Show the relationship between the glossary term and the column broken down by each table. | |
| ''') | |
| yield dataset_description, py, schema_relations, generated_glossary, None, None, None | |
| queries = generate_dataset_queries(dataset, generated_glossary, schema_relations) | |
| yield dataset_description, py, schema_relations, generated_glossary, queries, None, None | |
| lookml = generate_lookml(dataset, generated_glossary, schema_relations) | |
| yield dataset_description, py, schema_relations, generated_glossary, queries, lookml | |
| lookml_explore = model_response(f'''Given the dataset: {dataset}, schema relationships: {py} and graph:{schema_relations}, generate a data preparation pipeline that | |
| fixes the possible data quality issues across the tables in the dataset.''') | |
| yield dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore | |
| return dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore | |
| # Modify the wrapper function to yield a tuple for Gradio outputs | |
| def wrapper(dataset_id): | |
| for outputs in run(dataset_id): | |
| yield ( | |
| outputs[0], | |
| outputs[1], | |
| outputs[2], # Schema Relationships | |
| outputs[3], # Generated Glossary | |
| outputs[4], # Queries | |
| outputs[5] # LookML Model | |
| ) | |
| import gradio as gr | |
| iface = gr.Interface( | |
| fn=wrapper, | |
| inputs=gr.Textbox(label="Dataset ID"), | |
| outputs=[ | |
| gr.Markdown(label="Dataset description"), | |
| gr.Markdown(label="Knowledge Graph"), | |
| gr.Markdown(label="Schema Relationships"), | |
| gr.Markdown(label="Generated Glossary"), | |
| gr.Textbox(label="Queries"), | |
| gr.Markdown(label="LookML Model") | |
| ], | |
| live=False, | |
| theme = gr.themes.Ocean(), | |
| title="Dataplex knowledge engine ⚙️💡📊 (Simulator)", | |
| description="Provide a dataset ID to generate LookML, schema relationships, glossary, and more...", examples=['ncaa_basketball2', 'thelook_ecommerce','geo_openstreetmap','google_political_ads','noaa_historic_severe_storms','stackoverflow'], | |
| article = "This is a simulator that provides a sneak-peek into how BQ knowledge engine works." | |
| ) | |
| # Launch the app | |
| iface.launch(share=True, debug=True) |