Spaces:

agentharbor
/

bke

Runtime error

App Files Files Community

bke / app.py

agentharbor

Update app.py

fd7f0e9 verified 4 months ago

raw

history blame contribute delete

6.76 kB

	from google import genai
	client = genai.Client(api_key="AIzaSyD6voSAiSUim17kB90skpdisMMyFXZPxMo")
	MODEL_ID = "gemini-2.0-flash-exp"
	def model_response(text):

	response = client.models.generate_content(
	model=MODEL_ID,
	contents=text
	)
	return response.text

	def generate_dataset_queries(dataset_id,generated_glossary, schema_relationships):
	queries = model_response(f'''You are an expert in drafting BQ queries. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and
	recommend interesting data exploration queries. Format:
	Query description in the form of a single line question
	Actual query''')
	return queries

	def generate_lookml(dataset_id,generated_glossary, schema_relationships):
	lookml = model_response(f'''You are an expert in drafting LookML models. Look at the dataset: {dataset_id}, look at the glossary: {generated_glossary} and {schema_relationships} and
	recommend the LookML semantic model corresponding to the dataset. ''')
	return lookml

	def run(DATASET_ID):
	dataset_description = None
	py = None
	schema_relations = None
	generated_glossary = None
	queries = None
	lookml = None
	lookml_explore = None
	dataset = model_response(f'''You are an expert in BQ public datasets. Generate a dataset schema related to {DATASET_ID}. You need to come up with atleast 5 tables with each table
	containing atleast 10 columns along with their descriptions.Ensure that these tables have columns that talk about data quality issues.''')
	dataset_description = model_response(f'''Generate a succinct 3-4 line description of the dataset: {dataset}.''')
	yield dataset_description, None, None, None, None, None, None

	#get_table_doc(PROJECT_ID, DATASET_ID)

	py = model_response(f'''Based on the dataset provided: {dataset}, identify all the possible relationships
	that exist between the tables in the dataset. Discover these relationships from
	the point of view of data exploration.
	Output:
	List of relationships along with the description which is the business value of the relationship and a query
	with description that validates the relationship.

	Ensure that the column names and table names are accurate.''')
	yield dataset_description, py, schema_relations, None, None, None, None


	schema_relations = model_response(f'''Based on the context: {py}, generate a knowledge graph represented using ASCII art. Also generate a brief description of the graph.
	Output:
	Description of the graph listing all the relationships in markdown format
	ASCII version of the knowledge graph with nodes represented by tables and edges represented by the relationships. Edges should be annotated with the type of relationships identified - many-to-one, many-to-many, one-to-one, primary key, self joins, foreign keys etc''')
	yield dataset_description, py, schema_relations, None, None, None, None
	generated_glossary = model_response(f'''Based on the relationships identified: {schema_relations}
	and the dataset: {dataset_description}, generate glossary terms that will help business users easily find the tables in the dataset.
	## Task
	- Your goal is to create a business glossary for the data in this dataset, aligned with the definition of business glossary specified above.
	- Provide each business term in a newline, along with the definition.
	- Include examples in the term definitions, wherever suitable.
	- Make sure the business terms are relevant as per the table and column names and descriptions, and relevant to the domain to which the data belongs.
	- Also include a few business terms around the users/clients and around 5 key metrics in the domain of the data.
	- After defining the terms, identify the relationships between the business terms identified previously.

	## Output format
	Ensure that the output is in markdown format with proper indentation
	- Output each business glossary term definition in a newline in the folowing format:
	term: definition
	- For the business terms which are the key metrics in the business domain, mark such terms by adding "[METRIC]" in the beginning of the line, in the following format:
	[METRIC] term: definition
	- Then print a header to indicate the end of this section and start of the relationships section.
	- Then output the relationships between the business terms as follows:
	term -> [related_term1, related_term2]
	Show the relationship between the glossary term and the column broken down by each table.
	''')
	yield dataset_description, py, schema_relations, generated_glossary, None, None, None
	queries = generate_dataset_queries(dataset, generated_glossary, schema_relations)
	yield dataset_description, py, schema_relations, generated_glossary, queries, None, None
	lookml = generate_lookml(dataset, generated_glossary, schema_relations)
	yield dataset_description, py, schema_relations, generated_glossary, queries, lookml
	lookml_explore = model_response(f'''Given the dataset: {dataset}, schema relationships: {py} and graph:{schema_relations}, generate a data preparation pipeline that
	fixes the possible data quality issues across the tables in the dataset.''')
	yield dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore
	return dataset_description, py, schema_relations, generated_glossary, queries, lookml, lookml_explore

	# Modify the wrapper function to yield a tuple for Gradio outputs
	def wrapper(dataset_id):
	for outputs in run(dataset_id):
	yield (
	outputs[0],
	outputs[1],
	outputs[2], # Schema Relationships
	outputs[3], # Generated Glossary
	outputs[4], # Queries
	outputs[5] # LookML Model
	)

	import gradio as gr

	iface = gr.Interface(
	fn=wrapper,
	inputs=gr.Textbox(label="Dataset ID"),
	outputs=[
	gr.Markdown(label="Dataset description"),
	gr.Markdown(label="Knowledge Graph"),
	gr.Markdown(label="Schema Relationships"),
	gr.Markdown(label="Generated Glossary"),
	gr.Textbox(label="Queries"),
	gr.Markdown(label="LookML Model")
	],
	live=False,
	theme = gr.themes.Ocean(),
	title="Dataplex knowledge engine ⚙️💡📊 (Simulator)",
	description="Provide a dataset ID to generate LookML, schema relationships, glossary, and more...", examples=['ncaa_basketball2', 'thelook_ecommerce','geo_openstreetmap','google_political_ads','noaa_historic_severe_storms','stackoverflow'],
	article = "This is a simulator that provides a sneak-peek into how BQ knowledge engine works."
	)
	# Launch the app
	iface.launch(share=True, debug=True)