Rauhan commited on
Commit
d7fd1be
·
1 Parent(s): 1021b5b

UPDATE: metadata, repl

Browse files
analyticsHub/components/__init__.py CHANGED
@@ -1,6 +1,2 @@
1
- from dataclasses import dataclass, field
2
-
3
- @dataclass
4
- class REPLManager:
5
- manager: dict = field(default_factory=dict)
6
- replManager = REPLManager()
 
1
+ from langchain_experimental.utilities import PythonREPL
2
+ replManager = PythonREPL()
 
 
 
 
analyticsHub/components/speechToText.py CHANGED
@@ -1,4 +1,4 @@
1
- from ..utils.functions import readYaml, getConfig
2
  from ..utils.exceptions import CustomException
3
  from ..utils.logger import logger
4
  from groq import Groq
 
1
+ from ..utils.functions import getConfig
2
  from ..utils.exceptions import CustomException
3
  from ..utils.logger import logger
4
  from groq import Groq
analyticsHub/pipelines/pipeline.py CHANGED
@@ -28,7 +28,7 @@ class CompletePipeline:
28
  for fileName in dataFiles:
29
  dataframeName = fileName.replace(".parquet", "")
30
  codeString = readYaml(self.yamlPath)["attributeInfoCode"].format(dataframeName = dataframeName, projectId = projectId)
31
- results += replManager.manager[projectId].run(codeString)
32
  metadataChain = self.metadataGenerator.getMetadataChain()
33
  metadata = metadataChain.invoke({"metadata": results})
34
  metadataParts = metadata.split("```")
@@ -60,9 +60,9 @@ class CompletePipeline:
60
  tablesUsed = blendConfig[dataSource].get("tables")
61
  joinTypes = blendConfig[dataSource].get("joinTypes")
62
  blendOn = blendConfig[dataSource].get("blendOn")
63
- response = replManager.manager[projectId].run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed={tablesUsed}, joinTypes={joinTypes}, blendOn={blendOn})")
64
  else:
65
- response = replManager.manager[projectId].run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed='{dataSource}')")
66
  response = orjson.loads(response.encode())
67
  return response
68
  except Exception as e:
 
28
  for fileName in dataFiles:
29
  dataframeName = fileName.replace(".parquet", "")
30
  codeString = readYaml(self.yamlPath)["attributeInfoCode"].format(dataframeName = dataframeName, projectId = projectId)
31
+ results += replManager.run(codeString)
32
  metadataChain = self.metadataGenerator.getMetadataChain()
33
  metadata = metadataChain.invoke({"metadata": results})
34
  metadataParts = metadata.split("```")
 
60
  tablesUsed = blendConfig[dataSource].get("tables")
61
  joinTypes = blendConfig[dataSource].get("joinTypes")
62
  blendOn = blendConfig[dataSource].get("blendOn")
63
+ response = replManager.run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed={tablesUsed}, joinTypes={joinTypes}, blendOn={blendOn})")
64
  else:
65
+ response = replManager.run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed='{dataSource}')")
66
  response = orjson.loads(response.encode())
67
  return response
68
  except Exception as e:
analyticsHub/routers/projectManager.py CHANGED
@@ -1,10 +1,8 @@
1
  from ..models.requestModels import UpdateProjectState, CreateProject, EditMetadata
2
  from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
3
- from langchain_experimental.utilities import PythonREPL
4
- from ..utils.functions import verifyToken, readYaml
5
  from fastapi.exceptions import HTTPException
6
  from fastapi.responses import JSONResponse
7
- from ..components import replManager
8
  from fastapi import APIRouter, Depends
9
  from supabase import create_client
10
  from urllib.request import urlopen
@@ -29,11 +27,6 @@ async def createProject(projectDetails: CreateProject, credentials: Annotated[HT
29
  try:
30
  if verifyToken(token = credentials.credentials):
31
  projectId = str(uuid.uuid4())
32
- replManager.manager[projectId] = PythonREPL()
33
- _ = replManager.manager[projectId].run(readYaml("params.yaml")["redisFunctionCode"])
34
- _ = replManager.manager[projectId].run(readYaml("params.yaml")["jsonSerializer"])
35
- _ = replManager.manager[projectId].run(("globals()['__name__'] = '__main__'"))
36
- _ = replManager.manager[projectId].run("globals().update(locals())")
37
  decodedToken = jwt.decode(
38
  credentials.credentials,
39
  os.environ["SECRET_KEY"],
@@ -126,7 +119,6 @@ async def generateMetadata(projectId: str, credentials: Annotated[HTTPAuthorizat
126
  for key in newKeys: jsonData[key] = newMetadata[key]
127
  else:
128
  jsonData = pipeline.generateMetadata(projectId = projectId)
129
- _ = replManager.manager[projectId].run(f'metadata = {jsonData}')
130
  with io.BytesIO() as buffer:
131
  buffer.write(json.dumps(jsonData, indent=4).encode("utf-8"))
132
  buffer.seek(0)
 
1
  from ..models.requestModels import UpdateProjectState, CreateProject, EditMetadata
2
  from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 
 
3
  from fastapi.exceptions import HTTPException
4
  from fastapi.responses import JSONResponse
5
+ from ..utils.functions import verifyToken
6
  from fastapi import APIRouter, Depends
7
  from supabase import create_client
8
  from urllib.request import urlopen
 
27
  try:
28
  if verifyToken(token = credentials.credentials):
29
  projectId = str(uuid.uuid4())
 
 
 
 
 
30
  decodedToken = jwt.decode(
31
  credentials.credentials,
32
  os.environ["SECRET_KEY"],
 
119
  for key in newKeys: jsonData[key] = newMetadata[key]
120
  else:
121
  jsonData = pipeline.generateMetadata(projectId = projectId)
 
122
  with io.BytesIO() as buffer:
123
  buffer.write(json.dumps(jsonData, indent=4).encode("utf-8"))
124
  buffer.seek(0)
analyticsHub/workflows/reportingWorkflow.py CHANGED
@@ -40,7 +40,7 @@ class ReportingToolWorkflow:
40
  }
41
  def runInPythonSandbox(self, state: State):
42
  code = "\n".join(state["generatedCode"].split("```")[-2].split("\n")[1:])
43
- response = replManager.manager.get(state["projectId"]).run(code)
44
  return {
45
  "codeOutput": response
46
  }
 
40
  }
41
  def runInPythonSandbox(self, state: State):
42
  code = "\n".join(state["generatedCode"].split("```")[-2].split("\n")[1:])
43
+ response = replManager.run(code)
44
  return {
45
  "codeOutput": response
46
  }
app.py CHANGED
@@ -41,13 +41,13 @@ app.include_router(utilities.router, prefix = "/utils", tags = ["Utilities"])
41
  @app.on_event("startup")
42
  async def startupEvent():
43
  projectIds = [x["projectId"] for x in client.table("Projects").select("projectId").execute().data]
 
44
  for id in projectIds:
45
- replManager.manager[id] = PythonREPL()
46
- _ = replManager.manager[id].run(readYaml("params.yaml")["redisFunctionCode"])
47
- _ = replManager.manager[id].run(readYaml("params.yaml")["jsonSerializer"])
48
- _ = replManager.manager[id].run(readYaml("params.yaml")["panelChartDataCode"])
49
- _ = replManager.manager[id].run(("globals()['__name__'] = '__main__'"))
50
- _ = replManager.manager[id].run("globals().update(locals())")
51
 
52
  if __name__ == "__main__":
53
  uvicorn.run("app:app", host = "0.0.0.0", port = 7860, reload = True)
 
41
  @app.on_event("startup")
42
  async def startupEvent():
43
  projectIds = [x["projectId"] for x in client.table("Projects").select("projectId").execute().data]
44
+ params = readYaml("params.yaml")
45
  for id in projectIds:
46
+ _ = replManager.run(params["redisFunctionCode"])
47
+ _ = replManager.run(params["jsonSerializer"])
48
+ _ = replManager.run(params["panelChartDataCode"])
49
+ _ = replManager.run(("globals()['__name__'] = '__main__'"))
50
+ _ = replManager.run("globals().update(locals())")
 
51
 
52
  if __name__ == "__main__":
53
  uvicorn.run("app:app", host = "0.0.0.0", port = 7860, reload = True)
params.yaml CHANGED
@@ -126,7 +126,7 @@ redisFunctionCode: |
126
  return df
127
 
128
  queryRephraserAgentPrompt: |
129
- You are a **Query Rewriter AI Agent**, ensuring user queries are **clear, valid, and executable** based on dataset metadata.
130
 
131
  ### **1. Understand the Query**
132
  - Analyze the query within dataset context.
@@ -251,13 +251,13 @@ queryRephraserAgentPrompt: |
251
 
252
  ### **Environment Constraints:**
253
  - **Data is retrieved using the `fetch_data` function which takes the dataframe name as a string parameter.**
254
- - **The input metadata is available as a dictionary in the `metadata` variable.** Mention use of the `metadata` variable explicitly if used in transformations.
255
-
256
  ### **Format Instructions:**
257
  - Return **ONLY the output JSON**—no extra text or commentary.
258
 
259
  #### **Provided Inputs:**
260
- - **Metadata (Already present in the `metadata` variable):** {metadata}
261
  - **Query:** {query}
262
 
263
 
@@ -274,8 +274,8 @@ codeGeneratorAgentPrompt: |
274
  - For example: `sales_data = fetch_data("sales_data")`
275
  - **The `fetch_data` function is already defined in the environment - do not redefine it**
276
 
277
- 2. **The `metadata` variable is already present in the environment - do not redefine it or modify it**
278
- - Use the metadata variable directly to access dataset information when needed
279
 
280
  3. **DO NOT assume any new data or create placeholder/sample data.**
281
 
@@ -401,7 +401,7 @@ codeGeneratorAgentPrompt: |
401
  - **type:** The data type (e.g., "int64", "float64", "object").
402
  - **description:** A brief description of the column's content.
403
  - **sample_row:** An object representing an example record from the dataframe.
404
- - The metadata is accessible through the `metadata` variable. Use this information to verify columns and structure.
405
 
406
  ### Python Script Requirements
407
  - **Imports:** Include necessary imports (e.g., `json`, `pandas`).
@@ -530,35 +530,110 @@ codeGeneratorAgentPrompt: |
530
  ### Example 3: Metadata variable usage
531
 
532
  **User Query:**
533
- "Display total number of tables using a card. Steps: 1) Access metadata from the metadata variable, 2) Count keys using len(), 3) Create final_df with count value"
534
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
535
  **Expected Output:**
536
- ```python
537
  import pandas as pd
538
  import json
539
 
540
- # METADATA-ONLY SOLUTION: No fetch_data() required
541
- # Access metadata directly to get table count directly, no need to load it again in any variable
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  table_count = len(metadata.keys())
543
 
544
- # Create required final_df with count (single KPI format)
545
- final_df = pd.DataFrame({{
546
  "total_tables": [table_count]
547
- }})
548
 
549
- # Generate card JSON following strict format
550
- print(json.dumps({{
551
  "chartType": "card",
552
  "title": "Table Inventory Overview",
553
  "label": "Total Tables",
554
- "data": final_df["total_tables"].iloc[0] # Extract single value
555
- }}, indent=4))
556
- ```
557
 
558
  ## Final Guidelines
559
 
560
  - **ALWAYS use the `fetch_data` function to retrieve the dataframes you need.**
561
- - **The `metadata` variable and `fetch_data` function are already defined - DO NOT redefine them.**
 
562
  - **Only retrieve datasets that are specifically needed for the query.**
563
  - **Ensure 100% JSON serializability.**
564
  - **Return only a fully executable Python script—NO additional commentary or explanation.**
@@ -566,7 +641,7 @@ codeGeneratorAgentPrompt: |
566
  - **The final transformed dataframe must always be named `final_df`.**
567
 
568
  ## **Provided Inputs:**
569
- - **Metadata (Already present in the `metadata` variable):** {metadata}
570
  - **Query:** {query}
571
 
572
 
@@ -585,11 +660,11 @@ codeDebuggerAgentPrompt: |
585
 
586
  ## CRITICAL ENVIRONMENT KNOWLEDGE
587
 
588
- 1. **The `fetch_data` function is already defined** - Do not redefine it, but ensure it's used correctly with the exact dataframe names from metadata.
589
- 2. **The `metadata` variable is already present** - Never redefine it, just ensure it's accessed properly.
590
- 3. **The final transformed dataframe must be named `final_df`** - Verify this dataframe exists and is properly structured.
591
- 4. **Chart.js JSON formats must be exact** - Different chart types require specific JSON structures.
592
- 5. **A custom serializer function is available** - The code calls `json.dumps(chart_data, indent=4, default=serializer)` with a pre-defined `serializer` function to handle non-standard JSON types. Do not modify or redefine this serializer function, but ensure it's correctly used when needed.
593
 
594
  ## INPUT DATA
595
 
@@ -684,7 +759,8 @@ codeDebuggerAgentPrompt: |
684
 
685
  ### Environment & Setup Errors
686
  - Missing or incorrect imports
687
- - Incorrect access to metadata or fetch_data
 
688
  - Redefinition of provided functions/variables
689
 
690
  ### Data Processing Errors
 
126
  return df
127
 
128
  queryRephraserAgentPrompt: |
129
+ You are a **Query Rewriter AI Agent**, ensuring user queries are **clear, valid, and executable** based on the given **dataset metadata**.
130
 
131
  ### **1. Understand the Query**
132
  - Analyze the query within dataset context.
 
251
 
252
  ### **Environment Constraints:**
253
  - **Data is retrieved using the `fetch_data` function which takes the dataframe name as a string parameter.**
254
+ - **The `metadata` variable is not preloaded. If a query needs metadata, first define `metadata` as a dictionary using the prompt, then refer to it explicitly in any transformations.**
255
+
256
  ### **Format Instructions:**
257
  - Return **ONLY the output JSON**—no extra text or commentary.
258
 
259
  #### **Provided Inputs:**
260
+ - **Metadata (To be defined as a `metadata` variable if needed):** {metadata}
261
  - **Query:** {query}
262
 
263
 
 
274
  - For example: `sales_data = fetch_data("sales_data")`
275
  - **The `fetch_data` function is already defined in the environment - do not redefine it**
276
 
277
+ 2. **The `metadata` variable is NOT preloaded in the environment.**
278
+ - If metadata is required, you must define the `metadata` variable correctly using the information provided in the prompt before using it in your code. No room for any modification to the metadata.
279
 
280
  3. **DO NOT assume any new data or create placeholder/sample data.**
281
 
 
401
  - **type:** The data type (e.g., "int64", "float64", "object").
402
  - **description:** A brief description of the column's content.
403
  - **sample_row:** An object representing an example record from the dataframe.
404
+ - If you need to access metadata, define a `metadata` variable using the JSON/YAML structure provided in the prompt. Then use it to verify dataset structure and column names.
405
 
406
  ### Python Script Requirements
407
  - **Imports:** Include necessary imports (e.g., `json`, `pandas`).
 
530
  ### Example 3: Metadata variable usage
531
 
532
  **User Query:**
533
+ "Display total number of tables using a card. Steps: 1) Define the metadata variable using the provided input, 2) Count keys using len() to get the number of tables, 3) Create final_df with count value"
534
 
535
+ **Metadata:**
536
+ {{
537
+ "sales": {{
538
+ "description": "Sales records for 2023 and 2024",
539
+ "shape": [1000, 5],
540
+ "columns": [
541
+ {{"name": "date", "type": "datetime64", "description": "Transaction date"}},
542
+ {{"name": "region", "type": "object", "description": "Sales region"}},
543
+ {{"name": "amount", "type": "float64", "description": "Sale amount"}},
544
+ {{"name": "product", "type": "object", "description": "Product name"}},
545
+ {{"name": "channel", "type": "object", "description": "Sales channel"}}
546
+ ],
547
+ "sample_row": {{
548
+ "date": "2024-05-12",
549
+ "region": "North",
550
+ "amount": 1234.56,
551
+ "product": "Laptop",
552
+ "channel": "Online"
553
+ }}
554
+ }},
555
+ "customers": {{
556
+ "description": "Customer demographic data",
557
+ "shape": [500, 4],
558
+ "columns": [
559
+ {{"name": "customer_id", "type": "int64", "description": "Unique customer ID"}},
560
+ {{"name": "age", "type": "int64", "description": "Age of customer"}},
561
+ {{"name": "gender", "type": "object", "description": "Gender of customer"}},
562
+ {{"name": "region", "type": "object", "description": "Region of residence"}}
563
+ ],
564
+ "sample_row": {{
565
+ "customer_id": 101,
566
+ "age": 34,
567
+ "gender": "Female",
568
+ "region": "West"
569
+ }}
570
+ }}
571
+ }}
572
+
573
  **Expected Output:**
 
574
  import pandas as pd
575
  import json
576
 
577
+ # Step 1: Define metadata manually from the provided input
578
+ metadata = {{
579
+ "sales": {{
580
+ "description": "Sales records for 2023 and 2024",
581
+ "shape": [1000, 5],
582
+ "columns": [
583
+ {{"name": "date", "type": "datetime64", "description": "Transaction date"}},
584
+ {{"name": "region", "type": "object", "description": "Sales region"}},
585
+ {{"name": "amount", "type": "float64", "description": "Sale amount"}},
586
+ {{"name": "product", "type": "object", "description": "Product name"}},
587
+ {{"name": "channel", "type": "object", "description": "Sales channel"}}
588
+ ],
589
+ "sample_row": {{
590
+ "date": "2024-05-12",
591
+ "region": "North",
592
+ "amount": 1234.56,
593
+ "product": "Laptop",
594
+ "channel": "Online"
595
+ }}
596
+ }},
597
+ "customers": {{
598
+ "description": "Customer demographic data",
599
+ "shape": [500, 4],
600
+ "columns": [
601
+ {{"name": "customer_id", "type": "int64", "description": "Unique customer ID"}},
602
+ {{"name": "age", "type": "int64", "description": "Age of customer"}},
603
+ {{"name": "gender", "type": "object", "description": "Gender of customer"}},
604
+ {{"name": "region", "type": "object", "description": "Region of residence"}}
605
+ ],
606
+ "sample_row": {{
607
+ "customer_id": 101,
608
+ "age": 34,
609
+ "gender": "Female",
610
+ "region": "West"
611
+ }}
612
+ }}
613
+ }}
614
+
615
+ # Step 2: Count total number of tables
616
  table_count = len(metadata.keys())
617
 
618
+ # Step 3: Create final_df with table count
619
+ final_df = pd.DataFrame({
620
  "total_tables": [table_count]
621
+ })
622
 
623
+ # Generate card JSON
624
+ print(json.dumps({
625
  "chartType": "card",
626
  "title": "Table Inventory Overview",
627
  "label": "Total Tables",
628
+ "data": final_df["total_tables"].iloc[0]
629
+ }, indent=4))
630
+
631
 
632
  ## Final Guidelines
633
 
634
  - **ALWAYS use the `fetch_data` function to retrieve the dataframes you need.**
635
+ - **The `fetch_data` function is already defined - DO NOT redefine it.**
636
+ - **The `metadata` variable is NOT preloaded in the environment and needs to be defined correctly from the input.**
637
  - **Only retrieve datasets that are specifically needed for the query.**
638
  - **Ensure 100% JSON serializability.**
639
  - **Return only a fully executable Python script—NO additional commentary or explanation.**
 
641
  - **The final transformed dataframe must always be named `final_df`.**
642
 
643
  ## **Provided Inputs:**
644
+ - **Metadata (Define a `metadata` variable if needed):** {metadata}
645
  - **Query:** {query}
646
 
647
 
 
660
 
661
  ## CRITICAL ENVIRONMENT KNOWLEDGE
662
 
663
+ 1. **The `fetch_data` function may not be predefined** If missing from the code, it must be assumed to be already defined **only if referenced**. Do not define it yourself.
664
+ 2. **The `metadata` needs to be defined** – If metadata is used, it must be **explicitly defined** from the provided metadata input. Do not assume it already exists. Also, it needs to be defined correctly as is given in the input. No room for any modification to the metadata or mistakes.
665
+ 3. **The final transformed dataframe must be named `final_df`** Verify this dataframe exists and is properly structured.
666
+ 4. **Chart.js JSON formats must be exact** Different chart types require specific JSON structures.
667
+ 5. **A custom serializer function is available** The code calls `json.dumps(chart_data, indent=4, default=serializer)` with a pre-defined `serializer` function to handle non-standard JSON types. Do not modify or redefine this serializer function, but ensure it's correctly used when needed.
668
 
669
  ## INPUT DATA
670
 
 
759
 
760
  ### Environment & Setup Errors
761
  - Missing or incorrect imports
762
+ - Incorrect or missing definition of metadata
763
+ - Incorrect access to `metadata` or `fetch_data`
764
  - Redefinition of provided functions/variables
765
 
766
  ### Data Processing Errors