Spaces:

Rauhan
/

AnalyticsHubTest1

Sleeping

App Files Files Community

Rauhan commited on May 26, 2025

Commit

d7fd1be

1 Parent(s): 1021b5b

UPDATE: metadata, repl

Browse files

Files changed (7) hide show

analyticsHub/components/__init__.py +2 -6
analyticsHub/components/speechToText.py +1 -1
analyticsHub/pipelines/pipeline.py +3 -3
analyticsHub/routers/projectManager.py +1 -9
analyticsHub/workflows/reportingWorkflow.py +1 -1
app.py +6 -6
params.yaml +103 -27

analyticsHub/components/__init__.py CHANGED Viewed

@@ -1,6 +1,2 @@
-from dataclasses import dataclass, field
-@dataclass
-class REPLManager:
-    manager: dict = field(default_factory=dict)
-replManager = REPLManager()


1	+ from langchain_experimental.utilities import PythonREPL
2	+ replManager = PythonREPL()

analyticsHub/components/speechToText.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from ..utils.functions import readYaml, getConfig
 from ..utils.exceptions import CustomException
 from ..utils.logger import logger
 from groq import Groq

+from ..utils.functions import getConfig
 from ..utils.exceptions import CustomException
 from ..utils.logger import logger
 from groq import Groq

analyticsHub/pipelines/pipeline.py CHANGED Viewed

@@ -28,7 +28,7 @@ class CompletePipeline:
             for fileName in dataFiles:
                 dataframeName = fileName.replace(".parquet", "")
                 codeString = readYaml(self.yamlPath)["attributeInfoCode"].format(dataframeName = dataframeName, projectId = projectId)
-                results += replManager.manager[projectId].run(codeString)
             metadataChain = self.metadataGenerator.getMetadataChain()
             metadata = metadataChain.invoke({"metadata": results})
             metadataParts = metadata.split("```")
@@ -60,9 +60,9 @@ class CompletePipeline:
                 tablesUsed = blendConfig[dataSource].get("tables")
                 joinTypes = blendConfig[dataSource].get("joinTypes")
                 blendOn = blendConfig[dataSource].get("blendOn")
-                response = replManager.manager[projectId].run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed={tablesUsed}, joinTypes={joinTypes}, blendOn={blendOn})")
             else:
-                response = replManager.manager[projectId].run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed='{dataSource}')")
             response = orjson.loads(response.encode())
             return response
         except Exception as e:

             for fileName in dataFiles:
                 dataframeName = fileName.replace(".parquet", "")
                 codeString = readYaml(self.yamlPath)["attributeInfoCode"].format(dataframeName = dataframeName, projectId = projectId)
+                results += replManager.run(codeString)
             metadataChain = self.metadataGenerator.getMetadataChain()
             metadata = metadataChain.invoke({"metadata": results})
             metadataParts = metadata.split("```")
                 tablesUsed = blendConfig[dataSource].get("tables")
                 joinTypes = blendConfig[dataSource].get("joinTypes")
                 blendOn = blendConfig[dataSource].get("blendOn")
+                response = replManager.run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed={tablesUsed}, joinTypes={joinTypes}, blendOn={blendOn})")
             else:
+                response = replManager.run(f"getDataForChart(projectId='{projectId}', chartType='{chartType}', xAxis='{xAxis}', yAxis='{yAxis}', aggregationMetric='{aggregationMetric}', tablesUsed='{dataSource}')")
             response = orjson.loads(response.encode())
             return response
         except Exception as e:

analyticsHub/routers/projectManager.py CHANGED Viewed

@@ -1,10 +1,8 @@
 from ..models.requestModels import UpdateProjectState, CreateProject, EditMetadata
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
-from langchain_experimental.utilities import PythonREPL
-from ..utils.functions import verifyToken, readYaml
 from fastapi.exceptions import HTTPException
 from fastapi.responses import JSONResponse
-from ..components import replManager
 from fastapi import APIRouter, Depends
 from supabase import create_client
 from urllib.request import urlopen
@@ -29,11 +27,6 @@ async def createProject(projectDetails: CreateProject, credentials: Annotated[HT
     try:
         if verifyToken(token = credentials.credentials):
             projectId = str(uuid.uuid4())
-            replManager.manager[projectId] = PythonREPL()
-            _ = replManager.manager[projectId].run(readYaml("params.yaml")["redisFunctionCode"])
-            _ = replManager.manager[projectId].run(readYaml("params.yaml")["jsonSerializer"])
-            _ = replManager.manager[projectId].run(("globals()['__name__'] = '__main__'"))
-            _ = replManager.manager[projectId].run("globals().update(locals())")
             decodedToken = jwt.decode(
                 credentials.credentials,
                 os.environ["SECRET_KEY"],
@@ -126,7 +119,6 @@ async def generateMetadata(projectId: str, credentials: Annotated[HTTPAuthorizat
                 for key in newKeys: jsonData[key] = newMetadata[key]
             else:
                 jsonData = pipeline.generateMetadata(projectId = projectId)
-                _ = replManager.manager[projectId].run(f'metadata = {jsonData}')
             with io.BytesIO() as buffer:
                 buffer.write(json.dumps(jsonData, indent=4).encode("utf-8"))
                 buffer.seek(0)

 from ..models.requestModels import UpdateProjectState, CreateProject, EditMetadata
 from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
 from fastapi.exceptions import HTTPException
 from fastapi.responses import JSONResponse
+from ..utils.functions import verifyToken
 from fastapi import APIRouter, Depends
 from supabase import create_client
 from urllib.request import urlopen
     try:
         if verifyToken(token = credentials.credentials):
             projectId = str(uuid.uuid4())
             decodedToken = jwt.decode(
                 credentials.credentials,
                 os.environ["SECRET_KEY"],
                 for key in newKeys: jsonData[key] = newMetadata[key]
             else:
                 jsonData = pipeline.generateMetadata(projectId = projectId)
             with io.BytesIO() as buffer:
                 buffer.write(json.dumps(jsonData, indent=4).encode("utf-8"))
                 buffer.seek(0)

analyticsHub/workflows/reportingWorkflow.py CHANGED Viewed

@@ -40,7 +40,7 @@ class ReportingToolWorkflow:
         }
     def runInPythonSandbox(self, state: State):
         code = "\n".join(state["generatedCode"].split("```")[-2].split("\n")[1:])
-        response = replManager.manager.get(state["projectId"]).run(code)
         return {
             "codeOutput": response
         }

         }
     def runInPythonSandbox(self, state: State):
         code = "\n".join(state["generatedCode"].split("```")[-2].split("\n")[1:])
+        response = replManager.run(code)
         return {
             "codeOutput": response
         }

app.py CHANGED Viewed

@@ -41,13 +41,13 @@ app.include_router(utilities.router, prefix = "/utils", tags = ["Utilities"])
 @app.on_event("startup")
 async def startupEvent():
     projectIds = [x["projectId"] for x in client.table("Projects").select("projectId").execute().data]
     for id in projectIds:
-        replManager.manager[id] = PythonREPL()
-        _ = replManager.manager[id].run(readYaml("params.yaml")["redisFunctionCode"])
-        _ = replManager.manager[id].run(readYaml("params.yaml")["jsonSerializer"])
-        _ = replManager.manager[id].run(readYaml("params.yaml")["panelChartDataCode"])
-        _ = replManager.manager[id].run(("globals()['__name__'] = '__main__'"))
-        _ = replManager.manager[id].run("globals().update(locals())")
 if __name__ == "__main__":
     uvicorn.run("app:app", host = "0.0.0.0", port = 7860, reload = True)

 @app.on_event("startup")
 async def startupEvent():
     projectIds = [x["projectId"] for x in client.table("Projects").select("projectId").execute().data]
+    params = readYaml("params.yaml")
     for id in projectIds:
+        _ = replManager.run(params["redisFunctionCode"])
+        _ = replManager.run(params["jsonSerializer"])
+        _ = replManager.run(params["panelChartDataCode"])
+        _ = replManager.run(("globals()['__name__'] = '__main__'"))
+        _ = replManager.run("globals().update(locals())")
 if __name__ == "__main__":
     uvicorn.run("app:app", host = "0.0.0.0", port = 7860, reload = True)

params.yaml CHANGED Viewed

@@ -126,7 +126,7 @@ redisFunctionCode: |
         return df
 queryRephraserAgentPrompt: |
-  You are a **Query Rewriter AI Agent**, ensuring user queries are **clear, valid, and executable** based on dataset metadata.
   ### **1. Understand the Query**
   - Analyze the query within dataset context.
@@ -251,13 +251,13 @@ queryRephraserAgentPrompt: |
   ### **Environment Constraints:**
   - **Data is retrieved using the `fetch_data` function which takes the dataframe name as a string parameter.**
-  - **The input metadata is available as a dictionary in the `metadata` variable.** Mention use of the `metadata` variable explicitly if used in transformations.
   ### **Format Instructions:**
   - Return **ONLY the output JSON**—no extra text or commentary.
   #### **Provided Inputs:**
-  - **Metadata (Already present in the `metadata` variable):** {metadata}
   - **Query:** {query}
@@ -274,8 +274,8 @@ codeGeneratorAgentPrompt: |
         - For example: `sales_data = fetch_data("sales_data")`
     - **The `fetch_data` function is already defined in the environment - do not redefine it**
-    2. **The `metadata` variable is already present in the environment - do not redefine it or modify it**
-    - Use the metadata variable directly to access dataset information when needed
     3. **DO NOT assume any new data or create placeholder/sample data.**
@@ -401,7 +401,7 @@ codeGeneratorAgentPrompt: |
         - **type:** The data type (e.g., "int64", "float64", "object").
         - **description:** A brief description of the column's content.
     - **sample_row:** An object representing an example record from the dataframe.
-    - The metadata is accessible through the `metadata` variable. Use this information to verify columns and structure.
     ### Python Script Requirements
     - **Imports:** Include necessary imports (e.g., `json`, `pandas`).
@@ -530,35 +530,110 @@ codeGeneratorAgentPrompt: |
     ### Example 3: Metadata variable usage
     **User Query:**
-    "Display total number of tables using a card. Steps: 1) Access metadata from the metadata variable, 2) Count keys using len(), 3) Create final_df with count value"
     **Expected Output:**
-    ```python
     import pandas as pd
     import json
-    # METADATA-ONLY SOLUTION: No fetch_data() required
-    # Access metadata directly to get table count directly, no need to load it again in any variable
     table_count = len(metadata.keys())
-    # Create required final_df with count (single KPI format)
-    final_df = pd.DataFrame({{
         "total_tables": [table_count]
-    }})
-    # Generate card JSON following strict format
-    print(json.dumps({{
         "chartType": "card",
         "title": "Table Inventory Overview",
         "label": "Total Tables",
-        "data": final_df["total_tables"].iloc[0]  # Extract single value
-    }}, indent=4))
-    ```
     ## Final Guidelines
     - **ALWAYS use the `fetch_data` function to retrieve the dataframes you need.**
-    - **The `metadata` variable and `fetch_data` function are already defined - DO NOT redefine them.**
     - **Only retrieve datasets that are specifically needed for the query.**
     - **Ensure 100% JSON serializability.**
     - **Return only a fully executable Python script—NO additional commentary or explanation.**
@@ -566,7 +641,7 @@ codeGeneratorAgentPrompt: |
     - **The final transformed dataframe must always be named `final_df`.**
     ## **Provided Inputs:**
-    - **Metadata (Already present in the `metadata` variable):** {metadata}
     - **Query:** {query}
@@ -585,11 +660,11 @@ codeDebuggerAgentPrompt: |
     ## CRITICAL ENVIRONMENT KNOWLEDGE
-    1. **The `fetch_data` function is already defined** - Do not redefine it, but ensure it's used correctly with the exact dataframe names from metadata.
-    2. **The `metadata` variable is already present** - Never redefine it, just ensure it's accessed properly.
-    3. **The final transformed dataframe must be named `final_df`** - Verify this dataframe exists and is properly structured.
-    4. **Chart.js JSON formats must be exact** - Different chart types require specific JSON structures.
-    5. **A custom serializer function is available** - The code calls `json.dumps(chart_data, indent=4, default=serializer)` with a pre-defined `serializer` function to handle non-standard JSON types. Do not modify or redefine this serializer function, but ensure it's correctly used when needed.
     ## INPUT DATA
@@ -684,7 +759,8 @@ codeDebuggerAgentPrompt: |
     ### Environment & Setup Errors
     - Missing or incorrect imports
-    - Incorrect access to metadata or fetch_data
     - Redefinition of provided functions/variables
     ### Data Processing Errors

         return df
 queryRephraserAgentPrompt: |
+  You are a **Query Rewriter AI Agent**, ensuring user queries are **clear, valid, and executable** based on the given **dataset metadata**.
   ### **1. Understand the Query**
   - Analyze the query within dataset context.
   ### **Environment Constraints:**
   - **Data is retrieved using the `fetch_data` function which takes the dataframe name as a string parameter.**
+  - **The `metadata` variable is not preloaded. If a query needs metadata, first define `metadata` as a dictionary using the prompt, then refer to it explicitly in any transformations.**
   ### **Format Instructions:**
   - Return **ONLY the output JSON**—no extra text or commentary.
   #### **Provided Inputs:**
+  - **Metadata (To be defined as a `metadata` variable if needed):** {metadata}
   - **Query:** {query}
         - For example: `sales_data = fetch_data("sales_data")`
     - **The `fetch_data` function is already defined in the environment - do not redefine it**
+    2. **The `metadata` variable is NOT preloaded in the environment.**
+    - If metadata is required, you must define the `metadata` variable correctly using the information provided in the prompt before using it in your code. No room for any modification to the metadata.
     3. **DO NOT assume any new data or create placeholder/sample data.**
         - **type:** The data type (e.g., "int64", "float64", "object").
         - **description:** A brief description of the column's content.
     - **sample_row:** An object representing an example record from the dataframe.
+    - If you need to access metadata, define a `metadata` variable using the JSON/YAML structure provided in the prompt. Then use it to verify dataset structure and column names.
     ### Python Script Requirements
     - **Imports:** Include necessary imports (e.g., `json`, `pandas`).
     ### Example 3: Metadata variable usage
     **User Query:**
+    "Display total number of tables using a card. Steps: 1) Define the metadata variable using the provided input, 2) Count keys using len() to get the number of tables, 3) Create final_df with count value"
+    **Metadata:**
+    {{
+        "sales": {{
+            "description": "Sales records for 2023 and 2024",
+            "shape": [1000, 5],
+            "columns": [
+                {{"name": "date", "type": "datetime64", "description": "Transaction date"}},
+                {{"name": "region", "type": "object", "description": "Sales region"}},
+                {{"name": "amount", "type": "float64", "description": "Sale amount"}},
+                {{"name": "product", "type": "object", "description": "Product name"}},
+                {{"name": "channel", "type": "object", "description": "Sales channel"}}
+            ],
+            "sample_row": {{
+                "date": "2024-05-12",
+                "region": "North",
+                "amount": 1234.56,
+                "product": "Laptop",
+                "channel": "Online"
+            }}
+        }},
+        "customers": {{
+            "description": "Customer demographic data",
+            "shape": [500, 4],
+            "columns": [
+                {{"name": "customer_id", "type": "int64", "description": "Unique customer ID"}},
+                {{"name": "age", "type": "int64", "description": "Age of customer"}},
+                {{"name": "gender", "type": "object", "description": "Gender of customer"}},
+                {{"name": "region", "type": "object", "description": "Region of residence"}}
+            ],
+            "sample_row": {{
+                "customer_id": 101,
+                "age": 34,
+                "gender": "Female",
+                "region": "West"
+            }}
+        }}
+    }}
     **Expected Output:**
     import pandas as pd
     import json
+    # Step 1: Define metadata manually from the provided input
+    metadata = {{
+        "sales": {{
+            "description": "Sales records for 2023 and 2024",
+            "shape": [1000, 5],
+            "columns": [
+                {{"name": "date", "type": "datetime64", "description": "Transaction date"}},
+                {{"name": "region", "type": "object", "description": "Sales region"}},
+                {{"name": "amount", "type": "float64", "description": "Sale amount"}},
+                {{"name": "product", "type": "object", "description": "Product name"}},
+                {{"name": "channel", "type": "object", "description": "Sales channel"}}
+            ],
+            "sample_row": {{
+                "date": "2024-05-12",
+                "region": "North",
+                "amount": 1234.56,
+                "product": "Laptop",
+                "channel": "Online"
+            }}
+        }},
+        "customers": {{
+            "description": "Customer demographic data",
+            "shape": [500, 4],
+            "columns": [
+                {{"name": "customer_id", "type": "int64", "description": "Unique customer ID"}},
+                {{"name": "age", "type": "int64", "description": "Age of customer"}},
+                {{"name": "gender", "type": "object", "description": "Gender of customer"}},
+                {{"name": "region", "type": "object", "description": "Region of residence"}}
+            ],
+            "sample_row": {{
+                "customer_id": 101,
+                "age": 34,
+                "gender": "Female",
+                "region": "West"
+            }}
+        }}
+    }}
+    # Step 2: Count total number of tables
     table_count = len(metadata.keys())
+    # Step 3: Create final_df with table count
+    final_df = pd.DataFrame({
         "total_tables": [table_count]
+    })
+    # Generate card JSON
+    print(json.dumps({
         "chartType": "card",
         "title": "Table Inventory Overview",
         "label": "Total Tables",
+        "data": final_df["total_tables"].iloc[0]
+    }, indent=4))
     ## Final Guidelines
     - **ALWAYS use the `fetch_data` function to retrieve the dataframes you need.**
+    - **The `fetch_data` function is already defined - DO NOT redefine it.**
+    - **The `metadata` variable is NOT preloaded in the environment and needs to be defined correctly from the input.**
     - **Only retrieve datasets that are specifically needed for the query.**
     - **Ensure 100% JSON serializability.**
     - **Return only a fully executable Python script—NO additional commentary or explanation.**
     - **The final transformed dataframe must always be named `final_df`.**
     ## **Provided Inputs:**
+    - **Metadata (Define a `metadata` variable if needed):** {metadata}
     - **Query:** {query}
     ## CRITICAL ENVIRONMENT KNOWLEDGE
+    1. **The `fetch_data` function may not be predefined** – If missing from the code, it must be assumed to be already defined **only if referenced**. Do not define it yourself.
+    2. **The `metadata` needs to be defined** – If metadata is used, it must be **explicitly defined** from the provided metadata input. Do not assume it already exists. Also, it needs to be defined correctly as is given in the input. No room for any modification to the metadata or mistakes.
+    3. **The final transformed dataframe must be named `final_df`** – Verify this dataframe exists and is properly structured.
+    4. **Chart.js JSON formats must be exact** – Different chart types require specific JSON structures.
+    5. **A custom serializer function is available** – The code calls `json.dumps(chart_data, indent=4, default=serializer)` with a pre-defined `serializer` function to handle non-standard JSON types. Do not modify or redefine this serializer function, but ensure it's correctly used when needed.
     ## INPUT DATA
     ### Environment & Setup Errors
     - Missing or incorrect imports
+    - Incorrect or missing definition of metadata
+    - Incorrect access to `metadata` or `fetch_data`
     - Redefinition of provided functions/variables
     ### Data Processing Errors