Spaces:

harsimran726
/

FineTune_Data_Generation_Agent

Runtime error

App Files Files Community

first_time

by harsimran726 - opened Jun 15, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+529

-1054

Files changed (8) hide show

.env +1 -1
Data_Geneartion_Agent.py +303 -318
Dockerfile +19 -19
README.md +112 -120
main.py +83 -88
requirements.txt +11 -12
runtime_env.json +0 -1
templates/index.html +0 -495

.env CHANGED Viewed

	@@ -1 +1 @@
1	- GOOGLE_API_KEY=~~AIzaSyAp9Gib90XnjNdC-GEZNwk9x7vmOyTyGiU~~


1	+ GOOGLE_API_KEY=AIzaSyDDYC_kOXY7iBSX7jaaBEKojDkECLDgaUk

Data_Geneartion_Agent.py CHANGED Viewed

@@ -1,318 +1,303 @@
-import pandas as pd
-import numpy as np
-from langchain_core.prompts import ChatPromptTemplate
-from langchain.agents import AgentExecutor, create_react_agent , BaseMultiActionAgent , initialize_agent, AgentType , create_openai_tools_agent , create_openai_functions_agent , create_tool_calling_agent
-from langchain.tools import Tool
-from langchain_google_genai import ChatGoogleGenerativeAI
-import json
-from dotenv import load_dotenv
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from langchain.schema.agent import AgentActionMessageLog
-from langchain.agents.agent import AgentAction
-from langchain.chains import LLMChain
-from langchain_core.runnables import Runnable
-load_dotenv()
-import os
-# os.makedirs("/tmp/", exist_ok=True)
-# openai_api_key = os.getenv("OPENAI_API_KEY")
-def load_api_key():
-    try:
-        with open("runtime_env.json", "r") as f:
-            env = json.load(f)
-            return env.get("API_KEY")
-    except Exception:
-        return None
-GOOGLE_API_KEY = load_api_key()
-print(f"Here is api_key {GOOGLE_API_KEY}")
-system_prompt = """
-You are a Synthetic Data Generation Agent responsible for producing structured conversational data suitable for fine-tuning a language model.
-Your task follows this pipeline:
-1. **Understand the user's request** to determine the data domain and format.
-2. **Generate a diverse list of realistic user instructions** related to the request topic using the generate_data_tool.
-3. **Create corresponding assistant responses** using the generate_response_tool.
-4. **IMPORTANT**: After generating both instructions and responses, you MUST use the csv_tool to save the data.
-5. **Return the output in JSON format** using two keys only:
-   - `"instructions"`: An array of user queries or prompts.
-   - `"response"`: An array of assistant replies corresponding to each instruction.
-### Output format:
-Return the final output in this JSON format not any other text.:
-```json
-{{
-  "instructions": ["<user prompt 1>", "<user prompt 2>", "..."],
-  "response": ["<assistant response 1>", "<assistant response 2>", "..."]
-}}
-### Available Tools:
-1. **generate_data_tool**
-   - Use when: You need to create initial instructions for data generation
-   - Purpose: Generates structured instructions based on user input
-   - Input: User's query about what kind of data they want
-   - Output: JSON with "instructions" key
-2. **generate_response_tool**
-   - Use when: You have instructions and need to generate corresponding responses
-   - Purpose: Creates appropriate responses for the given instructions
-   - Input: Instructions from generate_data_tool
-   - Output: JSON with "response" key
-3. **csv_tool**
-   - Use when: You have complete JSON data ready to be saved
-   - Purpose: Converts JSON data to CSV format and saves it
-   - Input: Complete JSON data with both instructions and responses
-   - Output: Saves data to "Data_File.csv"
-### Tool Usage Flow:
-1. First, use generate_data_tool to create instructions
-2. Then, use generate_response_tool to create corresponding responses
-3. Finally, you MUST use csv_tool to save the complete dataset
-Remember to always maintain the correct JSON structure throughout the process.
-IMPORTANT: Only provide JSON output without any additional text before or after the JSON structure except of 'Json'. Do not include any explanatory text, markdown formatting, or other content outside the JSON object.
-If user dose not mention the number of rows then default to 10 rows.
-"""
-query_system_prompt = """You are a **Data Generation Agent** that produces **natural language instructions** to guide the creation of fine-tuning datasets based on a user request.
-### Your Task:
-1. Understand the user's input and determine the type and topic of data required.
-2. Based on the input, generate a **single, clear instruction** for creating a dataset. The instruction should describe what kind of data to generate, in natural and concise language.
-3. If the number of rows is not explicitly mentioned, default to **10 rows**.
-4. **Only return a string** with one key: `"instructions"`.
----
-### Output Format:
-  "instructions": "Generate 1000 rows of employee salary data based on..."
-"""
-response_system_prompt = """
-You are a **Data Generation Agent** responsible for generating **structured data responses** based on the given instructions.
----
-### Your Task:
-1. Read and understand the provided **instructions**.
-2. Generate the appropriate **data or description** that directly fulfills the instructions.
-3. Return only a **JSON string** with one key: `"response"`.
----
-### Output Format:
-  "response": "Here is the data..."
-"""
-def generate_data(query : str) -> str:
-    try:
-        query_llm =  ChatGoogleGenerativeAI(
-    model="gemini-2.0-flash ",   # gemini-2.0-flash     or gemini-2.5-flash-preview-04-17
-    api_key=GOOGLE_API_KEY,
-    temperature=0.9,
-    )
-        query_prompt = ChatPromptTemplate.from_messages([
-            ("system", query_system_prompt),
-            ("human", "{input}"),
-            ("assistant", "{agent_scratchpad}"),
-        ])
-        chain: Runnable = query_prompt | query_llm
-        result = chain.invoke({"input": query, "agent_scratchpad": ""})
-        return result
-    except Exception as e:
-        print(f"Error in generate_data: {str(e)}")
-        raise
-def generate_response(instructions : str) -> str:
-    try:
-        response_llm = ChatGoogleGenerativeAI(
-    model="gemini-2.0-flash",
-    api_key=GOOGLE_API_KEY,
-    temperature=0.9,
-    )
-        response_prompt = ChatPromptTemplate.from_messages([
-            ("system", response_system_prompt),
-            ("human", "{instructions}"),
-            ("assistant", "{agent_scratchpad}"),
-        ])
-        chain: Runnable = response_prompt | response_llm
-        result = chain.invoke({"instructions": instructions, "agent_scratchpad": ""})
-        return result
-    except Exception as e:
-        print(f"Error in generate_response: {str(e)}")
-        raise
-def save_to_csv(data: str):
-    try:
-        # Clean the input string by removing triple quotes
-        print(f"here is the data {data}")
-        # Parse the JSON string into a Python dictionary
-        if ('json') in data:
-            print(f"json is in data")
-            if "```" in data:
-                print(f"``` is in data")
-                data = data.replace("json", "").strip()
-                data = data.replace("```", "").strip()
-                data_dict = json.loads(data)
-                df = pd.DataFrame()
-                df['instructions'] = data_dict['instructions']
-                df['response'] = data_dict['response']
-                print(f"DataFrame shape: {df.shape}")
-                print(f"DataFrame columns: {df.columns.tolist()}")
-                print(f"here is df {df}")
-                # Save to CSV without index
-                print("\nSaving to CSV...")
-                output_path = "/tmp/Data_File.csv"
-                df.to_csv(output_path, index=False)
-                return f"File scucessfully created"
-            else:
-                print(f"``` is not in data")
-                data = data.replace("json", "").strip()
-                print(f"data is {data}")
-                data_dict = json.loads(data)
-                print(f"data_dict is {data_dict}")
-                df = pd.DataFrame()
-                df['instructions'] = data_dict['instructions']
-                df['response'] = data_dict['response']
-                print(f"DataFrame shape: {df.shape}")
-                print(f"DataFrame columns: {df.columns.tolist()}")
-                print(f"here is df {df}")
-                # Save to CSV without index
-                print("\nSaving to CSV...")
-                output_path = "/tmp/Data_File.csv"
-                df.to_csv(output_path, index=False)
-                return f"File scucessfully created"
-        # Convert the dictionary to a DataFrame
-        elif "```" in data:
-            print(f"``` is in data")
-            data = data.replace("```", "").strip()
-            data = json.loads(data)
-            df = pd.DataFrame()
-            df['instructions'] = data['instructions']
-            df['response'] = data['response']
-            # Save to CSV without index
-            print("\nSaving to CSV...")
-            output_path = "/tmp/Data_File.csv"
-            df.to_csv(output_path, index=False)
-            return f"File created successfully"
-        else:
-            print(f"''' is not in data")
-            data = json.loads(data)
-            df = pd.DataFrame()
-            df['instructions'] = data['instructions']
-            df['response'] = data['response']
-            # Save to CSV without index
-            print("\nSaving to CSV...")
-            output_path = "/tmp/Data_File.csv"
-            df.to_csv(output_path, index=False)
-            return f"File created successfully"
-    except json.JSONDecodeError as e:
-        raise ValueError(f"Invalid JSON data: {str(e)}")
-generate_data_tool =    Tool(
-        name="generate_data_tool",
-        description="Generate the data(Instructions) for the query",
-        func=generate_data
-    )
-generate_response_tool =    Tool(
-        name="generate_response_tool",
-        description="Generate the data(Response) for the instructions",
-        func=generate_response
-    )
-csv_tool = Tool(
-    name="csv_tool",
-    description="Pass the JSON data after generating both instructions and responses, convert it into csv then save the csv file",
-    func=save_to_csv
-)
-tools = [generate_data_tool, generate_response_tool,csv_tool]
-query_prompt = ChatPromptTemplate.from_messages([
-    ("system", system_prompt),
-    ("human", "{input}"),
-    MessagesPlaceholder(variable_name="agent_scratchpad")
-])
-llm = ChatGoogleGenerativeAI(
-    model="gemini-2.0-flash",
-    api_key=GOOGLE_API_KEY,
-    temperature=0.9,
-    )
-agent = create_openai_tools_agent(llm=llm, prompt=query_prompt, tools=tools)
-data_agent = AgentExecutor(agent=agent, tools=tools, verbose=True)
-def generate_data_agent(query: str):
-    try:
-        GOOGLE_API_KEY = load_api_key()
-        print(f"Here is api_key after puting {GOOGLE_API_KEY}")
-        if not query:
-            return {"status": "error", "message": "Query cannot be empty", "csv_file": None}
-        print(f"Processing query: {query}")
-        result = data_agent.invoke({"input": query})
-        print(f"Agent execution result: {result['output']}")
-        save_to_csv(result['output'])
-        print(f"here is the result {type(result['output'])}")
-        # Check if data.csv was created
-        if os.path.exists("tmp/Data_File.csv"):
-            return {
-                "status": "success",
-                "message": "Created successfully! You can download the CSV file below.",
-                "csv_file": "Data_File.csv"
-            }
-        else:
-            return {
-                "status": "error",
-                "message": "Failed to generate data file",
-                "csv_file": None
-            }
-    except Exception as e:
-        print(f"Error in generate_data_agent: {str(e)}")
-        return {
-            "status": "error",
-            "message": f"An error occurred: {str(e)}",
-            "csv_file": None
-        }

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate
+from langchain.agents import AgentExecutor, create_react_agent , BaseMultiActionAgent , initialize_agent, AgentType , create_openai_tools_agent , create_openai_functions_agent , create_tool_calling_agent
+from langchain.tools import Tool
+from langchain_google_genai import ChatGoogleGenerativeAI
+import json
+from dotenv import load_dotenv
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain.schema.agent import AgentActionMessageLog
+from langchain.agents.agent import AgentAction
+from langchain.chains import LLMChain
+from langchain_nvidia_ai_endpoints import ChatNVIDIA
+from langchain_core.runnables import Runnable
+load_dotenv()
+import os
+openai_api_key = os.getenv("OPENAI_API_KEY")
+GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
+system_prompt = """
+You are a Synthetic Data Generation Agent responsible for producing structured conversational data suitable for fine-tuning a language model.
+Your task follows this pipeline:
+1. **Understand the user's request** to determine the data domain and format.
+2. **Generate a diverse list of realistic user instructions** related to the request topic using the generate_data_tool.
+3. **Create corresponding assistant responses** using the generate_response_tool.
+4. **IMPORTANT**: After generating both instructions and responses, you MUST use the csv_tool to save the data.
+5. **Return the output in JSON format** using two keys only:
+   - `"instructions"`: An array of user queries or prompts.
+   - `"response"`: An array of assistant replies corresponding to each instruction.
+### Output format:
+Return the final output in this JSON format not any other text.:
+```json
+{{
+  "instructions": ["<user prompt 1>", "<user prompt 2>", "..."],
+  "response": ["<assistant response 1>", "<assistant response 2>", "..."]
+}}
+### Available Tools:
+1. **generate_data_tool**
+   - Use when: You need to create initial instructions for data generation
+   - Purpose: Generates structured instructions based on user input
+   - Input: User's query about what kind of data they want
+   - Output: JSON with "instructions" key
+2. **generate_response_tool**
+   - Use when: You have instructions and need to generate corresponding responses
+   - Purpose: Creates appropriate responses for the given instructions
+   - Input: Instructions from generate_data_tool
+   - Output: JSON with "response" key
+3. **csv_tool**
+   - Use when: You have complete JSON data ready to be saved
+   - Purpose: Converts JSON data to CSV format and saves it
+   - Input: Complete JSON data with both instructions and responses
+   - Output: Saves data to "Data_File.csv"
+### Tool Usage Flow:
+1. First, use generate_data_tool to create instructions
+2. Then, use generate_response_tool to create corresponding responses
+3. Finally, you MUST use csv_tool to save the complete dataset
+Remember to always maintain the correct JSON structure throughout the process.
+IMPORTANT: Only provide JSON output without any additional text before or after the JSON structure except of 'Json'. Do not include any explanatory text, markdown formatting, or other content outside the JSON object.
+If user dose not mention the number of rows then default to 10 rows.
+"""
+query_system_prompt = """You are a **Data Generation Agent** that produces **natural language instructions** to guide the creation of fine-tuning datasets based on a user request.
+### Your Task:
+1. Understand the user's input and determine the type and topic of data required.
+2. Based on the input, generate a **single, clear instruction** for creating a dataset. The instruction should describe what kind of data to generate, in natural and concise language.
+3. If the number of rows is not explicitly mentioned, default to **10 rows**.
+4. **Only return a string** with one key: `"instructions"`.
+---
+### Output Format:
+  "instructions": "Generate 1000 rows of employee salary data based on..."
+"""
+response_system_prompt = """
+You are a **Data Generation Agent** responsible for generating **structured data responses** based on the given instructions.
+---
+### Your Task:
+1. Read and understand the provided **instructions**.
+2. Generate the appropriate **data or description** that directly fulfills the instructions.
+3. Return only a **JSON string** with one key: `"response"`.
+---
+### Output Format:
+  "response": "Here is the data..."
+"""
+def generate_data(query : str) -> str:
+    try:
+        query_llm =  ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash ",   # gemini-2.0-flash     or gemini-2.5-flash-preview-04-17
+    api_key=GOOGLE_API_KEY,
+    temperature=0.9,
+    )
+        query_prompt = ChatPromptTemplate.from_messages([
+            ("system", query_system_prompt),
+            ("human", "{input}"),
+            ("assistant", "{agent_scratchpad}"),
+        ])
+        chain: Runnable = query_prompt | query_llm
+        result = chain.invoke({"input": query, "agent_scratchpad": ""})
+        return result
+    except Exception as e:
+        print(f"Error in generate_data: {str(e)}")
+        raise
+def generate_response(instructions : str) -> str:
+    try:
+        response_llm = ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash",
+    api_key=GOOGLE_API_KEY,
+    temperature=0.9,
+    )
+        response_prompt = ChatPromptTemplate.from_messages([
+            ("system", response_system_prompt),
+            ("human", "{instructions}"),
+            ("assistant", "{agent_scratchpad}"),
+        ])
+        chain: Runnable = response_prompt | response_llm
+        result = chain.invoke({"instructions": instructions, "agent_scratchpad": ""})
+        return result
+    except Exception as e:
+        print(f"Error in generate_response: {str(e)}")
+        raise
+def save_to_csv(data: str):
+    try:
+        # Clean the input string by removing triple quotes
+        print(f"here is the data {data}")
+        # Parse the JSON string into a Python dictionary
+        if ('json') in data:
+            print(f"json is in data")
+            if "```" in data:
+                print(f"``` is in data")
+                data = data.replace("json", "").strip()
+                data = data.replace("```", "").strip()
+                data_dict = json.loads(data)
+                df = pd.DataFrame()
+                df['instructions'] = data_dict['instructions']
+                df['response'] = data_dict['response']
+                print(f"DataFrame shape: {df.shape}")
+                print(f"DataFrame columns: {df.columns.tolist()}")
+                print(f"here is df {df}")
+                # Save to CSV without index
+                print("\nSaving to CSV...")
+                output_path = "Data_File.csv"
+                df.to_csv(output_path, index=False)
+                return f"File scucessfully created"
+            else:
+                print(f"``` is not in data")
+                data = data.replace("json", "").strip()
+                print(f"data is {data}")
+                data_dict = json.loads(data)
+                print(f"data_dict is {data_dict}")
+                df = pd.DataFrame()
+                df['instructions'] = data_dict['instructions']
+                df['response'] = data_dict['response']
+                print(f"DataFrame shape: {df.shape}")
+                print(f"DataFrame columns: {df.columns.tolist()}")
+                print(f"here is df {df}")
+                # Save to CSV without index
+                print("\nSaving to CSV...")
+                output_path = "Data_File.csv"
+                df.to_csv(output_path, index=False)
+                return f"File scucessfully created"
+        # Convert the dictionary to a DataFrame
+        elif "```" in data:
+            print(f"``` is in data")
+            data = data.replace("```", "").strip()
+            data = json.loads(data)
+            df = pd.DataFrame()
+            df['instructions'] = data['instructions']
+            df['response'] = data['response']
+            # Save to CSV without index
+            print("\nSaving to CSV...")
+            output_path = "Data_File.csv"
+            df.to_csv(output_path, index=False)
+            return f"File created successfully"
+        else:
+            print(f"''' is not in data")
+            data = json.loads(data)
+            df = pd.DataFrame()
+            df['instructions'] = data['instructions']
+            df['response'] = data['response']
+            # Save to CSV without index
+            print("\nSaving to CSV...")
+            output_path = "Data_File.csv"
+            df.to_csv(output_path, index=False)
+            return f"File created successfully"
+    except json.JSONDecodeError as e:
+        raise ValueError(f"Invalid JSON data: {str(e)}")
+generate_data_tool =    Tool(
+        name="generate_data_tool",
+        description="Generate the data(Instructions) for the query",
+        func=generate_data
+    )
+generate_response_tool =    Tool(
+        name="generate_response_tool",
+        description="Generate the data(Response) for the instructions",
+        func=generate_response
+    )
+csv_tool = Tool(
+    name="csv_tool",
+    description="Pass the JSON data after generating both instructions and responses, convert it into csv then save the csv file",
+    func=save_to_csv
+)
+tools = [generate_data_tool, generate_response_tool,csv_tool]
+query_prompt = ChatPromptTemplate.from_messages([
+    ("system", system_prompt),
+    ("human", "{input}"),
+    MessagesPlaceholder(variable_name="agent_scratchpad")
+])
+llm = ChatGoogleGenerativeAI(
+    model="gemini-2.0-flash",
+    api_key=GOOGLE_API_KEY,
+    temperature=0.9,
+    )
+agent = create_openai_tools_agent(llm=llm, prompt=query_prompt, tools=tools)
+data_agent = AgentExecutor(agent=agent, tools=tools, verbose=True)
+def generate_data_agent(query: str):
+    try:
+        if not query:
+            return {"status": "error", "message": "Query cannot be empty", "csv_file": None}
+        print(f"Processing query: {query}")
+        result = data_agent.invoke({"input": query})
+        print(f"Agent execution result: {result['output']}")
+        save_to_csv(result['output'])
+        print(f"here is the result {type(result['output'])}")
+        # Check if data.csv was created
+        if os.path.exists("Data_File.csv"):
+            return {
+                "status": "success",
+                "message": "Created successfully! You can download the CSV file below.",
+                "csv_file": "Data_File.csv"
+            }
+        else:
+            return {
+                "status": "error",
+                "message": "Failed to generate data file",
+                "csv_file": None
+            }
+    except Exception as e:
+        print(f"Error in generate_data_agent: {str(e)}")
+        return {
+            "status": "error",
+            "message": f"An error occurred: {str(e)}",
+            "csv_file": None
+        }

Dockerfile CHANGED Viewed

@@ -1,20 +1,20 @@
-# Use Python 3.10 as base image
-FROM python:3.13-slim
-# Set working directory
-WORKDIR /app
-# Copy requirements first to leverage Docker cache
-COPY requirements.txt .
-# Install dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy the rest of the application
-COPY . .
-# Expose the port the app runs on
-EXPOSE 7860
-# Command to run the application
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

+# Use Python 3.10 as base image
+FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Expose the port the app runs on
+EXPOSE 7860
+# Command to run the application
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,120 +1,112 @@
----
-license: mit
-title: Fine Tune Data Generation Agent
-sdk: docker
-colorFrom: blue
-colorTo: blue
-short_description: Generate your Fine-Tune Dataset only with one Query using AI
----
-# Data Generation Agent
-A LangChain-based agent that automatically generates diverse training data for fine-tuning LLM models. While optimized for customer support conversations, it can generate any type of instruction-response pairs, including but not limited to:
-- Customer service interactions
-- Technical support dialogues
-- Product inquiries
-- FAQ responses
-- Educational content
-- Code explanations
-- Creative writing prompts
-![](image1.png)
-![](image2.png)
-The agent creates structured data pairs (instructions and responses) in JSON format and saves them to CSV, making it easy to prepare training data for language models.
-## Features
-- Generates structured data in JSON format
-- Supports custom data generation instructions
-- Automatically saves data to CSV format
-- Uses OpenAI's GPT models for data generation
-- Implements a two-step process: instruction generation and response generation
-- Versatile data generation for any domain or use case
-- Customizable output format and structure
-## Prerequisites
-- Python 3.x
-- OpenAI API key
-- Required Python packages (install via pip):
-  ```bash
-  pip install langchain langchain-openai pandas numpy matplotlib python-dotenv
-  ```
-## Environment Setup
-1. Create a `.env` file in the project root
-2. Add your OpenAI API key:
-  ```
-  OPENAI_API_KEY=your_api_key_here
-  ```
-## Code Structure
-### Main Components
-1. **System Prompts**
-   - `system_prompt`: Main agent prompt for overall data generation
-   - `query_system_prompt`: Prompt for generating instructions
-   - `response_system_prompt`: Prompt for generating responses
-2. **Core Functions**
-   - `generate_data(query)`: Generates instructions based on user query
-   - `generate_response(instructions)`: Generates responses based on instructions
-   - `save_to_csv(data)`: Saves generated data to CSV file
-3. **Tools**
-   - `generate_data_tool`: Tool for instruction generation
-   - `generate_response_tool`: Tool for response generation
-   - `csv_tool`: Tool for saving data to CSV
-### Usage Example
-```python
-query = "provide me amx customer support data atleast 100 rows"
-result = data_agent.invoke({"input": query})
-```
-## Output Format
-The agent generates data in the following JSON format:
-```json
-{
-    "instructions": ["instruction1", "instruction2", ...],
-    "response": ["response1", "response2", ...]
-}
-```
-## Data Generation Process
-1. **Instruction Generation**
-   - Takes user query as input
-   - Generates natural language instructions
-   - Returns JSON with "instructions" key
-2. **Response Generation**
-   - Takes instructions as input
-   - Generates corresponding responses
-   - Returns JSON with "response" key
-3. **Data Storage**
-   - Converts JSON data to DataFrame
-   - Saves to CSV file named "data.csv"
-## Configuration
-- Model: GPT-4 (configurable via `model` parameter)
-- Temperature: 0.8 (configurable)
-- Default row count: 1000 (if not specified in query)
-## Error Handling
-The code includes basic error handling for:
-- JSON parsing
-- CSV file operations
-- API calls
-## Contributing
-Feel free to submit issues and enhancement requests!

+# Data Generation Agent
+A LangChain-based agent that automatically generates diverse training data for fine-tuning LLM models. While optimized for customer support conversations, it can generate any type of instruction-response pairs, including but not limited to:
+- Customer service interactions
+- Technical support dialogues
+- Product inquiries
+- FAQ responses
+- Educational content
+- Code explanations
+- Creative writing prompts
+![](image1.png)
+![](image2.png)
+The agent creates structured data pairs (instructions and responses) in JSON format and saves them to CSV, making it easy to prepare training data for language models.
+## Features
+- Generates structured data in JSON format
+- Supports custom data generation instructions
+- Automatically saves data to CSV format
+- Uses OpenAI's GPT models for data generation
+- Implements a two-step process: instruction generation and response generation
+- Versatile data generation for any domain or use case
+- Customizable output format and structure
+## Prerequisites
+- Python 3.x
+- OpenAI API key
+- Required Python packages (install via pip):
+  ```bash
+  pip install langchain langchain-openai pandas numpy matplotlib python-dotenv
+  ```
+## Environment Setup
+1. Create a `.env` file in the project root
+2. Add your OpenAI API key:
+  ```
+  OPENAI_API_KEY=your_api_key_here
+  ```
+## Code Structure
+### Main Components
+1. **System Prompts**
+   - `system_prompt`: Main agent prompt for overall data generation
+   - `query_system_prompt`: Prompt for generating instructions
+   - `response_system_prompt`: Prompt for generating responses
+2. **Core Functions**
+   - `generate_data(query)`: Generates instructions based on user query
+   - `generate_response(instructions)`: Generates responses based on instructions
+   - `save_to_csv(data)`: Saves generated data to CSV file
+3. **Tools**
+   - `generate_data_tool`: Tool for instruction generation
+   - `generate_response_tool`: Tool for response generation
+   - `csv_tool`: Tool for saving data to CSV
+### Usage Example
+```python
+query = "provide me amx customer support data atleast 100 rows"
+result = data_agent.invoke({"input": query})
+```
+## Output Format
+The agent generates data in the following JSON format:
+```json
+{
+    "instructions": ["instruction1", "instruction2", ...],
+    "response": ["response1", "response2", ...]
+}
+```
+## Data Generation Process
+1. **Instruction Generation**
+   - Takes user query as input
+   - Generates natural language instructions
+   - Returns JSON with "instructions" key
+2. **Response Generation**
+   - Takes instructions as input
+   - Generates corresponding responses
+   - Returns JSON with "response" key
+3. **Data Storage**
+   - Converts JSON data to DataFrame
+   - Saves to CSV file named "data.csv"
+## Configuration
+- Model: GPT-4 (configurable via `model` parameter)
+- Temperature: 0.8 (configurable)
+- Default row count: 1000 (if not specified in query)
+## Error Handling
+The code includes basic error handling for:
+- JSON parsing
+- CSV file operations
+- API calls
+## Contributing
+Feel free to submit issues and enhancement requests!

main.py CHANGED Viewed

@@ -1,88 +1,83 @@
-import fastapi
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.responses import FileResponse, JSONResponse
-from fastapi.staticfiles import StaticFiles
-import json
-from fastapi.responses import HTMLResponse
-from fastapi.templating import Jinja2Templates
-import uvicorn
-from pydantic import BaseModel
-from typing import List, Optional
-from Data_Geneartion_Agent import generate_data_agent
-import os
-from dotenv import load_dotenv
-app = FastAPI()
-from fastapi.middleware.cors import CORSMiddleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # or your Netlify frontend URL
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Mount static files
-templates = Jinja2Templates(directory="templates")
-# app.mount("/static", StaticFiles(directory="static"), name="static")
-class Query(BaseModel):
-    # api_key: str
-    query: str
-class Result(BaseModel):
-    status: str
-    message: str
-    csv_file: Optional[str] = None
-def save_api_key(api_key: str):
-    """Save API key to .env file"""
-    # with open(".env", "w") as f:
-    #     f.write(f"GOOGLE_API_KEY={api_key}\n")
-    # with open("runtime_env.json", "w") as f:
-    #     json.dump({"API_KEY": api_key}, f)
-    # Reload environment variables
-    load_dotenv()
-@app.get("/",response_class=HTMLResponse)
-async def root(request: Request):
-    return templates.TemplateResponse("index.html",{"request":request})
-# class ApiKey(BaseModel):
-#     api_key: str
-# @app.post("/api/save_api_key")
-# async def save_api_key(api_key: ApiKey):
-#     with open(".env", "w") as f:
-#         f.write(f"GOOGLE_API_KEY={api_key.api_key}")
-#     return JSONResponse(content={"message": "API key saved successfully"}, status_code=200)
-@app.post("/generate")
-async def generate(query: Query):
-    try:
-        # Save the API key to .env file
-        # print(f"Here is api_key in main.py {query.api_key}")
-        # save_api_key(query.api_key)
-        # Generate data using the agent with the new API key
-        result = generate_data_agent(query.query)
-        print(f"Here is the final result {result}")
-        return result
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/download/{filename}")
-async def download_file(filename: str):
-    try:
-        print(f"Here is file filename in main.py  {filename}")
-        return FileResponse(
-            path=filename,
-            filename=filename,
-            media_type="text/csv"
-        )
-    except Exception as e:
-        raise HTTPException(status_code=404, detail="File not found")
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)

+import fastapi
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse
+from fastapi.templating import Jinja2Templates
+import uvicorn
+from pydantic import BaseModel
+from typing import List, Optional
+from Data_Geneartion_Agent import generate_data_agent
+import os
+from dotenv import load_dotenv
+app = FastAPI()
+from fastapi.middleware.cors import CORSMiddleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # or your Netlify frontend URL
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Mount static files
+templates = Jinja2Templates(directory="templates")
+app.mount("/static", StaticFiles(directory="static"), name="static")
+class Query(BaseModel):
+    api_key: str
+    query: str
+class Result(BaseModel):
+    status: str
+    message: str
+    csv_file: Optional[str] = None
+def save_api_key(api_key: str):
+    """Save API key to .env file"""
+    with open(".env", "w") as f:
+        f.write(f"GOOGLE_API_KEY={api_key}\n")
+    # Reload environment variables
+    load_dotenv()
+@app.get("/",response_class=HTMLResponse)
+async def root(request: Request):
+    return templates.TemplateResponse("index.html",{"request":request})
+# class ApiKey(BaseModel):
+#     api_key: str
+# @app.post("/api/save_api_key")
+# async def save_api_key(api_key: ApiKey):
+#     with open(".env", "w") as f:
+#         f.write(f"GOOGLE_API_KEY={api_key.api_key}")
+#     return JSONResponse(content={"message": "API key saved successfully"}, status_code=200)
+@app.post("/generate")
+async def generate(query: Query):
+    try:
+        # Save the API key to .env file
+        save_api_key(query.api_key)
+        # Generate data using the agent with the new API key
+        result = generate_data_agent(query.query)
+        print(f"Here is the final result {result}")
+        return result
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/download/{filename}")
+async def download_file(filename: str):
+    try:
+        print(f"Here is file filename in main.py  {filename}")
+        return FileResponse(
+            path=filename,
+            filename=filename,
+            media_type="text/csv"
+        )
+    except Exception as e:
+        raise HTTPException(status_code=404, detail="File not found")
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt CHANGED Viewed

@@ -1,12 +1,11 @@
-fastapi>=0.109.0
-uvicorn>=0.27.0
-python-dotenv>=1.0.0
-jinja2>=3.1.3
-pydantic>=2.6.0
-pandas>=2.2.0
-numpy>=1.26.4
-# ✅ Compatible Langchain stack
-langchain>=0.1.0
-langchain-google-genai>=0.0.6
-langchain-core>=0.1.10

+fastapi==0.104.1
+uvicorn==0.24.0
+python-dotenv==1.0.0
+jinja2==3.1.2
+pydantic==2.4.2
+pandas==1.5.3
+numpy==1.26.2
+langchain==0.0.350
+langchain-google-genai==0.0.5
+langchain-core==0.1.10
+langchain-openai==0.0.5

runtime_env.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {"API_KEY": "AIzaSyCM58jXv44b9TjbvLUzsTZ2secXHcwT-AI"}

templates/index.html DELETED Viewed

@@ -1,495 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-    <meta charset="UTF-8">
-    <meta name="viewport" content="width=device-width, initial-scale=1.0">
-    <title>Fine Tune Data Generation Agent</title>
-    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
-    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/animate.css/4.1.1/animate.min.css">
-    <style>
-        :root {
-            --primary-color: #8B5CF6;
-            --secondary-color: #EC4899;
-            --accent-color: #10B981;
-            --text-color: #ffffff;
-            --bg-color: #0F172A;
-            --chat-bg: #1E293B;
-            --nav-bg: rgba(15, 23, 42, 0.95);
-        }
-        body {
-            background-color: var(--bg-color);
-            color: var(--text-color);
-            min-height: 100vh;
-            position: relative;
-            overflow-x: hidden;
-            font-family: 'Inter', sans-serif;
-            padding-top: 70px;
-        }
-        .navbar {
-            background-color: var(--nav-bg);
-            backdrop-filter: blur(10px);
-            border-bottom: 1px solid rgba(255, 255, 255, 0.1);
-            padding: 0.5rem 1rem;
-        }
-        .nav-note {
-            font-size: 0.85rem;
-            color: #ff6b6b;
-            padding: 0.5rem;
-            border-radius: 0.5rem;
-            background: rgba(255, 107, 107, 0.1);
-            border-left: 3px solid #ff6b6b;
-        }
-        .container {
-            position: relative;
-            z-index: 2;
-            max-width: 1200px;
-            margin: 0 auto;
-            padding: 0 20px;
-            height: calc(100vh - 70px);
-            display: flex;
-            flex-direction: column;
-        }
-        .chat-container {
-            flex: 1;
-            overflow-y: auto;
-            padding: 1.5rem 2rem;
-            margin-bottom: 100px;
-            position: relative;
-            z-index: 2;
-        }
-        .message {
-            display: flex;
-            margin-bottom: 1.5rem;
-            animation: slideIn 0.3s ease-out;
-            position: relative;
-            z-index: 3;
-        }
-        .message.user {
-            justify-content: flex-end;
-        }
-        .message-content {
-            max-width: 70%;
-            padding: 1rem 1.5rem;
-            border-radius: 1rem;
-            position: relative;
-            transform-style: preserve-3d;
-            transition: transform 0.3s ease;
-        }
-        .message.user .message-content {
-            background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
-            margin-left: auto;
-        }
-        .message.bot .message-content {
-            background: var(--chat-bg);
-            border: 1px solid rgba(255, 255, 255, 0.1);
-        }
-        .message-content:hover {
-            transform: translateZ(20px) rotateX(5deg);
-        }
-        .query-box {
-            position: fixed;
-            bottom: 0;
-            left: 0;
-            right: 0;
-            padding: 1.5rem;
-            background: rgba(15, 23, 42, 0.95);
-            backdrop-filter: blur(10px);
-            border-top: 1px solid rgba(255, 255, 255, 0.1);
-            z-index: 10;
-        }
-        .query-input {
-            background-color: var(--chat-bg);
-            border: 2px solid rgba(139, 92, 246, 0.3);
-            color: var(--text-color);
-            padding: 1.2rem;
-            border-radius: 1rem;
-            font-size: 1.1rem;
-            transition: all 0.3s ease;
-        }
-        .query-input:focus {
-            border-color: var(--primary-color);
-            box-shadow: 0 0 0 3px rgba(139, 92, 246, 0.2);
-            outline: none;
-        }
-        .btn-primary {
-            background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
-            border: none;
-            padding: 1.2rem 2rem;
-            border-radius: 1rem;
-            font-weight: 600;
-            transition: all 0.3s ease;
-        }
-        .btn-primary:hover {
-            transform: translateY(-2px);
-            box-shadow: 0 5px 15px rgba(139, 92, 246, 0.4);
-        }
-        /* Enhanced Download Button */
-        .download-btn {
-            position: relative;
-            overflow: hidden;
-            transition: all 0.4s ease;
-            display: inline-block;
-            z-index: 20;
-        }
-        .download-btn::before {
-            content: '';
-            position: absolute;
-            top: 0;
-            left: -100%;
-            width: 100%;
-            height: 100%;
-            background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
-            transition: 0.5s;
-            z-index: -1;
-        }
-        .download-btn:hover::before {
-            left: 100%;
-        }
-        .download-btn:hover {
-            transform: translateY(-3px);
-            box-shadow: 0 10px 20px rgba(0, 0, 0, 0.3);
-        }
-        .download-btn:active {
-            transform: translateY(1px);
-        }
-        .download-btn .fa-download {
-            transition: transform 0.3s ease;
-        }
-        .download-btn:hover .fa-download {
-            transform: translateY(-3px);
-        }
-        .pulse {
-            animation: pulse 2s infinite;
-        }
-        @keyframes pulse {
-            0% {
-                box-shadow: 0 0 0 0 rgba(16, 185, 129, 0.7);
-            }
-            70% {
-                box-shadow: 0 0 0 10px rgba(16, 185, 129, 0);
-            }
-            100% {
-                box-shadow: 0 0 0 0 rgba(16, 185, 129, 0);
-            }
-        }
-        .floating-particles {
-            position: fixed;
-            top: 0;
-            left: 0;
-            width: 100%;
-            height: 100%;
-            pointer-events: none;
-            z-index: 1;
-        }
-        /* Make sure canvas doesn't block clicks */
-        .floating-particles canvas {
-            pointer-events: none !important;
-        }
-        .loader {
-            display: inline-block;
-            width: 20px;
-            height: 20px;
-            border: 3px solid rgba(255,255,255,.3);
-            border-radius: 50%;
-            border-top-color: var(--accent-color);
-            animation: spin 1s ease-in-out infinite;
-            margin-right: 10px;
-            vertical-align: middle;
-        }
-        @keyframes spin {
-            to { transform: rotate(360deg); }
-        }
-        @keyframes slideIn {
-            from {
-                opacity: 0;
-                transform: translateY(20px);
-            }
-            to {
-                opacity: 1;
-                transform: translateY(0);
-            }
-        }
-    </style>
-</head>
-<body>
-    <!-- Navbar -->
-    <nav class="navbar fixed-top navbar-expand-lg">
-        <div class="container-fluid">
-            <a class="navbar-brand d-flex align-items-center" href="#">
-                <i class="fas fa-database me-2"></i>
-                <span class="fw-bold">Fine Tune Data Generator</span>
-            </a>
-            <div class="d-flex">
-                <div class="nav-note me-3">
-                    <i class="fas fa-exclamation-circle me-2"></i>
-                    If you find errors (similar wrong format or unable to detect model), please try again later or contact the developer
-                </div>
-                <!-- <div class="api-key-input">
-                    <input type="password" id="apiKeyInput" class="form-control" placeholder="Enter Gemini API Key" style="background-color: var(--chat-bg); color: var(--text-color); border: 1px solid rgba(255, 255, 255, 0.1);">
-                </div> -->
-            </div>
-        </div>
-    </nav>
-    <div class="floating-particles" id="particles"></div>
-    <div class="container">
-        <div class="chat-container" id="chatContainer">
-            <div class="message bot">
-                <div class="message-content">
-                    <h3><i class="fas fa-robot me-2"></i>Welcome to Fine Tune Data Generation Agent</h3>
-                    <p>I'm here to help you generate datasets. What would you like to create?</p>
-                </div>
-            </div>
-        </div>
-        <div class="query-box">
-            <form id="queryForm" class="d-flex">
-                <input type="text" id="queryInput" class="form-control query-input" placeholder="Enter your query to generate dataset...">
-                <button type="submit" class="btn btn-primary ms-2">
-                    <i class="fas fa-paper-plane me-2"></i>Send
-                </button>
-            </form>
-            <div class="contact-links mt-2 text-center" style="font-size: 0.9rem;">
-                <a href="https://www.linkedin.com/in/harsimransinghtech/" target="_blank" class="text-light me-3" style="text-decoration: none;">
-                    <i class="fab fa-linkedin"></i> LinkedIn
-                </a>
-                <a href="https://github.com/harsimran726" target="_blank" class="text-light" style="text-decoration: none;">
-                    <i class="fab fa-github"></i> GitHub
-                </a>
-            </div>
-        </div>
-    </div>
-    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
-    <script src="https://cdn.jsdelivr.net/npm/three@0.132.2/build/three.min.js"></script>
-    <script>
-        // 3D Particles Animation with fixed z-index
-        const scene = new THREE.Scene();
-        const camera = new THREE.PerspectiveCamera(75, window.innerWidth / window.innerHeight, 0.1, 1000);
-        const renderer = new THREE.WebGLRenderer({
-            alpha: true,
-            antialias: true
-        });
-        const particlesContainer = document.getElementById('particles');
-        renderer.setSize(window.innerWidth, window.innerHeight);
-        renderer.domElement.style.position = 'fixed';
-        renderer.domElement.style.top = '0';
-        renderer.domElement.style.left = '0';
-        renderer.domElement.style.zIndex = '1';
-        renderer.domElement.style.pointerEvents = 'none'; // Ensure no interaction
-        particlesContainer.appendChild(renderer.domElement);
-        const particles = [];
-        const particleCount = 25; // Reduced for performance
-        for (let i = 0; i < particleCount; i++) {
-            const geometry = new THREE.IcosahedronGeometry(0.15, 1); // Smoother shape
-            const material = new THREE.MeshBasicMaterial({
-                color: new THREE.Color(
-                    Math.random() * 0.5 + 0.5,
-                    Math.random() * 0.5,
-                    Math.random() * 0.5 + 0.5
-                ),
-                transparent: true,
-                opacity: 0.4
-            });
-            const particle = new THREE.Mesh(geometry, material);
-            particle.position.x = Math.random() * 10 - 5;
-            particle.position.y = Math.random() * 10 - 5;
-            particle.position.z = Math.random() * 10 - 5;
-            particles.push(particle);
-            scene.add(particle);
-        }
-        camera.position.z = 5;
-        function animate() {
-            requestAnimationFrame(animate);
-            particles.forEach(particle => {
-                particle.rotation.x += 0.005;
-                particle.rotation.y += 0.005;
-                particle.position.y += Math.sin(Date.now() * 0.001) * 0.005;
-            });
-            renderer.render(scene, camera);
-        }
-        animate();
-        // Form submission handling
-        document.getElementById('queryForm').addEventListener('submit', async (e) => {
-            e.preventDefault();
-            const query = document.getElementById('queryInput').value.trim();
-            // const apiKey = document.getElementById('apiKeyInput').value.trim();
-            if (!query) return;
-            // if (!apiKey) {
-            //     const errorMessage = document.createElement('div');
-            //     errorMessage.className = 'message bot';
-            //     errorMessage.innerHTML = `
-            //         <div class="message-content">
-            //             <p class="text-danger"><i class="fas fa-exclamation-triangle me-2"></i>Please enter your Gemini API Key first</p>
-            //         </div>
-            //     `;
-            //     document.getElementById('chatContainer').appendChild(errorMessage);
-            //     return;
-            // }
-            const chatContainer = document.getElementById('chatContainer');
-            // Add user message
-            const userMessage = document.createElement('div');
-            userMessage.className = 'message user';
-            userMessage.innerHTML = `
-                <div class="message-content">
-                    ${query}
-                </div>
-            `;
-            chatContainer.appendChild(userMessage);
-            try {
-                // Show loading message
-                const loadingMessage = document.createElement('div');
-                loadingMessage.className = 'message bot';
-                loadingMessage.innerHTML = `
-                    <div class="message-content">
-                        <p><span class="loader"></span>Generating your dataset...</p>
-                    </div>
-                `;
-                chatContainer.appendChild(loadingMessage);
-                // Clear input
-                document.getElementById('queryInput').value = '';
-                // Scroll to bottom
-                chatContainer.scrollTop = chatContainer.scrollHeight;
-                // Make API call
-                const response = await fetch('/generate', {
-                    method: 'POST',
-                    headers: {
-                        'Content-Type': 'application/json',
-                    },
-                    body: JSON.stringify({
-                        query: query,
-                        // api_key: apiKey
-                    })
-                });
-                if (!response.ok) {
-                    throw new Error('Failed to generate data. Server responded with status: ' + response.status);
-                }
-                const result = await response.json();
-                // Remove loading message
-                chatContainer.removeChild(loadingMessage);
-                // Add bot response message
-                const botMessage = document.createElement('div');
-                botMessage.className = 'message bot';
-                let messageContent = `
-                    <div class="message-content">
-                        <p>${result.message}</p>
-                `;
-                // Add download button if CSV file is available
-                if (result.csv_file) {
-                    messageContent += `
-                            <a href="/download/tmp/${result.csv_file}" class="btn btn-primary download-btn pulse" download="${result.csv_file}">
-                                <i class="fas fa-download me-2"></i>Download Dataset
-                            </a>
-                    `;
-                }
-                messageContent += `</div>`;
-                botMessage.innerHTML = messageContent;
-                chatContainer.appendChild(botMessage);
-                // Scroll to bottom
-                chatContainer.scrollTop = chatContainer.scrollHeight;
-                // Add click handler to download button
-                const downloadBtn = botMessage.querySelector('.download-btn');
-                if (downloadBtn) {
-                    downloadBtn.addEventListener('click', function(e) {
-                        console.log('Download initiated:', this.href);
-                        // Optional: Track download event
-                    });
-                }
-            } catch (error) {
-                // Remove loading message if it exists
-                const loadingMessages = chatContainer.querySelectorAll('.message');
-                const lastMessage = loadingMessages[loadingMessages.length - 1];
-                if (lastMessage && lastMessage.querySelector('.loader')) {
-                    chatContainer.removeChild(lastMessage);
-                }
-                // Add error message
-                const errorMessage = document.createElement('div');
-                errorMessage.className = 'message bot';
-                errorMessage.innerHTML = `
-                    <div class="message-content">
-                        <p class="text-danger"><i class="fas fa-exclamation-triangle me-2"></i>${error.message}</p>
-                    </div>
-                `;
-                chatContainer.appendChild(errorMessage);
-                // Scroll to bottom
-                chatContainer.scrollTop = chatContainer.scrollHeight;
-            }
-        });
-        // Handle window resize
-        window.addEventListener('resize', () => {
-            camera.aspect = window.innerWidth / window.innerHeight;
-            camera.updateProjectionMatrix();
-            renderer.setSize(window.innerWidth, window.innerHeight);
-        });
-    </script>
-</body>
-</html>