Dataset-Creator / app.py
rohangbs's picture
Create app.py
30dd9be verified
import streamlit as st
import pandas as pd
import os
import json
import httpx
import time
from typing import List, Dict
class GroqHRGenerator:
def __init__(self, api_key: str):
self.api_key = api_key
self.base_url = "https://api.groq.com/openai/v1/chat/completions"
self.headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
def _call_groq_api(self, prompt: str) -> str:
payload = {
"model": "mixtral-8x7b-32768",
"messages": [
{
"role": "system",
"content": """You are a conversation generator for HR-employee interactions.
Generate realistic conversations with emotional context and natural flow.
Output should be in JSON format with the following structure for each turn:
{"role": "employee/hr", "message": "text", "emotion": "emotion_name"}"""
},
{
"role": "user",
"content": prompt
}
],
"temperature": 0.7,
"max_tokens": 1000
}
try:
response = httpx.post(
self.base_url,
headers=self.headers,
json=payload,
timeout=30.0
)
response.raise_for_status()
return response.json()['choices'][0]['message']['content']
except Exception as e:
st.error(f"Error calling Groq API: {e}")
return None
def generate_conversation(self, scenario: str) -> List[Dict]:
prompt = f"""
Generate a realistic HR-employee conversation about the following scenario:
{scenario}
The conversation should:
1. Include natural emotional responses from the employee
2. Show professional and empathetic responses from HR
3. Have a natural flow and progression
4. Include 12-15 turns between the employee and HR
Return the conversation in JSON format as a list of messages, where each message has:
- role (employee/hr)
- message (the actual text)
- emotion (for employee messages only)
"""
response = self._call_groq_api(prompt)
if response:
try:
response = response.replace("```json", "").replace("```", "").strip()
return json.loads(response)
except json.JSONDecodeError as e:
st.error(f"Error parsing JSON response: {e}")
return None
return None
def generate_dataset(self, scenarios: List[str]) -> pd.DataFrame:
all_turns = []
for scenario_idx, scenario in enumerate(scenarios, 1):
with st.spinner(f'Generating conversation for scenario {scenario_idx}...'):
conversation = self.generate_conversation(scenario)
if conversation:
for turn in conversation:
all_turns.append({
'conversation_id': scenario_idx,
'role': turn['role'],
'message': turn['message'],
'emotion': turn.get('emotion', 'N/A'),
'scenario': scenario
})
time.sleep(1) # Small delay between API calls
if all_turns:
return pd.DataFrame(all_turns)
return None
def main():
st.title("HR Conversation Dataset Generator")
st.write("Generate realistic HR-employee conversations based on different scenarios.")
# API Key input
api_key = st.text_input("Enter your Groq API Key:", type="password")
# Scenario input
st.subheader("Enter Scenarios")
st.write("Add scenarios for generating conversations. Each scenario will generate a unique conversation.")
# Initialize scenarios list in session state if it doesn't exist
if 'scenarios' not in st.session_state:
st.session_state.scenarios = [""]
# Function to add new scenario field
def add_scenario():
st.session_state.scenarios.append("")
# Function to remove scenario field
def remove_scenario(index):
st.session_state.scenarios.pop(index)
# Display scenario input fields
new_scenarios = []
for i, scenario in enumerate(st.session_state.scenarios):
col1, col2 = st.columns([6, 1])
with col1:
new_scenario = st.text_area(f"Scenario {i+1}", scenario, key=f"scenario_{i}")
new_scenarios.append(new_scenario)
with col2:
if i > 0: # Don't allow removing the first scenario
if st.button("Remove", key=f"remove_{i}"):
remove_scenario(i)
st.rerun()
st.session_state.scenarios = new_scenarios
if st.button("Add Another Scenario"):
add_scenario()
st.rerun()
# Generate button
if st.button("Generate Dataset"):
if not api_key:
st.error("Please enter your Groq API key.")
return
# Filter out empty scenarios
scenarios = [s for s in st.session_state.scenarios if s.strip()]
if not scenarios:
st.error("Please enter at least one scenario.")
return
generator = GroqHRGenerator(api_key)
df = generator.generate_dataset(scenarios)
if df is not None:
st.success("Dataset generated successfully!")
# Display the dataset
st.subheader("Generated Dataset")
st.dataframe(df)
# Download button
csv = df.to_csv(index=False)
st.download_button(
label="Download CSV",
data=csv,
file_name="hr_conversations.csv",
mime="text/csv"
)
else:
st.error("Failed to generate dataset. Please try again.")
if __name__ == "__main__":
main()