Spaces:

rohangbs
/

Dataset-Creator

Sleeping

File size: 6,165 Bytes

30dd9be

import streamlit as st
import pandas as pd
import os
import json
import httpx
import time
from typing import List, Dict

class GroqHRGenerator:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.groq.com/openai/v1/chat/completions"
        self.headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }

    def _call_groq_api(self, prompt: str) -> str:
        payload = {
            "model": "mixtral-8x7b-32768",
            "messages": [
                {
                    "role": "system",
                    "content": """You are a conversation generator for HR-employee interactions. 
                    Generate realistic conversations with emotional context and natural flow. 
                    Output should be in JSON format with the following structure for each turn:
                    {"role": "employee/hr", "message": "text", "emotion": "emotion_name"}"""
                },
                {
                    "role": "user",
                    "content": prompt
                }
            ],
            "temperature": 0.7,
            "max_tokens": 1000
        }

        try:
            response = httpx.post(
                self.base_url,
                headers=self.headers,
                json=payload,
                timeout=30.0
            )
            response.raise_for_status()
            return response.json()['choices'][0]['message']['content']
        except Exception as e:
            st.error(f"Error calling Groq API: {e}")
            return None

    def generate_conversation(self, scenario: str) -> List[Dict]:
        prompt = f"""
        Generate a realistic HR-employee conversation about the following scenario:
        {scenario}
        
        The conversation should:
        1. Include natural emotional responses from the employee
        2. Show professional and empathetic responses from HR
        3. Have a natural flow and progression
        4. Include 12-15 turns between the employee and HR
        
        Return the conversation in JSON format as a list of messages, where each message has:
        - role (employee/hr)
        - message (the actual text)
        - emotion (for employee messages only)
        """

        response = self._call_groq_api(prompt)
        if response:
            try:
                response = response.replace("```json", "").replace("```", "").strip()
                return json.loads(response)
            except json.JSONDecodeError as e:
                st.error(f"Error parsing JSON response: {e}")
                return None
        return None

    def generate_dataset(self, scenarios: List[str]) -> pd.DataFrame:
        all_turns = []
        
        for scenario_idx, scenario in enumerate(scenarios, 1):
            with st.spinner(f'Generating conversation for scenario {scenario_idx}...'):
                conversation = self.generate_conversation(scenario)
                if conversation:
                    for turn in conversation:
                        all_turns.append({
                            'conversation_id': scenario_idx,
                            'role': turn['role'],
                            'message': turn['message'],
                            'emotion': turn.get('emotion', 'N/A'),
                            'scenario': scenario
                        })
                time.sleep(1)  # Small delay between API calls

        if all_turns:
            return pd.DataFrame(all_turns)
        return None

def main():
    st.title("HR Conversation Dataset Generator")
    st.write("Generate realistic HR-employee conversations based on different scenarios.")

    # API Key input
    api_key = st.text_input("Enter your Groq API Key:", type="password")
    
    # Scenario input
    st.subheader("Enter Scenarios")
    st.write("Add scenarios for generating conversations. Each scenario will generate a unique conversation.")
    
    # Initialize scenarios list in session state if it doesn't exist
    if 'scenarios' not in st.session_state:
        st.session_state.scenarios = [""]

    # Function to add new scenario field
    def add_scenario():
        st.session_state.scenarios.append("")

    # Function to remove scenario field
    def remove_scenario(index):
        st.session_state.scenarios.pop(index)

    # Display scenario input fields
    new_scenarios = []
    for i, scenario in enumerate(st.session_state.scenarios):
        col1, col2 = st.columns([6, 1])
        with col1:
            new_scenario = st.text_area(f"Scenario {i+1}", scenario, key=f"scenario_{i}")
            new_scenarios.append(new_scenario)
        with col2:
            if i > 0:  # Don't allow removing the first scenario
                if st.button("Remove", key=f"remove_{i}"):
                    remove_scenario(i)
                    st.rerun()

    st.session_state.scenarios = new_scenarios

    if st.button("Add Another Scenario"):
        add_scenario()
        st.rerun()

    # Generate button
    if st.button("Generate Dataset"):
        if not api_key:
            st.error("Please enter your Groq API key.")
            return

        # Filter out empty scenarios
        scenarios = [s for s in st.session_state.scenarios if s.strip()]
        
        if not scenarios:
            st.error("Please enter at least one scenario.")
            return

        generator = GroqHRGenerator(api_key)
        df = generator.generate_dataset(scenarios)
        
        if df is not None:
            st.success("Dataset generated successfully!")
            
            # Display the dataset
            st.subheader("Generated Dataset")
            st.dataframe(df)
            
            # Download button
            csv = df.to_csv(index=False)
            st.download_button(
                label="Download CSV",
                data=csv,
                file_name="hr_conversations.csv",
                mime="text/csv"
            )
        else:
            st.error("Failed to generate dataset. Please try again.")

if __name__ == "__main__":
    main()