Spaces:

Wedyan2023
/

Data_Generation_LabelingCopy

Sleeping

App Files Files

Data_Generation_LabelingCopy / app110.py

Wedyan2023

Update app110.py

f63f35b verified 6 months ago

raw

history blame

51.3 kB

	import streamlit as st
	import pandas as pd
	import os
	import json
	import base64
	import random
	from streamlit_pdf_viewer import pdf_viewer
	from langchain.prompts import PromptTemplate
	from datetime import datetime
	from pathlib import Path
	from openai import OpenAI
	from dotenv import load_dotenv
	import warnings

	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch

	warnings.filterwarnings('ignore')

	os.getenv("OAUTH_CLIENT_ID")


	# Load environment variables and initialize the OpenAI client to use Hugging Face Inference API.
	load_dotenv()
	client = OpenAI(
	base_url="https://api-inference.huggingface.co/v1",
	#api_key=os.environ.get('TOKEN2') # Hugging Face API token
	api_key=os.environ.get('LLM')
	)
	#######
	#from openai import OpenAI

	client = OpenAI(
	base_url="https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3.3-70B-Instruct/v1",
	#api_key="hf_xxxxxxxxxxxxxxxxxxxxxxxx",
	api_key=os.environ.get('LLM')
	)

	completion = client.chat.completions.create(
	model="meta-llama/Llama-3.3-70B-Instruct",
	messages=[
	{
	"role": "user",
	"content": "What is the capital of France?"
	}
	],
	)

	print(completion.choices[0].message)



	# Create necessary directories
	for dir_name in ['data', 'feedback']:
	if not os.path.exists(dir_name):
	os.makedirs(dir_name)

	# Custom CSS
	st.markdown("""
	<style>
	.stButton > button {
	width: 100%;
	margin-bottom: 10px;
	background-color: #4CAF50;
	color: white;
	border: none;
	padding: 10px;
	border-radius: 5px;
	}
	.task-button {
	background-color: #2196F3 !important;
	}
	.stSelectbox {
	margin-bottom: 20px;
	}
	.output-container {
	padding: 20px;
	border-radius: 5px;
	border: 1px solid #ddd;
	margin: 10px 0;
	}
	.status-container {
	padding: 10px;
	border-radius: 5px;
	margin: 10px 0;
	}
	.sidebar-info {
	padding: 10px;
	background-color: #f0f2f6;
	border-radius: 5px;
	margin: 10px 0;
	}
	.feedback-button {
	background-color: #ff9800 !important;
	}
	.feedback-container {
	padding: 15px;
	background-color: #f5f5f5;
	border-radius: 5px;
	margin: 15px 0;
	}
	</style>
	""", unsafe_allow_html=True)

	# Helper functions
	def read_csv_with_encoding(file):
	encodings = ['utf-8', 'latin1', 'iso-8859-1', 'cp1252']
	for encoding in encodings:
	try:
	return pd.read_csv(file, encoding=encoding)
	except UnicodeDecodeError:
	continue
	raise UnicodeDecodeError("Failed to read file with any supported encoding")

	#

	def reset_conversation():
	st.session_state.conversation = []
	st.session_state.messages = []
	if 'task_choice' in st.session_state:
	del st.session_state.task_choice
	return None
	#new 24 March
	#user_input = st.text_input("Enter your prompt:")
	###########33

	# Initialize session state variables
	if "messages" not in st.session_state:
	st.session_state.messages = []
	if "examples_to_classify" not in st.session_state:
	st.session_state.examples_to_classify = []
	if "system_role" not in st.session_state:
	st.session_state.system_role = ""



	# Main app title
	st.title("🤖🦙 Text Data Labeling and Generation App")


	# Sidebar settings
	with st.sidebar:
	st.title("⚙️ Settings")


	#this last code works
	with st.sidebar:
	st.markdown("### 📘Data Generation and Labeling Instructions")
	#st.markdown("<h4 style='color: #4A90E2;'>📘 Instructions</h4>", unsafe_allow_html=True)
	with open("User instructions.pdf", "rb") as f:
	st.download_button(
	label="📄 Download Instructions PDF",
	data=f,
	#file_name="instructions.pdf",
	file_name="User instructions.pdf",
	mime="application/pdf"
	)

	selected_model = st.selectbox(
	"Select Model",
	["meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-Prompt-Guard-2-86M", "meta-llama/Llama-3.2-11B-Vision-Instruct", "meta-llama/Llama-3.2-3B-Instruct","meta-llama/Llama-4-Scout-17B-16E-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct",
	"meta-llama/Llama-3.1-70B-Instruct"],
	key='model_select'
	)

	#"mistralai/Mistral-7B-Instruct-v0.2",
	temperature = st.slider(
	"Temperature",
	0.0, 1.0, 0.7,
	help="Controls randomness in generation"
	)

	st.button("🔄 New Conversation", on_click=reset_conversation)
	with st.container():
	st.markdown(f"""
	<div class="sidebar-info">
	<h4>Current Model: {selected_model}</h4>
	<p><em>Note: Generated content may be inaccurate or false. Check important info.</em></p>
	</div>
	""", unsafe_allow_html=True)

	feedback_url = "https://docs.google.com/forms/d/e/1FAIpQLSdZ_5mwW-pjqXHgxR0xriyVeRhqdQKgb5c-foXlYAV55Rilsg/viewform?usp=header"
	st.sidebar.markdown(
	f'<a href="{feedback_url}" target="_blank"><button style="width: 100%;">Feedback Form</button></a>',
	unsafe_allow_html=True
	)

	# Display conversation
	for message in st.session_state.messages:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Main content
	if 'task_choice' not in st.session_state:
	col1, col2 = st.columns(2)
	with col1:
	if st.button("📝 Data Generation", key="gen_button", help="Generate new data"):
	st.session_state.task_choice = "Data Generation"
	with col2:
	if st.button("🏷️ Data Labeling", key="label_button", help="Label existing data"):
	st.session_state.task_choice = "Data Labeling"

	if "task_choice" in st.session_state:
	if st.session_state.task_choice == "Data Generation":
	st.header("📝 Data Generation")

	# 1. Domain selection
	domain_selection = st.selectbox("Domain", [
	"Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"
	])

	# 2. Handle custom domain input
	custom_domain_valid = True # Assume valid until proven otherwise

	if domain_selection == "Custom":
	domain = st.text_input("Specify custom domain")
	if not domain.strip():
	st.error("Please specify a domain name.")
	custom_domain_valid = False
	else:
	domain = domain_selection

	# Classification type selection
	classification_type = st.selectbox(
	"Classification Type",
	["Sentiment Analysis", "Binary Classification", "Multi-Class Classification"]
	)

	labels = []
	labels_valid = False
	errors = []

	def validate_binary_labels(labels):
	errors = []
	normalized = [label.strip().lower() for label in labels]

	if not labels[0].strip():
	errors.append("First class name is required.")
	if not labels[1].strip():
	errors.append("Second class name is required.")
	if normalized[0] == normalized[1] and all(normalized):
	errors.append("Class names must be different.")
	return errors

	if classification_type == "Sentiment Analysis":
	st.write("### Sentiment Analysis Labels (Fixed)")
	col1, col2, col3 = st.columns(3)
	with col1:
	st.text_input("First class", "Positive", disabled=True)
	with col2:
	st.text_input("Second class", "Negative", disabled=True)
	with col3:
	st.text_input("Third class", "Neutral", disabled=True)
	labels = ["Positive", "Negative", "Neutral"]

	elif classification_type == "Binary Classification":
	st.write("### Binary Classification Labels")
	col1, col2 = st.columns(2)
	with col1:
	label_1 = st.text_input("First class", "Positive")
	with col2:
	label_2 = st.text_input("Second class", "Negative")

	labels = [label_1, label_2]
	errors = validate_binary_labels(labels)

	if errors:
	st.error("\n".join(errors))
	else:
	st.success("Binary class names are valid and unique!")


	elif classification_type == "Multi-Class Classification":
	st.write("### Multi-Class Classification Labels")

	default_labels_by_domain = {
	"News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
	"AG News": ["World", "Sports", "Business", "Sci/Tech"],
	"Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
	"Food & Dining", "Local Experience", "Adventure Activities",
	"Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
	"Luxury Tourism"],
	"Restaurant reviews": ["Italian", "French", "American"],
	"E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
	"Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
	"Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
	"Books & Stationery","Toys & Games", "Sports & Fitness",
	"Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
	}

	num_classes = st.slider("Number of classes", 3, 10, 3)

	# Get defaults for selected domain, or empty list
	defaults = default_labels_by_domain.get(domain, [])

	labels = []
	errors = []
	cols = st.columns(3)

	for i in range(num_classes):
	with cols[i % 3]:
	default_value = defaults[i] if i < len(defaults) else ""
	label_input = st.text_input(f"Class {i+1}", default_value)
	normalized_label = label_input.strip().title()

	if not normalized_label:
	errors.append(f"Class {i+1} name is required.")
	else:
	labels.append(normalized_label)

	# Check for duplicates (case-insensitive)
	if len(labels) != len(set(labels)):
	errors.append("Labels names must be unique (case-insensitive, normalized to Title Case).")

	# Show validation results
	if errors:
	for error in errors:
	st.error(error)
	else:
	st.success("All Labels names are valid and unique!")
	labels_valid = not errors # Will be True only if there are no label errors

	##############
	#new 22/4/2025
	# add additional attributes
	add_attributes = st.checkbox("Add additional attributes (optional)")
	additional_attributes = []

	if add_attributes:
	num_attributes = st.slider("Number of attributes to add", 1, 5, 1)
	for i in range(num_attributes):
	st.markdown(f"#### Attribute {i+1}")
	attr_name = st.text_input(f"Name of attribute {i+1}", key=f"attr_name_{i}")
	attr_topics = st.text_input(f"Topics (comma-separated) for {attr_name}", key=f"attr_topics_{i}")
	if attr_name and attr_topics:
	topics_list = [topic.strip() for topic in attr_topics.split(",") if topic.strip()]
	additional_attributes.append({"attribute": attr_name, "topics": topics_list})

	################

	# Generation parameters
	col1, col2 = st.columns(2)
	with col1:
	min_words = st.number_input("Min words", 1, 100, 20)
	with col2:
	max_words = st.number_input("Max words", min_words, 100, 50)

	# Few-shot examples
	use_few_shot = st.toggle("Use few-shot examples")
	few_shot_examples = []
	if use_few_shot:
	num_examples = st.slider("Number of few-shot examples", 1, 10, 1)
	for i in range(num_examples):
	with st.expander(f"Example {i+1}"):
	content = st.text_area(f"Content", key=f"few_shot_content_{i}")
	label = st.selectbox(f"Label", labels, key=f"few_shot_label_{i}")
	if content and label:
	few_shot_examples.append({"content": content, "label": label})

	num_to_generate = st.number_input("Number of examples", 1, 100, 10)
	#sytem role after
	# System role customization
	#default_system_role = f"You are a professional {classification_type} expert, your role is to generate text examples for {domain} domain. Always generate unique diverse examples and do not repeat the generated data. The generated text should be between {min_words} to {max_words} words long."
	# System role customization
	default_system_role = (
	f"You are a seasoned expert in {classification_type}, specializing in the {domain} domain. "
	f" Your primary responsibility is to generate high-quality, diverse, and unique text examples "
	f"tailored to this domain. Please ensure that each example adheres to the specified length "
	f"requirements, ranging from {min_words} to {max_words} words, and avoid any repetition in the generated content."
	)
	system_role = st.text_area("Modify System Role (optional)",
	value=default_system_role,
	key="system_role_input")
	st.session_state['system_role'] = system_role if system_role else default_system_role
	# Labels initialization
	#labels = []


	user_prompt = st.text_area("User Prompt (optional)")

	# # Updated prompt template including system role
	# prompt_template = PromptTemplate(
	# input_variables=["system_role", "classification_type", "domain", "num_examples",
	# "min_words", "max_words", "labels", "user_prompt", "few_shot_examples", "additional_attributes"],
	# template=(
	# "{system_role}\n"
	# "- Use the following parameters:\n"
	# "- Generate {num_examples} examples\n"
	# "- Each example should be between {min_words} to {max_words} words long\n"
	# "- Use these labels: {labels}.\n"
	# "- Use the following additional attributes:\n"
	# "- {additional_attributes}\n"
	# "- Generate the examples in this format: 'Example text. Label: label'\n"
	# "- Do not include word counts or any additional information\n"
	# "- Always use your creativity and intelligence to generate unique and diverse text data\n"
	# "- In sentiment analysis, ensure that the sentiment classification is clearly identified as Positive, Negative, or Neutral. Do not leave the sentiment ambiguous.\n"
	# "- In binary sentiment analysis, classify text strictly as either Positive or Negative. Do not include or imply Neutral as an option.\n"
	# "- Write unique examples every time.\n"
	# "- DO NOT REPEAT your gnerated text. \n"
	# "- For each Output, describe it once and move to the next.\n"
	# "- List each Output only once, and avoid repeating details.\n"
	# "- Additional instructions: {user_prompt}\n\n"
	# "- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n"
	# "- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it"

	# )
	# )

	# Updated prompt template including system role
	prompt_template = PromptTemplate(
	input_variables=["system_role", "classification_type", "domain", "num_examples",
	"min_words", "max_words", "labels", "user_prompt", "few_shot_examples", "additional_attributes"],
	template=(
	"{system_role}\n"
	"- Use the following parameters:\n"
	"- Generate {num_examples} examples\n"
	"- Each example should be between {min_words} to {max_words} words long, every 20 words can be a line long, so generate a line of text for each 20 words where if the number of {max_words} is 100, it should be 5 lines long.\n"
	"- Use these labels: {labels}.\n"
	"- Use the following additional attributes:\n"
	"- {additional_attributes}\n"
	"- Generate the examples in this format: 'Example text. Label: label'\n"
	"- Do not include word counts or any additional information\n"
	"- Always use your creativity and intelligence to generate unique and diverse text data\n"
	"- In sentiment analysis, ensure that the sentiment classification is clearly identified as Positive, Negative, or Neutral. Do not leave the sentiment ambiguous.\n"
	"- In binary sentiment analysis, classify text strictly as either Positive or Negative. Do not include or imply Neutral as an option.\n"
	"- Write unique examples every time.\n"
	"- DO NOT REPEAT gnerated examples. \n"
	"- Additional instructions: {user_prompt}\n\n"
	"- Use the following examples as a reference in the generation process\n\n {few_shot_examples}. \n"
	"- Think step by step, generate numbered examples, and check each newly generated example to ensure it has not been generated before. If it has, modify it"

	)
	)
	#every 20 words can be a line long, so write a line for each 20 words where if the number of word 100, it should be 5 lines long.
	#50 words short text and more than 50 words long text with not less than 4 lines
	##########new 22/4/2025
	formatted_attributes = "\n".join([
	f"- {attr['attribute']}: {', '.join(attr['topics'])}" for attr in additional_attributes
	])
	#######################

	# Generate system prompt
	system_prompt = prompt_template.format(
	system_role=st.session_state['system_role'],
	classification_type=classification_type,
	domain=domain,
	num_examples=num_to_generate,
	min_words=min_words,
	max_words=max_words,
	labels=", ".join(labels),
	user_prompt=user_prompt,
	few_shot_examples="\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples]) if few_shot_examples else "",
	additional_attributes=formatted_attributes
	)


	# Store system prompt in session state
	st.session_state['system_prompt'] = system_prompt

	# Display system prompt
	st.write("System Prompt:")
	st.text_area("Current System Prompt", value=st.session_state['system_prompt'],
	height=400, disabled=True)


	if st.button("🎯 Generate Examples"):
	#
	errors = []
	if domain_selection == "Custom" and not domain.strip():
	st.warning("Custom domain name is required.")
	elif len(labels) != len(set(labels)):
	st.warning("Class names must be unique.")
	elif any(not lbl.strip() for lbl in labels):
	st.warning("All class labels must be filled in.")


	with st.spinner("Generating examples..."):
	try:
	stream = client.chat.completions.create(
	model=selected_model,
	messages=[{"role": "system", "content": st.session_state['system_prompt']}],
	temperature=temperature,
	stream=True,
	#max_tokens=80000,
	max_tokens=4000,
	top_p=0.9,
	# repetition_penalty=1.2,
	#frequency_penalty=0.5, # Discourages frequent words
	#presence_penalty=0.6,
	)

	#new 24 march
	st.session_state.messages.append({"role": "user", "content": system_prompt})
	# # ####################
	response = st.write_stream(stream)
	st.session_state.messages.append({"role": "assistant", "content": response})
	# Initialize session state variables if they don't exist
	if 'system_prompt' not in st.session_state:
	st.session_state.system_prompt = system_prompt

	if 'response' not in st.session_state:
	st.session_state.response = response

	if 'generated_examples' not in st.session_state:
	st.session_state.generated_examples = []

	if 'generated_examples_csv' not in st.session_state:
	st.session_state.generated_examples_csv = None

	if 'generated_examples_json' not in st.session_state:
	st.session_state.generated_examples_json = None

	# Parse response and generate examples list
	examples_list = []
	for line in response.split('\n'):
	if line.strip():
	parts = line.rsplit('Label:', 1)
	if len(parts) == 2:
	text = parts[0].strip()
	label = parts[1].strip()
	if text and label:
	examples_list.append({
	'text': text,
	'label': label,
	'system_prompt': st.session_state.system_prompt,
	'system_role': st.session_state.system_role,
	'task_type': 'Data Generation',
	'Use few-shot example?': 'Yes' if use_few_shot else 'No',
	})




	if examples_list:
	# Update session state with new data
	st.session_state.generated_examples = examples_list

	# Generate CSV and JSON data
	df = pd.DataFrame(examples_list)
	st.session_state.generated_examples_csv = df.to_csv(index=False).encode('utf-8')
	st.session_state.generated_examples_json = json.dumps(examples_list, indent=2).encode('utf-8')

	# Vertical layout with centered "or" between buttons
	st.download_button(
	"📥 Download Generated Examples (CSV)",
	st.session_state.generated_examples_csv,
	"generated_examples.csv",
	"text/csv",
	key='download-csv-persistent'
	)

	# Add space and center the "or"
	st.markdown("""
	<div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
	""", unsafe_allow_html=True)

	st.download_button(
	"📥 Download Generated Examples (JSON)",
	st.session_state.generated_examples_json,
	"generated_examples.json",
	"application/json",
	key='download-json-persistent'
	)
	# Display the labeled examples
	st.markdown("##### 📋 Generated Examples Preview")
	st.dataframe(df, use_container_width=True)

	if st.button("Continue"):
	if follow_up == "Generate more examples":
	st.experimental_rerun()
	elif follow_up == "Data Labeling":
	st.session_state.task_choice = "Data Labeling"
	st.experimental_rerun()

	except Exception as e:
	st.error("An error occurred during generation.")
	st.error(f"Details: {e}")


	# Lableing Process
	elif st.session_state.task_choice == "Data Labeling":
	st.header("🏷️ Data Labeling")

	domain_selection = st.selectbox("Domain", ["Restaurant reviews", "E-Commerce reviews", "News", "AG News", "Tourism", "Custom"])
	# 2. Handle custom domain input
	custom_domain_valid = True # Assume valid until proven otherwise

	if domain_selection == "Custom":
	domain = st.text_input("Specify custom domain")
	if not domain.strip():
	st.error("Please specify a domain name.")
	custom_domain_valid = False
	else:
	domain = domain_selection


	# Classification type selection
	classification_type = st.selectbox(
	"Classification Type",
	["Sentiment Analysis", "Binary Classification", "Multi-Class Classification", "Named Entity Recognition (NER)"]
	)
	#NNew edit
	# Labels setup based on classification type
	labels = []
	labels_valid = False
	errors = []

	if classification_type == "Sentiment Analysis":
	st.write("### Sentiment Analysis Labels (Fixed)")
	col1, col2, col3 = st.columns(3)
	with col1:
	label_1 = st.text_input("First class", "Positive", disabled=True)
	with col2:
	label_2 = st.text_input("Second class", "Negative", disabled=True)
	with col3:
	label_3 = st.text_input("Third class", "Neutral", disabled=True)
	labels = ["Positive", "Negative", "Neutral"]


	elif classification_type == "Binary Classification":
	st.write("### Binary Classification Labels")
	col1, col2 = st.columns(2)

	with col1:
	label_1 = st.text_input("First class", "Positive")
	with col2:
	label_2 = st.text_input("Second class", "Negative")

	errors = []
	labels = [label_1.strip(), label_2.strip()]


	# Strip and lower-case labels for validation
	label_1 = labels[0].strip()
	label_2 = labels[1].strip()

	# Check for empty class names
	if not label_1:
	errors.append("First class name is required.")
	if not label_2:
	errors.append("Second class name is required.")

	# Check for duplicates (case insensitive)
	if label_1.lower() == label_2.lower() and label_1 and label_2:
	errors.append("Class names must be different.")

	# Show errors or success
	if errors:
	for error in errors:
	st.error(error)
	else:
	st.success("Binary class names are valid and unique!")


	elif classification_type == "Multi-Class Classification":
	st.write("### Multi-Class Classification Labels")

	default_labels_by_domain = {
	"News": ["Political", "Sports", "Entertainment", "Technology", "Business"],
	"AG News": ["World", "Sports", "Business", "Sci/Tech"],
	"Tourism": ["Accommodation", "Transportation", "Tourist Attractions",
	"Food & Dining", "Local Experience", "Adventure Activities",
	"Wellness & Spa", "Eco-Friendly Practices", "Family-Friendly",
	"Luxury Tourism"],
	"Restaurant reviews": ["Italian", "French", "American"],
	"E-Commerce reviews": ["Mobile Phones & Accessories", "Laptops & Computers","Kitchen & Dining",
	"Beauty & Personal Care", "Home & Furniture", "Clothing & Fashion",
	"Shoes & Handbags", "Health & Wellness", "Electronics & Gadgets",
	"Books & Stationery","Toys & Games", "Sports & Fitness",
	"Grocery & Gourmet Food","Watches & Accessories", "Baby Products"]
	}



	# Ask user how many classes they want to define
	num_classes = st.slider("Select the number of classes (labels)", min_value=3, max_value=10, value=3)

	# Use default labels based on selected domain, if available
	defaults = default_labels_by_domain.get(domain, [])

	labels = []
	errors = []
	cols = st.columns(3) # For nicely arranged label inputs

	for i in range(num_classes):
	with cols[i % 3]: # Distribute inputs across columns
	default_value = defaults[i] if i < len(defaults) else ""
	label_input = st.text_input(f"Label {i + 1}", default_value)
	normalized_label = label_input.strip().title()

	if not normalized_label:
	errors.append(f"Label {i + 1} is required.")
	else:
	labels.append(normalized_label)

	# Check for duplicates (case-insensitive)
	normalized_set = {label.lower() for label in labels}
	if len(labels) != len(normalized_set):
	errors.append("Label names must be unique (case-insensitive).")

	# Show validation results
	if errors:
	for error in errors:
	st.error(error)
	else:
	st.success("All label names are valid and unique!")

	labels_valid = not errors # True if no validation errors

	elif classification_type == "Named Entity Recognition (NER)":

	#new 22/4/2025
	#if classification_type == "Named Entity Recognition (NER)":
	use_few_shot = True
	#new 22/4/2025
	few_shot_examples = [
	{"content": "Mount Everest is the tallest mountain in the world.", "label": "LOC: Mount Everest"},
	{"content": "The President of the United States visited Paris last summer.", "label": "GPE: United States, GPE: Paris"},
	{"content": "Amazon is expanding its offices in Berlin.", "label": "ORG: Amazon, GPE: Berlin"},
	{"content": "J.K. Rowling wrote the Harry Potter books.", "label": "PERSON: J.K. Rowling"},
	{"content": "Apple was founded in California in 1976.", "label": "ORG: Apple, GPE: California, DATE: 1976"},
	{"content": "The Nile is the longest river in Africa.", "label": "LOC: Nile, GPE: Africa"},
	{"content": "He arrived at 3 PM for the meeting.", "label": "TIME: 3 PM"},
	{"content": "She bought the dress for $200.", "label": "MONEY: $200"},
	{"content": "The event is scheduled for July 4th.", "label": "DATE: July 4th"},
	{"content": "The World Health Organization is headquartered in Geneva.", "label": "ORG: World Health Organization, GPE: Geneva"}
	]
	###########

	st.write("### Named Entity Recognition (NER) Entities")

	# Predefined standard entities
	ner_entities = [
	"PERSON - Names of people, fictional characters, historical figures",
	"ORG - Companies, institutions, agencies, teams",
	"LOC - Physical locations (mountains, oceans, etc.)",
	"GPE - Countries, cities, states, political regions",
	"DATE - Calendar dates, years, centuries",
	"TIME - Times, durations",
	"MONEY - Monetary values with currency"
	]

	# User can add custom NER types
	custom_ner_entities = []
	if st.checkbox("Add custom NER entities?"):
	num_custom_ner = st.slider("Number of custom NER entities", 1, 10, 1)
	for i in range(num_custom_ner):
	st.markdown(f"#### Custom Entity {i+1}")
	custom_type = st.text_input(f"Entity type {i+1}", key=f"custom_ner_type_{i}")
	custom_description = st.text_input(f"Description for {custom_type}", key=f"custom_ner_desc_{i}")
	if custom_type and custom_description:
	custom_ner_entities.append(f"{custom_type.upper()} - {custom_description}")

	# Combine built-in and custom NERs
	all_ner_options = ner_entities + custom_ner_entities

	selected_entities = st.multiselect(
	"Select entities to recognize",
	all_ner_options,
	default=ner_entities
	)

	# Extract entity type names (before the dash)
	labels = [entity.split(" - ")[0].strip() for entity in selected_entities]

	if not labels:
	st.warning("Please select at least one entity type.")
	labels = ["PERSON"]

	use_few_shot = st.toggle("Use few-shot examples for labeling")
	few_shot_examples = []
	if use_few_shot:
	num_few_shot = st.slider("Number of few-shot examples", 1, 10, 1)
	for i in range(num_few_shot):
	with st.expander(f"Few-shot Example {i+1}"):
	content = st.text_area(f"Content", key=f"label_few_shot_content_{i}")
	label = st.selectbox(f"Label", labels, key=f"label_few_shot_label_{i}")
	if content and label:
	few_shot_examples.append(f"{content}\nLabel: {label}")

	num_examples = st.number_input("Number of examples to classify", 1, 100, 1)

	examples_to_classify = []
	if num_examples <= 10:
	for i in range(num_examples):
	example = st.text_area(f"Example {i+1}", key=f"example_{i}")
	if example:
	examples_to_classify.append(example)
	else:
	examples_text = st.text_area(
	"Enter examples (one per line)",
	height=300,
	help="Enter each example on a new line"
	)
	if examples_text:
	examples_to_classify = [ex.strip() for ex in examples_text.split('\n') if ex.strip()]
	if len(examples_to_classify) > num_examples:
	examples_to_classify = examples_to_classify[:num_examples]

	#New Wedyan
	#default_system_role = f"You are a professional {classification_type} expert, your role is to classify the provided text examples for {domain} domain."
	# System role customization
	default_system_role = (f"You are a highly skilled {classification_type} expert."
	f" Your task is to accurately classify the provided text examples within the {domain} domain."
	f" Ensure that all classifications are precise, context-aware, and aligned with domain-specific standards and best practices."
	)
	system_role = st.text_area("Modify System Role (optional)",
	value=default_system_role,
	key="system_role_input")
	st.session_state['system_role'] = system_role if system_role else default_system_role
	# Labels initialization
	#labels = []
	####

	user_prompt = st.text_area("User prompt (optional)", key="label_instructions")

	few_shot_text = "\n\n".join(few_shot_examples) if few_shot_examples else ""
	examples_text = "\n".join([f"{i+1}. {ex}" for i, ex in enumerate(examples_to_classify)])

	# Customize prompt template based on classification type
	if classification_type == "Named Entity Recognition (NER)":


	label_prompt_template = PromptTemplate(
	input_variables=["system_role", "labels", "few_shot_examples", "examples", "domain", "user_prompt"],
	template=(
	"{system_role}\n"
	"- You are an expert at Named Entity Recognition (NER) for domain: {domain}.\n"
	"- Use these entity types: {labels}.\n\n"
	"### Output Format:\n"
	"Return each example followed by the entities you found in this format:\n"
	"'Example text.\nEntity types:\n"
	"Then group the entities under each label like this:\n"
	"\nPERSON – Angela Merkel, John Smith\n"
	"ORG – Google, United Nations\n"
	"DATE – January 1st, 2023\n"
	"... and so on.\n\n"
	"Each new entities group should be in a new line.\n"
	"If entity type {labels} is not found, do not write it in your response.\n"
	"- Do NOT output them inline after the text.\n"
	"- Do NOT repeat the sentence.\n"
	"- If no entities are found for a type, skip it.\n"
	"- Keep the format consistent.\n\n"
	"User Instructions:\n{user_prompt}\n\n"
	"Few-shot Examples:\n{few_shot_examples}\n\n"
	"Examples to analyze:\n{examples}"
	)
	)

	#######
	else:
	label_prompt_template = PromptTemplate(

	input_variables=["system_role", "classification_type", "labels", "few_shot_examples", "examples","domain", "user_prompt"],
	template=(
	#"- Let'\s think step by step:"
	"{system_role}\n"
	# "- You are a professional {classification_type} expert in {domain} domain. Your role is to classify the following examples using these labels: {labels}.\n"
	"- Use the following instructions:\n"
	"- Use the following labels: {labels}.\n"
	"- Return the classified text followed by the label in this format: 'text. Label: [label]'\n"
	"- Do not provide any additional information or explanations\n"
	"- User prompt:\n {user_prompt}\n\n"
	"- Use user provided examples as guidence in the classification process:\n\n {few_shot_examples}\n"
	"- Examples to classify:\n{examples}\n\n"
	"- Think step by step then classify the examples"
	#"Output:\n"
	))

	# Check if few_shot_examples is already a formatted string
	# Check if few_shot_examples is already a formatted string
	if isinstance(few_shot_examples, str):
	formatted_few_shot = few_shot_examples
	# If it's a list of already formatted strings
	elif isinstance(few_shot_examples, list) and all(isinstance(ex, str) for ex in few_shot_examples):
	formatted_few_shot = "\n".join(few_shot_examples)
	# If it's a list of dictionaries with 'content' and 'label' keys
	elif isinstance(few_shot_examples, list) and all(isinstance(ex, dict) and 'content' in ex and 'label' in ex for ex in few_shot_examples):
	formatted_few_shot = "\n".join([f"{ex['content']}\nLabel: {ex['label']}" for ex in few_shot_examples])
	else:
	formatted_few_shot = ""

	# new 22/4/2025
	#formatted_few_shot = "\n".join([f"{ex['content']}\nEntities: [{ex['label']}]" for ex in few_shot_examples])
	formatted_few_shot = "\n\n".join([f"{ex['content']}\n\nEntity types\n{ex['label']}" for ex in few_shot_examples])

	###########
	system_prompt = label_prompt_template.format(
	system_role=st.session_state['system_role'],
	classification_type=classification_type,
	domain=domain,
	examples="\n".join(examples_to_classify),
	labels=", ".join(labels),
	user_prompt=user_prompt,
	few_shot_examples=formatted_few_shot
	)

	# Step 2: Store the system_prompt in st.session_state
	st.session_state['system_prompt'] = system_prompt
	#::contentReference[oaicite:0]{index=0}
	st.write("System Prompt:")
	#st.code(system_prompt)
	#st.code(st.session_state['system_prompt'])
	st.text_area("System Prompt", value=st.session_state['system_prompt'], height=300, max_chars=None, key=None, help=None, disabled=True)



	if st.button("🏷️ Label Data"):
	if examples_to_classify:
	with st.spinner("Labeling data..."):
	#Generate the system prompt based on classification type
	if classification_type == "Named Entity Recognition (NER)":
	system_prompt = label_prompt_template.format(
	system_role=st.session_state['system_role'],
	labels=", ".join(labels),
	domain = domain,
	few_shot_examples=few_shot_text,
	examples=examples_text,
	user_prompt=user_prompt
	)


	else:
	system_prompt = label_prompt_template.format(
	classification_type=classification_type,
	system_role=st.session_state['system_role'],
	domain = domain,
	labels=", ".join(labels),
	few_shot_examples=few_shot_text,
	examples=examples_text,
	user_prompt=user_prompt
	)
	try:
	stream = client.chat.completions.create(
	model=selected_model,
	messages=[{"role": "system", "content": system_prompt}],
	temperature=temperature,
	stream=True,
	#max_tokens=20000,
	max_tokens=4000,
	top_p = 0.9,

	)
	#new 24 March
	# Append user message
	st.session_state.messages.append({"role": "user", "content": system_prompt})
	#################
	response = st.write_stream(stream)
	st.session_state.messages.append({"role": "assistant", "content": response})


	# Initialize session state variables if they don't exist
	if 'system_prompt' not in st.session_state:
	st.session_state.system_prompt = system_prompt

	if 'response' not in st.session_state:
	st.session_state.response = response

	if 'generated_examples' not in st.session_state:
	st.session_state.generated_examples = []

	if 'generated_examples_csv' not in st.session_state:
	st.session_state.generated_examples_csv = None

	if 'generated_examples_json' not in st.session_state:
	st.session_state.generated_examples_json = None



	#new 22/4/2025
	labeled_examples = []
	if classification_type == "Named Entity Recognition (NER)":
	labeled_examples = [{
	'ner_output': response.strip(),
	'system_prompt': st.session_state.system_prompt,
	'system_role': st.session_state.system_role,
	'task_type': 'Named Entity Recognition (NER)',
	'Use few-shot example?': 'Yes' if use_few_shot else 'No',
	}]

	######


	else:
	labeled_examples = []
	for line in response.split('\n'):
	if line.strip():
	parts = line.rsplit('Label:', 1)
	if len(parts) == 2:
	text = parts[0].strip()
	label = parts[1].strip()
	if text and label:
	labeled_examples.append({
	'text': text,
	'label': label,
	'system_prompt': st.session_state.system_prompt,
	'system_role': st.session_state.system_role,
	'task_type': 'Data Labeling',
	'Use few-shot example?': 'Yes' if use_few_shot else 'No',
	})
	# Save and provide download options
	if labeled_examples:
	# Update session state
	st.session_state.labeled_examples = labeled_examples

	# Convert to CSV and JSON
	df = pd.DataFrame(labeled_examples)
	#new 22/4/2025
	# CSV
	st.session_state.labeled_examples_csv = df.to_csv(index=False).encode('utf-8')

	# JSON
	st.session_state.labeled_examples_json = json.dumps({
	"metadata": {
	"domain": domain,
	"labels": labels,
	"used_few_shot": use_few_shot,
	"task_type": "Named Entity Recognition (NER)",
	"timestamp": datetime.now().isoformat()
	},
	"examples": labeled_examples
	}, indent=2).encode('utf-8')


	# Download buttons
	st.download_button(
	"📥 Download Labeled Examples (CSV)",
	st.session_state.labeled_examples_csv,
	"labeled_examples.csv",
	"text/csv",
	key='download-labeled-csv'
	)

	st.markdown("""
	<div style='text-align: left; margin:15px 0; font-weight: 600; color: #666;'>. . . . . . or</div>
	""", unsafe_allow_html=True)

	st.download_button(
	"📥 Download Labeled Examples (JSON)",
	st.session_state.labeled_examples_json,
	"labeled_examples.json",
	"application/json",
	key='download-labeled-json'
	)
	# Display the labeled examples
	st.markdown("##### 📋 Labeled Examples Preview")
	st.dataframe(df, use_container_width=True)

	# #Display section
	# st.markdown("### 📋 Labeled Examples Preview")
	# st.dataframe(st.session_state.labeled_preview, use_container_width=True)



	if st.button("Continue"):
	if follow_up == "Label more data":
	st.session_state.examples_to_classify = []
	st.experimental_rerun()
	elif follow_up == "Data Generation":
	st.session_state.task_choice = "Data Labeling"
	st.experimental_rerun()

	except Exception as e:
	st.error("An error occurred during labeling.")
	st.error(f"Details: {e}")
	else:
	st.warning("Please enter at least one example to classify.")

	#st.session_state.messages.append({"role": "assistant", "content": response})




	# Footer
	st.markdown("---")
	st.markdown(
	"""
	<div style='text-align: center'>
	<p>Made with ❤️ by Wedyan Alsakran 2025</p>
	</div>
	""",
	unsafe_allow_html=True
	)