Spaces:

xdghgjghchg
/

zero

Sleeping

App Files Files Community

zero / app.py

Ignaciohhhhggfgjfrffd

Update app.py

fa6403c verified 3 months ago

raw

history blame contribute delete

19.3 kB

	import os
	import random
	import re
	import sys
	from datetime import datetime
	from typing import Dict, List, Tuple, Any

	import pandas as pd
	from huggingface_hub import login
	from transformers import pipeline
	from huggingface_hub import HfApi

	import gradio as gr


	CSS_STYLE = """
	#header {
	font-size: 4em;
	color: #0071C5;
	background: linear-gradient(to right, #0071C5, #00BFFF);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	padding: 5px;
	border-radius: 5px;
	margin-bottom: 5px;
	text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
	}

	.btn {
	background-color: #0071C5;
	color: white;
	}

	.text-center {
	text-align: center;
	}

	.success-text { color: green; font-weight: bold; }
	.error-text { color: red; font-weight: bold; }
	.info-text { color: blue; font-weight: bold; }
	"""


	def extract_label(input_string: str) -> Tuple[str, str]:
	if ":" not in input_string:
	raise ValueError(
	"Input string must contain a ':' separating the label and description."
	)
	parts = input_string.split(":", 1)
	return parts[0].strip(), parts[1].strip()


	def parse_string(input_string: str) -> Tuple[str, str]:
	match = re.search(r"OUTPUT:\s(.+?)\sREASONING:\s*(.+)", input_string, re.DOTALL)

	if not match:
	raise ValueError(
	"The generated response is not in the expected 'OUTPUT:... REASONING:...' format."
	)

	output = match.group(1).strip()
	reasoning = match.group(2).strip()

	return output, reasoning


	def sdg(
	sample_size: int,
	labels: List[str],
	label_descriptions: str,
	categories_types: Dict[str, List[str]],
	use_case: str,
	prompt_examples: str,
	model: str,
	max_new_tokens: int,
	batch_size: int,
	output_dir: str,
	save_reasoning: bool,
	) -> Tuple[str, str, str]:
	categories = list(categories_types.keys())

	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	output_path = os.path.join(output_dir, f"{timestamp}.csv")

	num_batches = (sample_size + batch_size - 1) // batch_size

	for batch in range(num_batches):
	start = batch * batch_size
	end = min(start + batch_size, sample_size)

	batch_data = []

	batch_random_labels = random.choices(labels, k=end - start)
	batch_random_categories = random.choices(categories, k=end - start)

	for idx in range(end - start):
	random_type = random.choices(
	categories_types[batch_random_categories[idx]]
	)
	prompt = f"""You should create synthetic data for specified labels and categories.
	This is especially useful for {use_case}.

	Label Descriptions
	{label_descriptions}

	Examples
	{prompt_examples}

	####################

	Generate one output for the classification below.
	You may use the examples I have provided as a guide, but you cannot simply modify or rewrite them.
	Only return the OUTPUT and REASONING. The first token in your response must be OUTPUT.
	Do not return the LABEL, CATEGORY, or TYPE.

	LABEL: {batch_random_labels[idx]}
	CATEGORY: {batch_random_categories[idx]}
	TYPE: {random_type}
	OUTPUT:
	REASONING:
	"""
	messages = [
	{
	"role": "system",
	"content": f"You are a helpful assistant designed to generate synthetic data for {use_case} with labels {labels} in categories {categories}. The first token in your generated text must be OUTPUT: This must be followed by the token REASONING: as in the prompt examples.",
	},
	{"role": "user", "content": prompt},
	]
	generator = pipeline("text-generation", model=model)
	result = generator(messages, max_new_tokens=max_new_tokens)[0][
	"generated_text"
	]

	text, reasoning = parse_string(result)

	entry = {
	"text": text,
	"label": batch_random_labels[idx],
	"source": model,
	}

	if save_reasoning:
	entry["reasoning"] = reasoning

	batch_data.append(entry)

	batch_df = pd.DataFrame(batch_data)

	if batch == 0:
	batch_df.to_csv(output_path, mode="w", index=False)
	else:
	batch_df.to_csv(output_path, mode="a", header=False, index=False)

	return f"Synthetic data saved to {output_path}", output_path, timestamp


	def update_status_ui(message: str, state_obj: Dict) -> Tuple[str, Dict]:
	css_class = "info-text"
	if "Error" in message or "error" in message:
	css_class = "error-text"
	elif "success" in message or "pushed" in message:
	css_class = "success-text"

	html_message = f'<p class="{css_class}">{message}</p>'
	state_obj['last_status_html'] = html_message
	return html_message, state_obj

	def run_sdg(
	sample_size: int,
	model: str,
	max_new_tokens: int,
	save_reasoning: bool,
	token: str,
	state: Dict,
	label_boxes: List[Dict[str, str]],
	use_case: str,
	prompt_examples: str,
	category_boxes: List[Dict[str, str]],
	hf_repo_id: str,
	) -> Tuple[str, Dict, str]:

	if not all([token, use_case, label_boxes, category_boxes, prompt_examples]):
	msg = "Validation Error: Token, Use Case, Labels, Categories, and Examples are required."
	return update_status_ui(msg, state)[0], state, ""

	try:
	login(token)
	status_msg = f"Logged into Hugging Face successfully."
	state = update_status_ui(status_msg, state)[1]
	except Exception as e:
	msg = f"Error logging in with token: {e}"
	return update_status_ui(msg, state)[0], state, ""

	try:
	label_descriptions = ""
	labels = []
	for box in label_boxes:
	label_descriptions += box["content"] + "\n"
	label, _ = extract_label(box["content"])
	labels.append(label)

	categories_types = {}
	for box in category_boxes:
	category, types = extract_label(box["content"])
	categories_types[category.strip()] = [t.strip() for t in types.split(",")]

	status, output_path, timestamp = sdg(
	sample_size=sample_size,
	labels=labels,
	label_descriptions=label_descriptions,
	categories_types=categories_types,
	use_case=use_case,
	prompt_examples=prompt_examples,
	model=model,
	max_new_tokens=max_new_tokens,
	batch_size=20,
	output_dir="./output_data",
	save_reasoning=save_reasoning,
	)
	state["output_path"] = output_path
	state["timestamp"] = timestamp

	status_msg_local = update_status_ui(status, state)[0]

	if hf_repo_id and token:
	api = HfApi(token=token)

	if not api.repo_exists(hf_repo_id):
	api.create_repo(repo_id=hf_repo_id, repo_type="dataset", private=False)
	status_msg_upload = f"Created new dataset repository: {hf_repo_id}"
	status_msg_local = update_status_ui(status_msg_upload, state)[0]

	df = pd.DataFrame(sdg(
	sample_size=sample_size,
	labels=labels,
	label_descriptions=label_descriptions,
	categories_types=categories_types,
	use_case=use_case,
	prompt_examples=prompt_examples,
	model=model,
	max_new_tokens=max_new_tokens,
	batch_size=sample_size,
	save_reasoning=save_reasoning,
	))

	temp_csv_path = f"./temp_data_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
	df.to_csv(temp_csv_path, index=False)

	api.upload_file(
	path_or_fileobj=temp_csv_path,
	path_in_repo="data.csv",
	repo_id=hf_repo_id,
	repo_type="dataset",
	)

	os.remove(temp_csv_path)

	dataset_url = f"https://huggingface.co/{hf_repo_id}/blob/main/data.csv"
	status_msg_upload_final = f"Successfully pushed data to HF Dataset: {hf_repo_id}. View at: {dataset_url}"
	status_msg_final = update_status_ui(status_msg_upload_final, state)[0]

	state["hf_repo_id"] = hf_repo_id
	return status_msg_final, state, ""

	return status_msg_local, state, output_path
	except Exception as e:
	msg = f"Generation Error: {e}"
	return update_status_ui(msg, state)[0], state, ""

	def handle_generation_success(status, state_val, output_path_or_none):
	if output_path_or_none and os.path.exists(output_path_or_none):
	return status, state_val, gr.File(output_path_or_none, visible=True)
	return status, state_val, gr.File(None, visible=False)

	with gr.Blocks(css=CSS_STYLE) as demo:
	gr.Markdown(
	"# Synthetic Data Generator",
	elem_id="header",
	elem_classes="text-center",
	)
	gr.Markdown(
	"Use Language Models to Create Datasets for Specified Labels and Categories",
	elem_classes="text-center",
	)
	with gr.Tab("Data Generator"):
	with gr.Row():
	with gr.Column(scale=2):
	gr.Markdown(
	"## Setup & Configure",
	elem_classes="text-center",
	)
	gr.Markdown("### Use Case")
	use_case = gr.Textbox(
	show_label=False,
	placeholder="Describe your use case (e.g., customer service).",
	autofocus=True,
	)

	sample_size = gr.Number(
	label="Total Samples to Generate",
	value=100,
	minimum=1
	)

	label_boxes = gr.State([])
	gr.Markdown(
	"### Labels\nUse a colon to separate each label and its description as in 'label: description.'"
	)
	with gr.Row():
	new_label = gr.Textbox(
	show_label=False,
	)
	gr.Examples(
	examples=[
	"polite: Text is considerate and shows respect and good manners, often including courteous phrases and a friendly tone.",
	"somewhat polite: Text is generally respectful but lacks warmth or formality, communicating with a decent level of courtesy.",
	],
	example_labels=["polite", "somewhat polite"],
	inputs=new_label,
	)

	add_label_button = gr.Button("Save Label", elem_classes="btn")

	def add_item(
	label_boxes: List[Dict[str, str]], new_content: str
	) -> Tuple[List[Dict[str, str]], str]:
	if new_content.strip():
	return (
	label_boxes + [{"content": new_content.strip()}],
	"",
	)
	return label_boxes, ""

	add_label_button.click(
	add_item, [label_boxes, new_label], [label_boxes, new_label]
	)

	@gr.render(inputs=label_boxes)
	def render_boxes(box_list: List[Dict[str, str]]) -> None:
	with gr.Accordion(f"Saved Labels ({len(box_list)})"):
	for box in box_list:
	with gr.Row():
	gr.Textbox(
	box["content"],
	lines=2,
	show_label=False,
	container=False,
	)
	delete_button = gr.Button(
	"Delete", scale=0, variant="stop"
	)

	def delete(
	box: Dict[str, str] = box,
	) -> List[Dict[str, str]]:
	box_list.remove(box)
	return box_list

	delete_button.click(delete, None, [label_boxes])

	category_boxes = gr.State([])
	gr.Markdown(
	"### Categories\nUse a colon to separate each category and its subcategories as in 'category: type1, type2.'"
	)
	with gr.Row():
	new_category = gr.Textbox(show_label=False)
	gr.Examples(
	examples=[
	"travel: hotel, airline, train",
	"finance: fees and charges, credit",
	],
	example_labels=["travel", "finance"],
	inputs=new_category,
	)

	add_category_button = gr.Button("Save Category", elem_classes="btn")
	add_category_button.click(
	add_item,
	[category_boxes, new_category],
	[category_boxes, new_category],
	)

	@gr.render(inputs=category_boxes)
	def render_boxes_cat(box_list: List[Dict[str, str]]) -> None:
	with gr.Accordion(f"Saved Categories ({len(box_list)})"):
	for box in box_list:
	with gr.Row():
	gr.Textbox(
	box["content"],
	show_label=False,
	container=False,
	)
	delete_button = gr.Button(
	"Delete", scale=0, variant="stop"
	)

	def delete(
	box: Dict[str, str] = box,
	) -> List[Dict[str, str]]:
	box_list.remove(box)
	return box_list

	delete_button.click(delete, None, [category_boxes])

	gr.Markdown(
	"### Guiding Examples\nInclude all examples in this box. For each example, provide a LABEL, CATEGORY, TYPE, OUTPUT, and REASONING."
	)
	with gr.Row():
	prompt_examples = gr.Textbox(
	show_label=False,
	)
	gr.Examples(
	label="Example",
	examples=[
	"""LABEL: polite
	CATEGORY: food and drink
	TYPE: cafe
	OUTPUT: Thank you for visiting! While we prepare your coffee, feel free to relax or browse our selection of pastries. Let us know if we can make your day even better!
	REASONING: This text is polite because it expresses gratitude and encourages the customer to feel at ease with a welcoming tone. Phrases like "Let us know if we can make your day even better" show warmth and consideration, enhancing the customer experience.

	LABEL: somewhat polite
	CATEGORY: travel
	TYPE: train
	OUTPUT: I understand your concern about your booking, and I'll check what options we have for you.
	REASONING: This text would be classified as "somewhat polite." The acknowledgment of the customer's concern shows a basic level of respect. The sentence is direct and lacks additional warmth or formality, but it communicates a willingness to help. The use of "I'll check" is a straightforward commitment to action without additional courteous phrases that would make it fully polite.
	"""
	],
	example_labels=["polite and somewhat polite"],
	inputs=prompt_examples,
	)

	gr.Markdown(
	"""### Language Model Configuration"""
	)
	model = gr.Dropdown(
	label="Model",
	choices=[
	"google/gemma-3-1b-it",
	"HuggingFaceTB/SmolLM2-1.7B-Instruct",
	"meta-llama/Llama-3.2-3B-Instruct",
	],
	value="google/gemma-3-1b-it",
	)
	max_new_tokens = gr.Number(
	label="Maximum Number of New Tokens",
	value=512
	)
	save_reasoning = gr.Checkbox(label="Save Reasoning", value=True)

	api_token = gr.Textbox(
	label="Hugging Face Token",
	placeholder="Paste your HF token here (required for gated models)",
	type="password"
	)

	hf_repo_id = gr.Textbox(
	label="HF Dataset Repository ID (Optional, e.g., your_user/my_new_data)",
	placeholder="Enter repo ID to push dataset here",
	value=""
	)


	with gr.Column(scale=1):
	gr.Markdown("## Generation & Output")

	status_output = gr.HTML(label="Status", value="<p class='info-text'>Enter setup details and click 'Generate'.</p>")

	generate_button = gr.Button("Generate Synthetic Data", variant="primary")

	state = gr.State({})

	summary_output = gr.Markdown("Data summary will appear here.")

	clear_button = gr.Button("Clear All Inputs")

	def clear_all():
	return "", 100, True, "", [], [], "", None, "", ""

	clear_button.click(
	clear_all,
	inputs=[],
	outputs=[
	use_case, sample_size, max_new_tokens, prompt_examples, label_boxes, category_boxes, hf_repo_id, summary_output, state, api_token
	]
	)


	generate_button.click(
	run_sdg,
	inputs=[
	sample_size,
	model,
	max_new_tokens,
	save_reasoning,
	api_token,
	state,
	label_boxes,
	use_case,
	prompt_examples,
	category_boxes,
	hf_repo_id,
	],
	outputs=[status_output, state, summary_output],
	).then(
	handle_generation_success,
	inputs=[status_output, state, summary_output],
	outputs=[status_output, state, summary_output]
	)

	if __name__ == "__main__":
	demo.launch()