zero / app.py
Ignaciohhhhggfgjfrffd's picture
Update app.py
fa6403c verified
import os
import random
import re
import sys
from datetime import datetime
from typing import Dict, List, Tuple, Any
import pandas as pd
from huggingface_hub import login
from transformers import pipeline
from huggingface_hub import HfApi
import gradio as gr
CSS_STYLE = """
#header {
font-size: 4em;
color: #0071C5;
background: linear-gradient(to right, #0071C5, #00BFFF);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
padding: 5px;
border-radius: 5px;
margin-bottom: 5px;
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
}
.btn {
background-color: #0071C5;
color: white;
}
.text-center {
text-align: center;
}
.success-text { color: green; font-weight: bold; }
.error-text { color: red; font-weight: bold; }
.info-text { color: blue; font-weight: bold; }
"""
def extract_label(input_string: str) -> Tuple[str, str]:
if ":" not in input_string:
raise ValueError(
"Input string must contain a ':' separating the label and description."
)
parts = input_string.split(":", 1)
return parts[0].strip(), parts[1].strip()
def parse_string(input_string: str) -> Tuple[str, str]:
match = re.search(r"OUTPUT:\s*(.+?)\s*REASONING:\s*(.+)", input_string, re.DOTALL)
if not match:
raise ValueError(
"The generated response is not in the expected 'OUTPUT:... REASONING:...' format."
)
output = match.group(1).strip()
reasoning = match.group(2).strip()
return output, reasoning
def sdg(
sample_size: int,
labels: List[str],
label_descriptions: str,
categories_types: Dict[str, List[str]],
use_case: str,
prompt_examples: str,
model: str,
max_new_tokens: int,
batch_size: int,
output_dir: str,
save_reasoning: bool,
) -> Tuple[str, str, str]:
categories = list(categories_types.keys())
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = os.path.join(output_dir, f"{timestamp}.csv")
num_batches = (sample_size + batch_size - 1) // batch_size
for batch in range(num_batches):
start = batch * batch_size
end = min(start + batch_size, sample_size)
batch_data = []
batch_random_labels = random.choices(labels, k=end - start)
batch_random_categories = random.choices(categories, k=end - start)
for idx in range(end - start):
random_type = random.choices(
categories_types[batch_random_categories[idx]]
)
prompt = f"""You should create synthetic data for specified labels and categories.
This is especially useful for {use_case}.
*Label Descriptions*
{label_descriptions}
*Examples*
{prompt_examples}
####################
Generate one output for the classification below.
You may use the examples I have provided as a guide, but you cannot simply modify or rewrite them.
Only return the OUTPUT and REASONING. The first token in your response must be OUTPUT.
Do not return the LABEL, CATEGORY, or TYPE.
LABEL: {batch_random_labels[idx]}
CATEGORY: {batch_random_categories[idx]}
TYPE: {random_type}
OUTPUT:
REASONING:
"""
messages = [
{
"role": "system",
"content": f"You are a helpful assistant designed to generate synthetic data for {use_case} with labels {labels} in categories {categories}. The first token in your generated text must be OUTPUT: This must be followed by the token REASONING: as in the prompt examples.",
},
{"role": "user", "content": prompt},
]
generator = pipeline("text-generation", model=model)
result = generator(messages, max_new_tokens=max_new_tokens)[0][
"generated_text"
]
text, reasoning = parse_string(result)
entry = {
"text": text,
"label": batch_random_labels[idx],
"source": model,
}
if save_reasoning:
entry["reasoning"] = reasoning
batch_data.append(entry)
batch_df = pd.DataFrame(batch_data)
if batch == 0:
batch_df.to_csv(output_path, mode="w", index=False)
else:
batch_df.to_csv(output_path, mode="a", header=False, index=False)
return f"Synthetic data saved to {output_path}", output_path, timestamp
def update_status_ui(message: str, state_obj: Dict) -> Tuple[str, Dict]:
css_class = "info-text"
if "Error" in message or "error" in message:
css_class = "error-text"
elif "success" in message or "pushed" in message:
css_class = "success-text"
html_message = f'<p class="{css_class}">{message}</p>'
state_obj['last_status_html'] = html_message
return html_message, state_obj
def run_sdg(
sample_size: int,
model: str,
max_new_tokens: int,
save_reasoning: bool,
token: str,
state: Dict,
label_boxes: List[Dict[str, str]],
use_case: str,
prompt_examples: str,
category_boxes: List[Dict[str, str]],
hf_repo_id: str,
) -> Tuple[str, Dict, str]:
if not all([token, use_case, label_boxes, category_boxes, prompt_examples]):
msg = "Validation Error: Token, Use Case, Labels, Categories, and Examples are required."
return update_status_ui(msg, state)[0], state, ""
try:
login(token)
status_msg = f"Logged into Hugging Face successfully."
state = update_status_ui(status_msg, state)[1]
except Exception as e:
msg = f"Error logging in with token: {e}"
return update_status_ui(msg, state)[0], state, ""
try:
label_descriptions = ""
labels = []
for box in label_boxes:
label_descriptions += box["content"] + "\n"
label, _ = extract_label(box["content"])
labels.append(label)
categories_types = {}
for box in category_boxes:
category, types = extract_label(box["content"])
categories_types[category.strip()] = [t.strip() for t in types.split(",")]
status, output_path, timestamp = sdg(
sample_size=sample_size,
labels=labels,
label_descriptions=label_descriptions,
categories_types=categories_types,
use_case=use_case,
prompt_examples=prompt_examples,
model=model,
max_new_tokens=max_new_tokens,
batch_size=20,
output_dir="./output_data",
save_reasoning=save_reasoning,
)
state["output_path"] = output_path
state["timestamp"] = timestamp
status_msg_local = update_status_ui(status, state)[0]
if hf_repo_id and token:
api = HfApi(token=token)
if not api.repo_exists(hf_repo_id):
api.create_repo(repo_id=hf_repo_id, repo_type="dataset", private=False)
status_msg_upload = f"Created new dataset repository: {hf_repo_id}"
status_msg_local = update_status_ui(status_msg_upload, state)[0]
df = pd.DataFrame(sdg(
sample_size=sample_size,
labels=labels,
label_descriptions=label_descriptions,
categories_types=categories_types,
use_case=use_case,
prompt_examples=prompt_examples,
model=model,
max_new_tokens=max_new_tokens,
batch_size=sample_size,
save_reasoning=save_reasoning,
))
temp_csv_path = f"./temp_data_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv"
df.to_csv(temp_csv_path, index=False)
api.upload_file(
path_or_fileobj=temp_csv_path,
path_in_repo="data.csv",
repo_id=hf_repo_id,
repo_type="dataset",
)
os.remove(temp_csv_path)
dataset_url = f"https://huggingface.co/{hf_repo_id}/blob/main/data.csv"
status_msg_upload_final = f"Successfully pushed data to HF Dataset: {hf_repo_id}. View at: {dataset_url}"
status_msg_final = update_status_ui(status_msg_upload_final, state)[0]
state["hf_repo_id"] = hf_repo_id
return status_msg_final, state, ""
return status_msg_local, state, output_path
except Exception as e:
msg = f"Generation Error: {e}"
return update_status_ui(msg, state)[0], state, ""
def handle_generation_success(status, state_val, output_path_or_none):
if output_path_or_none and os.path.exists(output_path_or_none):
return status, state_val, gr.File(output_path_or_none, visible=True)
return status, state_val, gr.File(None, visible=False)
with gr.Blocks(css=CSS_STYLE) as demo:
gr.Markdown(
"# Synthetic Data Generator",
elem_id="header",
elem_classes="text-center",
)
gr.Markdown(
"**Use Language Models to Create Datasets for Specified Labels and Categories**",
elem_classes="text-center",
)
with gr.Tab("Data Generator"):
with gr.Row():
with gr.Column(scale=2):
gr.Markdown(
"## Setup & Configure",
elem_classes="text-center",
)
gr.Markdown("### Use Case")
use_case = gr.Textbox(
show_label=False,
placeholder="Describe your use case (e.g., customer service).",
autofocus=True,
)
sample_size = gr.Number(
label="Total Samples to Generate",
value=100,
minimum=1
)
label_boxes = gr.State([])
gr.Markdown(
"### Labels\nUse a colon to separate each label and its description as in 'label: description.'"
)
with gr.Row():
new_label = gr.Textbox(
show_label=False,
)
gr.Examples(
examples=[
"polite: Text is considerate and shows respect and good manners, often including courteous phrases and a friendly tone.",
"somewhat polite: Text is generally respectful but lacks warmth or formality, communicating with a decent level of courtesy.",
],
example_labels=["polite", "somewhat polite"],
inputs=new_label,
)
add_label_button = gr.Button("Save Label", elem_classes="btn")
def add_item(
label_boxes: List[Dict[str, str]], new_content: str
) -> Tuple[List[Dict[str, str]], str]:
if new_content.strip():
return (
label_boxes + [{"content": new_content.strip()}],
"",
)
return label_boxes, ""
add_label_button.click(
add_item, [label_boxes, new_label], [label_boxes, new_label]
)
@gr.render(inputs=label_boxes)
def render_boxes(box_list: List[Dict[str, str]]) -> None:
with gr.Accordion(f"Saved Labels ({len(box_list)})"):
for box in box_list:
with gr.Row():
gr.Textbox(
box["content"],
lines=2,
show_label=False,
container=False,
)
delete_button = gr.Button(
"Delete", scale=0, variant="stop"
)
def delete(
box: Dict[str, str] = box,
) -> List[Dict[str, str]]:
box_list.remove(box)
return box_list
delete_button.click(delete, None, [label_boxes])
category_boxes = gr.State([])
gr.Markdown(
"### Categories\nUse a colon to separate each category and its subcategories as in 'category: type1, type2.'"
)
with gr.Row():
new_category = gr.Textbox(show_label=False)
gr.Examples(
examples=[
"travel: hotel, airline, train",
"finance: fees and charges, credit",
],
example_labels=["travel", "finance"],
inputs=new_category,
)
add_category_button = gr.Button("Save Category", elem_classes="btn")
add_category_button.click(
add_item,
[category_boxes, new_category],
[category_boxes, new_category],
)
@gr.render(inputs=category_boxes)
def render_boxes_cat(box_list: List[Dict[str, str]]) -> None:
with gr.Accordion(f"Saved Categories ({len(box_list)})"):
for box in box_list:
with gr.Row():
gr.Textbox(
box["content"],
show_label=False,
container=False,
)
delete_button = gr.Button(
"Delete", scale=0, variant="stop"
)
def delete(
box: Dict[str, str] = box,
) -> List[Dict[str, str]]:
box_list.remove(box)
return box_list
delete_button.click(delete, None, [category_boxes])
gr.Markdown(
"### Guiding Examples\nInclude all examples in this box. For each example, provide a LABEL, CATEGORY, TYPE, OUTPUT, and REASONING."
)
with gr.Row():
prompt_examples = gr.Textbox(
show_label=False,
)
gr.Examples(
label="Example",
examples=[
"""LABEL: polite
CATEGORY: food and drink
TYPE: cafe
OUTPUT: Thank you for visiting! While we prepare your coffee, feel free to relax or browse our selection of pastries. Let us know if we can make your day even better!
REASONING: This text is polite because it expresses gratitude and encourages the customer to feel at ease with a welcoming tone. Phrases like "Let us know if we can make your day even better" show warmth and consideration, enhancing the customer experience.
LABEL: somewhat polite
CATEGORY: travel
TYPE: train
OUTPUT: I understand your concern about your booking, and I'll check what options we have for you.
REASONING: This text would be classified as "somewhat polite." The acknowledgment of the customer's concern shows a basic level of respect. The sentence is direct and lacks additional warmth or formality, but it communicates a willingness to help. The use of "I'll check" is a straightforward commitment to action without additional courteous phrases that would make it fully polite.
"""
],
example_labels=["polite and somewhat polite"],
inputs=prompt_examples,
)
gr.Markdown(
"""### Language Model Configuration"""
)
model = gr.Dropdown(
label="Model",
choices=[
"google/gemma-3-1b-it",
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
"meta-llama/Llama-3.2-3B-Instruct",
],
value="google/gemma-3-1b-it",
)
max_new_tokens = gr.Number(
label="Maximum Number of New Tokens",
value=512
)
save_reasoning = gr.Checkbox(label="Save Reasoning", value=True)
api_token = gr.Textbox(
label="Hugging Face Token",
placeholder="Paste your HF token here (required for gated models)",
type="password"
)
hf_repo_id = gr.Textbox(
label="HF Dataset Repository ID (Optional, e.g., your_user/my_new_data)",
placeholder="Enter repo ID to push dataset here",
value=""
)
with gr.Column(scale=1):
gr.Markdown("## Generation & Output")
status_output = gr.HTML(label="Status", value="<p class='info-text'>Enter setup details and click 'Generate'.</p>")
generate_button = gr.Button("Generate Synthetic Data", variant="primary")
state = gr.State({})
summary_output = gr.Markdown("Data summary will appear here.")
clear_button = gr.Button("Clear All Inputs")
def clear_all():
return "", 100, True, "", [], [], "", None, "", ""
clear_button.click(
clear_all,
inputs=[],
outputs=[
use_case, sample_size, max_new_tokens, prompt_examples, label_boxes, category_boxes, hf_repo_id, summary_output, state, api_token
]
)
generate_button.click(
run_sdg,
inputs=[
sample_size,
model,
max_new_tokens,
save_reasoning,
api_token,
state,
label_boxes,
use_case,
prompt_examples,
category_boxes,
hf_repo_id,
],
outputs=[status_output, state, summary_output],
).then(
handle_generation_success,
inputs=[status_output, state, summary_output],
outputs=[status_output, state, summary_output]
)
if __name__ == "__main__":
demo.launch()