Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| import re | |
| import sys | |
| from datetime import datetime | |
| from typing import Dict, List, Tuple, Any | |
| import pandas as pd | |
| from huggingface_hub import login | |
| from transformers import pipeline | |
| from huggingface_hub import HfApi | |
| import gradio as gr | |
| CSS_STYLE = """ | |
| #header { | |
| font-size: 4em; | |
| color: #0071C5; | |
| background: linear-gradient(to right, #0071C5, #00BFFF); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| padding: 5px; | |
| border-radius: 5px; | |
| margin-bottom: 5px; | |
| text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2); | |
| } | |
| .btn { | |
| background-color: #0071C5; | |
| color: white; | |
| } | |
| .text-center { | |
| text-align: center; | |
| } | |
| .success-text { color: green; font-weight: bold; } | |
| .error-text { color: red; font-weight: bold; } | |
| .info-text { color: blue; font-weight: bold; } | |
| """ | |
| def extract_label(input_string: str) -> Tuple[str, str]: | |
| if ":" not in input_string: | |
| raise ValueError( | |
| "Input string must contain a ':' separating the label and description." | |
| ) | |
| parts = input_string.split(":", 1) | |
| return parts[0].strip(), parts[1].strip() | |
| def parse_string(input_string: str) -> Tuple[str, str]: | |
| match = re.search(r"OUTPUT:\s*(.+?)\s*REASONING:\s*(.+)", input_string, re.DOTALL) | |
| if not match: | |
| raise ValueError( | |
| "The generated response is not in the expected 'OUTPUT:... REASONING:...' format." | |
| ) | |
| output = match.group(1).strip() | |
| reasoning = match.group(2).strip() | |
| return output, reasoning | |
| def sdg( | |
| sample_size: int, | |
| labels: List[str], | |
| label_descriptions: str, | |
| categories_types: Dict[str, List[str]], | |
| use_case: str, | |
| prompt_examples: str, | |
| model: str, | |
| max_new_tokens: int, | |
| batch_size: int, | |
| output_dir: str, | |
| save_reasoning: bool, | |
| ) -> Tuple[str, str, str]: | |
| categories = list(categories_types.keys()) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_path = os.path.join(output_dir, f"{timestamp}.csv") | |
| num_batches = (sample_size + batch_size - 1) // batch_size | |
| for batch in range(num_batches): | |
| start = batch * batch_size | |
| end = min(start + batch_size, sample_size) | |
| batch_data = [] | |
| batch_random_labels = random.choices(labels, k=end - start) | |
| batch_random_categories = random.choices(categories, k=end - start) | |
| for idx in range(end - start): | |
| random_type = random.choices( | |
| categories_types[batch_random_categories[idx]] | |
| ) | |
| prompt = f"""You should create synthetic data for specified labels and categories. | |
| This is especially useful for {use_case}. | |
| *Label Descriptions* | |
| {label_descriptions} | |
| *Examples* | |
| {prompt_examples} | |
| #################### | |
| Generate one output for the classification below. | |
| You may use the examples I have provided as a guide, but you cannot simply modify or rewrite them. | |
| Only return the OUTPUT and REASONING. The first token in your response must be OUTPUT. | |
| Do not return the LABEL, CATEGORY, or TYPE. | |
| LABEL: {batch_random_labels[idx]} | |
| CATEGORY: {batch_random_categories[idx]} | |
| TYPE: {random_type} | |
| OUTPUT: | |
| REASONING: | |
| """ | |
| messages = [ | |
| { | |
| "role": "system", | |
| "content": f"You are a helpful assistant designed to generate synthetic data for {use_case} with labels {labels} in categories {categories}. The first token in your generated text must be OUTPUT: This must be followed by the token REASONING: as in the prompt examples.", | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ] | |
| generator = pipeline("text-generation", model=model) | |
| result = generator(messages, max_new_tokens=max_new_tokens)[0][ | |
| "generated_text" | |
| ] | |
| text, reasoning = parse_string(result) | |
| entry = { | |
| "text": text, | |
| "label": batch_random_labels[idx], | |
| "source": model, | |
| } | |
| if save_reasoning: | |
| entry["reasoning"] = reasoning | |
| batch_data.append(entry) | |
| batch_df = pd.DataFrame(batch_data) | |
| if batch == 0: | |
| batch_df.to_csv(output_path, mode="w", index=False) | |
| else: | |
| batch_df.to_csv(output_path, mode="a", header=False, index=False) | |
| return f"Synthetic data saved to {output_path}", output_path, timestamp | |
| def update_status_ui(message: str, state_obj: Dict) -> Tuple[str, Dict]: | |
| css_class = "info-text" | |
| if "Error" in message or "error" in message: | |
| css_class = "error-text" | |
| elif "success" in message or "pushed" in message: | |
| css_class = "success-text" | |
| html_message = f'<p class="{css_class}">{message}</p>' | |
| state_obj['last_status_html'] = html_message | |
| return html_message, state_obj | |
| def run_sdg( | |
| sample_size: int, | |
| model: str, | |
| max_new_tokens: int, | |
| save_reasoning: bool, | |
| token: str, | |
| state: Dict, | |
| label_boxes: List[Dict[str, str]], | |
| use_case: str, | |
| prompt_examples: str, | |
| category_boxes: List[Dict[str, str]], | |
| hf_repo_id: str, | |
| ) -> Tuple[str, Dict, str]: | |
| if not all([token, use_case, label_boxes, category_boxes, prompt_examples]): | |
| msg = "Validation Error: Token, Use Case, Labels, Categories, and Examples are required." | |
| return update_status_ui(msg, state)[0], state, "" | |
| try: | |
| login(token) | |
| status_msg = f"Logged into Hugging Face successfully." | |
| state = update_status_ui(status_msg, state)[1] | |
| except Exception as e: | |
| msg = f"Error logging in with token: {e}" | |
| return update_status_ui(msg, state)[0], state, "" | |
| try: | |
| label_descriptions = "" | |
| labels = [] | |
| for box in label_boxes: | |
| label_descriptions += box["content"] + "\n" | |
| label, _ = extract_label(box["content"]) | |
| labels.append(label) | |
| categories_types = {} | |
| for box in category_boxes: | |
| category, types = extract_label(box["content"]) | |
| categories_types[category.strip()] = [t.strip() for t in types.split(",")] | |
| status, output_path, timestamp = sdg( | |
| sample_size=sample_size, | |
| labels=labels, | |
| label_descriptions=label_descriptions, | |
| categories_types=categories_types, | |
| use_case=use_case, | |
| prompt_examples=prompt_examples, | |
| model=model, | |
| max_new_tokens=max_new_tokens, | |
| batch_size=20, | |
| output_dir="./output_data", | |
| save_reasoning=save_reasoning, | |
| ) | |
| state["output_path"] = output_path | |
| state["timestamp"] = timestamp | |
| status_msg_local = update_status_ui(status, state)[0] | |
| if hf_repo_id and token: | |
| api = HfApi(token=token) | |
| if not api.repo_exists(hf_repo_id): | |
| api.create_repo(repo_id=hf_repo_id, repo_type="dataset", private=False) | |
| status_msg_upload = f"Created new dataset repository: {hf_repo_id}" | |
| status_msg_local = update_status_ui(status_msg_upload, state)[0] | |
| df = pd.DataFrame(sdg( | |
| sample_size=sample_size, | |
| labels=labels, | |
| label_descriptions=label_descriptions, | |
| categories_types=categories_types, | |
| use_case=use_case, | |
| prompt_examples=prompt_examples, | |
| model=model, | |
| max_new_tokens=max_new_tokens, | |
| batch_size=sample_size, | |
| save_reasoning=save_reasoning, | |
| )) | |
| temp_csv_path = f"./temp_data_{datetime.now().strftime('%Y%m%d%H%M%S')}.csv" | |
| df.to_csv(temp_csv_path, index=False) | |
| api.upload_file( | |
| path_or_fileobj=temp_csv_path, | |
| path_in_repo="data.csv", | |
| repo_id=hf_repo_id, | |
| repo_type="dataset", | |
| ) | |
| os.remove(temp_csv_path) | |
| dataset_url = f"https://huggingface.co/{hf_repo_id}/blob/main/data.csv" | |
| status_msg_upload_final = f"Successfully pushed data to HF Dataset: {hf_repo_id}. View at: {dataset_url}" | |
| status_msg_final = update_status_ui(status_msg_upload_final, state)[0] | |
| state["hf_repo_id"] = hf_repo_id | |
| return status_msg_final, state, "" | |
| return status_msg_local, state, output_path | |
| except Exception as e: | |
| msg = f"Generation Error: {e}" | |
| return update_status_ui(msg, state)[0], state, "" | |
| def handle_generation_success(status, state_val, output_path_or_none): | |
| if output_path_or_none and os.path.exists(output_path_or_none): | |
| return status, state_val, gr.File(output_path_or_none, visible=True) | |
| return status, state_val, gr.File(None, visible=False) | |
| with gr.Blocks(css=CSS_STYLE) as demo: | |
| gr.Markdown( | |
| "# Synthetic Data Generator", | |
| elem_id="header", | |
| elem_classes="text-center", | |
| ) | |
| gr.Markdown( | |
| "**Use Language Models to Create Datasets for Specified Labels and Categories**", | |
| elem_classes="text-center", | |
| ) | |
| with gr.Tab("Data Generator"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown( | |
| "## Setup & Configure", | |
| elem_classes="text-center", | |
| ) | |
| gr.Markdown("### Use Case") | |
| use_case = gr.Textbox( | |
| show_label=False, | |
| placeholder="Describe your use case (e.g., customer service).", | |
| autofocus=True, | |
| ) | |
| sample_size = gr.Number( | |
| label="Total Samples to Generate", | |
| value=100, | |
| minimum=1 | |
| ) | |
| label_boxes = gr.State([]) | |
| gr.Markdown( | |
| "### Labels\nUse a colon to separate each label and its description as in 'label: description.'" | |
| ) | |
| with gr.Row(): | |
| new_label = gr.Textbox( | |
| show_label=False, | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| "polite: Text is considerate and shows respect and good manners, often including courteous phrases and a friendly tone.", | |
| "somewhat polite: Text is generally respectful but lacks warmth or formality, communicating with a decent level of courtesy.", | |
| ], | |
| example_labels=["polite", "somewhat polite"], | |
| inputs=new_label, | |
| ) | |
| add_label_button = gr.Button("Save Label", elem_classes="btn") | |
| def add_item( | |
| label_boxes: List[Dict[str, str]], new_content: str | |
| ) -> Tuple[List[Dict[str, str]], str]: | |
| if new_content.strip(): | |
| return ( | |
| label_boxes + [{"content": new_content.strip()}], | |
| "", | |
| ) | |
| return label_boxes, "" | |
| add_label_button.click( | |
| add_item, [label_boxes, new_label], [label_boxes, new_label] | |
| ) | |
| def render_boxes(box_list: List[Dict[str, str]]) -> None: | |
| with gr.Accordion(f"Saved Labels ({len(box_list)})"): | |
| for box in box_list: | |
| with gr.Row(): | |
| gr.Textbox( | |
| box["content"], | |
| lines=2, | |
| show_label=False, | |
| container=False, | |
| ) | |
| delete_button = gr.Button( | |
| "Delete", scale=0, variant="stop" | |
| ) | |
| def delete( | |
| box: Dict[str, str] = box, | |
| ) -> List[Dict[str, str]]: | |
| box_list.remove(box) | |
| return box_list | |
| delete_button.click(delete, None, [label_boxes]) | |
| category_boxes = gr.State([]) | |
| gr.Markdown( | |
| "### Categories\nUse a colon to separate each category and its subcategories as in 'category: type1, type2.'" | |
| ) | |
| with gr.Row(): | |
| new_category = gr.Textbox(show_label=False) | |
| gr.Examples( | |
| examples=[ | |
| "travel: hotel, airline, train", | |
| "finance: fees and charges, credit", | |
| ], | |
| example_labels=["travel", "finance"], | |
| inputs=new_category, | |
| ) | |
| add_category_button = gr.Button("Save Category", elem_classes="btn") | |
| add_category_button.click( | |
| add_item, | |
| [category_boxes, new_category], | |
| [category_boxes, new_category], | |
| ) | |
| def render_boxes_cat(box_list: List[Dict[str, str]]) -> None: | |
| with gr.Accordion(f"Saved Categories ({len(box_list)})"): | |
| for box in box_list: | |
| with gr.Row(): | |
| gr.Textbox( | |
| box["content"], | |
| show_label=False, | |
| container=False, | |
| ) | |
| delete_button = gr.Button( | |
| "Delete", scale=0, variant="stop" | |
| ) | |
| def delete( | |
| box: Dict[str, str] = box, | |
| ) -> List[Dict[str, str]]: | |
| box_list.remove(box) | |
| return box_list | |
| delete_button.click(delete, None, [category_boxes]) | |
| gr.Markdown( | |
| "### Guiding Examples\nInclude all examples in this box. For each example, provide a LABEL, CATEGORY, TYPE, OUTPUT, and REASONING." | |
| ) | |
| with gr.Row(): | |
| prompt_examples = gr.Textbox( | |
| show_label=False, | |
| ) | |
| gr.Examples( | |
| label="Example", | |
| examples=[ | |
| """LABEL: polite | |
| CATEGORY: food and drink | |
| TYPE: cafe | |
| OUTPUT: Thank you for visiting! While we prepare your coffee, feel free to relax or browse our selection of pastries. Let us know if we can make your day even better! | |
| REASONING: This text is polite because it expresses gratitude and encourages the customer to feel at ease with a welcoming tone. Phrases like "Let us know if we can make your day even better" show warmth and consideration, enhancing the customer experience. | |
| LABEL: somewhat polite | |
| CATEGORY: travel | |
| TYPE: train | |
| OUTPUT: I understand your concern about your booking, and I'll check what options we have for you. | |
| REASONING: This text would be classified as "somewhat polite." The acknowledgment of the customer's concern shows a basic level of respect. The sentence is direct and lacks additional warmth or formality, but it communicates a willingness to help. The use of "I'll check" is a straightforward commitment to action without additional courteous phrases that would make it fully polite. | |
| """ | |
| ], | |
| example_labels=["polite and somewhat polite"], | |
| inputs=prompt_examples, | |
| ) | |
| gr.Markdown( | |
| """### Language Model Configuration""" | |
| ) | |
| model = gr.Dropdown( | |
| label="Model", | |
| choices=[ | |
| "google/gemma-3-1b-it", | |
| "HuggingFaceTB/SmolLM2-1.7B-Instruct", | |
| "meta-llama/Llama-3.2-3B-Instruct", | |
| ], | |
| value="google/gemma-3-1b-it", | |
| ) | |
| max_new_tokens = gr.Number( | |
| label="Maximum Number of New Tokens", | |
| value=512 | |
| ) | |
| save_reasoning = gr.Checkbox(label="Save Reasoning", value=True) | |
| api_token = gr.Textbox( | |
| label="Hugging Face Token", | |
| placeholder="Paste your HF token here (required for gated models)", | |
| type="password" | |
| ) | |
| hf_repo_id = gr.Textbox( | |
| label="HF Dataset Repository ID (Optional, e.g., your_user/my_new_data)", | |
| placeholder="Enter repo ID to push dataset here", | |
| value="" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("## Generation & Output") | |
| status_output = gr.HTML(label="Status", value="<p class='info-text'>Enter setup details and click 'Generate'.</p>") | |
| generate_button = gr.Button("Generate Synthetic Data", variant="primary") | |
| state = gr.State({}) | |
| summary_output = gr.Markdown("Data summary will appear here.") | |
| clear_button = gr.Button("Clear All Inputs") | |
| def clear_all(): | |
| return "", 100, True, "", [], [], "", None, "", "" | |
| clear_button.click( | |
| clear_all, | |
| inputs=[], | |
| outputs=[ | |
| use_case, sample_size, max_new_tokens, prompt_examples, label_boxes, category_boxes, hf_repo_id, summary_output, state, api_token | |
| ] | |
| ) | |
| generate_button.click( | |
| run_sdg, | |
| inputs=[ | |
| sample_size, | |
| model, | |
| max_new_tokens, | |
| save_reasoning, | |
| api_token, | |
| state, | |
| label_boxes, | |
| use_case, | |
| prompt_examples, | |
| category_boxes, | |
| hf_repo_id, | |
| ], | |
| outputs=[status_output, state, summary_output], | |
| ).then( | |
| handle_generation_success, | |
| inputs=[status_output, state, summary_output], | |
| outputs=[status_output, state, summary_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |