Spaces:

PhaseTechnologies
/

Webtaset

Sleeping

App Files Files Community

Phase-Technologies commited on Mar 14, 2025

Commit

6e6a649

verified ·

1 Parent(s): 5153619

Create app.py

Browse files

Files changed (1) hide show

app.py +161 -0

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import tempfile
+from faker import Faker
+import random
+from huggingface_hub import InferenceClient
+# Initialize Faker for synthetic data fallback
+fake = Faker()
+# Function to extract ALL text from a webpage
+def extract_all_text_from_url(url):
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, 'html.parser')
+        text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
+        return text_elements
+    except Exception as e:
+        raise ValueError(f"Error fetching or parsing the URL: {e}")
+# Function to apply common-sense filtering
+def apply_common_sense(text_list):
+    filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
+    return list(filtered)
+# Function to generate synthetic data using HF Inference API or Faker fallback
+def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
+    synthetic_data = []
+    if not text_list:
+        text_list = [fake.sentence()]
+    if not hf_api_token:
+        # Fallback to Faker if no token provided
+        for _ in range(num_synthetic):
+            base_text = random.choice(text_list)
+            words = base_text.split()
+            random.shuffle(words)
+            synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
+    else:
+        # Use HF Inference API
+        client = InferenceClient(model=hf_model, token=hf_api_token)
+        for _ in range(num_synthetic):
+            base_text = random.choice(text_list)
+            try:
+                prompt = f"Generate a creative variation of this text: '{base_text}'"
+                generated = client.text_generation(prompt, max_length=50, temperature=0.7)
+                synthetic_data.append(generated.strip())
+            except Exception as e:
+                synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))
+    return synthetic_data
+# Function to sort text by length
+def sort_text_by_length(text_list):
+    return sorted(text_list, key=len)
+# Function to create a DataFrame with only a text column
+def create_dataframe(text_list, column_text):
+    df = pd.DataFrame({column_text: text_list})
+    return df
+# Function to generate a CSV file
+def download_csv(df):
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
+        df.to_csv(tmp.name, index=False)
+        return tmp.name
+# Function to generate a JSON file
+def download_json(df):
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
+        df.to_json(tmp.name, orient='records')
+        return tmp.name
+# Gradio interface
+with gr.Blocks() as demo:
+    # Header
+    gr.Markdown("# Webtaset: Website to Dataset Converter")
+    gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")
+    # Inputs
+    url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
+    column_text = gr.Textbox(label="Column name for text", value="Text")
+    num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
+    hf_model = gr.Dropdown(
+        label="Hugging Face Model (lightweight)",
+        choices=["distilgpt2", "facebook/bart-base", "gpt2"],
+        value="distilgpt2"
+    )
+    hf_api_token = gr.Textbox(
+        label="Hugging Face API Token (required for HF models)",
+        type="password",
+        placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+    )
+    # Process button
+    process_btn = gr.Button("Process")
+    # Outputs
+    df_preview = gr.Dataframe(label="Dataset Preview")
+    state = gr.State()  # To store the DataFrame
+    status = gr.Textbox(label="Status", interactive=False)
+    download_csv_btn = gr.Button("Download CSV")
+    download_json_btn = gr.Button("Download JSON")
+    csv_file = gr.File(label="Download CSV")
+    json_file = gr.File(label="Download JSON")
+    # Process function
+    def process(url, column_text, num_synthetic, hf_model, hf_api_token):
+        try:
+            # Step 1 & 2: Get URL and extract ALL text
+            text_list = extract_all_text_from_url(url)
+            # Add common-sense filtering
+            filtered_text = apply_common_sense(text_list)
+            # Generate synthetic data if requested
+            if num_synthetic > 0:
+                synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
+                filtered_text.extend(synthetic_data)
+            # Step 5 & 6: Sort by increasing size
+            sorted_text = sort_text_by_length(filtered_text)
+            # Step 7: Create DataFrame with user-defined column name
+            df = create_dataframe(sorted_text, column_text)
+            # Step 8: Return for preview and state
+            method = "Faker" if not hf_api_token else hf_model
+            return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
+        except Exception as e:
+            return None, None, f"Error: {e}"
+    # Connect process button
+    process_btn.click(
+        fn=process,
+        inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
+        outputs=[df_preview, state, status]
+    )
+    # Download CSV function
+    def gen_csv(state):
+        if state is None:
+            return None
+        return download_csv(state)
+    # Download JSON function
+    def gen_json(state):
+        if state is None:
+            return None
+        return download_json(state)
+    # Connect download buttons
+    download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
+    download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)
+# Launch the app
+demo.launch()