Spaces:

PhaseTechnologies
/

Webtaset

Sleeping

File size: 6,163 Bytes

6e6a649

import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
from faker import Faker
import random
from huggingface_hub import InferenceClient

# Initialize Faker for synthetic data fallback
fake = Faker()

# Function to extract ALL text from a webpage
def extract_all_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
        return text_elements
    except Exception as e:
        raise ValueError(f"Error fetching or parsing the URL: {e}")

# Function to apply common-sense filtering
def apply_common_sense(text_list):
    filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
    return list(filtered)

# Function to generate synthetic data using HF Inference API or Faker fallback
def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
    synthetic_data = []
    if not text_list:
        text_list = [fake.sentence()]
    
    if not hf_api_token:
        # Fallback to Faker if no token provided
        for _ in range(num_synthetic):
            base_text = random.choice(text_list)
            words = base_text.split()
            random.shuffle(words)
            synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
    else:
        # Use HF Inference API
        client = InferenceClient(model=hf_model, token=hf_api_token)
        for _ in range(num_synthetic):
            base_text = random.choice(text_list)
            try:
                prompt = f"Generate a creative variation of this text: '{base_text}'"
                generated = client.text_generation(prompt, max_length=50, temperature=0.7)
                synthetic_data.append(generated.strip())
            except Exception as e:
                synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))
    
    return synthetic_data

# Function to sort text by length
def sort_text_by_length(text_list):
    return sorted(text_list, key=len)

# Function to create a DataFrame with only a text column
def create_dataframe(text_list, column_text):
    df = pd.DataFrame({column_text: text_list})
    return df

# Function to generate a CSV file
def download_csv(df):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
        df.to_csv(tmp.name, index=False)
        return tmp.name

# Function to generate a JSON file
def download_json(df):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
        df.to_json(tmp.name, orient='records')
        return tmp.name

# Gradio interface
with gr.Blocks() as demo:
    # Header
    gr.Markdown("# Webtaset: Website to Dataset Converter")
    gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")
    
    # Inputs
    url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
    column_text = gr.Textbox(label="Column name for text", value="Text")
    num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
    hf_model = gr.Dropdown(
        label="Hugging Face Model (lightweight)",
        choices=["distilgpt2", "facebook/bart-base", "gpt2"],
        value="distilgpt2"
    )
    hf_api_token = gr.Textbox(
        label="Hugging Face API Token (required for HF models)",
        type="password",
        placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
    )
    
    # Process button
    process_btn = gr.Button("Process")
    
    # Outputs
    df_preview = gr.Dataframe(label="Dataset Preview")
    state = gr.State()  # To store the DataFrame
    status = gr.Textbox(label="Status", interactive=False)
    
    download_csv_btn = gr.Button("Download CSV")
    download_json_btn = gr.Button("Download JSON")
    csv_file = gr.File(label="Download CSV")
    json_file = gr.File(label="Download JSON")
    
    # Process function
    def process(url, column_text, num_synthetic, hf_model, hf_api_token):
        try:
            # Step 1 & 2: Get URL and extract ALL text
            text_list = extract_all_text_from_url(url)
            
            # Add common-sense filtering
            filtered_text = apply_common_sense(text_list)
            
            # Generate synthetic data if requested
            if num_synthetic > 0:
                synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
                filtered_text.extend(synthetic_data)
            
            # Step 5 & 6: Sort by increasing size
            sorted_text = sort_text_by_length(filtered_text)
            
            # Step 7: Create DataFrame with user-defined column name
            df = create_dataframe(sorted_text, column_text)
            
            # Step 8: Return for preview and state
            method = "Faker" if not hf_api_token else hf_model
            return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
        except Exception as e:
            return None, None, f"Error: {e}"
    
    # Connect process button
    process_btn.click(
        fn=process,
        inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
        outputs=[df_preview, state, status]
    )
    
    # Download CSV function
    def gen_csv(state):
        if state is None:
            return None
        return download_csv(state)
    
    # Download JSON function
    def gen_json(state):
        if state is None:
            return None
        return download_json(state)
    
    # Connect download buttons
    download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
    download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)

# Launch the app
demo.launch()