File size: 6,163 Bytes
6e6a649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
from faker import Faker
import random
from huggingface_hub import InferenceClient

# Initialize Faker for synthetic data fallback
fake = Faker()

# Function to extract ALL text from a webpage
def extract_all_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
        return text_elements
    except Exception as e:
        raise ValueError(f"Error fetching or parsing the URL: {e}")

# Function to apply common-sense filtering
def apply_common_sense(text_list):
    filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
    return list(filtered)

# Function to generate synthetic data using HF Inference API or Faker fallback
def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
    synthetic_data = []
    if not text_list:
        text_list = [fake.sentence()]
    
    if not hf_api_token:
        # Fallback to Faker if no token provided
        for _ in range(num_synthetic):
            base_text = random.choice(text_list)
            words = base_text.split()
            random.shuffle(words)
            synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
    else:
        # Use HF Inference API
        client = InferenceClient(model=hf_model, token=hf_api_token)
        for _ in range(num_synthetic):
            base_text = random.choice(text_list)
            try:
                prompt = f"Generate a creative variation of this text: '{base_text}'"
                generated = client.text_generation(prompt, max_length=50, temperature=0.7)
                synthetic_data.append(generated.strip())
            except Exception as e:
                synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))
    
    return synthetic_data

# Function to sort text by length
def sort_text_by_length(text_list):
    return sorted(text_list, key=len)

# Function to create a DataFrame with only a text column
def create_dataframe(text_list, column_text):
    df = pd.DataFrame({column_text: text_list})
    return df

# Function to generate a CSV file
def download_csv(df):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
        df.to_csv(tmp.name, index=False)
        return tmp.name

# Function to generate a JSON file
def download_json(df):
    with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
        df.to_json(tmp.name, orient='records')
        return tmp.name

# Gradio interface
with gr.Blocks() as demo:
    # Header
    gr.Markdown("# Webtaset: Website to Dataset Converter")
    gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")
    
    # Inputs
    url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
    column_text = gr.Textbox(label="Column name for text", value="Text")
    num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
    hf_model = gr.Dropdown(
        label="Hugging Face Model (lightweight)",
        choices=["distilgpt2", "facebook/bart-base", "gpt2"],
        value="distilgpt2"
    )
    hf_api_token = gr.Textbox(
        label="Hugging Face API Token (required for HF models)",
        type="password",
        placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
    )
    
    # Process button
    process_btn = gr.Button("Process")
    
    # Outputs
    df_preview = gr.Dataframe(label="Dataset Preview")
    state = gr.State()  # To store the DataFrame
    status = gr.Textbox(label="Status", interactive=False)
    
    download_csv_btn = gr.Button("Download CSV")
    download_json_btn = gr.Button("Download JSON")
    csv_file = gr.File(label="Download CSV")
    json_file = gr.File(label="Download JSON")
    
    # Process function
    def process(url, column_text, num_synthetic, hf_model, hf_api_token):
        try:
            # Step 1 & 2: Get URL and extract ALL text
            text_list = extract_all_text_from_url(url)
            
            # Add common-sense filtering
            filtered_text = apply_common_sense(text_list)
            
            # Generate synthetic data if requested
            if num_synthetic > 0:
                synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
                filtered_text.extend(synthetic_data)
            
            # Step 5 & 6: Sort by increasing size
            sorted_text = sort_text_by_length(filtered_text)
            
            # Step 7: Create DataFrame with user-defined column name
            df = create_dataframe(sorted_text, column_text)
            
            # Step 8: Return for preview and state
            method = "Faker" if not hf_api_token else hf_model
            return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
        except Exception as e:
            return None, None, f"Error: {e}"
    
    # Connect process button
    process_btn.click(
        fn=process,
        inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
        outputs=[df_preview, state, status]
    )
    
    # Download CSV function
    def gen_csv(state):
        if state is None:
            return None
        return download_csv(state)
    
    # Download JSON function
    def gen_json(state):
        if state is None:
            return None
        return download_json(state)
    
    # Connect download buttons
    download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
    download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)

# Launch the app
demo.launch()