Spaces:
Sleeping
Sleeping
File size: 6,163 Bytes
6e6a649 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import tempfile
from faker import Faker
import random
from huggingface_hub import InferenceClient
# Initialize Faker for synthetic data fallback
fake = Faker()
# Function to extract ALL text from a webpage
def extract_all_text_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
return text_elements
except Exception as e:
raise ValueError(f"Error fetching or parsing the URL: {e}")
# Function to apply common-sense filtering
def apply_common_sense(text_list):
filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
return list(filtered)
# Function to generate synthetic data using HF Inference API or Faker fallback
def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
synthetic_data = []
if not text_list:
text_list = [fake.sentence()]
if not hf_api_token:
# Fallback to Faker if no token provided
for _ in range(num_synthetic):
base_text = random.choice(text_list)
words = base_text.split()
random.shuffle(words)
synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
else:
# Use HF Inference API
client = InferenceClient(model=hf_model, token=hf_api_token)
for _ in range(num_synthetic):
base_text = random.choice(text_list)
try:
prompt = f"Generate a creative variation of this text: '{base_text}'"
generated = client.text_generation(prompt, max_length=50, temperature=0.7)
synthetic_data.append(generated.strip())
except Exception as e:
synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))
return synthetic_data
# Function to sort text by length
def sort_text_by_length(text_list):
return sorted(text_list, key=len)
# Function to create a DataFrame with only a text column
def create_dataframe(text_list, column_text):
df = pd.DataFrame({column_text: text_list})
return df
# Function to generate a CSV file
def download_csv(df):
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
df.to_csv(tmp.name, index=False)
return tmp.name
# Function to generate a JSON file
def download_json(df):
with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
df.to_json(tmp.name, orient='records')
return tmp.name
# Gradio interface
with gr.Blocks() as demo:
# Header
gr.Markdown("# Webtaset: Website to Dataset Converter")
gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")
# Inputs
url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
column_text = gr.Textbox(label="Column name for text", value="Text")
num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
hf_model = gr.Dropdown(
label="Hugging Face Model (lightweight)",
choices=["distilgpt2", "facebook/bart-base", "gpt2"],
value="distilgpt2"
)
hf_api_token = gr.Textbox(
label="Hugging Face API Token (required for HF models)",
type="password",
placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
)
# Process button
process_btn = gr.Button("Process")
# Outputs
df_preview = gr.Dataframe(label="Dataset Preview")
state = gr.State() # To store the DataFrame
status = gr.Textbox(label="Status", interactive=False)
download_csv_btn = gr.Button("Download CSV")
download_json_btn = gr.Button("Download JSON")
csv_file = gr.File(label="Download CSV")
json_file = gr.File(label="Download JSON")
# Process function
def process(url, column_text, num_synthetic, hf_model, hf_api_token):
try:
# Step 1 & 2: Get URL and extract ALL text
text_list = extract_all_text_from_url(url)
# Add common-sense filtering
filtered_text = apply_common_sense(text_list)
# Generate synthetic data if requested
if num_synthetic > 0:
synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
filtered_text.extend(synthetic_data)
# Step 5 & 6: Sort by increasing size
sorted_text = sort_text_by_length(filtered_text)
# Step 7: Create DataFrame with user-defined column name
df = create_dataframe(sorted_text, column_text)
# Step 8: Return for preview and state
method = "Faker" if not hf_api_token else hf_model
return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
except Exception as e:
return None, None, f"Error: {e}"
# Connect process button
process_btn.click(
fn=process,
inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
outputs=[df_preview, state, status]
)
# Download CSV function
def gen_csv(state):
if state is None:
return None
return download_csv(state)
# Download JSON function
def gen_json(state):
if state is None:
return None
return download_json(state)
# Connect download buttons
download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)
# Launch the app
demo.launch() |