Phase-Technologies commited on
Commit
6e6a649
·
verified ·
1 Parent(s): 5153619

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -0
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ import tempfile
6
+ from faker import Faker
7
+ import random
8
+ from huggingface_hub import InferenceClient
9
+
10
+ # Initialize Faker for synthetic data fallback
11
+ fake = Faker()
12
+
13
+ # Function to extract ALL text from a webpage
14
+ def extract_all_text_from_url(url):
15
+ try:
16
+ response = requests.get(url)
17
+ response.raise_for_status()
18
+ soup = BeautifulSoup(response.text, 'html.parser')
19
+ text_elements = [text.strip() for text in soup.stripped_strings if text.strip()]
20
+ return text_elements
21
+ except Exception as e:
22
+ raise ValueError(f"Error fetching or parsing the URL: {e}")
23
+
24
+ # Function to apply common-sense filtering
25
+ def apply_common_sense(text_list):
26
+ filtered = set([text for text in text_list if len(text) >= 3 and not text.isspace()])
27
+ return list(filtered)
28
+
29
+ # Function to generate synthetic data using HF Inference API or Faker fallback
30
+ def generate_synthetic_data(text_list, num_synthetic, hf_model, hf_api_token):
31
+ synthetic_data = []
32
+ if not text_list:
33
+ text_list = [fake.sentence()]
34
+
35
+ if not hf_api_token:
36
+ # Fallback to Faker if no token provided
37
+ for _ in range(num_synthetic):
38
+ base_text = random.choice(text_list)
39
+ words = base_text.split()
40
+ random.shuffle(words)
41
+ synthetic_data.append(" ".join(words) + " " + fake.sentence(nb_words=random.randint(3, 10)))
42
+ else:
43
+ # Use HF Inference API
44
+ client = InferenceClient(model=hf_model, token=hf_api_token)
45
+ for _ in range(num_synthetic):
46
+ base_text = random.choice(text_list)
47
+ try:
48
+ prompt = f"Generate a creative variation of this text: '{base_text}'"
49
+ generated = client.text_generation(prompt, max_length=50, temperature=0.7)
50
+ synthetic_data.append(generated.strip())
51
+ except Exception as e:
52
+ synthetic_data.append(fake.sentence() + " " + " ".join(random.sample(base_text.split(), min(len(base_text.split()), 5))))
53
+
54
+ return synthetic_data
55
+
56
+ # Function to sort text by length
57
+ def sort_text_by_length(text_list):
58
+ return sorted(text_list, key=len)
59
+
60
+ # Function to create a DataFrame with only a text column
61
+ def create_dataframe(text_list, column_text):
62
+ df = pd.DataFrame({column_text: text_list})
63
+ return df
64
+
65
+ # Function to generate a CSV file
66
+ def download_csv(df):
67
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as tmp:
68
+ df.to_csv(tmp.name, index=False)
69
+ return tmp.name
70
+
71
+ # Function to generate a JSON file
72
+ def download_json(df):
73
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.json') as tmp:
74
+ df.to_json(tmp.name, orient='records')
75
+ return tmp.name
76
+
77
+ # Gradio interface
78
+ with gr.Blocks() as demo:
79
+ # Header
80
+ gr.Markdown("# Webtaset: Website to Dataset Converter")
81
+ gr.Markdown("Extract all text from a URL, apply common-sense filtering, generate synthetic data with lightweight HF models, and download as a dataset. Provide your own HF API token for advanced features.")
82
+
83
+ # Inputs
84
+ url = gr.Textbox(label="Enter the URL", placeholder="https://example.com")
85
+ column_text = gr.Textbox(label="Column name for text", value="Text")
86
+ num_synthetic = gr.Slider(label="Number of synthetic data entries", minimum=0, maximum=1000, step=1, value=0)
87
+ hf_model = gr.Dropdown(
88
+ label="Hugging Face Model (lightweight)",
89
+ choices=["distilgpt2", "facebook/bart-base", "gpt2"],
90
+ value="distilgpt2"
91
+ )
92
+ hf_api_token = gr.Textbox(
93
+ label="Hugging Face API Token (required for HF models)",
94
+ type="password",
95
+ placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
96
+ )
97
+
98
+ # Process button
99
+ process_btn = gr.Button("Process")
100
+
101
+ # Outputs
102
+ df_preview = gr.Dataframe(label="Dataset Preview")
103
+ state = gr.State() # To store the DataFrame
104
+ status = gr.Textbox(label="Status", interactive=False)
105
+
106
+ download_csv_btn = gr.Button("Download CSV")
107
+ download_json_btn = gr.Button("Download JSON")
108
+ csv_file = gr.File(label="Download CSV")
109
+ json_file = gr.File(label="Download JSON")
110
+
111
+ # Process function
112
+ def process(url, column_text, num_synthetic, hf_model, hf_api_token):
113
+ try:
114
+ # Step 1 & 2: Get URL and extract ALL text
115
+ text_list = extract_all_text_from_url(url)
116
+
117
+ # Add common-sense filtering
118
+ filtered_text = apply_common_sense(text_list)
119
+
120
+ # Generate synthetic data if requested
121
+ if num_synthetic > 0:
122
+ synthetic_data = generate_synthetic_data(filtered_text, num_synthetic, hf_model, hf_api_token)
123
+ filtered_text.extend(synthetic_data)
124
+
125
+ # Step 5 & 6: Sort by increasing size
126
+ sorted_text = sort_text_by_length(filtered_text)
127
+
128
+ # Step 7: Create DataFrame with user-defined column name
129
+ df = create_dataframe(sorted_text, column_text)
130
+
131
+ # Step 8: Return for preview and state
132
+ method = "Faker" if not hf_api_token else hf_model
133
+ return df, df, f"Processing complete. Extracted {len(text_list)} items, filtered to {len(filtered_text) - num_synthetic}, added {num_synthetic} synthetic using {method}."
134
+ except Exception as e:
135
+ return None, None, f"Error: {e}"
136
+
137
+ # Connect process button
138
+ process_btn.click(
139
+ fn=process,
140
+ inputs=[url, column_text, num_synthetic, hf_model, hf_api_token],
141
+ outputs=[df_preview, state, status]
142
+ )
143
+
144
+ # Download CSV function
145
+ def gen_csv(state):
146
+ if state is None:
147
+ return None
148
+ return download_csv(state)
149
+
150
+ # Download JSON function
151
+ def gen_json(state):
152
+ if state is None:
153
+ return None
154
+ return download_json(state)
155
+
156
+ # Connect download buttons
157
+ download_csv_btn.click(fn=gen_csv, inputs=state, outputs=csv_file)
158
+ download_json_btn.click(fn=gen_json, inputs=state, outputs=json_file)
159
+
160
+ # Launch the app
161
+ demo.launch()