limitedonly41 commited on
Commit
d78fe22
Β·
verified Β·
1 Parent(s): e3e06ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -13
app.py CHANGED
@@ -231,7 +231,66 @@ def process_url_list(url_text: str, progress=gr.Progress()) -> str:
231
  except Exception as e:
232
  return f"Error processing URLs: {str(e)}"
233
 
234
- # Create Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  def create_interface():
236
  with gr.Blocks(title="Website Category Classifier") as interface:
237
  gr.HTML("<h1>πŸ” Website Category Classifier</h1>")
@@ -240,35 +299,41 @@ def create_interface():
240
  with gr.Row():
241
  with gr.Column():
242
  url_input = gr.Textbox(
243
- label="URLs (one per line)",
244
- placeholder="https://example1.com\nhttps://example2.com\nhttps://example3.com",
245
- lines=10,
246
- max_lines=20
247
  )
248
 
249
- process_btn = gr.Button("πŸš€ Classify Websites", variant="primary")
250
 
251
  with gr.Column():
252
- output = gr.Textbox(
253
- label="Results",
 
 
 
 
 
 
254
  lines=15,
255
- max_lines=30,
256
  interactive=False
257
  )
258
 
259
  # Examples
260
  gr.Examples(
261
  examples=[
262
- ["https://news.google.com\nhttps://amazon.com\nhttps://github.com"],
263
- ["https://techcrunch.com\nhttps://shopify.com\nhttps://stackoverflow.com"]
 
264
  ],
265
  inputs=[url_input],
266
  )
267
 
268
  process_btn.click(
269
- fn=process_url_list,
270
  inputs=[url_input],
271
- outputs=[output],
272
  show_progress=True
273
  )
274
 
 
231
  except Exception as e:
232
  return f"Error processing URLs: {str(e)}"
233
 
234
+
235
+ def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
236
+ """Process a single URL and return both scraped text and prediction"""
237
+ if not url.strip():
238
+ return "Please provide a URL to process.", ""
239
+
240
+ # Clean the URL
241
+ url = url.strip()
242
+ if not (url.startswith('http://') or url.startswith('https://')):
243
+ url = 'https://' + url
244
+
245
+ try:
246
+ progress(0.1, desc="Scraping website...")
247
+
248
+ # Scrape the URL
249
+ import httpx
250
+ with httpx.Client(timeout=30.0) as client:
251
+ response = client.get(url)
252
+
253
+ if response.status_code != 200:
254
+ return f"Error: HTTP {response.status_code}", ""
255
+
256
+ # Extract text content (you can enhance this with BeautifulSoup)
257
+ from bs4 import BeautifulSoup
258
+ soup = BeautifulSoup(response.text, 'html.parser')
259
+
260
+ # Remove script and style elements
261
+ for script in soup(["script", "style"]):
262
+ script.decompose()
263
+
264
+ # Get text content
265
+ scraped_text = soup.get_text()
266
+
267
+ # Clean up the text
268
+ lines = (line.strip() for line in scraped_text.splitlines())
269
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
270
+ scraped_text = ' '.join(chunk for chunk in chunks if chunk)
271
+
272
+ # Limit text length for display
273
+ scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
274
+
275
+ progress(0.5, desc="Translating text...")
276
+
277
+ # Check if text is too short
278
+ if len(scraped_text) < 150:
279
+ return "Short", scraped_display
280
+
281
+ # Translate text
282
+ translated = translate_text(scraped_text[:4990])
283
+
284
+ progress(0.8, desc="Classifying website...")
285
+
286
+ # Get prediction using GPU
287
+ prediction = predict_inference(translated)
288
+
289
+ return prediction, scraped_display
290
+
291
+ except Exception as e:
292
+ error_msg = f"Error processing URL: {str(e)[:200]}"
293
+ return error_msg, ""
294
  def create_interface():
295
  with gr.Blocks(title="Website Category Classifier") as interface:
296
  gr.HTML("<h1>πŸ” Website Category Classifier</h1>")
 
299
  with gr.Row():
300
  with gr.Column():
301
  url_input = gr.Textbox(
302
+ label="Website URL",
303
+ placeholder="https://example.com",
304
+ lines=1
 
305
  )
306
 
307
+ process_btn = gr.Button("πŸš€ Classify Website", variant="primary")
308
 
309
  with gr.Column():
310
+ prediction_output = gr.Textbox(
311
+ label="Classification Result",
312
+ lines=2,
313
+ interactive=False
314
+ )
315
+
316
+ scraped_output = gr.Textbox(
317
+ label="Scraped Content (first 2000 chars)",
318
  lines=15,
319
+ max_lines=20,
320
  interactive=False
321
  )
322
 
323
  # Examples
324
  gr.Examples(
325
  examples=[
326
+ ["https://news.google.com"],
327
+ ["https://amazon.com"],
328
+ ["https://github.com"]
329
  ],
330
  inputs=[url_input],
331
  )
332
 
333
  process_btn.click(
334
+ fn=process_single_url,
335
  inputs=[url_input],
336
+ outputs=[prediction_output, scraped_output],
337
  show_progress=True
338
  )
339