Update app.py
Browse files
app.py
CHANGED
|
@@ -231,7 +231,66 @@ def process_url_list(url_text: str, progress=gr.Progress()) -> str:
|
|
| 231 |
except Exception as e:
|
| 232 |
return f"Error processing URLs: {str(e)}"
|
| 233 |
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
def create_interface():
|
| 236 |
with gr.Blocks(title="Website Category Classifier") as interface:
|
| 237 |
gr.HTML("<h1>π Website Category Classifier</h1>")
|
|
@@ -240,35 +299,41 @@ def create_interface():
|
|
| 240 |
with gr.Row():
|
| 241 |
with gr.Column():
|
| 242 |
url_input = gr.Textbox(
|
| 243 |
-
label="
|
| 244 |
-
placeholder="https://
|
| 245 |
-
lines=
|
| 246 |
-
max_lines=20
|
| 247 |
)
|
| 248 |
|
| 249 |
-
process_btn = gr.Button("π Classify
|
| 250 |
|
| 251 |
with gr.Column():
|
| 252 |
-
|
| 253 |
-
label="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
lines=15,
|
| 255 |
-
max_lines=
|
| 256 |
interactive=False
|
| 257 |
)
|
| 258 |
|
| 259 |
# Examples
|
| 260 |
gr.Examples(
|
| 261 |
examples=[
|
| 262 |
-
["https://news.google.com
|
| 263 |
-
["https://
|
|
|
|
| 264 |
],
|
| 265 |
inputs=[url_input],
|
| 266 |
)
|
| 267 |
|
| 268 |
process_btn.click(
|
| 269 |
-
fn=
|
| 270 |
inputs=[url_input],
|
| 271 |
-
outputs=[
|
| 272 |
show_progress=True
|
| 273 |
)
|
| 274 |
|
|
|
|
| 231 |
except Exception as e:
|
| 232 |
return f"Error processing URLs: {str(e)}"
|
| 233 |
|
| 234 |
+
|
| 235 |
+
def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
|
| 236 |
+
"""Process a single URL and return both scraped text and prediction"""
|
| 237 |
+
if not url.strip():
|
| 238 |
+
return "Please provide a URL to process.", ""
|
| 239 |
+
|
| 240 |
+
# Clean the URL
|
| 241 |
+
url = url.strip()
|
| 242 |
+
if not (url.startswith('http://') or url.startswith('https://')):
|
| 243 |
+
url = 'https://' + url
|
| 244 |
+
|
| 245 |
+
try:
|
| 246 |
+
progress(0.1, desc="Scraping website...")
|
| 247 |
+
|
| 248 |
+
# Scrape the URL
|
| 249 |
+
import httpx
|
| 250 |
+
with httpx.Client(timeout=30.0) as client:
|
| 251 |
+
response = client.get(url)
|
| 252 |
+
|
| 253 |
+
if response.status_code != 200:
|
| 254 |
+
return f"Error: HTTP {response.status_code}", ""
|
| 255 |
+
|
| 256 |
+
# Extract text content (you can enhance this with BeautifulSoup)
|
| 257 |
+
from bs4 import BeautifulSoup
|
| 258 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 259 |
+
|
| 260 |
+
# Remove script and style elements
|
| 261 |
+
for script in soup(["script", "style"]):
|
| 262 |
+
script.decompose()
|
| 263 |
+
|
| 264 |
+
# Get text content
|
| 265 |
+
scraped_text = soup.get_text()
|
| 266 |
+
|
| 267 |
+
# Clean up the text
|
| 268 |
+
lines = (line.strip() for line in scraped_text.splitlines())
|
| 269 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 270 |
+
scraped_text = ' '.join(chunk for chunk in chunks if chunk)
|
| 271 |
+
|
| 272 |
+
# Limit text length for display
|
| 273 |
+
scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
|
| 274 |
+
|
| 275 |
+
progress(0.5, desc="Translating text...")
|
| 276 |
+
|
| 277 |
+
# Check if text is too short
|
| 278 |
+
if len(scraped_text) < 150:
|
| 279 |
+
return "Short", scraped_display
|
| 280 |
+
|
| 281 |
+
# Translate text
|
| 282 |
+
translated = translate_text(scraped_text[:4990])
|
| 283 |
+
|
| 284 |
+
progress(0.8, desc="Classifying website...")
|
| 285 |
+
|
| 286 |
+
# Get prediction using GPU
|
| 287 |
+
prediction = predict_inference(translated)
|
| 288 |
+
|
| 289 |
+
return prediction, scraped_display
|
| 290 |
+
|
| 291 |
+
except Exception as e:
|
| 292 |
+
error_msg = f"Error processing URL: {str(e)[:200]}"
|
| 293 |
+
return error_msg, ""
|
| 294 |
def create_interface():
|
| 295 |
with gr.Blocks(title="Website Category Classifier") as interface:
|
| 296 |
gr.HTML("<h1>π Website Category Classifier</h1>")
|
|
|
|
| 299 |
with gr.Row():
|
| 300 |
with gr.Column():
|
| 301 |
url_input = gr.Textbox(
|
| 302 |
+
label="Website URL",
|
| 303 |
+
placeholder="https://example.com",
|
| 304 |
+
lines=1
|
|
|
|
| 305 |
)
|
| 306 |
|
| 307 |
+
process_btn = gr.Button("π Classify Website", variant="primary")
|
| 308 |
|
| 309 |
with gr.Column():
|
| 310 |
+
prediction_output = gr.Textbox(
|
| 311 |
+
label="Classification Result",
|
| 312 |
+
lines=2,
|
| 313 |
+
interactive=False
|
| 314 |
+
)
|
| 315 |
+
|
| 316 |
+
scraped_output = gr.Textbox(
|
| 317 |
+
label="Scraped Content (first 2000 chars)",
|
| 318 |
lines=15,
|
| 319 |
+
max_lines=20,
|
| 320 |
interactive=False
|
| 321 |
)
|
| 322 |
|
| 323 |
# Examples
|
| 324 |
gr.Examples(
|
| 325 |
examples=[
|
| 326 |
+
["https://news.google.com"],
|
| 327 |
+
["https://amazon.com"],
|
| 328 |
+
["https://github.com"]
|
| 329 |
],
|
| 330 |
inputs=[url_input],
|
| 331 |
)
|
| 332 |
|
| 333 |
process_btn.click(
|
| 334 |
+
fn=process_single_url,
|
| 335 |
inputs=[url_input],
|
| 336 |
+
outputs=[prediction_output, scraped_output],
|
| 337 |
show_progress=True
|
| 338 |
)
|
| 339 |
|