Spaces:

limitedonly41
/

CV_website_classify

Paused

App Files Files Community

limitedonly41 commited on Oct 3, 2025

Commit

d78fe22

verified ·

1 Parent(s): e3e06ae

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -13

app.py CHANGED Viewed

@@ -231,7 +231,66 @@ def process_url_list(url_text: str, progress=gr.Progress()) -> str:
     except Exception as e:
         return f"Error processing URLs: {str(e)}"
-# Create Gradio interface
 def create_interface():
     with gr.Blocks(title="Website Category Classifier") as interface:
         gr.HTML("<h1>🔍 Website Category Classifier</h1>")
@@ -240,35 +299,41 @@ def create_interface():
         with gr.Row():
             with gr.Column():
                 url_input = gr.Textbox(
-                    label="URLs (one per line)",
-                    placeholder="https://example1.com\nhttps://example2.com\nhttps://example3.com",
-                    lines=10,
-                    max_lines=20
                 )
-                process_btn = gr.Button("🚀 Classify Websites", variant="primary")
             with gr.Column():
-                output = gr.Textbox(
-                    label="Results",
                     lines=15,
-                    max_lines=30,
                     interactive=False
                 )
         # Examples
         gr.Examples(
             examples=[
-                ["https://news.google.com\nhttps://amazon.com\nhttps://github.com"],
-                ["https://techcrunch.com\nhttps://shopify.com\nhttps://stackoverflow.com"]
             ],
             inputs=[url_input],
         )
         process_btn.click(
-            fn=process_url_list,
             inputs=[url_input],
-            outputs=[output],
             show_progress=True
         )

     except Exception as e:
         return f"Error processing URLs: {str(e)}"
+def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
+    """Process a single URL and return both scraped text and prediction"""
+    if not url.strip():
+        return "Please provide a URL to process.", ""
+    # Clean the URL
+    url = url.strip()
+    if not (url.startswith('http://') or url.startswith('https://')):
+        url = 'https://' + url
+    try:
+        progress(0.1, desc="Scraping website...")
+        # Scrape the URL
+        import httpx
+        with httpx.Client(timeout=30.0) as client:
+            response = client.get(url)
+        if response.status_code != 200:
+            return f"Error: HTTP {response.status_code}", ""
+        # Extract text content (you can enhance this with BeautifulSoup)
+        from bs4 import BeautifulSoup
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove script and style elements
+        for script in soup(["script", "style"]):
+            script.decompose()
+        # Get text content
+        scraped_text = soup.get_text()
+        # Clean up the text
+        lines = (line.strip() for line in scraped_text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        scraped_text = ' '.join(chunk for chunk in chunks if chunk)
+        # Limit text length for display
+        scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
+        progress(0.5, desc="Translating text...")
+        # Check if text is too short
+        if len(scraped_text) < 150:
+            return "Short", scraped_display
+        # Translate text
+        translated = translate_text(scraped_text[:4990])
+        progress(0.8, desc="Classifying website...")
+        # Get prediction using GPU
+        prediction = predict_inference(translated)
+        return prediction, scraped_display
+    except Exception as e:
+        error_msg = f"Error processing URL: {str(e)[:200]}"
+        return error_msg, ""
 def create_interface():
     with gr.Blocks(title="Website Category Classifier") as interface:
         gr.HTML("<h1>🔍 Website Category Classifier</h1>")
         with gr.Row():
             with gr.Column():
                 url_input = gr.Textbox(
+                    label="Website URL",
+                    placeholder="https://example.com",
+                    lines=1
                 )
+                process_btn = gr.Button("🚀 Classify Website", variant="primary")
             with gr.Column():
+                prediction_output = gr.Textbox(
+                    label="Classification Result",
+                    lines=2,
+                    interactive=False
+                )
+                scraped_output = gr.Textbox(
+                    label="Scraped Content (first 2000 chars)",
                     lines=15,
+                    max_lines=20,
                     interactive=False
                 )
         # Examples
         gr.Examples(
             examples=[
+                ["https://news.google.com"],
+                ["https://amazon.com"],
+                ["https://github.com"]
             ],
             inputs=[url_input],
         )
         process_btn.click(
+            fn=process_single_url,
             inputs=[url_input],
+            outputs=[prediction_output, scraped_output],
             show_progress=True
         )