Spaces:

limitedonly41
/

CV_website_classify

Paused

App Files Files Community

limitedonly41 commited on Oct 3, 2025

Commit

3efbebc

verified ·

1 Parent(s): 705ae9c

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -204

app.py CHANGED Viewed

@@ -3,66 +3,20 @@ import spaces
 import asyncio
 import json
 import time
-from typing import List, Dict, Any
 from datetime import datetime, timezone
-import httpx
 from deep_translator import GoogleTranslator
 import torch
 from torch.amp import autocast
-# from unsloth import FastLanguageModel
-# Initialize model globally (outside GPU decorator)
 max_seq_length = 2048
 dtype = None
 load_in_4bit = True
 peft_model_name = "limitedonly41/website_mistral7b_v02"
-# # Load model once at startup
-# print("Loading model...")
-# model, tokenizer = FastLanguageModel.from_pretrained(
-#     model_name=peft_model_name,
-#     max_seq_length=max_seq_length,
-#     dtype=dtype,
-#     load_in_4bit=load_in_4bit,
-# )
-# FastLanguageModel.for_inference(model)
-# print("Model loaded successfully")
-# In-memory storage (replacing Redis)
-task_storage = {}
-task_counter = 0
-class TaskManager:
-    def __init__(self):
-        self.tasks = {}
-    def create_task(self, urls: List[str]) -> str:
-        global task_counter
-        task_counter += 1
-        task_id = f"task_{task_counter}"
-        self.tasks[task_id] = {
-            "total": len(urls),
-            "completed": 0,
-            "scraped": 0,
-            "status": "processing",
-            "urls": urls,
-            "results": {},
-            "created_time": datetime.now(timezone.utc).isoformat()
-        }
-        return task_id
-    def update_progress(self, task_id: str, field: str, value: Any):
-        if task_id in self.tasks:
-            self.tasks[task_id][field] = value
-    def get_task(self, task_id: str) -> Dict:
-        return self.tasks.get(task_id, {})
-task_manager = TaskManager()
 def translate_text(text: str) -> str:
     """Translate text to English"""
     try:
@@ -73,8 +27,6 @@ def translate_text(text: str) -> str:
         print(f"Translation error: {e}")
         return text[:4990]
 @spaces.GPU
 def predict_inference(translated_text: str) -> str:
     """GPU-accelerated inference function"""
@@ -85,11 +37,6 @@ def predict_inference(translated_text: str) -> str:
         from unsloth import FastLanguageModel
         # Load model INSIDE the GPU function
-        max_seq_length = 2048
-        dtype = None
-        load_in_4bit = True
-        peft_model_name = "limitedonly41/website_mistral7b_v02"
         model, tokenizer = FastLanguageModel.from_pretrained(
             model_name=peft_model_name,
             max_seq_length=max_seq_length,
@@ -130,109 +77,55 @@ Categorize the website into one of the 3 categories:\n\n1) OTHER \n2) NEWS/BLOG\
         print(f"Inference error: {e}")
         return 'ERROR'
-async def scrape_single_url(session: httpx.AsyncClient, url: str) -> Dict:
-    """Scrape a single URL"""
     try:
-        response = await session.get(url, timeout=30.0)
-        if response.status_code == 200:
-            # Simple text extraction (you can enhance this)
-            text_content = response.text[:5000]  # Limit content
-            return {
-                "url": url,
-                "text": text_content,
-                "status": "success"
             }
         else:
-            return {
-                "url": url,
-                "text": "",
-                "status": f"error_{response.status_code}"
-            }
-    except Exception as e:
-        return {
-            "url": url,
-            "text": "",
-            "status": f"error_{str(e)[:100]}"
-        }
-async def process_urls_batch(urls: List[str], progress_callback=None) -> Dict[str, str]:
-    """Process a batch of URLs"""
-    task_id = task_manager.create_task(urls)
-    results = {}
-    async with httpx.AsyncClient() as client:
-        for i, url in enumerate(urls):
-            try:
-                # Scrape URL
-                scraped_data = await scrape_single_url(client, url)
-                task_manager.update_progress(task_id, "scraped", i + 1)
-                # Process text
-                text = scraped_data.get("text", "")
-                if len(text) < 150:
-                    prediction = "Short"
-                else:
-                    # Translate text
-                    translated = translate_text(text)
-                    # Get prediction using GPU
-                    prediction = predict_inference(translated)
-                results[url] = prediction
-                task_manager.update_progress(task_id, "completed", i + 1)
-                # Update progress
-                if progress_callback:
-                    progress = f"Processed {i + 1}/{len(urls)} URLs"
-                    progress_callback(progress)
-            except Exception as e:
-                results[url] = f"Error: {str(e)[:100]}"
-    task_manager.update_progress(task_id, "status", "completed")
-    task_manager.update_progress(task_id, "results", results)
-    return results
-def process_url_list(url_text: str, progress=gr.Progress()) -> str:
-    """Main processing function for Gradio interface"""
-    if not url_text.strip():
-        return "Please provide URLs to process."
-    # Parse URLs
-    urls = [url.strip() for url in url_text.strip().split('\n') if url.strip()]
-    if not urls:
-        return "No valid URLs found."
-    if len(urls) > 50:  # Limit for demo
-        return f"Too many URLs ({len(urls)}). Please limit to 50 URLs."
-    try:
-        # Process URLs
-        progress(0, desc="Starting processing...")
-        def progress_callback(msg):
-            progress(None, desc=msg)
-        # Run async function
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-        results = loop.run_until_complete(process_urls_batch(urls, progress_callback))
-        loop.close()
-        # Format results
-        output_lines = []
-        for url, prediction in results.items():
-            output_lines.append(f"{url} → {prediction}")
-        return "\n".join(output_lines)
     except Exception as e:
-        return f"Error processing URLs: {str(e)}"
-def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
     """Process a single URL and return both scraped text and prediction"""
     if not url.strip():
         return "Please provide a URL to process.", ""
@@ -245,90 +138,114 @@ def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
     try:
         progress(0.1, desc="Scraping website...")
-        # Scrape the URL
-        import httpx
-        with httpx.Client(timeout=30.0) as client:
-            response = client.get(url)
-        if response.status_code != 200:
-            return f"Error: HTTP {response.status_code}", ""
-        # Extract text content (you can enhance this with BeautifulSoup)
-        from bs4 import BeautifulSoup
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Remove script and style elements
-        for script in soup(["script", "style"]):
-            script.decompose()
-        # Get text content
-        scraped_text = soup.get_text()
-        # Clean up the text
-        lines = (line.strip() for line in scraped_text.splitlines())
-        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
-        scraped_text = ' '.join(chunk for chunk in chunks if chunk)
         # Limit text length for display
         scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
-        progress(0.5, desc="Translating text...")
-        # Check if text is too short
         if len(scraped_text) < 150:
             return "Short", scraped_display
         # Translate text
         translated = translate_text(scraped_text[:4990])
-        progress(0.8, desc="Classifying website...")
         # Get prediction using GPU
         prediction = predict_inference(translated)
         return prediction, scraped_display
     except Exception as e:
         error_msg = f"Error processing URL: {str(e)[:200]}"
         return error_msg, ""
 def create_interface():
-    with gr.Blocks(title="Website Category Classifier") as interface:
-        gr.HTML("<h1>🔍 Website Category Classifier</h1>")
-        gr.HTML("<p>Classify websites into categories: OTHER, NEWS/BLOG, or E-commerce</p>")
         with gr.Row():
-            with gr.Column():
                 url_input = gr.Textbox(
-                    label="Website URL",
-                    placeholder="https://example.com",
-                    lines=1
                 )
-                process_btn = gr.Button("🚀 Classify Website", variant="primary")
-            with gr.Column():
                 prediction_output = gr.Textbox(
-                    label="Classification Result",
-                    lines=2,
-                    interactive=False
                 )
                 scraped_output = gr.Textbox(
-                    label="Scraped Content (first 2000 chars)",
-                    lines=15,
-                    max_lines=20,
-                    interactive=False
                 )
-        # Examples
-        gr.Examples(
-            examples=[
-                ["https://news.google.com"],
-                ["https://amazon.com"],
-                ["https://github.com"]
-            ],
-            inputs=[url_input],
-        )
         process_btn.click(
             fn=process_single_url,

 import asyncio
 import json
 import time
+from typing import List, Dict, Any, Tuple
 from datetime import datetime, timezone
 from deep_translator import GoogleTranslator
 import torch
 from torch.amp import autocast
+from curl_cffi import requests
+from bs4 import BeautifulSoup
+# Initialize model parameters
 max_seq_length = 2048
 dtype = None
 load_in_4bit = True
 peft_model_name = "limitedonly41/website_mistral7b_v02"
 def translate_text(text: str) -> str:
     """Translate text to English"""
     try:
         print(f"Translation error: {e}")
         return text[:4990]
 @spaces.GPU
 def predict_inference(translated_text: str) -> str:
     """GPU-accelerated inference function"""
         from unsloth import FastLanguageModel
         # Load model INSIDE the GPU function
         model, tokenizer = FastLanguageModel.from_pretrained(
             model_name=peft_model_name,
             max_seq_length=max_seq_length,
         print(f"Inference error: {e}")
         return 'ERROR'
+def scrape_url_with_curl_cffi(url: str) -> Tuple[str, str]:
+    """Scrape URL using curl_cffi for better compatibility"""
     try:
+        # Use curl_cffi with browser impersonation
+        response = requests.get(
+            url,
+            timeout=30,
+            impersonate="chrome110",  # Impersonate Chrome browser
+            headers={
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Language': 'en-US,en;q=0.5',
+                'Accept-Encoding': 'gzip, deflate',
+                'Connection': 'keep-alive',
             }
+        )
+        if response.status_code != 200:
+            return f"HTTP Error {response.status_code}", ""
+        # Parse HTML with BeautifulSoup
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Remove script, style, nav, footer, and other non-content elements
+        for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'advertisement']):
+            element.decompose()
+        # Try to find main content areas first
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.find('body')
+        if main_content:
+            text = main_content.get_text(separator=' ', strip=True)
         else:
+            text = soup.get_text(separator=' ', strip=True)
+        # Clean up the text
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        cleaned_text = ' '.join(lines)
+        # Remove excessive whitespace
+        import re
+        cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
+        return "success", cleaned_text
     except Exception as e:
+        return f"Scraping error: {str(e)[:200]}", ""
+def process_single_url(url: str, progress=gr.Progress()) -> Tuple[str, str]:
     """Process a single URL and return both scraped text and prediction"""
     if not url.strip():
         return "Please provide a URL to process.", ""
     try:
         progress(0.1, desc="Scraping website...")
+        # Scrape the URL using curl_cffi
+        status, scraped_text = scrape_url_with_curl_cffi(url)
+        if status != "success":
+            return status, ""
+        if len(scraped_text) < 50:
+            return "Error: Could not extract meaningful content from the website", scraped_text[:2000]
         # Limit text length for display
         scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
+        progress(0.4, desc="Translating text...")
+        # Check if text is too short for classification
         if len(scraped_text) < 150:
             return "Short", scraped_display
         # Translate text
         translated = translate_text(scraped_text[:4990])
+        progress(0.7, desc="Classifying website...")
         # Get prediction using GPU
         prediction = predict_inference(translated)
+        progress(1.0, desc="Complete!")
         return prediction, scraped_display
     except Exception as e:
         error_msg = f"Error processing URL: {str(e)[:200]}"
         return error_msg, ""
 def create_interface():
+    with gr.Blocks(title="Website Category Classifier", theme=gr.themes.Soft()) as interface:
+        gr.HTML("""
+        <div style="text-align: center; margin-bottom: 20px;">
+            <h1>🔍 Website Category Classifier</h1>
+            <p style="font-size: 18px; color: #666;">
+                Classify websites into categories: <strong>OTHER</strong>, <strong>NEWS/BLOG</strong>, or <strong>E-commerce</strong>
+            </p>
+        </div>
+        """)
         with gr.Row():
+            with gr.Column(scale=1):
                 url_input = gr.Textbox(
+                    label="🌐 Website URL",
+                    placeholder="https://example.com or just example.com",
+                    lines=1,
+                    info="Enter any website URL to classify"
                 )
+                process_btn = gr.Button(
+                    "🚀 Classify Website",
+                    variant="primary",
+                    size="lg"
+                )
+                gr.HTML("<br>")
+                # Examples
+                gr.Examples(
+                    examples=[
+                        ["https://techcrunch.com"],
+                        ["https://amazon.com"],
+                        ["https://github.com"],
+                        ["https://cnn.com"],
+                        ["https://shopify.com"]
+                    ],
+                    inputs=[url_input],
+                    label="📋 Try these examples:"
+                )
+            with gr.Column(scale=2):
                 prediction_output = gr.Textbox(
+                    label="🎯 Classification Result",
+                    lines=3,
+                    interactive=False,
+                    info="The predicted category for this website"
                 )
                 scraped_output = gr.Textbox(
+                    label="📄 Scraped Content Preview (first 2000 characters)",
+                    lines=20,
+                    max_lines=25,
+                    interactive=False,
+                    info="Raw text content extracted from the website"
                 )
+        # Info section
+        gr.HTML("""
+        <div style="margin-top: 20px; padding: 15px; background-color: #f8f9fa; border-radius: 8px;">
+            <h3>ℹ️ How it works:</h3>
+            <ol>
+                <li><strong>Web Scraping:</strong> Extracts text content from the website using advanced scraping techniques</li>
+                <li><strong>Translation:</strong> Automatically translates non-English content to English</li>
+                <li><strong>AI Classification:</strong> Uses a fine-tuned Mistral 7B model to categorize the website</li>
+            </ol>
+            <p><strong>Categories:</strong></p>
+            <ul>
+                <li><strong>NEWS/BLOG:</strong> News websites, blogs, articles, journalism sites</li>
+                <li><strong>E-commerce:</strong> Online stores, shopping sites, marketplaces</li>
+                <li><strong>OTHER:</strong> All other types of websites (documentation, portfolios, etc.)</li>
+            </ul>
+        </div>
+        """)
         process_btn.click(
             fn=process_single_url,