Spaces:

limitedonly41
/

CV_website_classify

Paused

App Files Files Community

limitedonly41 commited on Oct 3, 2025

Commit

f61233e

verified ·

1 Parent(s): 81aab5a

Upload 2 files

Browse files

Files changed (2) hide show

app.py +260 -0
requirements.txt +13 -0

app.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import gradio as gr
+import spaces
+import asyncio
+import json
+import time
+from typing import List, Dict, Any
+from datetime import datetime, timezone
+import httpx
+from deep_translator import GoogleTranslator
+import torch
+from torch.amp import autocast
+from unsloth import FastLanguageModel
+# Initialize model globally (outside GPU decorator)
+max_seq_length = 2048
+dtype = None
+load_in_4bit = True
+peft_model_name = "limitedonly41/website_mistral7b_v02"
+# Load model once at startup
+print("Loading model...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name=peft_model_name,
+    max_seq_length=max_seq_length,
+    dtype=dtype,
+    load_in_4bit=load_in_4bit,
+)
+FastLanguageModel.for_inference(model)
+print("Model loaded successfully")
+# In-memory storage (replacing Redis)
+task_storage = {}
+task_counter = 0
+class TaskManager:
+    def __init__(self):
+        self.tasks = {}
+    def create_task(self, urls: List[str]) -> str:
+        global task_counter
+        task_counter += 1
+        task_id = f"task_{task_counter}"
+        self.tasks[task_id] = {
+            "total": len(urls),
+            "completed": 0,
+            "scraped": 0,
+            "status": "processing",
+            "urls": urls,
+            "results": {},
+            "created_time": datetime.now(timezone.utc).isoformat()
+        }
+        return task_id
+    def update_progress(self, task_id: str, field: str, value: Any):
+        if task_id in self.tasks:
+            self.tasks[task_id][field] = value
+    def get_task(self, task_id: str) -> Dict:
+        return self.tasks.get(task_id, {})
+task_manager = TaskManager()
+def translate_text(text: str) -> str:
+    """Translate text to English"""
+    try:
+        text = text[:4990]
+        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
+        return translated_text
+    except Exception as e:
+        print(f"Translation error: {e}")
+        return text[:4990]
+@spaces.GPU
+def predict_inference(translated_text: str) -> str:
+    """GPU-accelerated inference function"""
+    try:
+        if len(translated_text) < 150:
+            return 'Short'
+        prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
+### Instruction:
+Categorize the website into one of the 3 categories:\n\n1) OTHER \n2) NEWS/BLOG\n3) E-commerce
+### Input:
+{translated_text}
+### Response:"""
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        with autocast(device_type='cuda'):
+            inputs = tokenizer(prompt, return_tensors="pt").to(device)
+            outputs = model.generate(**inputs, max_new_tokens=16, use_cache=True)
+            ans = tokenizer.batch_decode(outputs)[0]
+        ans_pred = ans.split('### Response:')[1].split('<')[0].strip()
+        if 'OTHER' in ans_pred:
+            return 'OTHER'
+        elif 'NEWS/BLOG' in ans_pred:
+            return 'NEWS/BLOG'
+        elif 'E-commerce' in ans_pred:
+            return 'E-commerce'
+        else:
+            return 'ERROR'
+    except Exception as e:
+        print(f"Inference error: {e}")
+        return 'ERROR'
+async def scrape_single_url(session: httpx.AsyncClient, url: str) -> Dict:
+    """Scrape a single URL"""
+    try:
+        response = await session.get(url, timeout=30.0)
+        if response.status_code == 200:
+            # Simple text extraction (you can enhance this)
+            text_content = response.text[:5000]  # Limit content
+            return {
+                "url": url,
+                "text": text_content,
+                "status": "success"
+            }
+        else:
+            return {
+                "url": url,
+                "text": "",
+                "status": f"error_{response.status_code}"
+            }
+    except Exception as e:
+        return {
+            "url": url,
+            "text": "",
+            "status": f"error_{str(e)[:100]}"
+        }
+async def process_urls_batch(urls: List[str], progress_callback=None) -> Dict[str, str]:
+    """Process a batch of URLs"""
+    task_id = task_manager.create_task(urls)
+    results = {}
+    async with httpx.AsyncClient() as client:
+        for i, url in enumerate(urls):
+            try:
+                # Scrape URL
+                scraped_data = await scrape_single_url(client, url)
+                task_manager.update_progress(task_id, "scraped", i + 1)
+                # Process text
+                text = scraped_data.get("text", "")
+                if len(text) < 150:
+                    prediction = "Short"
+                else:
+                    # Translate text
+                    translated = translate_text(text)
+                    # Get prediction using GPU
+                    prediction = predict_inference(translated)
+                results[url] = prediction
+                task_manager.update_progress(task_id, "completed", i + 1)
+                # Update progress
+                if progress_callback:
+                    progress = f"Processed {i + 1}/{len(urls)} URLs"
+                    progress_callback(progress)
+            except Exception as e:
+                results[url] = f"Error: {str(e)[:100]}"
+    task_manager.update_progress(task_id, "status", "completed")
+    task_manager.update_progress(task_id, "results", results)
+    return results
+def process_url_list(url_text: str, progress=gr.Progress()) -> str:
+    """Main processing function for Gradio interface"""
+    if not url_text.strip():
+        return "Please provide URLs to process."
+    # Parse URLs
+    urls = [url.strip() for url in url_text.strip().split('\n') if url.strip()]
+    if not urls:
+        return "No valid URLs found."
+    if len(urls) > 50:  # Limit for demo
+        return f"Too many URLs ({len(urls)}). Please limit to 50 URLs."
+    try:
+        # Process URLs
+        progress(0, desc="Starting processing...")
+        def progress_callback(msg):
+            progress(None, desc=msg)
+        # Run async function
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        results = loop.run_until_complete(process_urls_batch(urls, progress_callback))
+        loop.close()
+        # Format results
+        output_lines = []
+        for url, prediction in results.items():
+            output_lines.append(f"{url} → {prediction}")
+        return "\n".join(output_lines)
+    except Exception as e:
+        return f"Error processing URLs: {str(e)}"
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Website Category Classifier") as interface:
+        gr.HTML("<h1>🔍 Website Category Classifier</h1>")
+        gr.HTML("<p>Classify websites into categories: OTHER, NEWS/BLOG, or E-commerce</p>")
+        with gr.Row():
+            with gr.Column():
+                url_input = gr.Textbox(
+                    label="URLs (one per line)",
+                    placeholder="https://example1.com\nhttps://example2.com\nhttps://example3.com",
+                    lines=10,
+                    max_lines=20
+                )
+                process_btn = gr.Button("🚀 Classify Websites", variant="primary")
+            with gr.Column():
+                output = gr.Textbox(
+                    label="Results",
+                    lines=15,
+                    max_lines=30,
+                    interactive=False
+                )
+        # Examples
+        gr.Examples(
+            examples=[
+                ["https://news.google.com\nhttps://amazon.com\nhttps://github.com"],
+                ["https://techcrunch.com\nhttps://shopify.com\nhttps://stackoverflow.com"]
+            ],
+            inputs=[url_input],
+        )
+        process_btn.click(
+            fn=process_url_list,
+            inputs=[url_input],
+            outputs=[output],
+            show_progress=True
+        )
+    return interface
+# Launch the app
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio>=4.44.0
+spaces
+torch>=2.1.0,<2.6.0
+transformers>=4.40.0
+unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git
+deep-translator>=1.11.4
+httpx>=0.25.0
+beautifulsoup4>=4.12.0
+accelerate>=0.21.0
+bitsandbytes>=0.41.0
+peft>=0.5.0
+datasets>=2.14.0
+safetensors>=0.3.2