limitedonly41 commited on
Commit
3efbebc
Β·
verified Β·
1 Parent(s): 705ae9c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -204
app.py CHANGED
@@ -3,66 +3,20 @@ import spaces
3
  import asyncio
4
  import json
5
  import time
6
- from typing import List, Dict, Any
7
  from datetime import datetime, timezone
8
- import httpx
9
  from deep_translator import GoogleTranslator
10
  import torch
11
  from torch.amp import autocast
12
- # from unsloth import FastLanguageModel
 
13
 
14
- # Initialize model globally (outside GPU decorator)
15
  max_seq_length = 2048
16
  dtype = None
17
  load_in_4bit = True
18
  peft_model_name = "limitedonly41/website_mistral7b_v02"
19
 
20
- # # Load model once at startup
21
- # print("Loading model...")
22
- # model, tokenizer = FastLanguageModel.from_pretrained(
23
- # model_name=peft_model_name,
24
- # max_seq_length=max_seq_length,
25
- # dtype=dtype,
26
- # load_in_4bit=load_in_4bit,
27
- # )
28
- # FastLanguageModel.for_inference(model)
29
- # print("Model loaded successfully")
30
-
31
-
32
-
33
- # In-memory storage (replacing Redis)
34
- task_storage = {}
35
- task_counter = 0
36
-
37
- class TaskManager:
38
- def __init__(self):
39
- self.tasks = {}
40
-
41
- def create_task(self, urls: List[str]) -> str:
42
- global task_counter
43
- task_counter += 1
44
- task_id = f"task_{task_counter}"
45
-
46
- self.tasks[task_id] = {
47
- "total": len(urls),
48
- "completed": 0,
49
- "scraped": 0,
50
- "status": "processing",
51
- "urls": urls,
52
- "results": {},
53
- "created_time": datetime.now(timezone.utc).isoformat()
54
- }
55
- return task_id
56
-
57
- def update_progress(self, task_id: str, field: str, value: Any):
58
- if task_id in self.tasks:
59
- self.tasks[task_id][field] = value
60
-
61
- def get_task(self, task_id: str) -> Dict:
62
- return self.tasks.get(task_id, {})
63
-
64
- task_manager = TaskManager()
65
-
66
  def translate_text(text: str) -> str:
67
  """Translate text to English"""
68
  try:
@@ -73,8 +27,6 @@ def translate_text(text: str) -> str:
73
  print(f"Translation error: {e}")
74
  return text[:4990]
75
 
76
-
77
-
78
  @spaces.GPU
79
  def predict_inference(translated_text: str) -> str:
80
  """GPU-accelerated inference function"""
@@ -85,11 +37,6 @@ def predict_inference(translated_text: str) -> str:
85
  from unsloth import FastLanguageModel
86
 
87
  # Load model INSIDE the GPU function
88
- max_seq_length = 2048
89
- dtype = None
90
- load_in_4bit = True
91
- peft_model_name = "limitedonly41/website_mistral7b_v02"
92
-
93
  model, tokenizer = FastLanguageModel.from_pretrained(
94
  model_name=peft_model_name,
95
  max_seq_length=max_seq_length,
@@ -130,109 +77,55 @@ Categorize the website into one of the 3 categories:\n\n1) OTHER \n2) NEWS/BLOG\
130
  print(f"Inference error: {e}")
131
  return 'ERROR'
132
 
133
- async def scrape_single_url(session: httpx.AsyncClient, url: str) -> Dict:
134
- """Scrape a single URL"""
135
  try:
136
- response = await session.get(url, timeout=30.0)
137
- if response.status_code == 200:
138
- # Simple text extraction (you can enhance this)
139
- text_content = response.text[:5000] # Limit content
140
- return {
141
- "url": url,
142
- "text": text_content,
143
- "status": "success"
 
 
 
144
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  else:
146
- return {
147
- "url": url,
148
- "text": "",
149
- "status": f"error_{response.status_code}"
150
- }
151
- except Exception as e:
152
- return {
153
- "url": url,
154
- "text": "",
155
- "status": f"error_{str(e)[:100]}"
156
- }
157
-
158
- async def process_urls_batch(urls: List[str], progress_callback=None) -> Dict[str, str]:
159
- """Process a batch of URLs"""
160
- task_id = task_manager.create_task(urls)
161
- results = {}
162
-
163
- async with httpx.AsyncClient() as client:
164
- for i, url in enumerate(urls):
165
- try:
166
- # Scrape URL
167
- scraped_data = await scrape_single_url(client, url)
168
- task_manager.update_progress(task_id, "scraped", i + 1)
169
-
170
- # Process text
171
- text = scraped_data.get("text", "")
172
-
173
- if len(text) < 150:
174
- prediction = "Short"
175
- else:
176
- # Translate text
177
- translated = translate_text(text)
178
- # Get prediction using GPU
179
- prediction = predict_inference(translated)
180
-
181
- results[url] = prediction
182
- task_manager.update_progress(task_id, "completed", i + 1)
183
-
184
- # Update progress
185
- if progress_callback:
186
- progress = f"Processed {i + 1}/{len(urls)} URLs"
187
- progress_callback(progress)
188
-
189
- except Exception as e:
190
- results[url] = f"Error: {str(e)[:100]}"
191
-
192
- task_manager.update_progress(task_id, "status", "completed")
193
- task_manager.update_progress(task_id, "results", results)
194
-
195
- return results
196
-
197
- def process_url_list(url_text: str, progress=gr.Progress()) -> str:
198
- """Main processing function for Gradio interface"""
199
- if not url_text.strip():
200
- return "Please provide URLs to process."
201
-
202
- # Parse URLs
203
- urls = [url.strip() for url in url_text.strip().split('\n') if url.strip()]
204
-
205
- if not urls:
206
- return "No valid URLs found."
207
-
208
- if len(urls) > 50: # Limit for demo
209
- return f"Too many URLs ({len(urls)}). Please limit to 50 URLs."
210
-
211
- try:
212
- # Process URLs
213
- progress(0, desc="Starting processing...")
214
-
215
- def progress_callback(msg):
216
- progress(None, desc=msg)
217
-
218
- # Run async function
219
- loop = asyncio.new_event_loop()
220
- asyncio.set_event_loop(loop)
221
- results = loop.run_until_complete(process_urls_batch(urls, progress_callback))
222
- loop.close()
223
-
224
- # Format results
225
- output_lines = []
226
- for url, prediction in results.items():
227
- output_lines.append(f"{url} β†’ {prediction}")
228
-
229
- return "\n".join(output_lines)
230
-
231
  except Exception as e:
232
- return f"Error processing URLs: {str(e)}"
233
 
234
-
235
- def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
236
  """Process a single URL and return both scraped text and prediction"""
237
  if not url.strip():
238
  return "Please provide a URL to process.", ""
@@ -245,90 +138,114 @@ def process_single_url(url: str, progress=gr.Progress()) -> tuple[str, str]:
245
  try:
246
  progress(0.1, desc="Scraping website...")
247
 
248
- # Scrape the URL
249
- import httpx
250
- with httpx.Client(timeout=30.0) as client:
251
- response = client.get(url)
252
-
253
- if response.status_code != 200:
254
- return f"Error: HTTP {response.status_code}", ""
255
-
256
- # Extract text content (you can enhance this with BeautifulSoup)
257
- from bs4 import BeautifulSoup
258
- soup = BeautifulSoup(response.text, 'html.parser')
259
 
260
- # Remove script and style elements
261
- for script in soup(["script", "style"]):
262
- script.decompose()
263
 
264
- # Get text content
265
- scraped_text = soup.get_text()
266
-
267
- # Clean up the text
268
- lines = (line.strip() for line in scraped_text.splitlines())
269
- chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
270
- scraped_text = ' '.join(chunk for chunk in chunks if chunk)
271
 
272
  # Limit text length for display
273
  scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
274
 
275
- progress(0.5, desc="Translating text...")
276
 
277
- # Check if text is too short
278
  if len(scraped_text) < 150:
279
  return "Short", scraped_display
280
 
281
  # Translate text
282
  translated = translate_text(scraped_text[:4990])
283
 
284
- progress(0.8, desc="Classifying website...")
285
 
286
  # Get prediction using GPU
287
  prediction = predict_inference(translated)
288
 
 
 
289
  return prediction, scraped_display
290
 
291
  except Exception as e:
292
  error_msg = f"Error processing URL: {str(e)[:200]}"
293
  return error_msg, ""
 
294
  def create_interface():
295
- with gr.Blocks(title="Website Category Classifier") as interface:
296
- gr.HTML("<h1>πŸ” Website Category Classifier</h1>")
297
- gr.HTML("<p>Classify websites into categories: OTHER, NEWS/BLOG, or E-commerce</p>")
 
 
 
 
 
 
298
 
299
  with gr.Row():
300
- with gr.Column():
301
  url_input = gr.Textbox(
302
- label="Website URL",
303
- placeholder="https://example.com",
304
- lines=1
 
305
  )
306
 
307
- process_btn = gr.Button("πŸš€ Classify Website", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
- with gr.Column():
310
  prediction_output = gr.Textbox(
311
- label="Classification Result",
312
- lines=2,
313
- interactive=False
 
314
  )
315
 
316
  scraped_output = gr.Textbox(
317
- label="Scraped Content (first 2000 chars)",
318
- lines=15,
319
- max_lines=20,
320
- interactive=False
 
321
  )
322
 
323
- # Examples
324
- gr.Examples(
325
- examples=[
326
- ["https://news.google.com"],
327
- ["https://amazon.com"],
328
- ["https://github.com"]
329
- ],
330
- inputs=[url_input],
331
- )
 
 
 
 
 
 
 
 
332
 
333
  process_btn.click(
334
  fn=process_single_url,
 
3
  import asyncio
4
  import json
5
  import time
6
+ from typing import List, Dict, Any, Tuple
7
  from datetime import datetime, timezone
 
8
  from deep_translator import GoogleTranslator
9
  import torch
10
  from torch.amp import autocast
11
+ from curl_cffi import requests
12
+ from bs4 import BeautifulSoup
13
 
14
+ # Initialize model parameters
15
  max_seq_length = 2048
16
  dtype = None
17
  load_in_4bit = True
18
  peft_model_name = "limitedonly41/website_mistral7b_v02"
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  def translate_text(text: str) -> str:
21
  """Translate text to English"""
22
  try:
 
27
  print(f"Translation error: {e}")
28
  return text[:4990]
29
 
 
 
30
  @spaces.GPU
31
  def predict_inference(translated_text: str) -> str:
32
  """GPU-accelerated inference function"""
 
37
  from unsloth import FastLanguageModel
38
 
39
  # Load model INSIDE the GPU function
 
 
 
 
 
40
  model, tokenizer = FastLanguageModel.from_pretrained(
41
  model_name=peft_model_name,
42
  max_seq_length=max_seq_length,
 
77
  print(f"Inference error: {e}")
78
  return 'ERROR'
79
 
80
+ def scrape_url_with_curl_cffi(url: str) -> Tuple[str, str]:
81
+ """Scrape URL using curl_cffi for better compatibility"""
82
  try:
83
+ # Use curl_cffi with browser impersonation
84
+ response = requests.get(
85
+ url,
86
+ timeout=30,
87
+ impersonate="chrome110", # Impersonate Chrome browser
88
+ headers={
89
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
90
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
91
+ 'Accept-Language': 'en-US,en;q=0.5',
92
+ 'Accept-Encoding': 'gzip, deflate',
93
+ 'Connection': 'keep-alive',
94
  }
95
+ )
96
+
97
+ if response.status_code != 200:
98
+ return f"HTTP Error {response.status_code}", ""
99
+
100
+ # Parse HTML with BeautifulSoup
101
+ soup = BeautifulSoup(response.text, 'html.parser')
102
+
103
+ # Remove script, style, nav, footer, and other non-content elements
104
+ for element in soup(['script', 'style', 'nav', 'footer', 'header', 'aside', 'advertisement']):
105
+ element.decompose()
106
+
107
+ # Try to find main content areas first
108
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content') or soup.find('body')
109
+
110
+ if main_content:
111
+ text = main_content.get_text(separator=' ', strip=True)
112
  else:
113
+ text = soup.get_text(separator=' ', strip=True)
114
+
115
+ # Clean up the text
116
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
117
+ cleaned_text = ' '.join(lines)
118
+
119
+ # Remove excessive whitespace
120
+ import re
121
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
122
+
123
+ return "success", cleaned_text
124
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
+ return f"Scraping error: {str(e)[:200]}", ""
127
 
128
+ def process_single_url(url: str, progress=gr.Progress()) -> Tuple[str, str]:
 
129
  """Process a single URL and return both scraped text and prediction"""
130
  if not url.strip():
131
  return "Please provide a URL to process.", ""
 
138
  try:
139
  progress(0.1, desc="Scraping website...")
140
 
141
+ # Scrape the URL using curl_cffi
142
+ status, scraped_text = scrape_url_with_curl_cffi(url)
 
 
 
 
 
 
 
 
 
143
 
144
+ if status != "success":
145
+ return status, ""
 
146
 
147
+ if len(scraped_text) < 50:
148
+ return "Error: Could not extract meaningful content from the website", scraped_text[:2000]
 
 
 
 
 
149
 
150
  # Limit text length for display
151
  scraped_display = scraped_text[:2000] + "..." if len(scraped_text) > 2000 else scraped_text
152
 
153
+ progress(0.4, desc="Translating text...")
154
 
155
+ # Check if text is too short for classification
156
  if len(scraped_text) < 150:
157
  return "Short", scraped_display
158
 
159
  # Translate text
160
  translated = translate_text(scraped_text[:4990])
161
 
162
+ progress(0.7, desc="Classifying website...")
163
 
164
  # Get prediction using GPU
165
  prediction = predict_inference(translated)
166
 
167
+ progress(1.0, desc="Complete!")
168
+
169
  return prediction, scraped_display
170
 
171
  except Exception as e:
172
  error_msg = f"Error processing URL: {str(e)[:200]}"
173
  return error_msg, ""
174
+
175
  def create_interface():
176
+ with gr.Blocks(title="Website Category Classifier", theme=gr.themes.Soft()) as interface:
177
+ gr.HTML("""
178
+ <div style="text-align: center; margin-bottom: 20px;">
179
+ <h1>πŸ” Website Category Classifier</h1>
180
+ <p style="font-size: 18px; color: #666;">
181
+ Classify websites into categories: <strong>OTHER</strong>, <strong>NEWS/BLOG</strong>, or <strong>E-commerce</strong>
182
+ </p>
183
+ </div>
184
+ """)
185
 
186
  with gr.Row():
187
+ with gr.Column(scale=1):
188
  url_input = gr.Textbox(
189
+ label="🌐 Website URL",
190
+ placeholder="https://example.com or just example.com",
191
+ lines=1,
192
+ info="Enter any website URL to classify"
193
  )
194
 
195
+ process_btn = gr.Button(
196
+ "πŸš€ Classify Website",
197
+ variant="primary",
198
+ size="lg"
199
+ )
200
+
201
+ gr.HTML("<br>")
202
+
203
+ # Examples
204
+ gr.Examples(
205
+ examples=[
206
+ ["https://techcrunch.com"],
207
+ ["https://amazon.com"],
208
+ ["https://github.com"],
209
+ ["https://cnn.com"],
210
+ ["https://shopify.com"]
211
+ ],
212
+ inputs=[url_input],
213
+ label="πŸ“‹ Try these examples:"
214
+ )
215
 
216
+ with gr.Column(scale=2):
217
  prediction_output = gr.Textbox(
218
+ label="🎯 Classification Result",
219
+ lines=3,
220
+ interactive=False,
221
+ info="The predicted category for this website"
222
  )
223
 
224
  scraped_output = gr.Textbox(
225
+ label="πŸ“„ Scraped Content Preview (first 2000 characters)",
226
+ lines=20,
227
+ max_lines=25,
228
+ interactive=False,
229
+ info="Raw text content extracted from the website"
230
  )
231
 
232
+ # Info section
233
+ gr.HTML("""
234
+ <div style="margin-top: 20px; padding: 15px; background-color: #f8f9fa; border-radius: 8px;">
235
+ <h3>ℹ️ How it works:</h3>
236
+ <ol>
237
+ <li><strong>Web Scraping:</strong> Extracts text content from the website using advanced scraping techniques</li>
238
+ <li><strong>Translation:</strong> Automatically translates non-English content to English</li>
239
+ <li><strong>AI Classification:</strong> Uses a fine-tuned Mistral 7B model to categorize the website</li>
240
+ </ol>
241
+ <p><strong>Categories:</strong></p>
242
+ <ul>
243
+ <li><strong>NEWS/BLOG:</strong> News websites, blogs, articles, journalism sites</li>
244
+ <li><strong>E-commerce:</strong> Online stores, shopping sites, marketplaces</li>
245
+ <li><strong>OTHER:</strong> All other types of websites (documentation, portfolios, etc.)</li>
246
+ </ul>
247
+ </div>
248
+ """)
249
 
250
  process_btn.click(
251
  fn=process_single_url,