Spaces:

limitedonly41
/

website_classification

Sleeping

App Files Files Community

limitedonly41 commited on Sep 20, 2024

Commit

69e9c2b

verified ·

1 Parent(s): 295889a

Update app.py

Browse files

Files changed (1) hide show

app.py +179 -4

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 import torch
 import spaces
 import logging
 # Configure logging to write messages to a file
 logging.basicConfig(filename='app.log', level=logging.ERROR)
@@ -17,10 +19,183 @@ peft_model_name = "limitedonly41/website_mistral7b_v02_1200_finetuned_7"
 model = None
 tokenizer = None
 @spaces.GPU()
-def classify_website(site_text):
     global model, tokenizer  # Declare model and tokenizer as global variables
     try:
         # Load the model and tokenizer if they are not already loaded
         if model is None or tokenizer is None:
@@ -45,7 +220,7 @@ Categorize the website into one of the 3 categories:
 3) E-commerce
 ### Input:
-{site_text}
 ### Response:"""
@@ -60,8 +235,8 @@ Categorize the website into one of the 3 categories:
             ans_pred = 'NEWS/BLOG'
         elif 'E-commerce' in ans_pred:
             ans_pred = 'E-commerce'
-        else:
-            ans_pred = 'OTHER'
         return ans_pred

 import torch
 import spaces
 import logging
+from deep_translator import GoogleTranslator
 # Configure logging to write messages to a file
 logging.basicConfig(filename='app.log', level=logging.ERROR)
 model = None
 tokenizer = None
+import pandas as pd
+from tqdm import tqdm
+import urllib
+import aiohttp
+import asyncio
+from bs4 import BeautifulSoup
+async def fetch_data(url):
+    headers = {
+        'Accept': '*/*',
+        'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
+        'Connection': 'keep-alive',
+        # 'Origin': 'https://www.beckman.es',
+        'Referer': f'{url}',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'cross-site',
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
+        'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"macOS"',
+    }
+    # encoding = 'windows-1251'
+    encoding = 'utf-8'
+    timeout = 10  # Set your desired timeout value in seconds
+    try:
+        # Function to make the request using urllib
+        def get_content():
+            req = urllib.request.Request(url, headers=headers)
+            with urllib.request.urlopen(req, timeout=timeout) as response:
+                return response.read()
+        response_content = await loop.run_in_executor(None, get_content)
+        soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
+        title = soup.find('title').text
+        description = soup.find('meta', attrs={'name': 'description'})
+        if description and "content" in description.attrs:
+            description = description.get("content")
+        else:
+            description = ""
+        keywords = soup.find('meta', attrs={'name': 'keywords'})
+        if keywords and "content" in keywords.attrs:
+            keywords = keywords.get("content")
+        else:
+            keywords = ""
+        # h1_all = " ".join(h.text for h in soup.find_all('h1'))
+        # h2_all = " ".join(h.text for h in soup.find_all('h2'))
+        # h3_all = " ".join(h.text for h in soup.find_all('h3'))
+        # paragraphs_all = " ".join(p.text for p in soup.find_all('p'))
+        h1 = soup.find_all('h1')
+        h1_all = ""
+        try:
+            for x in range (len(h1)):
+                if x ==  len(h1) -1:
+                    h1_all = h1_all + h1[x].text
+                else:
+                    h1_all = h1_all + h1[x].text + ". "
+        except:
+            h1_all = ""
+        paragraphs_all = ""
+        paragraphs = soup.find_all('p')
+        try:
+            for x in range (len(paragraphs)):
+                if x ==  len(paragraphs) -1:
+                    paragraphs_all = paragraphs_all + paragraphs[x].text
+                else:
+                    paragraphs_all = paragraphs_all + paragraphs[x].text + ". "
+        except:
+            paragraphs_all = ""
+        h2 = soup.find_all('h2')
+        h2_all = ""
+        try:
+            for x in range (len(h2)):
+                if x ==  len(h2) -1:
+                    h2_all = h2_all + h2[x].text
+                else:
+                    h2_all = h2_all + h2[x].text + ". "
+        except:
+            h2_all = ""
+        h3 = soup.find_all('h3')
+        h3_all = ""
+        try:
+            for x in range (len(h3)):
+                if x ==  len(h3) -1:
+                    h3_all = h3_all + h3[x].text
+                else:
+                    h3_all = h3_all + h3[x].text + ". "
+        except:
+            h3_all = ""
+        allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
+        allthecontent = allthecontent[:4999]
+        # Clean up the text
+        h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
+        h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
+        h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
+        title = title.replace(r'\xa0', ' ')
+        description = description.replace(r'\xa0', ' ')
+        keywords = keywords.replace(r'\xa0', ' ')
+        return {
+            'url': url,
+            'title': title,
+            'description': description,
+            'keywords': keywords,
+            'h1': h1_all,
+            'h2': h2_all,
+            'h3': h3_all,
+            'paragraphs': paragraphs_all,
+            'text': allthecontent
+        }
+    except Exception as e:
+        print(url, e)
+        return {
+            'url': url,
+            'title': None,
+            'description': None,
+            'keywords': None,
+            'h1': None,
+            'h2': None,
+            'h3': None,
+            'paragraphs': None,
+            'text': None
+        }
+async def main(urls):
+    tasks = [fetch_data(url) for url in urls]
+    results = []
+    for future in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
+        result = await future
+        results.append(result)
+    return results
 @spaces.GPU()
+def classify_website(url):
     global model, tokenizer  # Declare model and tokenizer as global variables
+    urls = [url]
+    # Run asyncio event loop
+    loop = asyncio.get_event_loop()
+    results_shop = await main(urls[:])  # Instead of loop.run_until_complete(main(urls))
+    # Convert results to DataFrame
+    df_result_train_more = pd.DataFrame(results_shop)
+    text = df_result_train_more['text'][0]
+    translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
     try:
         # Load the model and tokenizer if they are not already loaded
         if model is None or tokenizer is None:
 3) E-commerce
 ### Input:
+{translated}
 ### Response:"""
             ans_pred = 'NEWS/BLOG'
         elif 'E-commerce' in ans_pred:
             ans_pred = 'E-commerce'
+        # else:
+        #     ans_pred = 'OTHER'
         return ans_pred