limitedonly41 commited on
Commit
e34c82b
·
verified ·
1 Parent(s): 5cc7129

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -37
app.py CHANGED
@@ -6,8 +6,6 @@ from deep_translator import GoogleTranslator
6
  import pandas as pd
7
  from tqdm import tqdm
8
  import urllib
9
- import aiohttp
10
- import asyncio
11
  from bs4 import BeautifulSoup
12
 
13
  # Configure logging to write messages to a file
@@ -17,14 +15,14 @@ logging.basicConfig(filename='app.log', level=logging.ERROR)
17
  max_seq_length = 2048
18
  dtype = None # Auto detection of dtype
19
  load_in_4bit = True # Use 4-bit quantization to reduce memory usage
20
- peft_model_name = "limitedonly41/website_qwen2_7b_2"
 
21
 
22
  # Initialize model and tokenizer variables
23
  model = None
24
  tokenizer = None
25
 
26
- # Async function to fetch data
27
- async def fetch_data(url):
28
  headers = {
29
  'Accept': '*/*',
30
  'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
@@ -33,7 +31,7 @@ async def fetch_data(url):
33
  'Sec-Fetch-Dest': 'empty',
34
  'Sec-Fetch-Mode': 'cors',
35
  'Sec-Fetch-Site': 'cross-site',
36
- 'User-Agent': 'Mozilla/5.0',
37
  'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
38
  'sec-ch-ua-mobile': '?0',
39
  'sec-ch-ua-platform': '"macOS"',
@@ -41,20 +39,15 @@ async def fetch_data(url):
41
 
42
  encoding = 'utf-8'
43
  timeout = 10 # Set your desired timeout value in seconds
44
-
45
  try:
46
- # Function to make the request using urllib
47
- def get_content():
48
- req = urllib.request.Request(url, headers=headers)
49
- with urllib.request.urlopen(req, timeout=timeout) as response:
50
- return response.read()
51
-
52
- # Async task using executor for blocking I/O
53
- loop = asyncio.get_event_loop()
54
- response_content = await loop.run_in_executor(None, get_content)
55
 
56
  soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
57
- title = soup.find('title').text if soup.find('title') else ""
 
58
  description = soup.find('meta', attrs={'name': 'description'})
59
  description = description.get("content") if description and "content" in description.attrs else ""
60
 
@@ -66,8 +59,12 @@ async def fetch_data(url):
66
  h2_all = ". ".join(h.text for h in soup.find_all('h2'))
67
  h3_all = ". ".join(h.text for h in soup.find_all('h3'))
68
 
69
- allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"
70
- allthecontent = allthecontent[:4999]
 
 
 
 
71
 
72
  return {
73
  'url': url,
@@ -80,9 +77,8 @@ async def fetch_data(url):
80
  'paragraphs': paragraphs_all,
81
  'text': allthecontent
82
  }
83
-
84
  except Exception as e:
85
- logging.exception(f"Error fetching data for {url}: {e}")
86
  return {
87
  'url': url,
88
  'title': None,
@@ -95,25 +91,19 @@ async def fetch_data(url):
95
  'text': None
96
  }
97
 
98
- # Main async function to process multiple URLs
99
- async def main(urls):
100
- tasks = [fetch_data(url) for url in urls]
101
  results = []
102
- for future in tqdm(asyncio.as_completed(tasks), total=len(tasks)):
103
- result = await future
104
  results.append(result)
105
  return results
106
 
107
  @spaces.GPU()
108
  def classify_website(url):
109
- global model, tokenizer
110
 
111
  urls = [url]
112
-
113
- # Start asyncio loop for fetching data
114
- loop = asyncio.new_event_loop()
115
- asyncio.set_event_loop(loop)
116
- results_shop = loop.run_until_complete(main(urls)) # Correctly use asyncio loop
117
 
118
  # Convert results to DataFrame
119
  df_result_train_more = pd.DataFrame(results_shop)
@@ -121,17 +111,18 @@ def classify_website(url):
121
  translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
122
 
123
  try:
124
- # Load the model and tokenizer if not already loaded
125
  if model is None or tokenizer is None:
126
  from unsloth import FastLanguageModel
127
 
 
128
  model, tokenizer = FastLanguageModel.from_pretrained(
129
- model_name=peft_model_name,
130
  max_seq_length=max_seq_length,
131
  dtype=dtype,
132
  load_in_4bit=load_in_4bit,
133
  )
134
- FastLanguageModel.for_inference(model)
135
 
136
  prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
137
 
@@ -173,6 +164,7 @@ iface = gr.Interface(
173
  title="Website Categorization",
174
  description="Categorize a website into one of the 3 categories: OTHER, NEWS/BLOG, or E-commerce."
175
  )
176
- iface.queue() # Enable queue with default settings
177
- iface.launch()
178
 
 
 
 
6
  import pandas as pd
7
  from tqdm import tqdm
8
  import urllib
 
 
9
  from bs4 import BeautifulSoup
10
 
11
  # Configure logging to write messages to a file
 
15
  max_seq_length = 2048
16
  dtype = None # Auto detection of dtype
17
  load_in_4bit = True # Use 4-bit quantization to reduce memory usage
18
+
19
+ peft_model_name = "limitedonly41/website_mistral7b_v02_1200_finetuned_7"
20
 
21
  # Initialize model and tokenizer variables
22
  model = None
23
  tokenizer = None
24
 
25
+ def fetch_data(url):
 
26
  headers = {
27
  'Accept': '*/*',
28
  'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
 
31
  'Sec-Fetch-Dest': 'empty',
32
  'Sec-Fetch-Mode': 'cors',
33
  'Sec-Fetch-Site': 'cross-site',
34
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
35
  'sec-ch-ua': '"Google Chrome";v="125", "Chromium";v="125", "Not.A/Brand";v="24"',
36
  'sec-ch-ua-mobile': '?0',
37
  'sec-ch-ua-platform': '"macOS"',
 
39
 
40
  encoding = 'utf-8'
41
  timeout = 10 # Set your desired timeout value in seconds
 
42
  try:
43
+ # Make the request using urllib
44
+ req = urllib.request.Request(url, headers=headers)
45
+ with urllib.request.urlopen(req, timeout=timeout) as response:
46
+ response_content = response.read()
 
 
 
 
 
47
 
48
  soup = BeautifulSoup(response_content, 'html.parser', from_encoding=encoding)
49
+
50
+ title = soup.find('title').text
51
  description = soup.find('meta', attrs={'name': 'description'})
52
  description = description.get("content") if description and "content" in description.attrs else ""
53
 
 
59
  h2_all = ". ".join(h.text for h in soup.find_all('h2'))
60
  h3_all = ". ".join(h.text for h in soup.find_all('h3'))
61
 
62
+ allthecontent = f"{title} {description} {h1_all} {h2_all} {h3_all} {paragraphs_all}"[:4999]
63
+
64
+ # Clean up the text
65
+ h1_all = h1_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
66
+ h2_all = h2_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
67
+ h3_all = h3_all.replace(r'\xa0', ' ').replace('\n', ' ').replace('\t', ' ')
68
 
69
  return {
70
  'url': url,
 
77
  'paragraphs': paragraphs_all,
78
  'text': allthecontent
79
  }
 
80
  except Exception as e:
81
+ print(url, e)
82
  return {
83
  'url': url,
84
  'title': None,
 
91
  'text': None
92
  }
93
 
94
+ def main(urls):
 
 
95
  results = []
96
+ for url in tqdm(urls):
97
+ result = fetch_data(url)
98
  results.append(result)
99
  return results
100
 
101
  @spaces.GPU()
102
  def classify_website(url):
103
+ global model, tokenizer # Declare model and tokenizer as global variables
104
 
105
  urls = [url]
106
+ results_shop = main(urls)
 
 
 
 
107
 
108
  # Convert results to DataFrame
109
  df_result_train_more = pd.DataFrame(results_shop)
 
111
  translated = GoogleTranslator(source='auto', target='en').translate(text[:4990])
112
 
113
  try:
114
+ # Load the model and tokenizer if they are not already loaded
115
  if model is None or tokenizer is None:
116
  from unsloth import FastLanguageModel
117
 
118
+ # Load the model and tokenizer
119
  model, tokenizer = FastLanguageModel.from_pretrained(
120
+ model_name=peft_model_name, # Model used for training
121
  max_seq_length=max_seq_length,
122
  dtype=dtype,
123
  load_in_4bit=load_in_4bit,
124
  )
125
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
126
 
127
  prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
128
 
 
164
  title="Website Categorization",
165
  description="Categorize a website into one of the 3 categories: OTHER, NEWS/BLOG, or E-commerce."
166
  )
167
+ iface.queue() # Sets up a queue with default parameters
 
168
 
169
+ # Launch the interface
170
+ iface.launch()