Spaces:

limitedonly41
/

website_topic_classify

Build error

limitedonly41 commited on Nov 24, 2024

Commit

4429406

verified ·

1 Parent(s): 84e17b0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ import asyncio
 from curl_cffi.requests import AsyncSession
 from tqdm.asyncio import tqdm
 from fake_headers import Headers
 # Limit the number of concurrent workers
 CONCURRENT_WORKERS = 5
@@ -35,6 +35,21 @@ tokenizer = None
 async def get_page_bs4(url: str, headers):
@@ -187,7 +202,9 @@ def classify_website(url):
         FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
-    urls = [url]
     final_ans_dict = {}
     print('before scrape_websites')

 from curl_cffi.requests import AsyncSession
 from tqdm.asyncio import tqdm
 from fake_headers import Headers
+from urllib.parse import urlparse, urlunparse
 # Limit the number of concurrent workers
 CONCURRENT_WORKERS = 5
+def get_main_page_url(url):
+    try:
+        # Parse the given URL
+        parsed_url = urlparse(url)
+        # Construct the main page URL (scheme + netloc)
+        print(parsed_url.netloc)
+        main_page_url = urlunparse((parsed_url.scheme, parsed_url.netloc, '', '', '', ''))
+        return main_page_url
+    except Exception as e:
+        return f"Error processing URL: {e}"
 async def get_page_bs4(url: str, headers):
         FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
+    main_page_url = get_main_page_url(url)
+    urls = [main_page_url]
     final_ans_dict = {}
     print('before scrape_websites')